Exemplo n.º 1
0
    def _run_tst_fromBinary(self, useConfJson=False):
        # run this as a single big test so as to avoid repeated setUp and tearDown of the spark context
        # data will be a sequence of test data
        # all keys and all values in a test data item must be of the same length
        # keys get converted to ints regardless of raw input format
        DATA = [
            SeriesBinaryTestData.fromArrays([[1, 2, 3]], [[11, 12, 13]], 'int16', 'int16'),
            SeriesBinaryTestData.fromArrays([[1, 2, 3], [5, 6, 7]], [[11], [12]], 'int16', 'int16'),
            SeriesBinaryTestData.fromArrays([[1, 2, 3]], [[11, 12, 13]], 'int16', 'int32'),
            SeriesBinaryTestData.fromArrays([[1, 2, 3]], [[11, 12, 13]], 'int32', 'int16'),
            SeriesBinaryTestData.fromArrays([[1, 2, 3]], [[11.0, 12.0, 13.0]], 'int16', 'float32'),
            SeriesBinaryTestData.fromArrays([[1, 2, 3]], [[11.0, 12.0, 13.0]], 'float32', 'float32'),
            SeriesBinaryTestData.fromArrays([[2, 3, 4]], [[11.0, 12.0, 13.0]], 'float32', 'float32'),
        ]

        for itemidx, item in enumerate(DATA):
            outSubdir = os.path.join(self.outputdir, 'input%d' % itemidx)
            os.mkdir(outSubdir)

            fname = os.path.join(outSubdir, 'inputfile%d.bin' % itemidx)
            with open(fname, 'wb') as f:
                item.writeToFile(f)

            loader = SeriesLoader(self.sc)
            if not useConfJson:
                series = loader.fromBinary(outSubdir, nkeys=item.nkeys, nvalues=item.nvals, keyType=str(item.keyDtype),
                                           valueType=str(item.valDtype))
            else:
                # write configuration file
                conf = {'input': outSubdir,
                        'nkeys': item.nkeys, 'nvalues': item.nvals,
                        'valuetype': str(item.valDtype), 'keytype': str(item.keyDtype)}
                with open(os.path.join(outSubdir, "conf.json"), 'wb') as f:
                    json.dump(conf, f, indent=2)
                series = loader.fromBinary(outSubdir)

            seriesData = series.rdd.collect()

            expectedData = item.data
            assert_equals(len(expectedData), len(seriesData),
                          "Differing numbers of k/v pairs in item %d; expected %d, got %d" %
                          (itemidx, len(expectedData), len(seriesData)))

            for expected, actual in zip(expectedData, seriesData):
                expectedKeys = tuple(expected[0])
                expectedType = smallestFloatType(item.valDtype)
                expectedVals = array(expected[1], dtype=expectedType)
                assert_equals(expectedKeys, actual[0],
                              "Key mismatch in item %d; expected %s, got %s" %
                              (itemidx, str(expectedKeys), str(actual[0])))
                assert_true(allclose(expectedVals, actual[1]),
                            "Value mismatch in item %d; expected %s, got %s" %
                            (itemidx, str(expectedVals), str(actual[1])))
                assert_equals(expectedType, str(actual[1].dtype),
                              "Value type mismatch in item %d; expected %s, got %s" %
                              (itemidx, expectedType, str(actual[1].dtype)))
Exemplo n.º 2
0
    def loadSeries(self, dataPath, nkeys=None, nvalues=None, inputFormat='binary', minPartitions=None,
                   maxPartitionSize='32mb', confFilename='conf.json', keyType=None, valueType=None, keyPath=None,
                   varName=None):
        """
        Loads a Series object from data stored as binary, text, npy, or mat.

        For binary and text, supports single files or multiple files stored on a local file system,
        a networked file system (mounted and available on all cluster nodes), Amazon S3, or HDFS.
        For local formats (npy and mat) only local file systems currently supported.

        Parameters
        ----------
        dataPath: string
            Path to data files or directory, as either a local filesystem path or a URI.
            May include a single '*' wildcard in the filename. Examples of valid dataPaths include
            'local/directory/*.stack", "s3n:///my-s3-bucket/data/", or "file:///mnt/another/directory/".

        nkeys: int, optional (required if `inputFormat` is 'text'), default = None
            Number of keys per record (e.g. 3 for (x, y, z) coordinate keys). Must be specified for
            text data; can be specified here or in a configuration file for binary data.

        nvalues: int, optional (required if `inputFormat` is 'text')
            Number of values per record. Must be specified here or in a configuration file for binary data.

        inputFormat: {'text', 'binary', 'npy', 'mat'}. optional, default = 'binary'
            inputFormat of data to be read.

        minPartitions: int, optional, default = SparkContext.minParallelism
            Minimum number of Spark partitions to use, only for text.

        maxPartitionSize : int, optional, default = '32mb'
            Maximum size of partitions as a Java-style memory string, e.g. '32mb' or '64mb',
            indirectly controls the number of Spark partitions, only for binary.

        confFilename: string, optional, default 'conf.json'
            Path to JSON file with configuration options including 'nkeys', 'nvalues',
            'keyType', and 'valueType'. If a file is not found at the given path, then the base
            directory in 'dataPath' will be checked. Parameters will override the conf file.

        keyType: string or numpy dtype, optional, default = None
            Numerical type of keys, will override conf file.

        valueType: string or numpy dtype, optional, default = None
            Numerical type of values, will override conf file.

        keyPath: string, optional, default = None
            Path to file with keys when loading from npy or mat.

        varName : str, optional, default = None
            Variable name to load (for MAT files only)

        Returns
        -------
        data: lambdaimage.rdds.Series
            A Series object, wrapping an RDD, with (n-tuples of ints) : (numpy array) pairs
        """
        checkParams(inputFormat, ['text', 'binary', 'npy', 'mat'])

        from lambdaimage.rdds.fileio.seriesloader import SeriesLoader
        loader = SeriesLoader(self._sc, minPartitions=minPartitions)

        if inputFormat.lower() == 'binary':
            data = loader.fromBinary(dataPath, confFilename=confFilename, nkeys=nkeys, nvalues=nvalues,
                                     keyType=keyType, valueType=valueType, maxPartitionSize=maxPartitionSize)
        elif inputFormat.lower() == 'text':
            if nkeys is None:
                raise Exception('Must provide number of keys per record for loading from text')
            data = loader.fromText(dataPath, nkeys=nkeys)
        elif inputFormat.lower() == 'npy':
            data = loader.fromNpyLocal(dataPath, keyPath)
        else:
            if varName is None:
                raise Exception('Must provide variable name for loading MAT files')
            data = loader.fromMatLocal(dataPath, varName, keyPath)

        return data