Python SeriesLoader.fromBinary 예제들, thunder.rdds.fileio.seriesloader.SeriesLoader.fromBinary Python 예제들

예제 #1

0

파일 보기

파일: test_seriesloader.py 프로젝트: MiguelPeralvo/thunder

    def _run_tst_fromBinary(self, useConfJson=False):
        # run this as a single big test so as to avoid repeated setUp and tearDown of the spark context
        # data will be a sequence of test data
        # all keys and all values in a test data item must be of the same length
        # keys get converted to ints regardless of raw input format
        DATA = [
            SeriesBinaryTestData.fromArrays([[1, 2, 3]], [[11, 12, 13]], 'int16', 'int16'),
            SeriesBinaryTestData.fromArrays([[1, 2, 3], [5, 6, 7]], [[11], [12]], 'int16', 'int16'),
            SeriesBinaryTestData.fromArrays([[1, 2, 3]], [[11, 12, 13]], 'int16', 'int32'),
            SeriesBinaryTestData.fromArrays([[1, 2, 3]], [[11, 12, 13]], 'int32', 'int16'),
            SeriesBinaryTestData.fromArrays([[1, 2, 3]], [[11.0, 12.0, 13.0]], 'int16', 'float32'),
            SeriesBinaryTestData.fromArrays([[1, 2, 3]], [[11.0, 12.0, 13.0]], 'float32', 'float32'),
            SeriesBinaryTestData.fromArrays([[2, 3, 4]], [[11.0, 12.0, 13.0]], 'float32', 'float32'),
        ]

        for itemidx, item in enumerate(DATA):
            outSubdir = os.path.join(self.outputdir, 'input%d' % itemidx)
            os.mkdir(outSubdir)

            fname = os.path.join(outSubdir, 'inputfile%d.bin' % itemidx)
            with open(fname, 'wb') as f:
                item.writeToFile(f)

            loader = SeriesLoader(self.sc)
            if not useConfJson:
                series = loader.fromBinary(outSubdir, nkeys=item.nkeys, nvalues=item.nvals, keyType=str(item.keyDtype),
                                           valueType=str(item.valDtype))
            else:
                # write configuration file
                conf = {'input': outSubdir,
                        'nkeys': item.nkeys, 'nvalues': item.nvals,
                        'valuetype': str(item.valDtype), 'keytype': str(item.keyDtype)}
                with open(os.path.join(outSubdir, "conf.json"), 'wb') as f:
                    json.dump(conf, f, indent=2)
                series = loader.fromBinary(outSubdir)

            seriesData = series.rdd.collect()

            expectedData = item.data
            assert_equals(len(expectedData), len(seriesData),
                          "Differing numbers of k/v pairs in item %d; expected %d, got %d" %
                          (itemidx, len(expectedData), len(seriesData)))

            for expected, actual in zip(expectedData, seriesData):
                expectedKeys = tuple(expected[0])
                expectedType = smallestFloatType(item.valDtype)
                expectedVals = array(expected[1], dtype=expectedType)
                assert_equals(expectedKeys, actual[0],
                              "Key mismatch in item %d; expected %s, got %s" %
                              (itemidx, str(expectedKeys), str(actual[0])))
                assert_true(allclose(expectedVals, actual[1]),
                            "Value mismatch in item %d; expected %s, got %s" %
                            (itemidx, str(expectedVals), str(actual[1])))
                assert_equals(expectedType, str(actual[1].dtype),
                              "Value type mismatch in item %d; expected %s, got %s" %
                              (itemidx, expectedType, str(actual[1].dtype)))

예제 #2

0

파일 보기

파일: test_seriesloader.py 프로젝트: brokendata/thunder

    def _run_roundtrip_tst(self, testCount, arrays, blockSize):
        print "Running TestSeriesBinaryWriteFromStack roundtrip test #%d" % testCount
        insubdir = os.path.join(self.outputdir, 'input%d' % testCount)
        os.mkdir(insubdir)

        outsubdir = os.path.join(self.outputdir, 'output%d' % testCount)
        #os.mkdir(outsubdir)

        for aryCount, array in enumerate(arrays):
            # array.tofile always writes in column-major order...
            array.tofile(os.path.join(insubdir, "img%02d.stack" % aryCount))

        # ... but we will read and interpret these as though they are in row-major order
        dims = list(arrays[0].shape)
        dims.reverse()

        underTest = SeriesLoader(self.sc)

        underTest.saveFromStack(insubdir, outsubdir, dims, blockSize=blockSize, datatype=str(arrays[0].dtype))

        roundtripped = underTest.fromBinary(outsubdir).collect()

        for serieskeys, seriesvalues in roundtripped:
            for seriesidx, seriesval in enumerate(seriesvalues):
                #print "seriesidx: %d; serieskeys: %s; seriesval: %g" % (seriesidx, serieskeys, seriesval)
                # flip indices again for row vs col-major insanity
                arykeys = list(serieskeys)
                arykeys.reverse()
                msg = "Failure on test #%d, time point %d, indices %s" % (testCount, seriesidx, str(tuple(arykeys)))
                try:
                    assert_almost_equal(arrays[seriesidx][tuple(arykeys)], seriesval, places=4)
                except AssertionError, e:
                    raise AssertionError(msg, e)

예제 #3

0

파일 보기

파일: context.py 프로젝트: Young-china/thunder

    def loadSeries(self, datapath, nkeys=None, nvalues=None, inputformat='binary', minPartitions=None,
                   conffile='conf.json', keytype=None, valuetype=None):
        """
        Loads a Series object from data stored as text or binary files.

        Supports single files or multiple files stored on a local file system, a networked file system (mounted
        and available on all cluster nodes), Amazon S3, or HDFS.

        Parameters
        ----------
        datapath: string
            Path to data files or directory, specified as either a local filesystem path or in a URI-like format,
            including scheme. A datapath argument may include a single '*' wildcard character in the filename. Examples
            of valid datapaths include 'a/local/relative/directory/*.stack", "s3n:///my-s3-bucket/data/mydatafile.tif",
            "/mnt/my/absolute/data/directory/", or "file:///mnt/another/data/directory/".

        nkeys: int, optional (but required if `inputformat` is 'text')
            dimensionality of data keys. (For instance, (x,y,z) keyed data for 3-dimensional image timeseries data.)
            For text data, number of keys must be specified in this parameter; for binary data, number of keys must be
            specified either in this parameter or in a configuration file named by the 'conffile' argument if this
            parameter is not set.

        nvalues: int, optional (but required if `inputformat` is 'text')
            Number of values expected to be read. For binary data, nvalues must be specified either in this parameter
            or in a configuration file named by the 'conffile' argument if this parameter is not set.

        inputformat: {'text', 'binary'}. optional, default 'binary'
            Format of data to be read.

        minPartitions: int, optional
            Explicitly specify minimum number of Spark partitions to be generated from this data. Used only for
            text data. Default is to use minParallelism attribute of Spark context object.

        conffile: string, optional, default 'conf.json'
            Path to JSON file with configuration options including 'nkeys', 'nvalues', 'keytype', and 'valuetype'.
            If a file is not found at the given path, then the base directory given in 'datafile'
            will also be checked. Parameters `nkeys` or `nvalues` that are specified as explicit arguments to this
            method will take priority over those found in conffile if both are present.

        Returns
        -------
        data: thunder.rdds.Series
            A newly-created Series object, wrapping an RDD of series data. This RDD will have as keys an n-tuple
            of int, with n given by `nkeys` or the configuration passed in `conffile`. RDD values will be a numpy
            array of length `nvalues` (or as specified in the passed configuration file).
        """
        checkparams(inputformat, ['text', 'binary'])

        from thunder.rdds.fileio.seriesloader import SeriesLoader
        loader = SeriesLoader(self._sc, minPartitions=minPartitions)

        if inputformat.lower() == 'text':
            data = loader.fromText(datapath, nkeys=nkeys)
        else:
            # must be either 'text' or 'binary'
            data = loader.fromBinary(datapath, conffilename=conffile, nkeys=nkeys, nvalues=nvalues,
                                     keytype=keytype, valuetype=valuetype)

        return data

예제 #4

0

파일 보기

파일: test_seriesloader.py 프로젝트: mfcabrera/thunder

    def _run_roundtrip_tst(self, testCount, arrays, blockSize):
        print "Running TestSeriesBinaryWriteFromStack roundtrip test #%d" % testCount
        insubdir = os.path.join(self.outputdir, 'input%d' % testCount)
        os.mkdir(insubdir)

        outsubdir = os.path.join(self.outputdir, 'output%d' % testCount)
        #os.mkdir(outsubdir)

        for aryCount, array in enumerate(arrays):
            # array.tofile always writes in column-major order...
            array.tofile(os.path.join(insubdir, "img%02d.stack" % aryCount))

        # ... but we will read and interpret these as though they are in row-major order
        dims = list(arrays[0].shape)
        dims.reverse()

        underTest = SeriesLoader(self.sc)

        underTest.saveFromStack(insubdir, outsubdir, dims, blockSize=blockSize, datatype=str(arrays[0].dtype))
        series = underTest.fromStack(insubdir, dims, datatype=str(arrays[0].dtype))

        roundtripped_series = underTest.fromBinary(outsubdir)
        roundtripped = roundtripped_series.collect()
        direct = series.collect()

        expecteddtype = str(smallest_float_type(arrays[0].dtype))
        assert_equals(expecteddtype, roundtripped_series.dtype)
        assert_equals(expecteddtype, series.dtype)
        assert_equals(expecteddtype, str(roundtripped[0][1].dtype))
        assert_equals(expecteddtype, str(direct[0][1].dtype))

        with open(os.path.join(outsubdir, "conf.json"), 'r') as fp:
            # check that binary series file data type *matches* input stack data type (not yet converted to float)
            # at least according to conf.json
            conf = json.load(fp)
            assert_equals(str(arrays[0].dtype), conf["valuetype"])

        for ((serieskeys, seriesvalues), (directkeys, directvalues)) in zip(roundtripped, direct):
            assert_equals(directkeys, serieskeys)
            assert_equals(directvalues, seriesvalues)

            for seriesidx, seriesval in enumerate(seriesvalues):
                #print "seriesidx: %d; serieskeys: %s; seriesval: %g" % (seriesidx, serieskeys, seriesval)
                # flip indices again for row vs col-major insanity
                arykeys = list(serieskeys)
                arykeys.reverse()
                msg = "Failure on test #%d, time point %d, indices %s" % (testCount, seriesidx, str(tuple(arykeys)))
                try:
                    assert_almost_equal(arrays[seriesidx][tuple(arykeys)], seriesval, places=4)
                except AssertionError, e:
                    raise AssertionError(msg, e)

예제 #5

0

파일 보기

    def _run_roundtrip_tst(self, testIdx, nimages, aryShape, dtypeSpec,
                           npartitions):
        testArrays = TestSeriesBinaryWriteFromStack.generateTestImages(
            nimages, aryShape, dtypeSpec)
        loader = SeriesLoader(self.sc)
        series = loader.fromArrays(testArrays)

        saveDirPath = os.path.join(self.outputdir, 'save%d' % testIdx)
        series.repartition(
            npartitions
        )  # note: this does an elementwise shuffle! won't be in sorted order
        series.saveAsBinarySeries(saveDirPath)

        nnonemptyPartitions = 0
        for partitionList in series.rdd.glom().collect():
            if partitionList:
                nnonemptyPartitions += 1
        del partitionList
        nsaveFiles = len(glob.glob(saveDirPath + os.sep + "*.bin"))

        roundtrippedSeries = loader.fromBinary(saveDirPath)

        with open(os.path.join(saveDirPath, "conf.json"), 'r') as fp:
            conf = json.load(fp)

        # sorting is required here b/c of the randomization induced by the repartition.
        # orig and roundtripped will in general be different from each other, since roundtripped
        # will have (0, 0, 0) index as first element (since it will be the lexicographically first
        # file) while orig has only a 1 in npartitions chance of starting with (0, 0, 0) after repartition.
        expectedPackedAry = series.pack(sorting=True)
        actualPackedAry = roundtrippedSeries.pack(sorting=True)

        assert_true(array_equal(expectedPackedAry, actualPackedAry))

        assert_equals(nnonemptyPartitions, nsaveFiles)

        assert_equals(len(aryShape), conf["nkeys"])
        assert_equals(nimages, conf["nvalues"])
        assert_equals("int16", conf["keytype"])
        assert_equals(str(series.dtype), conf["valuetype"])
        # check that we have converted ourselves to an appropriate float after reloading
        assert_equals(str(smallestFloatType(series.dtype)),
                      str(roundtrippedSeries.dtype))

예제 #6

0

파일 보기

파일: test_seriesloader.py 프로젝트: MiguelPeralvo/thunder

    def _run_roundtrip_tst(self, testIdx, nimages, aryShape, dtypeSpec, npartitions):
        testArrays = TestSeriesBinaryWriteFromStack.generateTestImages(nimages, aryShape, dtypeSpec)
        loader = SeriesLoader(self.sc)
        series = loader.fromArrays(testArrays)

        saveDirPath = os.path.join(self.outputdir, 'save%d' % testIdx)
        series.repartition(npartitions)  # note: this does an elementwise shuffle! won't be in sorted order
        series.saveAsBinarySeries(saveDirPath)

        nnonemptyPartitions = 0
        for partitionList in series.rdd.glom().collect():
            if partitionList:
                nnonemptyPartitions += 1
        del partitionList
        nsaveFiles = len(glob.glob(saveDirPath + os.sep + "*.bin"))

        roundtrippedSeries = loader.fromBinary(saveDirPath)

        with open(os.path.join(saveDirPath, "conf.json"), 'r') as fp:
            conf = json.load(fp)

        # sorting is required here b/c of the randomization induced by the repartition.
        # orig and roundtripped will in general be different from each other, since roundtripped
        # will have (0, 0, 0) index as first element (since it will be the lexicographically first
        # file) while orig has only a 1 in npartitions chance of starting with (0, 0, 0) after repartition.
        expectedPackedAry = series.pack(sorting=True)
        actualPackedAry = roundtrippedSeries.pack(sorting=True)

        assert_true(array_equal(expectedPackedAry, actualPackedAry))

        assert_equals(nnonemptyPartitions, nsaveFiles)

        assert_equals(len(aryShape), conf["nkeys"])
        assert_equals(nimages, conf["nvalues"])
        assert_equals("int16", conf["keytype"])
        assert_equals(str(series.dtype), conf["valuetype"])
        # check that we have converted ourselves to an appropriate float after reloading
        assert_equals(str(smallestFloatType(series.dtype)), str(roundtrippedSeries.dtype))

예제 #7

0

파일 보기

    def loadSeries(self, dataPath, nkeys=None, nvalues=None, inputFormat='binary', minPartitions=None,
                   confFilename='conf.json', keyType=None, valueType=None, keyPath=None, varName=None):
        """
        Loads a Series object from data stored as binary, text, npy, or mat.

        For binary and text, supports single files or multiple files stored on a local file system,
        a networked file system (mounted and available on all cluster nodes), Amazon S3, or HDFS.
        For local formats (npy and mat) only local file systems currently supported.

        Parameters
        ----------
        dataPath: string
            Path to data files or directory, as either a local filesystem path or a URI.
            May include a single '*' wildcard in the filename. Examples of valid dataPaths include
            'local/directory/*.stack", "s3n:///my-s3-bucket/data/", or "file:///mnt/another/directory/".

        nkeys: int, optional (required if `inputFormat` is 'text'), default = None
            Number of keys per record (e.g. 3 for (x, y, z) coordinate keys). Must be specified for
            text data; can be specified here or in a configuration file for binary data.

        nvalues: int, optional (required if `inputFormat` is 'text')
            Number of values per record. Must be specified here or in a configuration file for binary data.

        inputFormat: {'text', 'binary', 'npy', 'mat'}. optional, default = 'binary'
            inputFormat of data to be read.

        minPartitions: int, optional, default = SparkContext.minParallelism
            Minimum number of Spark partitions to use, only for text.

        confFilename: string, optional, default 'conf.json'
            Path to JSON file with configuration options including 'nkeys', 'nvalues',
            'keyType', and 'valueType'. If a file is not found at the given path, then the base
            directory in 'dataPath' will be checked. Parameters will override the conf file.

        keyType: string or numpy dtype, optional, default = None
            Numerical type of keys, will override conf file.

        valueType: string or numpy dtype, optional, default = None
            Numerical type of values, will override conf file.

        keyPath: string, optional, default = None
            Path to file with keys when loading from npy or mat.

        varName : str, optional, default = None
            Variable name to load (for MAT files only)

        Returns
        -------
        data: thunder.rdds.Series
            A Series object, wrapping an RDD, with (n-tuples of ints) : (numpy array) pairs
        """
        checkParams(inputFormat, ['text', 'binary', 'npy', 'mat'])

        from thunder.rdds.fileio.seriesloader import SeriesLoader
        loader = SeriesLoader(self._sc, minPartitions=minPartitions)

        if inputFormat.lower() == 'binary':
            data = loader.fromBinary(dataPath, confFilename=confFilename, nkeys=nkeys, nvalues=nvalues,
                                     keyType=keyType, valueType=valueType)
        elif inputFormat.lower() == 'text':
            if nkeys is None:
                raise Exception('Must provide number of keys per record for loading from text')
            data = loader.fromText(dataPath, nkeys=nkeys)
        elif inputFormat.lower() == 'npy':
            data = loader.fromNpyLocal(dataPath, keyPath)
        else:
            if varName is None:
                raise Exception('Must provide variable name for loading MAT files')
            data = loader.fromMatLocal(dataPath, varName, keyPath)

        return data

예제 #8

0

파일 보기

    def _run_tst_fromBinary(self, useConfJson=False):
        # run this as a single big test so as to avoid repeated setUp and tearDown of the spark context
        # data will be a sequence of test data
        # all keys and all values in a test data item must be of the same length
        # keys get converted to ints regardless of raw input format
        DATA = [
            SeriesBinaryTestData.fromArrays([[1, 2, 3]], [[11, 12, 13]],
                                            'int16', 'int16'),
            SeriesBinaryTestData.fromArrays([[1, 2, 3], [5, 6, 7]],
                                            [[11], [12]], 'int16', 'int16'),
            SeriesBinaryTestData.fromArrays([[1, 2, 3]], [[11, 12, 13]],
                                            'int16', 'int32'),
            SeriesBinaryTestData.fromArrays([[1, 2, 3]], [[11, 12, 13]],
                                            'int32', 'int16'),
            SeriesBinaryTestData.fromArrays([[1, 2, 3]], [[11.0, 12.0, 13.0]],
                                            'int16', 'float32'),
            SeriesBinaryTestData.fromArrays([[1, 2, 3]], [[11.0, 12.0, 13.0]],
                                            'float32', 'float32'),
            SeriesBinaryTestData.fromArrays([[2, 3, 4]], [[11.0, 12.0, 13.0]],
                                            'float32', 'float32'),
        ]

        for itemidx, item in enumerate(DATA):
            outSubdir = os.path.join(self.outputdir, 'input%d' % itemidx)
            os.mkdir(outSubdir)

            fname = os.path.join(outSubdir, 'inputfile%d.bin' % itemidx)
            with open(fname, 'wb') as f:
                item.writeToFile(f)

            loader = SeriesLoader(self.sc)
            if not useConfJson:
                series = loader.fromBinary(outSubdir,
                                           nkeys=item.nkeys,
                                           nvalues=item.nvals,
                                           keyType=str(item.keyDtype),
                                           valueType=str(item.valDtype))
            else:
                # write configuration file
                conf = {
                    'input': outSubdir,
                    'nkeys': item.nkeys,
                    'nvalues': item.nvals,
                    'valuetype': str(item.valDtype),
                    'keytype': str(item.keyDtype)
                }
                with open(os.path.join(outSubdir, "conf.json"), 'wb') as f:
                    json.dump(conf, f, indent=2)
                series = loader.fromBinary(outSubdir)

            seriesData = series.rdd.collect()

            expectedData = item.data
            assert_equals(
                len(expectedData), len(seriesData),
                "Differing numbers of k/v pairs in item %d; expected %d, got %d"
                % (itemidx, len(expectedData), len(seriesData)))

            for expected, actual in zip(expectedData, seriesData):
                expectedKeys = tuple(expected[0])
                expectedType = smallestFloatType(item.valDtype)
                expectedVals = array(expected[1], dtype=expectedType)
                assert_equals(
                    expectedKeys, actual[0],
                    "Key mismatch in item %d; expected %s, got %s" %
                    (itemidx, str(expectedKeys), str(actual[0])))
                assert_true(
                    allclose(expectedVals, actual[1]),
                    "Value mismatch in item %d; expected %s, got %s" %
                    (itemidx, str(expectedVals), str(actual[1])))
                assert_equals(
                    expectedType, str(actual[1].dtype),
                    "Value type mismatch in item %d; expected %s, got %s" %
                    (itemidx, expectedType, str(actual[1].dtype)))

예제 #9

0

파일 보기

파일: test_seriesloader.py 프로젝트: mfcabrera/thunder

    def _run_roundtrip_tst(self, testCount, arrays, blockSize):
        print "Running TestSeriesBinaryWriteFromStack roundtrip test #%d" % testCount
        insubdir = os.path.join(self.outputdir, 'input%d' % testCount)
        os.mkdir(insubdir)

        outsubdir = os.path.join(self.outputdir, 'output%d' % testCount)
        #os.mkdir(outsubdir)

        for aryCount, array in enumerate(arrays):
            # array.tofile always writes in column-major order...
            array.tofile(os.path.join(insubdir, "img%02d.stack" % aryCount))

        # ... but we will read and interpret these as though they are in row-major order
        dims = list(arrays[0].shape)
        dims.reverse()

        underTest = SeriesLoader(self.sc)

        underTest.saveFromStack(insubdir,
                                outsubdir,
                                dims,
                                blockSize=blockSize,
                                datatype=str(arrays[0].dtype))
        series = underTest.fromStack(insubdir,
                                     dims,
                                     datatype=str(arrays[0].dtype))

        roundtripped_series = underTest.fromBinary(outsubdir)
        roundtripped = roundtripped_series.collect()
        direct = series.collect()

        expecteddtype = str(smallest_float_type(arrays[0].dtype))
        assert_equals(expecteddtype, roundtripped_series.dtype)
        assert_equals(expecteddtype, series.dtype)
        assert_equals(expecteddtype, str(roundtripped[0][1].dtype))
        assert_equals(expecteddtype, str(direct[0][1].dtype))

        with open(os.path.join(outsubdir, "conf.json"), 'r') as fp:
            # check that binary series file data type *matches* input stack data type (not yet converted to float)
            # at least according to conf.json
            conf = json.load(fp)
            assert_equals(str(arrays[0].dtype), conf["valuetype"])

        for ((serieskeys, seriesvalues),
             (directkeys, directvalues)) in zip(roundtripped, direct):
            assert_equals(directkeys, serieskeys)
            assert_equals(directvalues, seriesvalues)

            for seriesidx, seriesval in enumerate(seriesvalues):
                #print "seriesidx: %d; serieskeys: %s; seriesval: %g" % (seriesidx, serieskeys, seriesval)
                # flip indices again for row vs col-major insanity
                arykeys = list(serieskeys)
                arykeys.reverse()
                msg = "Failure on test #%d, time point %d, indices %s" % (
                    testCount, seriesidx, str(tuple(arykeys)))
                try:
                    assert_almost_equal(arrays[seriesidx][tuple(arykeys)],
                                        seriesval,
                                        places=4)
                except AssertionError, e:
                    raise AssertionError(msg, e)

예제 #10

0

파일 보기

파일: context.py 프로젝트: kmader/thunder

    def loadSeries(self,
                   dataPath,
                   nkeys=None,
                   nvalues=None,
                   inputFormat='binary',
                   minPartitions=None,
                   confFilename='conf.json',
                   keyType=None,
                   valueType=None):
        """
        Loads a Series object from data stored as text or binary files.

        Supports single files or multiple files stored on a local file system, a networked file system (mounted
        and available on all cluster nodes), Amazon S3, or HDFS.

        Parameters
        ----------
        dataPath: string
            Path to data files or directory, specified as either a local filesystem path or in a URI-like format,
            including scheme. A dataPath argument may include a single '*' wildcard character in the filename. Examples
            of valid dataPaths include 'a/local/relative/directory/*.stack", "s3n:///my-s3-bucket/data/mydatafile.tif",
            "/mnt/my/absolute/data/directory/", or "file:///mnt/another/data/directory/".

        nkeys: int, optional (but required if `inputFormat` is 'text')
            dimensionality of data keys. (For instance, (x,y,z) keyed data for 3-dimensional image timeseries data.)
            For text data, number of keys must be specified in this parameter; for binary data, number of keys must be
            specified either in this parameter or in a configuration file named by the 'conffile' argument if this
            parameter is not set.

        nvalues: int, optional (but required if `inputFormat` is 'text')
            Number of values expected to be read. For binary data, nvalues must be specified either in this parameter
            or in a configuration file named by the 'conffile' argument if this parameter is not set.

        inputFormat: {'text', 'binary'}. optional, default 'binary'
            Format of data to be read.

        minPartitions: int, optional
            Explicitly specify minimum number of Spark partitions to be generated from this data. Used only for
            text data. Default is to use minParallelism attribute of Spark context object.

        confFilename: string, optional, default 'conf.json'
            Path to JSON file with configuration options including 'nkeys', 'nvalues', 'keytype', and 'valuetype'.
            If a file is not found at the given path, then the base directory given in 'datafile'
            will also be checked. Parameters `nkeys` or `nvalues` that are specified as explicit arguments to this
            method will take priority over those found in conffile if both are present.

        Returns
        -------
        data: thunder.rdds.Series
            A newly-created Series object, wrapping an RDD of series data. This RDD will have as keys an n-tuple
            of int, with n given by `nkeys` or the configuration passed in `conffile`. RDD values will be a numpy
            array of length `nvalues` (or as specified in the passed configuration file).
        """
        checkParams(inputFormat, ['text', 'binary'])

        from thunder.rdds.fileio.seriesloader import SeriesLoader
        loader = SeriesLoader(self._sc, minPartitions=minPartitions)

        if inputFormat.lower() == 'text':
            data = loader.fromText(dataPath, nkeys=nkeys)
        else:
            # must be either 'text' or 'binary'
            data = loader.fromBinary(dataPath,
                                     confFilename=confFilename,
                                     nkeys=nkeys,
                                     nvalues=nvalues,
                                     keyType=keyType,
                                     valueType=valueType)
        return data