def _run_tst_fromBinary(self, useConfJson=False): # run this as a single big test so as to avoid repeated setUp and tearDown of the spark context # data will be a sequence of test data # all keys and all values in a test data item must be of the same length # keys get converted to ints regardless of raw input format DATA = [ SeriesBinaryTestData.fromArrays([[1, 2, 3]], [[11, 12, 13]], 'int16', 'int16'), SeriesBinaryTestData.fromArrays([[1, 2, 3], [5, 6, 7]], [[11], [12]], 'int16', 'int16'), SeriesBinaryTestData.fromArrays([[1, 2, 3]], [[11, 12, 13]], 'int16', 'int32'), SeriesBinaryTestData.fromArrays([[1, 2, 3]], [[11, 12, 13]], 'int32', 'int16'), SeriesBinaryTestData.fromArrays([[1, 2, 3]], [[11.0, 12.0, 13.0]], 'int16', 'float32'), SeriesBinaryTestData.fromArrays([[1, 2, 3]], [[11.0, 12.0, 13.0]], 'float32', 'float32'), SeriesBinaryTestData.fromArrays([[2, 3, 4]], [[11.0, 12.0, 13.0]], 'float32', 'float32'), ] for itemidx, item in enumerate(DATA): outSubdir = os.path.join(self.outputdir, 'input%d' % itemidx) os.mkdir(outSubdir) fname = os.path.join(outSubdir, 'inputfile%d.bin' % itemidx) with open(fname, 'wb') as f: item.writeToFile(f) loader = SeriesLoader(self.sc) if not useConfJson: series = loader.fromBinary(outSubdir, nkeys=item.nkeys, nvalues=item.nvals, keyType=str(item.keyDtype), valueType=str(item.valDtype)) else: # write configuration file conf = {'input': outSubdir, 'nkeys': item.nkeys, 'nvalues': item.nvals, 'valuetype': str(item.valDtype), 'keytype': str(item.keyDtype)} with open(os.path.join(outSubdir, "conf.json"), 'wb') as f: json.dump(conf, f, indent=2) series = loader.fromBinary(outSubdir) seriesData = series.rdd.collect() expectedData = item.data assert_equals(len(expectedData), len(seriesData), "Differing numbers of k/v pairs in item %d; expected %d, got %d" % (itemidx, len(expectedData), len(seriesData))) for expected, actual in zip(expectedData, seriesData): expectedKeys = tuple(expected[0]) expectedType = smallestFloatType(item.valDtype) expectedVals = array(expected[1], dtype=expectedType) assert_equals(expectedKeys, actual[0], "Key mismatch in item %d; expected %s, got %s" % (itemidx, str(expectedKeys), str(actual[0]))) assert_true(allclose(expectedVals, actual[1]), "Value mismatch in item %d; expected %s, got %s" % (itemidx, str(expectedVals), str(actual[1]))) assert_equals(expectedType, str(actual[1].dtype), "Value type mismatch in item %d; expected %s, got %s" % (itemidx, expectedType, str(actual[1].dtype)))
def _run_roundtrip_tst(self, testCount, arrays, blockSize): print "Running TestSeriesBinaryWriteFromStack roundtrip test #%d" % testCount insubdir = os.path.join(self.outputdir, 'input%d' % testCount) os.mkdir(insubdir) outsubdir = os.path.join(self.outputdir, 'output%d' % testCount) #os.mkdir(outsubdir) for aryCount, array in enumerate(arrays): # array.tofile always writes in column-major order... array.tofile(os.path.join(insubdir, "img%02d.stack" % aryCount)) # ... but we will read and interpret these as though they are in row-major order dims = list(arrays[0].shape) dims.reverse() underTest = SeriesLoader(self.sc) underTest.saveFromStack(insubdir, outsubdir, dims, blockSize=blockSize, datatype=str(arrays[0].dtype)) roundtripped = underTest.fromBinary(outsubdir).collect() for serieskeys, seriesvalues in roundtripped: for seriesidx, seriesval in enumerate(seriesvalues): #print "seriesidx: %d; serieskeys: %s; seriesval: %g" % (seriesidx, serieskeys, seriesval) # flip indices again for row vs col-major insanity arykeys = list(serieskeys) arykeys.reverse() msg = "Failure on test #%d, time point %d, indices %s" % (testCount, seriesidx, str(tuple(arykeys))) try: assert_almost_equal(arrays[seriesidx][tuple(arykeys)], seriesval, places=4) except AssertionError, e: raise AssertionError(msg, e)
def loadSeries(self, datapath, nkeys=None, nvalues=None, inputformat='binary', minPartitions=None, conffile='conf.json', keytype=None, valuetype=None): """ Loads a Series object from data stored as text or binary files. Supports single files or multiple files stored on a local file system, a networked file system (mounted and available on all cluster nodes), Amazon S3, or HDFS. Parameters ---------- datapath: string Path to data files or directory, specified as either a local filesystem path or in a URI-like format, including scheme. A datapath argument may include a single '*' wildcard character in the filename. Examples of valid datapaths include 'a/local/relative/directory/*.stack", "s3n:///my-s3-bucket/data/mydatafile.tif", "/mnt/my/absolute/data/directory/", or "file:///mnt/another/data/directory/". nkeys: int, optional (but required if `inputformat` is 'text') dimensionality of data keys. (For instance, (x,y,z) keyed data for 3-dimensional image timeseries data.) For text data, number of keys must be specified in this parameter; for binary data, number of keys must be specified either in this parameter or in a configuration file named by the 'conffile' argument if this parameter is not set. nvalues: int, optional (but required if `inputformat` is 'text') Number of values expected to be read. For binary data, nvalues must be specified either in this parameter or in a configuration file named by the 'conffile' argument if this parameter is not set. inputformat: {'text', 'binary'}. optional, default 'binary' Format of data to be read. minPartitions: int, optional Explicitly specify minimum number of Spark partitions to be generated from this data. Used only for text data. Default is to use minParallelism attribute of Spark context object. conffile: string, optional, default 'conf.json' Path to JSON file with configuration options including 'nkeys', 'nvalues', 'keytype', and 'valuetype'. If a file is not found at the given path, then the base directory given in 'datafile' will also be checked. Parameters `nkeys` or `nvalues` that are specified as explicit arguments to this method will take priority over those found in conffile if both are present. Returns ------- data: thunder.rdds.Series A newly-created Series object, wrapping an RDD of series data. This RDD will have as keys an n-tuple of int, with n given by `nkeys` or the configuration passed in `conffile`. RDD values will be a numpy array of length `nvalues` (or as specified in the passed configuration file). """ checkparams(inputformat, ['text', 'binary']) from thunder.rdds.fileio.seriesloader import SeriesLoader loader = SeriesLoader(self._sc, minPartitions=minPartitions) if inputformat.lower() == 'text': data = loader.fromText(datapath, nkeys=nkeys) else: # must be either 'text' or 'binary' data = loader.fromBinary(datapath, conffilename=conffile, nkeys=nkeys, nvalues=nvalues, keytype=keytype, valuetype=valuetype) return data
def _run_roundtrip_tst(self, testCount, arrays, blockSize): print "Running TestSeriesBinaryWriteFromStack roundtrip test #%d" % testCount insubdir = os.path.join(self.outputdir, 'input%d' % testCount) os.mkdir(insubdir) outsubdir = os.path.join(self.outputdir, 'output%d' % testCount) #os.mkdir(outsubdir) for aryCount, array in enumerate(arrays): # array.tofile always writes in column-major order... array.tofile(os.path.join(insubdir, "img%02d.stack" % aryCount)) # ... but we will read and interpret these as though they are in row-major order dims = list(arrays[0].shape) dims.reverse() underTest = SeriesLoader(self.sc) underTest.saveFromStack(insubdir, outsubdir, dims, blockSize=blockSize, datatype=str(arrays[0].dtype)) series = underTest.fromStack(insubdir, dims, datatype=str(arrays[0].dtype)) roundtripped_series = underTest.fromBinary(outsubdir) roundtripped = roundtripped_series.collect() direct = series.collect() expecteddtype = str(smallest_float_type(arrays[0].dtype)) assert_equals(expecteddtype, roundtripped_series.dtype) assert_equals(expecteddtype, series.dtype) assert_equals(expecteddtype, str(roundtripped[0][1].dtype)) assert_equals(expecteddtype, str(direct[0][1].dtype)) with open(os.path.join(outsubdir, "conf.json"), 'r') as fp: # check that binary series file data type *matches* input stack data type (not yet converted to float) # at least according to conf.json conf = json.load(fp) assert_equals(str(arrays[0].dtype), conf["valuetype"]) for ((serieskeys, seriesvalues), (directkeys, directvalues)) in zip(roundtripped, direct): assert_equals(directkeys, serieskeys) assert_equals(directvalues, seriesvalues) for seriesidx, seriesval in enumerate(seriesvalues): #print "seriesidx: %d; serieskeys: %s; seriesval: %g" % (seriesidx, serieskeys, seriesval) # flip indices again for row vs col-major insanity arykeys = list(serieskeys) arykeys.reverse() msg = "Failure on test #%d, time point %d, indices %s" % (testCount, seriesidx, str(tuple(arykeys))) try: assert_almost_equal(arrays[seriesidx][tuple(arykeys)], seriesval, places=4) except AssertionError, e: raise AssertionError(msg, e)
def _run_roundtrip_tst(self, testIdx, nimages, aryShape, dtypeSpec, npartitions): testArrays = TestSeriesBinaryWriteFromStack.generateTestImages( nimages, aryShape, dtypeSpec) loader = SeriesLoader(self.sc) series = loader.fromArrays(testArrays) saveDirPath = os.path.join(self.outputdir, 'save%d' % testIdx) series.repartition( npartitions ) # note: this does an elementwise shuffle! won't be in sorted order series.saveAsBinarySeries(saveDirPath) nnonemptyPartitions = 0 for partitionList in series.rdd.glom().collect(): if partitionList: nnonemptyPartitions += 1 del partitionList nsaveFiles = len(glob.glob(saveDirPath + os.sep + "*.bin")) roundtrippedSeries = loader.fromBinary(saveDirPath) with open(os.path.join(saveDirPath, "conf.json"), 'r') as fp: conf = json.load(fp) # sorting is required here b/c of the randomization induced by the repartition. # orig and roundtripped will in general be different from each other, since roundtripped # will have (0, 0, 0) index as first element (since it will be the lexicographically first # file) while orig has only a 1 in npartitions chance of starting with (0, 0, 0) after repartition. expectedPackedAry = series.pack(sorting=True) actualPackedAry = roundtrippedSeries.pack(sorting=True) assert_true(array_equal(expectedPackedAry, actualPackedAry)) assert_equals(nnonemptyPartitions, nsaveFiles) assert_equals(len(aryShape), conf["nkeys"]) assert_equals(nimages, conf["nvalues"]) assert_equals("int16", conf["keytype"]) assert_equals(str(series.dtype), conf["valuetype"]) # check that we have converted ourselves to an appropriate float after reloading assert_equals(str(smallestFloatType(series.dtype)), str(roundtrippedSeries.dtype))
def _run_roundtrip_tst(self, testIdx, nimages, aryShape, dtypeSpec, npartitions): testArrays = TestSeriesBinaryWriteFromStack.generateTestImages(nimages, aryShape, dtypeSpec) loader = SeriesLoader(self.sc) series = loader.fromArrays(testArrays) saveDirPath = os.path.join(self.outputdir, 'save%d' % testIdx) series.repartition(npartitions) # note: this does an elementwise shuffle! won't be in sorted order series.saveAsBinarySeries(saveDirPath) nnonemptyPartitions = 0 for partitionList in series.rdd.glom().collect(): if partitionList: nnonemptyPartitions += 1 del partitionList nsaveFiles = len(glob.glob(saveDirPath + os.sep + "*.bin")) roundtrippedSeries = loader.fromBinary(saveDirPath) with open(os.path.join(saveDirPath, "conf.json"), 'r') as fp: conf = json.load(fp) # sorting is required here b/c of the randomization induced by the repartition. # orig and roundtripped will in general be different from each other, since roundtripped # will have (0, 0, 0) index as first element (since it will be the lexicographically first # file) while orig has only a 1 in npartitions chance of starting with (0, 0, 0) after repartition. expectedPackedAry = series.pack(sorting=True) actualPackedAry = roundtrippedSeries.pack(sorting=True) assert_true(array_equal(expectedPackedAry, actualPackedAry)) assert_equals(nnonemptyPartitions, nsaveFiles) assert_equals(len(aryShape), conf["nkeys"]) assert_equals(nimages, conf["nvalues"]) assert_equals("int16", conf["keytype"]) assert_equals(str(series.dtype), conf["valuetype"]) # check that we have converted ourselves to an appropriate float after reloading assert_equals(str(smallestFloatType(series.dtype)), str(roundtrippedSeries.dtype))
def loadSeries(self, dataPath, nkeys=None, nvalues=None, inputFormat='binary', minPartitions=None, confFilename='conf.json', keyType=None, valueType=None, keyPath=None, varName=None): """ Loads a Series object from data stored as binary, text, npy, or mat. For binary and text, supports single files or multiple files stored on a local file system, a networked file system (mounted and available on all cluster nodes), Amazon S3, or HDFS. For local formats (npy and mat) only local file systems currently supported. Parameters ---------- dataPath: string Path to data files or directory, as either a local filesystem path or a URI. May include a single '*' wildcard in the filename. Examples of valid dataPaths include 'local/directory/*.stack", "s3n:///my-s3-bucket/data/", or "file:///mnt/another/directory/". nkeys: int, optional (required if `inputFormat` is 'text'), default = None Number of keys per record (e.g. 3 for (x, y, z) coordinate keys). Must be specified for text data; can be specified here or in a configuration file for binary data. nvalues: int, optional (required if `inputFormat` is 'text') Number of values per record. Must be specified here or in a configuration file for binary data. inputFormat: {'text', 'binary', 'npy', 'mat'}. optional, default = 'binary' inputFormat of data to be read. minPartitions: int, optional, default = SparkContext.minParallelism Minimum number of Spark partitions to use, only for text. confFilename: string, optional, default 'conf.json' Path to JSON file with configuration options including 'nkeys', 'nvalues', 'keyType', and 'valueType'. If a file is not found at the given path, then the base directory in 'dataPath' will be checked. Parameters will override the conf file. keyType: string or numpy dtype, optional, default = None Numerical type of keys, will override conf file. valueType: string or numpy dtype, optional, default = None Numerical type of values, will override conf file. keyPath: string, optional, default = None Path to file with keys when loading from npy or mat. varName : str, optional, default = None Variable name to load (for MAT files only) Returns ------- data: thunder.rdds.Series A Series object, wrapping an RDD, with (n-tuples of ints) : (numpy array) pairs """ checkParams(inputFormat, ['text', 'binary', 'npy', 'mat']) from thunder.rdds.fileio.seriesloader import SeriesLoader loader = SeriesLoader(self._sc, minPartitions=minPartitions) if inputFormat.lower() == 'binary': data = loader.fromBinary(dataPath, confFilename=confFilename, nkeys=nkeys, nvalues=nvalues, keyType=keyType, valueType=valueType) elif inputFormat.lower() == 'text': if nkeys is None: raise Exception('Must provide number of keys per record for loading from text') data = loader.fromText(dataPath, nkeys=nkeys) elif inputFormat.lower() == 'npy': data = loader.fromNpyLocal(dataPath, keyPath) else: if varName is None: raise Exception('Must provide variable name for loading MAT files') data = loader.fromMatLocal(dataPath, varName, keyPath) return data
def _run_tst_fromBinary(self, useConfJson=False): # run this as a single big test so as to avoid repeated setUp and tearDown of the spark context # data will be a sequence of test data # all keys and all values in a test data item must be of the same length # keys get converted to ints regardless of raw input format DATA = [ SeriesBinaryTestData.fromArrays([[1, 2, 3]], [[11, 12, 13]], 'int16', 'int16'), SeriesBinaryTestData.fromArrays([[1, 2, 3], [5, 6, 7]], [[11], [12]], 'int16', 'int16'), SeriesBinaryTestData.fromArrays([[1, 2, 3]], [[11, 12, 13]], 'int16', 'int32'), SeriesBinaryTestData.fromArrays([[1, 2, 3]], [[11, 12, 13]], 'int32', 'int16'), SeriesBinaryTestData.fromArrays([[1, 2, 3]], [[11.0, 12.0, 13.0]], 'int16', 'float32'), SeriesBinaryTestData.fromArrays([[1, 2, 3]], [[11.0, 12.0, 13.0]], 'float32', 'float32'), SeriesBinaryTestData.fromArrays([[2, 3, 4]], [[11.0, 12.0, 13.0]], 'float32', 'float32'), ] for itemidx, item in enumerate(DATA): outSubdir = os.path.join(self.outputdir, 'input%d' % itemidx) os.mkdir(outSubdir) fname = os.path.join(outSubdir, 'inputfile%d.bin' % itemidx) with open(fname, 'wb') as f: item.writeToFile(f) loader = SeriesLoader(self.sc) if not useConfJson: series = loader.fromBinary(outSubdir, nkeys=item.nkeys, nvalues=item.nvals, keyType=str(item.keyDtype), valueType=str(item.valDtype)) else: # write configuration file conf = { 'input': outSubdir, 'nkeys': item.nkeys, 'nvalues': item.nvals, 'valuetype': str(item.valDtype), 'keytype': str(item.keyDtype) } with open(os.path.join(outSubdir, "conf.json"), 'wb') as f: json.dump(conf, f, indent=2) series = loader.fromBinary(outSubdir) seriesData = series.rdd.collect() expectedData = item.data assert_equals( len(expectedData), len(seriesData), "Differing numbers of k/v pairs in item %d; expected %d, got %d" % (itemidx, len(expectedData), len(seriesData))) for expected, actual in zip(expectedData, seriesData): expectedKeys = tuple(expected[0]) expectedType = smallestFloatType(item.valDtype) expectedVals = array(expected[1], dtype=expectedType) assert_equals( expectedKeys, actual[0], "Key mismatch in item %d; expected %s, got %s" % (itemidx, str(expectedKeys), str(actual[0]))) assert_true( allclose(expectedVals, actual[1]), "Value mismatch in item %d; expected %s, got %s" % (itemidx, str(expectedVals), str(actual[1]))) assert_equals( expectedType, str(actual[1].dtype), "Value type mismatch in item %d; expected %s, got %s" % (itemidx, expectedType, str(actual[1].dtype)))
def _run_roundtrip_tst(self, testCount, arrays, blockSize): print "Running TestSeriesBinaryWriteFromStack roundtrip test #%d" % testCount insubdir = os.path.join(self.outputdir, 'input%d' % testCount) os.mkdir(insubdir) outsubdir = os.path.join(self.outputdir, 'output%d' % testCount) #os.mkdir(outsubdir) for aryCount, array in enumerate(arrays): # array.tofile always writes in column-major order... array.tofile(os.path.join(insubdir, "img%02d.stack" % aryCount)) # ... but we will read and interpret these as though they are in row-major order dims = list(arrays[0].shape) dims.reverse() underTest = SeriesLoader(self.sc) underTest.saveFromStack(insubdir, outsubdir, dims, blockSize=blockSize, datatype=str(arrays[0].dtype)) series = underTest.fromStack(insubdir, dims, datatype=str(arrays[0].dtype)) roundtripped_series = underTest.fromBinary(outsubdir) roundtripped = roundtripped_series.collect() direct = series.collect() expecteddtype = str(smallest_float_type(arrays[0].dtype)) assert_equals(expecteddtype, roundtripped_series.dtype) assert_equals(expecteddtype, series.dtype) assert_equals(expecteddtype, str(roundtripped[0][1].dtype)) assert_equals(expecteddtype, str(direct[0][1].dtype)) with open(os.path.join(outsubdir, "conf.json"), 'r') as fp: # check that binary series file data type *matches* input stack data type (not yet converted to float) # at least according to conf.json conf = json.load(fp) assert_equals(str(arrays[0].dtype), conf["valuetype"]) for ((serieskeys, seriesvalues), (directkeys, directvalues)) in zip(roundtripped, direct): assert_equals(directkeys, serieskeys) assert_equals(directvalues, seriesvalues) for seriesidx, seriesval in enumerate(seriesvalues): #print "seriesidx: %d; serieskeys: %s; seriesval: %g" % (seriesidx, serieskeys, seriesval) # flip indices again for row vs col-major insanity arykeys = list(serieskeys) arykeys.reverse() msg = "Failure on test #%d, time point %d, indices %s" % ( testCount, seriesidx, str(tuple(arykeys))) try: assert_almost_equal(arrays[seriesidx][tuple(arykeys)], seriesval, places=4) except AssertionError, e: raise AssertionError(msg, e)
def loadSeries(self, dataPath, nkeys=None, nvalues=None, inputFormat='binary', minPartitions=None, confFilename='conf.json', keyType=None, valueType=None): """ Loads a Series object from data stored as text or binary files. Supports single files or multiple files stored on a local file system, a networked file system (mounted and available on all cluster nodes), Amazon S3, or HDFS. Parameters ---------- dataPath: string Path to data files or directory, specified as either a local filesystem path or in a URI-like format, including scheme. A dataPath argument may include a single '*' wildcard character in the filename. Examples of valid dataPaths include 'a/local/relative/directory/*.stack", "s3n:///my-s3-bucket/data/mydatafile.tif", "/mnt/my/absolute/data/directory/", or "file:///mnt/another/data/directory/". nkeys: int, optional (but required if `inputFormat` is 'text') dimensionality of data keys. (For instance, (x,y,z) keyed data for 3-dimensional image timeseries data.) For text data, number of keys must be specified in this parameter; for binary data, number of keys must be specified either in this parameter or in a configuration file named by the 'conffile' argument if this parameter is not set. nvalues: int, optional (but required if `inputFormat` is 'text') Number of values expected to be read. For binary data, nvalues must be specified either in this parameter or in a configuration file named by the 'conffile' argument if this parameter is not set. inputFormat: {'text', 'binary'}. optional, default 'binary' Format of data to be read. minPartitions: int, optional Explicitly specify minimum number of Spark partitions to be generated from this data. Used only for text data. Default is to use minParallelism attribute of Spark context object. confFilename: string, optional, default 'conf.json' Path to JSON file with configuration options including 'nkeys', 'nvalues', 'keytype', and 'valuetype'. If a file is not found at the given path, then the base directory given in 'datafile' will also be checked. Parameters `nkeys` or `nvalues` that are specified as explicit arguments to this method will take priority over those found in conffile if both are present. Returns ------- data: thunder.rdds.Series A newly-created Series object, wrapping an RDD of series data. This RDD will have as keys an n-tuple of int, with n given by `nkeys` or the configuration passed in `conffile`. RDD values will be a numpy array of length `nvalues` (or as specified in the passed configuration file). """ checkParams(inputFormat, ['text', 'binary']) from thunder.rdds.fileio.seriesloader import SeriesLoader loader = SeriesLoader(self._sc, minPartitions=minPartitions) if inputFormat.lower() == 'text': data = loader.fromText(dataPath, nkeys=nkeys) else: # must be either 'text' or 'binary' data = loader.fromBinary(dataPath, confFilename=confFilename, nkeys=nkeys, nvalues=nvalues, keyType=keyType, valueType=valueType) return data