def test_toBlocksBySlices(self): narys = 3 arys, sh, sz = _generate_test_arrays(narys) imagedata = ImagesLoader(self.sc).fromArrays(arys) test_params = [ (1, 1, 1), (1, 1, 2), (1, 1, 3), (1, 2, 1), (1, 2, 2), (1, 2, 3), (1, 3, 1), (1, 3, 2), (1, 3, 3), (2, 1, 1), (2, 1, 2), (2, 1, 3), (2, 2, 1), (2, 2, 2), (2, 2, 3), (2, 3, 1), (2, 3, 2), (2, 3, 3)] for bpd in test_params: blocks = imagedata._toBlocksBySplits(bpd).collect() expectednuniquekeys = reduce(mul, bpd) expectedvalsperkey = narys keystocounts = Counter([kv[0] for kv in blocks]) assert_equals(expectednuniquekeys, len(keystocounts)) assert_equals([expectedvalsperkey] * expectednuniquekeys, keystocounts.values()) gatheredary = None for _, block in blocks: if gatheredary is None: gatheredary = zeros(block.origshape, dtype='int16') gatheredary[block.origslices] = block.values for i in xrange(narys): assert_true(array_equal(arys[i], gatheredary[i]))
def test_toSeriesWithInefficientSplitAndSortedPack(self): ary = arange(8, dtype=dtypeFunc("int16")).reshape((4, 2)) image = ImagesLoader(self.sc).fromArrays(ary) series = image.toBlocks((2, 1), units="s").toSeries() seriesVals = series.collect() seriesAry = series.pack(sorting=True) # check ordering of keys assert_equals((0, 0), seriesVals[0][0]) # first key assert_equals((1, 0), seriesVals[1][0]) # second key assert_equals((0, 1), seriesVals[2][0]) assert_equals((1, 1), seriesVals[3][0]) # end of first block # beginning of second block assert_equals((2, 0), seriesVals[4][0]) assert_equals((3, 0), seriesVals[5][0]) assert_equals((2, 1), seriesVals[6][0]) assert_equals((3, 1), seriesVals[7][0]) # check dimensions tuple matches numpy shape assert_equals(ary.shape, series.dims.count) # check that values are in expected order collectedVals = array([kv[1] for kv in seriesVals], dtype=dtypeFunc("int16")).ravel() assert_true(array_equal(ary[:2, :].ravel(order="F"), collectedVals[:4])) # first block assert_true(array_equal(ary[2:4, :].ravel(order="F"), collectedVals[4:])) # second block # check that packing returns original array (after sort) assert_true(array_equal(ary, seriesAry))
def test_castToFloat(self): arys, shape, size = _generateTestArrays(2, "uint8") imageData = ImagesLoader(self.sc).fromArrays(arys) catData = imageData.astype("smallfloat") assert_equals("float16", str(catData.dtype)) assert_equals("float16", str(catData.first()[1].dtype))
def test_fromMultiTimepointStacks(self): ary = arange(16, dtype=dtypeFunc('uint8')).reshape((4, 2, 2)) ary2 = arange(16, 32, dtype=dtypeFunc('uint8')).reshape((4, 2, 2)) ary.tofile(os.path.join(self.outputdir, "test01.stack")) ary2.tofile(os.path.join(self.outputdir, "test02.stack")) image = ImagesLoader(self.sc).fromStack(self.outputdir, dtype="uint8", dims=(2, 2, 4), nplanes=2) collectedImage = image.collect() # we don't expect to have nrecords cached, since we get an unknown number of images per file assert_true(image._nrecords is None) assert_equals(4, image.nrecords) assert_equals(4, len(collectedImage)) # check keys: assert_equals(0, collectedImage[0][0]) assert_equals(1, collectedImage[1][0]) assert_equals(2, collectedImage[2][0]) assert_equals(3, collectedImage[3][0]) # check values: assert_true(array_equal(ary[:2].T, collectedImage[0][1])) assert_true(array_equal(ary[2:].T, collectedImage[1][1])) assert_true(array_equal(ary2[:2].T, collectedImage[2][1])) assert_true(array_equal(ary2[2:].T, collectedImage[3][1])) # 3 planes does not divide 4 assert_raises(ValueError, ImagesLoader(self.sc).fromStack, self.outputdir, dtype="uint8", dims=(2, 2, 4), nplanes=3)
def loadImagesFromArray(self, values, npartitions=None): """ Load Images data from a local array Parameters ---------- values : list or ndarray A list of 2d or 3d numpy arrays, or a single 3d or 4d numpy array npartitions : position int, optional, default = None Number of partitions for RDD, if unspecified will use default parallelism. """ from numpy import ndarray, asarray from thunder.rdds.fileio.imagesloader import ImagesLoader loader = ImagesLoader(self._sc) if isinstance(values, list): values = asarray(values) if isinstance(values, ndarray) and values.ndim > 2: values = list(values) if not npartitions: npartitions = self._sc.defaultParallelism return loader.fromArrays(values, npartitions=npartitions)
def _run_tst_toSeriesWithSplitsAndPack(self, strategy): ary = arange(8, dtype=dtypeFunc('int16')).reshape((4, 2)) image = ImagesLoader(self.sc).fromArrays(ary) series = image.toBlocks(strategy).toSeries() seriesVals = series.collect() seriesAry = series.pack() # check ordering of keys assert_equals((0, 0), seriesVals[0][0]) # first key assert_equals((1, 0), seriesVals[1][0]) # second key assert_equals((2, 0), seriesVals[2][0]) assert_equals((3, 0), seriesVals[3][0]) assert_equals((0, 1), seriesVals[4][0]) assert_equals((1, 1), seriesVals[5][0]) assert_equals((2, 1), seriesVals[6][0]) assert_equals((3, 1), seriesVals[7][0]) # check dimensions tuple matches numpy shape assert_equals(ary.shape, series.dims.count) # check that values are in Fortran-convention order collectedVals = array([kv[1] for kv in seriesVals], dtype=dtypeFunc('int16')).ravel() assert_true(array_equal(ary.ravel(order='F'), collectedVals)) # check that packing returns original array assert_true(array_equal(ary, seriesAry))
def test_toSeriesBySlices(self): narys = 3 arys, sh, sz = _generateTestArrays(narys) imageData = ImagesLoader(self.sc).fromArrays(arys) imageData.cache() testParams = [ (1, 1, 1), (1, 1, 2), (1, 1, 3), (1, 2, 1), (1, 2, 2), (1, 2, 3), (1, 3, 1), (1, 3, 2), (1, 3, 3), (2, 1, 1), (2, 1, 2), (2, 1, 3), (2, 2, 1), (2, 2, 2), (2, 2, 3), (2, 3, 1), (2, 3, 2), (2, 3, 3), ] for bpd in testParams: series = imageData.toBlocks(bpd, units="s").toSeries().collect() self.evaluateSeries(arys, series, sz)
def setUp(self): super(TestBlockKeys, self).setUp() shape = (30, 30) arys = [ones(shape) for _ in range(0, 3)] data = ImagesLoader(self.sc).fromArrays(arys) self.blocks = data.toBlocks(size=(10, 10)).collect() self.keys = [k for k, v in self.blocks]
def test_toSeriesWithPack(self): ary = arange(8, dtype=dtypeFunc("int16")).reshape((2, 4)) image = ImagesLoader(self.sc).fromArrays(ary) series = image.toBlocks("150M").toSeries() seriesVals = series.collect() seriesAry = series.pack() seriesAry_xpose = series.pack(transpose=True) # check ordering of keys assert_equals((0, 0), seriesVals[0][0]) # first key assert_equals((1, 0), seriesVals[1][0]) # second key assert_equals((0, 1), seriesVals[2][0]) assert_equals((1, 1), seriesVals[3][0]) assert_equals((0, 2), seriesVals[4][0]) assert_equals((1, 2), seriesVals[5][0]) assert_equals((0, 3), seriesVals[6][0]) assert_equals((1, 3), seriesVals[7][0]) # check dimensions tuple matches numpy shape assert_equals(image.dims.count, series.dims.count) assert_equals(ary.shape, series.dims.count) # check that values are in Fortran-convention order collectedVals = array([kv[1] for kv in seriesVals], dtype=dtypeFunc("int16")).ravel() assert_true(array_equal(ary.ravel(order="F"), collectedVals)) # check that packing returns original array assert_true(array_equal(ary, seriesAry)) assert_true(array_equal(ary.T, seriesAry_xpose))
def test_fromStackToSeriesWithPack(self): ary = arange(8, dtype=dtypeFunc("int16")).reshape((2, 4)) filename = os.path.join(self.outputdir, "test.stack") ary.tofile(filename) image = ImagesLoader(self.sc).fromStack(filename, dims=(4, 2)) strategy = SimpleBlockingStrategy.generateFromBlockSize(image, "150M") series = image.toBlocks(strategy).toSeries() seriesVals = series.collect() seriesAry = series.pack() # check ordering of keys assert_equals((0, 0), seriesVals[0][0]) # first key assert_equals((1, 0), seriesVals[1][0]) # second key assert_equals((2, 0), seriesVals[2][0]) assert_equals((3, 0), seriesVals[3][0]) assert_equals((0, 1), seriesVals[4][0]) assert_equals((1, 1), seriesVals[5][0]) assert_equals((2, 1), seriesVals[6][0]) assert_equals((3, 1), seriesVals[7][0]) # check dimensions tuple is reversed from numpy shape assert_equals(ary.shape[::-1], series.dims.count) # check that values are in original order collectedVals = array([kv[1] for kv in seriesVals], dtype=dtypeFunc("int16")).ravel() assert_true(array_equal(ary.ravel(), collectedVals)) # check that packing returns transpose of original array assert_true(array_equal(ary.T, seriesAry))
def test_min(self): from numpy import minimum arys, shape, size = _generateTestArrays(2, "uint8") imageData = ImagesLoader(self.sc).fromArrays(arys) minVal = imageData.min() assert_true(array_equal(reduce(minimum, arys), minVal))
def test_castToFloat(self): arys, shape, size = _generate_test_arrays(2, 'uint8') imagedata = ImagesLoader(self.sc).fromArrays(arys) castdata = imagedata.astype("smallfloat") assert_equals('float16', str(castdata.dtype)) assert_equals('float16', str(castdata.first()[1].dtype))
def test_mean(self): from test_utils import elementwiseMean arys, shape, size = _generateTestArrays(2, 'uint8') imageData = ImagesLoader(self.sc).fromArrays(arys) meanVal = imageData.mean() expected = elementwiseMean(arys).astype('float16') assert_true(allclose(expected, meanVal)) assert_equals('float64', str(meanVal.dtype))
def _run_tstSaveAsBinarySeries(self, testIdx, narys_, valDtype, groupingDim_): """Pseudo-parameterized test fixture, allows reusing existing spark context """ paramStr = "(groupingdim=%d, valuedtype='%s')" % (groupingDim_, valDtype) arys, aryShape, arySize = _generateTestArrays(narys_, dtype_=valDtype) dims = aryShape[:] outdir = os.path.join(self.outputdir, "anotherdir%02d" % testIdx) images = ImagesLoader(self.sc).fromArrays(arys) slicesPerDim = [1]*arys[0].ndim slicesPerDim[groupingDim_] = arys[0].shape[groupingDim_] images.toBlocks(slicesPerDim, units="splits").saveAsBinarySeries(outdir) ndims = len(aryShape) # prevent padding to 4-byte boundaries: "=" specifies no alignment unpacker = struct.Struct('=' + 'h'*ndims + dtypeFunc(valDtype).char*narys_) def calcExpectedNKeys(): tmpShape = list(dims[:]) del tmpShape[groupingDim_] return prod(tmpShape) expectedNKeys = calcExpectedNKeys() def byrec(f_, unpacker_, nkeys_): rec = True while rec: rec = f_.read(unpacker_.size) if rec: allRecVals = unpacker_.unpack(rec) yield allRecVals[:nkeys_], allRecVals[nkeys_:] outFilenames = glob.glob(os.path.join(outdir, "*.bin")) assert_equals(dims[groupingDim_], len(outFilenames)) for outFilename in outFilenames: with open(outFilename, 'rb') as f: nkeys = 0 for keys, vals in byrec(f, unpacker, ndims): nkeys += 1 assert_equals(narys_, len(vals)) for valIdx, val in enumerate(vals): assert_equals(arys[valIdx][keys], val, "Expected %g, got %g, for test %d %s" % (arys[valIdx][keys], val, testIdx, paramStr)) assert_equals(expectedNKeys, nkeys) confName = os.path.join(outdir, "conf.json") assert_true(os.path.isfile(confName)) with open(os.path.join(outdir, "conf.json"), 'r') as fconf: import json conf = json.load(fconf) assert_equals(outdir, conf['input']) assert_equals(len(aryShape), conf['nkeys']) assert_equals(narys_, conf['nvalues']) assert_equals(valDtype, conf['valuetype']) assert_equals('int16', conf['keytype']) assert_true(os.path.isfile(os.path.join(outdir, 'SUCCESS')))
def test_stdev(self): from test_utils import elementwiseStdev arys, shape, size = _generateTestArrays(2, 'uint8') imageData = ImagesLoader(self.sc).fromArrays(arys) stdval = imageData.stdev() expected = elementwiseStdev([ary.astype('float16') for ary in arys]) assert_true(allclose(expected, stdval)) assert_equals('float64', str(stdval.dtype))
def test_toTimeSeries(self): # create 3 arrays of 4x3x3 images (C-order), containing sequential integers narys = 3 arys, sh, sz = _generateTestArrays(narys) imageData = ImagesLoader(self.sc).fromArrays(arys) series = imageData.toTimeSeries() assert isinstance(series, TimeSeries)
def test_toSeries(self): # create 3 arrays of 4x3x3 images (C-order), containing sequential integers narys = 3 arys, sh, sz = _generate_test_arrays(narys) imagedata = ImagesLoader(self.sc).fromArrays(arys) series = imagedata.toSeries(groupingDim=0).collect() self.evaluate_series(arys, series, sz)
def test_toSeries(self): # create 3 arrays of 4x3x3 images (C-order), containing sequential integers narys = 3 arys, sh, sz = _generateTestArrays(narys) imageData = ImagesLoader(self.sc).fromArrays(arys) series = imageData.toBlocks((4, 1, 1), units="s").toSeries().collect() self.evaluateSeries(arys, series, sz)
def test_variance(self): from test_utils import elementwise_var arys, shape, size = _generate_test_arrays(2, 'uint8') imagedata = ImagesLoader(self.sc).fromArrays(arys) varval = imagedata.variance() expected = elementwise_var([ary.astype('float16') for ary in arys]) assert_true(allclose(expected, varval)) assert_equals('float16', str(varval.dtype))
def test_fromArrays(self): ary = arange(8, dtype=dtypeFunc('int16')).reshape((2, 4)) image = ImagesLoader(self.sc).fromArrays(ary) collectedImage = image.collect() assert_equals(1, len(collectedImage)) assert_equals(ary.shape, image.dims.count) assert_equals(0, collectedImage[0][0]) # check key assert_true(array_equal(ary, collectedImage[0][1])) # check value
def _run_tst_roundtripThroughBlocks(self, strategy): imagepath = findSourceTreeDir("utils/data/fish/images") images = ImagesLoader(self.sc).fromTif(imagepath) blockedimages = images.toBlocks(strategy) recombinedimages = blockedimages.toImages() collectedimages = images.collect() roundtrippedimages = recombinedimages.collect() for orig, roundtripped in zip(collectedimages, roundtrippedimages): assert_true(array_equal(orig[1], roundtripped[1]))
def test_sum(self): from numpy import add arys, shape, size = _generateTestArrays(2, 'uint8') imageData = ImagesLoader(self.sc).fromArrays(arys) sumVal = imageData.sum(dtype='uint32') arys = [ary.astype('uint32') for ary in arys] expected = reduce(add, arys) assert_true(array_equal(expected, sumVal)) assert_equals('uint32', str(sumVal.dtype))
def test_variance(self): from test_utils import elementwiseVar arys, shape, size = _generateTestArrays(2, "uint8") imageData = ImagesLoader(self.sc).fromArrays(arys) varVal = imageData.variance() expected = elementwiseVar([ary.astype("float16") for ary in arys]) assert_true(allclose(expected, varVal)) assert_equals("float64", str(varVal.dtype))
def loadImagesOCP( self, bucketName, resolution, server="ocp.me", startIdx=None, stopIdx=None, minBound=None, maxBound=None ): """ Load Images from OCP (Open Connectome Project). The OCP is a web service for access to EM brain images and other neural image data. The web-service can be accessed at http://www.openconnectomeproject.org/. Parameters ---------- bucketName: string Token name for the project in OCP. This name should exist on the server from which data is loaded. resolution: nonnegative int Resolution of the data in OCP server: string, optional, default = 'ocp.me' Name of the OCP server with the specified token. startIdx: nonnegative int, optional, default = None Convenience parameters to read only a subset of input files. Uses python slice conventions (zero-based indexing with exclusive final position). stopIdx: nonnegative int, optional See startIdx. minBound, maxBound: tuple of nonnegative int, optional, default = None X,Y,Z bounds of the data to fetch from OCP. minBound contains the (xMin,yMin,zMin) while maxBound contains (xMax,yMax,zMax). Returns ------- data: thunder.rdds.Images An Images object, wrapping an RDD of with (int) : (numpy array) pairs """ from thunder.rdds.fileio.imagesloader import ImagesLoader loader = ImagesLoader(self._sc) # Checking StartIdx is smaller or equal to StopIdx if startIdx is not None and stopIdx is not None and startIdx > stopIdx: raise Exception("Error. startIdx {} is larger than stopIdx {}".format(startIdx, stopIdx)) data = loader.fromOCP( bucketName, resolution=resolution, server=server, startIdx=startIdx, stopIdx=stopIdx, minBound=minBound, maxBound=maxBound, ) return data
def test_crop(self): dims = (2, 2, 4) sz = reduce(lambda x, y: x*y, dims) origAry = arange(sz, dtype='int16').reshape(dims) imageData = ImagesLoader(self.sc).fromArrays([origAry]) croppedData = imageData.crop((0, 0, 0), (2, 2, 2)) crop = croppedData.collect()[0][1] expected = squeeze(origAry[slice(0, 2), slice(0, 2), slice(0, 2)]) assert_true(array_equal(expected, crop)) assert_equals(tuple(expected.shape), croppedData._dims.count) assert_equals(str(expected.dtype), croppedData._dtype)
def test_meanWithSingleRegionIndices3D(self): ary1 = array([[[3, 5, 3], [6, 8, 6]], [[3, 5, 3], [6, 8, 6]]], dtype="int32") ary2 = array([[[13, 15, 13], [16, 18, 16]], [[13, 15, 13], [16, 18, 16]]], dtype="int32") images = ImagesLoader(self.sc).fromArrays([ary1, ary2]) indices = [[(1, 1, 1), (0, 0, 0)]] # one region with two indices regionMeanImages = images.meanByRegions(indices) self.__checkAttrPropagation(regionMeanImages, (1, 1)) collected = regionMeanImages.collect() # check values assert_equals(5, collected[0][1][0]) assert_equals(15, collected[1][1][0])
def test_subtract(self): narys = 3 arys, sh, sz = _generateTestArrays(narys) subVals = [1, arange(sz, dtype=dtypeFunc("int16")).reshape(sh)] imageData = ImagesLoader(self.sc).fromArrays(arys) for subVal in subVals: subData = imageData.subtract(subVal) subtracted = subData.collect() expectedArys = map(lambda ary: ary - subVal, arys) for actual, expected in zip(subtracted, expectedArys): assert_true(allclose(expected, actual[1]))
def test_toBlocksWithSplit(self): ary = arange(8, dtype=dtypeFunc("int16")).reshape((2, 4)) image = ImagesLoader(self.sc).fromArrays(ary) groupedblocks = image.toBlocks((1, 2), units="s") # collectedblocks = blocks.collect() collectedgroupedblocks = groupedblocks.collect() assert_equals((0, 0), collectedgroupedblocks[0][0].spatialKey) assert_true(array_equal(ary[:, :2].ravel(), collectedgroupedblocks[0][1].ravel())) assert_equals((0, 2), collectedgroupedblocks[1][0].spatialKey) assert_true(array_equal(ary[:, 2:].ravel(), collectedgroupedblocks[1][1].ravel()))
def test_stats(self): from test_utils import elementwiseMean, elementwiseVar arys, shape, size = _generateTestArrays(2, 'uint8') imageData = ImagesLoader(self.sc).fromArrays(arys) statsval = imageData.stats() floatarys = [ary.astype('float16') for ary in arys] # StatsCounter contains a few different measures, only test a couple: expectedMean = elementwiseMean(floatarys) expectedVar = elementwiseVar(floatarys) assert_true(allclose(expectedMean, statsval.mean())) assert_true(allclose(expectedVar, statsval.variance()))
def test_planes(self): dims = (2, 2, 4) sz = reduce(lambda x, y: x * y, dims) origAry = arange(sz, dtype=dtypeFunc("int16")).reshape(dims) imageData = ImagesLoader(self.sc).fromArrays([origAry]) planedData = imageData.planes(0, 2) planed = planedData.collect()[0][1] expected = squeeze(origAry[slice(None), slice(None), slice(0, 2)]) assert_true(array_equal(expected, planed)) assert_equals(tuple(expected.shape), planedData._dims.count) assert_equals(str(expected.dtype), planedData._dtype)
def test_crosscorrVolume(self): random.seed(42) ref = random.randn(25, 25, 3) im = shift(ref, [2, -2, 0], mode='constant', order=0) imIn = ImagesLoader(self.sc).fromArrays(im) # use 3D cross correlation paramOut = Registration('crosscorr').prepare(ref).fit(imIn).transformations[0].delta imOut = Registration('crosscorr').prepare(ref).run(imIn).first()[1] assert_true(allclose(paramOut, [2, -2, 0])) assert_true(allclose(ref[:-2, 2:, :], imOut[:-2, 2:, :])) # use 2D cross correlation on each plane paramOut = Registration('planarcrosscorr').prepare(ref).fit(imIn).transformations[0].delta imOut = Registration('planarcrosscorr').prepare(ref).run(imIn).first()[1] assert_true(allclose(paramOut, [[2, -2], [2, -2], [2, -2]])) assert_true(allclose(ref[:-2, 2:, :], imOut[:-2, 2:, :]))
def test_saveAsBinarySeries(self): narys = 3 arys, aryShape, _ = _generateTestArrays(narys) outdir = os.path.join(self.outputdir, "anotherdir") os.mkdir(outdir) assert_raises( ValueError, ImagesLoader(self.sc).fromArrays(arys).toBlocks( (1, 1, 1), units="s").saveAsBinarySeries, outdir) groupingDims = xrange(len(aryShape)) dtypes = ('int16', 'int32', 'float32') paramIters = itertools.product(groupingDims, dtypes) for idx, params in enumerate(paramIters): gd, dt = params self._run_tstSaveAsBinarySeries(idx, narys, dt, gd)
def _run_tst_filter(self, dataFunc, filterFunc): narys = 3 arys, sh, sz = _generateTestArrays(narys) sigma = 2 imageData = ImagesLoader(self.sc).fromArrays(arys) filteredData = dataFunc(imageData, sigma) filtered = filteredData.collect() expectedArys = map( lambda ary: TestImagesMethods._run_filter(ary, filterFunc, sigma), arys) for actual, expected in zip(filtered, expectedArys): assert_true(allclose(expected, actual[1])) assert_equals(tuple(expectedArys[0].shape), filtered[0][1].shape) assert_equals(tuple(expectedArys[0].shape), filteredData._dims.count) assert_equals(str(arys[0].dtype), str(filtered[0][1].dtype)) assert_equals(str(filtered[0][1].dtype), filteredData._dtype)
def test_reference_2d(self): random.seed(42) im0 = random.random_integers(0, high=127, size=(25, 25)).astype('uint16') im1 = random.random_integers(0, high=127, size=(25, 25)).astype('uint16') im2 = random.random_integers(0, high=127, size=(25, 25)).astype('uint16') imgIn = ImagesLoader(self.sc).fromArrays([im0, im1, im2]) ref = Register.reference(imgIn) assert (allclose(ref, (im0 + im1 + im2) / 3)) ref = Register.reference(imgIn, startIdx=0, stopIdx=2) assert (allclose(ref, (im0 + im1) / 2)) ref = Register.reference(imgIn, startIdx=1, stopIdx=2) assert (allclose(ref, im1))
def test_reference_2d(self): # test default reference calculation in 2D random.seed(42) im0 = random.rand(25, 25).astype('float') im1 = random.rand(25, 25).astype('float') im2 = random.rand(25, 25).astype('float') imin = ImagesLoader(self.sc).fromArrays([im0, im1, im2]) reg = Registration('crosscorr').prepare(imin) assert (allclose(reg.reference, (im0 + im1 + im2) / 3)) reg = Registration('crosscorr').prepare(imin, startidx=0, stopidx=2) assert (allclose(reg.reference, (im0 + im1) / 2)) reg = Registration('crosscorr').prepare(imin, startidx=1, stopidx=2) assert (allclose(reg.reference, im1))
def _run_tst_multitif(self, filename, expectedDtype): imagePath = os.path.join(self.testResourcesDir, "multilayer_tif", filename) tiffImages = ImagesLoader(self.sc).fromTif(imagePath).collect() expectedNum = 1 expectedShape = (70, 75, 3) # 3 concatenated pages, each with single luminance channel # 3 images have increasing #s of black dots, so lower luminance overall expectedSums = [1140006, 1119161, 1098917] expectedKey = 0 assert_equals(expectedNum, len(tiffImages), "Expected %d images, got %d" % (expectedNum, len(tiffImages))) tiffImage = tiffImages[0] assert_equals(expectedKey, tiffImage[0], "Expected key %s, got %s" % (str(expectedKey), str(tiffImage[0]))) assert_true(isinstance(tiffImage[1], ndarray), "Value type error; expected image value to be numpy ndarray, was " + str(type(tiffImage[1]))) assert_equals(expectedDtype, str(tiffImage[1].dtype)) assert_equals(expectedShape, tiffImage[1].shape) for channelidx in xrange(0, expectedShape[2]): assert_equals(expectedSums[channelidx], tiffImage[1][:, :, channelidx].flatten().sum())
def test_fromMultipageTif(self): imagepath = os.path.join(self.testresourcesdir, "multilayer_tif", "dotdotdot_lzw.tif") tifimages = ImagesLoader(self.sc).fromMultipageTif(imagepath, self.sc).collect() expectednum = 1 expectedshape = (70, 75, 3) # 3 concatenated pages, each with single luminance channel # 3 images have increasing #s of black dots, so lower luminance overall expectedsums = [1140006, 1119161, 1098917] expectedkey = 0 #self._evaluateMultipleImages(tifimages, expectednum, expectedshape, expectedkeys, expectedsums) assert_equals(expectednum, len(tifimages), "Expected %s images, got %d" % (expectednum, len(tifimages))) tifimage = tifimages[0] assert_equals(expectedkey, tifimage[0], "Expected key %s, got %s" % (str(expectedkey), str(tifimage[0]))) assert_true(isinstance(tifimage[1], ndarray), "Value type error; expected image value to be numpy ndarray, was " + str(type(tifimage[1]))) assert_equals('uint8', str(tifimage[1].dtype)) assert_equals(expectedshape, tifimage[1].shape) for channelidx in xrange(0, expectedshape[2]): assert_equals(expectedsums[channelidx], tifimage[1][:, :, channelidx].flatten().sum())
def test_save_load(self): # test basic saving a loading functionality # new registration methods should add tests # for loading and saving random.seed(42) ref = random.randn(25, 25) im = shift(ref, [2, 0], mode='constant', order=0) imin = ImagesLoader(self.sc).fromArrays(im) reg = Registration('crosscorr') reg.prepare(ref) model1 = reg.fit(imin) t = tempfile.mkdtemp() model1.save(t + '/test.json') model2 = Registration.load(t + '/test.json') out1 = model1.transform(imin).first()[1] out2 = model2.transform(imin).first()[1] assert (allclose(out1, out2))
def loadImages(self, dataPath, dims=None, dtype=None, inputFormat='stack', ext=None, startIdx=None, stopIdx=None, recursive=False, nplanes=None, npartitions=None, renumber=False, confFilename='conf.json'): """ Loads an Images object from data stored as a binary image stack, tif, or png files. Supports single files or multiple files, stored on a local file system, a networked file sytem (mounted and available on all nodes), or Amazon S3. HDFS is not currently supported for image file data. Parameters ---------- dataPath: string Path to data files or directory, as either a local filesystem path or a URI. May include a single '*' wildcard in the filename. Examples of valid dataPaths include 'local/directory/*.stack", "s3n:///my-s3-bucket/data/", or "file:///mnt/another/directory/". dims: tuple of positive int, optional (required if inputFormat is 'stack') Image dimensions. Binary stack data will be interpreted as a multidimensional array with the given dimensions, and should be stored in row-major order (Fortran or Matlab convention), where the first dimension changes most rapidly. For 'png' or 'tif' data dimensions will be read from the image file headers. inputFormat: str, optional, default = 'stack' Expected format of the input data: 'stack', 'png', or 'tif'. 'stack' indicates flat binary stacks. 'png' or 'tif' indicate image format. Page of a multipage tif file will be extend along the third dimension. Separate files interpreted as distinct records, with ordering given by lexicographic sorting of file names. ext: string, optional, default = None File extension, default will be "bin" if inputFormat=="stack", "tif" for inputFormat=='tif', and 'png' for inputFormat=="png". dtype: string or numpy dtype, optional, default = 'int16' Data type of the image files to be loaded, specified as a numpy "dtype" string. Ignored for 'tif' or 'png' (data will be inferred from image formats). startIdx: nonnegative int, optional, default = None Convenience parameters to read only a subset of input files. Uses python slice conventions (zero-based indexing with exclusive final position). These parameters give the starting and final index after lexicographic sorting. stopIdx: nonnegative int, optional, default = None See startIdx. recursive: boolean, optional, default = False If true, will recursively descend directories rooted at dataPath, loading all files in the tree with an appropriate extension. nplanes: positive integer, optional, default = None Subdivide individual image files. Every `nplanes` from each file will be considered a new record. With nplanes=None (the default), a single file will be considered as representing a single record. If the number of records per file is not the same across all files, then `renumber` should be set to True to ensure consistent keys. npartitions: positive int, optional, default = None Specify number of partitions for the RDD, if unspecified will use 1 partition per image. renumber: boolean, optional, default = False Recalculate keys for records after images are loading. Only necessary if different files contain different number of records (e.g. due to specifying nplanes). See Images.renumber(). confFilename : string, optional, default = 'conf.json' Name of conf file if using to specify parameters for binary stack data Returns ------- data: thunder.rdds.Images An Images object, wrapping an RDD of with (int) : (numpy array) pairs """ checkParams(inputFormat, ['stack', 'png', 'tif', 'tif-stack']) from thunder.rdds.fileio.imagesloader import ImagesLoader loader = ImagesLoader(self._sc) # Checking StartIdx is smaller or equal to StopIdx if startIdx is not None and stopIdx is not None and startIdx > stopIdx: raise Exception("Error. startIdx {} is larger than stopIdx {}".inputFormat(startIdx, stopIdx)) if not ext: ext = DEFAULT_EXTENSIONS.get(inputFormat.lower(), None) if inputFormat.lower() == 'stack': data = loader.fromStack(dataPath, dims=dims, dtype=dtype, ext=ext, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive, nplanes=nplanes, npartitions=npartitions, confFilename=confFilename) elif inputFormat.lower().startswith('tif'): data = loader.fromTif(dataPath, ext=ext, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive, nplanes=nplanes, npartitions=npartitions) else: if nplanes: raise NotImplementedError("nplanes argument is not supported for png files") data = loader.fromPng(dataPath, ext=ext, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive, npartitions=npartitions) if not renumber: return data else: return data.renumber()
def _run_tstSaveAsBinarySeries(self, testIdx, narys_, valDtype, groupingDim_): """Pseudo-parameterized test fixture, allows reusing existing spark context """ paramStr = "(groupingdim=%d, valuedtype='%s')" % (groupingDim_, valDtype) arys, aryShape, arySize = _generateTestArrays(narys_, dtype_=valDtype) dims = aryShape[:] outdir = os.path.join(self.outputdir, "anotherdir%02d" % testIdx) images = ImagesLoader(self.sc).fromArrays(arys) slicesPerDim = [1] * arys[0].ndim slicesPerDim[groupingDim_] = arys[0].shape[groupingDim_] images.toBlocks(slicesPerDim, units="splits").saveAsBinarySeries(outdir) ndims = len(aryShape) # prevent padding to 4-byte boundaries: "=" specifies no alignment unpacker = struct.Struct('=' + 'h' * ndims + dtypeFunc(valDtype).char * narys_) def calcExpectedNKeys(): tmpShape = list(dims[:]) del tmpShape[groupingDim_] return prod(tmpShape) expectedNKeys = calcExpectedNKeys() def byrec(f_, unpacker_, nkeys_): rec = True while rec: rec = f_.read(unpacker_.size) if rec: allRecVals = unpacker_.unpack(rec) yield allRecVals[:nkeys_], allRecVals[nkeys_:] outFilenames = glob.glob(os.path.join(outdir, "*.bin")) assert_equals(dims[groupingDim_], len(outFilenames)) for outFilename in outFilenames: with open(outFilename, 'rb') as f: nkeys = 0 for keys, vals in byrec(f, unpacker, ndims): nkeys += 1 assert_equals(narys_, len(vals)) for valIdx, val in enumerate(vals): assert_equals( arys[valIdx][keys], val, "Expected %g, got %g, for test %d %s" % (arys[valIdx][keys], val, testIdx, paramStr)) assert_equals(expectedNKeys, nkeys) confName = os.path.join(outdir, "conf.json") assert_true(os.path.isfile(confName)) with open(os.path.join(outdir, "conf.json"), 'r') as fconf: import json conf = json.load(fconf) assert_equals(outdir, conf['input']) assert_equals(len(aryShape), conf['nkeys']) assert_equals(narys_, conf['nvalues']) assert_equals(valDtype, conf['valuetype']) assert_equals('int16', conf['keytype']) assert_true(os.path.isfile(os.path.join(outdir, 'SUCCESS')))
def test_min(self): from numpy import minimum arys, shape, size = _generateTestArrays(2, 'uint8') imageData = ImagesLoader(self.sc).fromArrays(arys) minVal = imageData.min() assert_true(array_equal(reduce(minimum, arys), minVal))
def convertImagesToSeries(self, datapath, outputdirpath, dims=None, inputformat='stack', dtype='int16', blocksize="150M", startidx=None, stopidx=None, shuffle=False, overwrite=False): """ Write out Images data as Series data, saved in a flat binary format. The resulting Series data files may subsequently be read in using the loadSeries() method. The Series data object that results will be equivalent to that which would be generated by loadImagesAsSeries(). It is expected that loading Series data directly from the series flat binary format, using loadSeries(), will be faster than converting image data to a Series object through loadImagesAsSeries(). Parameters ---------- datapath: string Path to data files or directory, specified as either a local filesystem path or in a URI-like format, including scheme. A datapath argument may include a single '*' wildcard character in the filename. Examples of valid datapaths include 'a/local/relative/directory/*.stack", "s3n:///my-s3-bucket/data/mydatafile.tif", "/mnt/my/absolute/data/directory/", or "file:///mnt/another/data/directory/". outputdirpath: string Path to a directory into which to write Series file output. An outputdir argument may be either a path on the local file system or a URI-like format, as in datapath. Examples of valid outputdirpaths include "a/relative/directory/", "s3n:///my-s3-bucket/data/myoutput/", or "file:///mnt/a/new/directory/". If the directory specified by outputdirpath already exists and the 'overwrite' parameter is False, this method will throw a ValueError. If the directory exists and 'overwrite' is True, the existing directory and all its contents will be deleted and overwritten. dims: tuple of positive int, optional (but required if inputformat is 'stack') Dimensions of input image data, for instance (1024, 1024, 48). Binary stack data will be interpreted as coming from a multidimensional array of the specified dimensions. The first dimension of the passed dims tuple should be the one that is changing most rapidly on disk. So for instance given dims of (x, y, z), the coordinates of the data in a binary stack file should be ordered as [(x0, y0, z0), (x1, y0, z0), ..., (xN, y0, z0), (x0, y1, z0), (x1, y1, z0), ..., (xN, yM, z0), (x0, y0, z1), ..., (xN, yM, zP)]. This is the opposite convention from that used by numpy, which by default has the fastest-changing dimension listed last (column-major convention). Thus, if loading a numpy array `ary`, where `ary.shape == (z, y, x)`, written to disk by `ary.tofile("myarray.stack")`, the corresponding dims parameter should be (x, y, z). If inputformat is 'tif-stack', the dims parameter (if any) will be ignored; data dimensions will instead be read out from the tif file headers. inputformat: {'stack', 'tif-stack'}. optional, default 'stack' Expected format of the input data. 'stack' indicates flat files of raw binary data, while 'tif-stack' indicates a sequence of multipage tif files, with each page of the tif corresponding to a separate z-plane. For both stacks and tif stacks, separate files are interpreted as distinct time points, with ordering given by lexicographic sorting of file names. This method assumes that stack data consists of signed 16-bit integers in native byte order. The lower-level API method SeriesLoader.saveFromStack() allows alternative data types to be read in. dtype: string or numpy dtype. optional, default 'int16' Data type of the image files to be loaded, specified as a numpy "dtype" string. If inputformat is 'tif-stack', the dtype parameter (if any) will be ignored; data type will instead be read out from the tif headers. blocksize: string formatted as e.g. "64M", "512k", "2G", or positive int. optional, default "150M" Requested size of individual output files in bytes (or kilobytes, megabytes, gigabytes). This parameter also indirectly controls the number of Spark partitions to be used, with one partition used per block created. startidx: nonnegative int, optional startidx and stopidx are convenience parameters to allow only a subset of input files to be read in. These parameters give the starting index (inclusive) and final index (exclusive) of the data files to be used after lexicographically sorting all input data files matching the datapath argument. For example, startidx=None (the default) and stopidx=10 will cause only the first 10 data files in datapath to be read in; startidx=2 and stopidx=3 will cause only the third file (zero-based index of 2) to be read in. startidx and stopidx use the python slice indexing convention (zero-based indexing with an exclusive final position). stopidx: nonnegative int, optional See startidx. shuffle: boolean, optional, default False Controls whether the conversion from Images to Series formats will make use of a Spark shuffle-based method. The default at present is not to use a shuffle. The shuffle-based method may lead to higher performance in some cases, but the default method appears to be more stable with larger data set sizes. This default may change in future releases. overwrite: boolean, optional, default False If true, the directory specified by outputdirpath will first be deleted, along with all its contents, if it already exists. (Use with caution.) If false, a ValueError will be thrown if outputdirpath is found to already exist. """ checkparams(inputformat, ['stack', 'tif-stack']) if inputformat.lower() == 'stack' and not dims: raise ValueError( "Dimensions ('dims' parameter) must be specified if loading from binary image stack" + " ('stack' value for 'inputformat' parameter)") if shuffle: from thunder.rdds.fileio.imagesloader import ImagesLoader loader = ImagesLoader(self._sc) if inputformat.lower() == 'stack': loader.fromStack(datapath, dims, dtype=dtype, startidx=startidx, stopidx=stopidx)\ .saveAsBinarySeries(outputdirpath, blockSize=blocksize, overwrite=overwrite) else: loader.fromMultipageTif(datapath, startidx=startidx, stopidx=stopidx)\ .saveAsBinarySeries(outputdirpath, blockSize=blocksize, overwrite=overwrite) else: from thunder.rdds.fileio.seriesloader import SeriesLoader loader = SeriesLoader(self._sc) if inputformat.lower() == 'stack': loader.saveFromStack(datapath, outputdirpath, dims, datatype=dtype, blockSize=blocksize, overwrite=overwrite, startidx=startidx, stopidx=stopidx) else: loader.saveFromMultipageTif(datapath, outputdirpath, blockSize=blocksize, startidx=startidx, stopidx=stopidx, overwrite=overwrite)
def test_max(self): from numpy import maximum arys, shape, size = _generate_test_arrays(2, 'uint8') imagedata = ImagesLoader(self.sc).fromArrays(arys) maxval = imagedata.max() assert_true(array_equal(reduce(maximum, arys), maxval))
def loadImagesAsSeries(self, dataPath, dims=None, inputFormat='stack', ext=None, dtype='int16', blockSize="150M", blockSizeUnits="pixels", startIdx=None, stopIdx=None, shuffle=True, recursive=False): """ Load Images data as Series data. Parameters ---------- dataPath: string Path to data files or directory, specified as either a local filesystem path or in a URI-like format, including scheme. A dataPath argument may include a single '*' wildcard character in the filename. Examples of valid dataPaths include 'a/local/relative/directory/*.stack", "s3n:///my-s3-bucket/data/mydatafile.tif", "/mnt/my/absolute/data/directory/", or "file:///mnt/another/data/directory/". dims: tuple of positive int, optional (but required if inputFormat is 'stack') Dimensions of input image data, for instance (1024, 1024, 48). Binary stack data will be interpreted as coming from a multidimensional array of the specified dimensions. The first dimension of the passed dims tuple should be the one that is changing most rapidly on disk. So for instance given dims of (x, y, z), the coordinates of the data in a binary stack file should be ordered as [(x0, y0, z0), (x1, y0, z0), ..., (xN, y0, z0), (x0, y1, z0), (x1, y1, z0), ..., (xN, yM, z0), (x0, y0, z1), ..., (xN, yM, zP)]. This is the opposite convention from that used by numpy, which by default has the fastest-changing dimension listed last (column-major convention). Thus, if loading a numpy array `ary`, where `ary.shape == (z, y, x)`, written to disk by `ary.tofile("myarray.stack")`, the corresponding dims parameter should be (x, y, z). If inputFormat is 'tif', the dims parameter (if any) will be ignored; data dimensions will instead be read out from the tif file headers. inputFormat: {'stack', 'tif'}. optional, default 'stack' Expected format of the input data. 'stack' indicates flat files of raw binary data, while 'tif' indicates greyscale / luminance TIF images. Each page of a multipage tif file will be interpreted as a separate z-plane. For both stacks and tif stacks, separate files are interpreted as distinct time points, with ordering given by lexicographic sorting of file names. ext: string, optional, default None Extension required on data files to be loaded. By default will be "stack" if inputFormat=="stack", "tif" for inputFormat=='tif'. dtype: string or numpy dtype. optional, default 'int16' Data type of the image files to be loaded, specified as a numpy "dtype" string. If inputFormat is 'tif', the dtype parameter (if any) will be ignored; data type will instead be read out from the tif headers. blockSize: string formatted as e.g. "64M", "512k", "2G", or positive int. optional, default "150M" Requested size of individual output files in bytes (or kilobytes, megabytes, gigabytes). If shuffle=True, blockSize can also be a tuple of int specifying either the number of pixels or of splits per dimension to apply to the loaded images, or an instance of BlockingStrategy. Whether a tuple of int is interpreted as pixels or as splits depends on the value of the blockSizeUnits parameter. blockSize also indirectly controls the number of Spark partitions to be used, with one partition used per block created. blockSizeUnits: string, either "pixels" or "splits" (or unique prefix of each, such as "s"), default "pixels" Specifies units to be used in interpreting a tuple passed as blockSizeSpec when shuffle=True. If a string or a BlockingStrategy instance is passed as blockSizeSpec, or if shuffle=False, this parameter has no effect. startIdx: nonnegative int, optional startIdx and stopIdx are convenience parameters to allow only a subset of input files to be read in. These parameters give the starting index (inclusive) and final index (exclusive) of the data files to be used after lexicographically sorting all input data files matching the dataPath argument. For example, startIdx=None (the default) and stopIdx=10 will cause only the first 10 data files in dataPath to be read in; startIdx=2 and stopIdx=3 will cause only the third file (zero-based index of 2) to be read in. startIdx and stopIdx use the python slice indexing convention (zero-based indexing with an exclusive final position). stopIdx: nonnegative int, optional See startIdx. shuffle: boolean, optional, default True Controls whether the conversion from Images to Series formats will make use of a Spark shuffle-based method. recursive: boolean, default False If true, will recursively descend directories rooted at dataPath, loading all files in the tree that have an appropriate extension. Recursive loading is currently only implemented for local filesystems (not s3), and only with shuffle=True. Returns ------- data: thunder.rdds.Series A newly-created Series object, wrapping an RDD of timeseries data generated from the images in dataPath. This RDD will have as keys an n-tuple of int, with n given by the dimensionality of the original images. The keys will be the zero-based spatial index of the timeseries data in the RDD value. The value will be a numpy array of length equal to the number of image files loaded. Each loaded image file will contribute one point to this value array, with ordering as implied by the lexicographic ordering of image file names. """ checkParams(inputFormat, ['stack', 'tif', 'tif-stack']) if inputFormat.lower() == 'stack' and not dims: raise ValueError( "Dimensions ('dims' parameter) must be specified if loading from binary image stack" + " ('stack' value for 'inputFormat' parameter)") if not ext: ext = DEFAULT_EXTENSIONS.get(inputFormat.lower(), None) if shuffle: from thunder.rdds.fileio.imagesloader import ImagesLoader loader = ImagesLoader(self._sc) if inputFormat.lower() == 'stack': images = loader.fromStack(dataPath, dims, dtype=dtype, ext=ext, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive) else: # tif / tif stack images = loader.fromTif(dataPath, ext=ext, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive) return images.toBlocks(blockSize, units=blockSizeUnits).toSeries() else: from thunder.rdds.fileio.seriesloader import SeriesLoader loader = SeriesLoader(self._sc) if inputFormat.lower() == 'stack': return loader.fromStack(dataPath, dims, ext=ext, dtype=dtype, blockSize=blockSize, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive) else: # tif / tif stack return loader.fromTif(dataPath, ext=ext, blockSize=blockSize, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive)
def setUp(self): super(TestImagesMeanByRegions, self).setUp() self.ary1 = array([[3, 5], [6, 8]], dtype='int32') self.ary2 = array([[13, 15], [16, 18]], dtype='int32') self.images = ImagesLoader(self.sc).fromArrays([self.ary1, self.ary2])
def loadImages(self, datapath, dims=None, inputformat='stack', dtype='int16', startidx=None, stopidx=None): """ Loads an Images object from data stored as a binary image stack, tif, tif-stack, or png files. Supports single files or multiple files, stored on a local file system, a networked file sytem (mounted and available on all nodes), or Amazon S3. HDFS is not currently supported for image file data. Parameters ---------- datapath: string Path to data files or directory, specified as either a local filesystem path or in a URI-like format, including scheme. A datapath argument may include a single '*' wildcard character in the filename. Examples of valid datapaths include 'a/local/relative/directory/*.stack", "s3n:///my-s3-bucket/data/mydatafile.tif", "/mnt/my/absolute/data/directory/", or "file:///mnt/another/data/directory/". dims: tuple of positive int, optional (but required if inputformat is 'stack') Dimensions of input image data, similar to a numpy 'shape' parameter, for instance (1024, 1024, 48). Binary stack data will be interpreted as coming from a multidimensional array of the specified dimensions. Stack data should be stored in row-major order (Fortran or Matlab convention) rather than column-major order (C or python/numpy convention), where the first dimension corresponds to that which is changing most rapidly on disk. So for instance given dims of (x, y, z), the coordinates of the data in a binary stack file should be ordered as [(x0, y0, z0), (x1, y0, zo), ..., (xN, y0, z0), (x0, y1, z0), (x1, y1, z0), ..., (xN, yM, z0), (x0, y0, z1), ..., (xN, yM, zP)]. If inputformat is 'png', 'tif', or'tif-stack', the dims parameter (if any) will be ignored; data dimensions will instead be read out from the image file headers. inputformat: {'stack', 'png', 'tif', 'tif-stack'}. optional, default 'stack' Expected format of the input data. 'stack' indicates flat files of raw binary data. 'png' or 'tif' indicate two-dimensional image files of the corresponding formats. 'tif-stack' indicates a sequence of multipage tif files, with each page of the tif corresponding to a separate z-plane. For all formats, separate files are interpreted as distinct time points, with ordering given by lexicographic sorting of file names. This method assumes that stack data consists of signed 16-bit integers in native byte order. Data types of image file data will be as specified in the file headers. dtype: string or numpy dtype. optional, default 'int16' Data type of the image files to be loaded, specified as a numpy "dtype" string. If inputformat is 'tif-stack', the dtype parameter (if any) will be ignored; data type will instead be read out from the tif headers. startidx: nonnegative int, optional startidx and stopidx are convenience parameters to allow only a subset of input files to be read in. These parameters give the starting index (inclusive) and final index (exclusive) of the data files to be used after lexicographically sorting all input data files matching the datapath argument. For example, startidx=None (the default) and stopidx=10 will cause only the first 10 data files in datapath to be read in; startidx=2 and stopidx=3 will cause only the third file (zero-based index of 2) to be read in. startidx and stopidx use the python slice indexing convention (zero-based indexing with an exclusive final position). stopidx: nonnegative int, optional See startidx. Returns ------- data: thunder.rdds.Images A newly-created Images object, wrapping an RDD of <int index, numpy array> key-value pairs. """ checkparams(inputformat, ['stack', 'png', 'tif', 'tif-stack']) from thunder.rdds.fileio.imagesloader import ImagesLoader loader = ImagesLoader(self._sc) if inputformat.lower() == 'stack': data = loader.fromStack(datapath, dims, dtype=dtype, startidx=startidx, stopidx=stopidx) elif inputformat.lower() == 'tif': data = loader.fromTif(datapath, startidx=startidx, stopidx=stopidx) elif inputformat.lower() == 'tif-stack': data = loader.fromMultipageTif(datapath, startidx=startidx, stopidx=stopidx) else: data = loader.fromPng(datapath) return data
def loadImages(self, dataPath, dims=None, dtype=None, inputFormat='stack', ext=None, startIdx=None, stopIdx=None, recursive=False, nplanes=None, npartitions=None, renumber=False, confFilename='conf.json'): """ Loads an Images object from data stored as a binary image stack, tif, or png files. Supports single files or multiple files, stored on a local file system, a networked file sytem (mounted and available on all nodes), or Amazon S3. HDFS is not currently supported for image file data. Parameters ---------- dataPath: string Path to data files or directory, specified as either a local filesystem path or in a URI-like format, including scheme. A dataPath argument may include a single '*' wildcard character in the filename. Examples of valid dataPaths include 'a/local/relative/directory/*.stack", "s3n:///my-s3-bucket/data/mydatafile.tif", "/mnt/my/absolute/data/directory/", or "file:///mnt/another/data/directory/". dims: tuple of positive int, optional (but required if inputFormat is 'stack') Dimensions of input image data, similar to a numpy 'shape' parameter, for instance (1024, 1024, 48). Binary stack data will be interpreted as coming from a multidimensional array of the specified dimensions. Stack data should be stored in row-major order (Fortran or Matlab convention) rather than column-major order (C or python/numpy convention), where the first dimension corresponds to that which is changing most rapidly on disk. So for instance given dims of (x, y, z), the coordinates of the data in a binary stack file should be ordered as [(x0, y0, z0), (x1, y0, zo), ..., (xN, y0, z0), (x0, y1, z0), (x1, y1, z0), ..., (xN, yM, z0), (x0, y0, z1), ..., (xN, yM, zP)]. If inputFormat is 'png' or 'tif', the dims parameter (if any) will be ignored; data dimensions will instead be read out from the image file headers. inputFormat: {'stack', 'png', 'tif'}. optional, default 'stack' Expected format of the input data. 'stack' indicates flat files of raw binary data. 'png' or 'tif' indicate image files of the corresponding formats. Each page of a multipage tif file will be interpreted as a separate z-plane. For all formats, separate files are interpreted as distinct time points, with ordering given by lexicographic sorting of file names. ext: string, optional, default None Extension required on data files to be loaded. By default will be "stack" if inputFormat=="stack", "tif" for inputFormat=='tif', and 'png' for inputFormat="png". dtype: string or numpy dtype. optional, default 'int16' Data type of the image files to be loaded, specified as a numpy "dtype" string. If inputFormat is 'tif' or 'png', the dtype parameter (if any) will be ignored; data type will instead be read out from the tif headers. startIdx: nonnegative int, optional startIdx and stopIdx are convenience parameters to allow only a subset of input files to be read in. These parameters give the starting index (inclusive) and final index (exclusive) of the data files to be used after lexicographically sorting all input data files matching the dataPath argument. For example, startIdx=None (the default) and stopIdx=10 will cause only the first 10 data files in dataPath to be read in; startIdx=2 and stopIdx=3 will cause only the third file (zero-based index of 2) to be read in. startIdx and stopIdx use the python slice indexing convention (zero-based indexing with an exclusive final position). stopIdx: nonnegative int, optional See startIdx. recursive: boolean, default False If true, will recursively descend directories rooted at dataPath, loading all files in the tree that have an appropriate extension. Recursive loading is currently only implemented for local filesystems (not s3). nplanes: positive integer, default None If passed, will cause a single image file to be subdivided into multiple records. Every `nplanes` z-planes (or multipage tif pages) in the file will be taken as a new record, with the first nplane planes of the first file being record 0, the second nplane planes being record 1, etc, until the first file is exhausted and record ordering continues with the first nplane planes of the second file, and so on. With nplanes=None (the default), a single file will be considered as representing a single record. Keys are calculated assuming that all input files contain the same number of records; if the number of records per file is not the same across all files, then `renumber` should be set to True to ensure consistent keys. npartitions: positive int, optional If specified, request a certain number of partitions for the underlying Spark RDD. Default is 1 partition per image file. renumber: boolean, optional, default False If renumber evaluates to True, then the keys for each record will be explicitly recalculated after all images are loaded. This should only be necessary at load time when different files contain different number of records. See Images.renumber(). Returns ------- data: thunder.rdds.Images A newly-created Images object, wrapping an RDD of <int index, numpy array> key-value pairs. """ checkParams(inputFormat, ['stack', 'png', 'tif', 'tif-stack']) from thunder.rdds.fileio.imagesloader import ImagesLoader loader = ImagesLoader(self._sc) if not ext: ext = DEFAULT_EXTENSIONS.get(inputFormat.lower(), None) if inputFormat.lower() == 'stack': data = loader.fromStack(dataPath, dims=dims, dtype=dtype, ext=ext, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive, nplanes=nplanes, npartitions=npartitions, confFilename=confFilename) elif inputFormat.lower().startswith('tif'): data = loader.fromTif(dataPath, ext=ext, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive, nplanes=nplanes, npartitions=npartitions) else: if nplanes: raise NotImplementedError( "nplanes argument is not supported for png files") data = loader.fromPng(dataPath, ext=ext, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive, npartitions=npartitions) if not renumber: return data else: return data.renumber()
def convertImagesToSeries(self, dataPath, outputDirPath, dims=None, inputFormat='stack', ext=None, dtype='int16', blockSize="150M", blockSizeUnits="pixels", startIdx=None, stopIdx=None, shuffle=True, overwrite=False, recursive=False, nplanes=None, npartitions=None, renumber=False): """ Write out Images data as Series data, saved in a flat binary format. The resulting Series data files may subsequently be read in using the loadSeries() method. The Series data object that results will be equivalent to that which would be generated by loadImagesAsSeries(). It is expected that loading Series data directly from the series flat binary format, using loadSeries(), will be faster than converting image data to a Series object through loadImagesAsSeries(). Parameters ---------- dataPath: string Path to data files or directory, specified as either a local filesystem path or in a URI-like format, including scheme. A dataPath argument may include a single '*' wildcard character in the filename. Examples of valid dataPaths include 'a/local/relative/directory/*.stack", "s3n:///my-s3-bucket/data/mydatafile.tif", "/mnt/my/absolute/data/directory/", or "file:///mnt/another/data/directory/". outputDirPath: string Path to a directory into which to write Series file output. An outputdir argument may be either a path on the local file system or a URI-like format, as in dataPath. Examples of valid outputDirPaths include "a/relative/directory/", "s3n:///my-s3-bucket/data/myoutput/", or "file:///mnt/a/new/directory/". If the directory specified by outputDirPath already exists and the 'overwrite' parameter is False, this method will throw a ValueError. If the directory exists and 'overwrite' is True, the existing directory and all its contents will be deleted and overwritten. dims: tuple of positive int, optional (but required if inputFormat is 'stack') Dimensions of input image data, for instance (1024, 1024, 48). Binary stack data will be interpreted as coming from a multidimensional array of the specified dimensions. The first dimension of the passed dims tuple should be the one that is changing most rapidly on disk. So for instance given dims of (x, y, z), the coordinates of the data in a binary stack file should be ordered as [(x0, y0, z0), (x1, y0, z0), ..., (xN, y0, z0), (x0, y1, z0), (x1, y1, z0), ..., (xN, yM, z0), (x0, y0, z1), ..., (xN, yM, zP)]. This is the opposite convention from that used by numpy, which by default has the fastest-changing dimension listed last (column-major convention). Thus, if loading a numpy array `ary`, where `ary.shape == (z, y, x)`, written to disk by `ary.tofile("myarray.stack")`, the corresponding dims parameter should be (x, y, z). If inputFormat is 'tif', the dims parameter (if any) will be ignored; data dimensions will instead be read out from the tif file headers. inputFormat: {'stack', 'tif'}. optional, default 'stack' Expected format of the input data. 'stack' indicates flat files of raw binary data, while 'tif' indicates greyscale / luminance TIF images. Each page of a multipage tif file will be interpreted as a separate z-plane. For both stacks and tif stacks, separate files are interpreted as distinct time points, with ordering given by lexicographic sorting of file names. ext: string, optional, default None Extension required on data files to be loaded. By default will be "stack" if inputFormat=="stack", "tif" for inputFormat=='tif'. dtype: string or numpy dtype. optional, default 'int16' Data type of the image files to be loaded, specified as a numpy "dtype" string. If inputFormat is 'tif', the dtype parameter (if any) will be ignored; data type will instead be read out from the tif headers. blockSize: string formatted as e.g. "64M", "512k", "2G", or positive int, tuple of positive int, or instance of BlockingStrategy. optional, default "150M" Requested size of individual output files in bytes (or kilobytes, megabytes, gigabytes). blockSize can also be an instance of blockingStrategy, or a tuple of int specifying either the number of pixels or of splits per dimension to apply to the loaded images. Whether a tuple of int is interpreted as pixels or as splits depends on the value of the blockSizeUnits parameter. This parameter also indirectly controls the number of Spark partitions to be used, with one partition used per block created. blockSizeUnits: string, either "pixels" or "splits" (or unique prefix of each, such as "s"), default "pixels" Specifies units to be used in interpreting a tuple passed as blockSizeSpec when shuffle=True. If a string or a BlockingStrategy instance is passed as blockSizeSpec, or if shuffle=False, this parameter has no effect. startIdx: nonnegative int, optional startIdx and stopIdx are convenience parameters to allow only a subset of input files to be read in. These parameters give the starting index (inclusive) and final index (exclusive) of the data files to be used after lexicographically sorting all input data files matching the dataPath argument. For example, startIdx=None (the default) and stopIdx=10 will cause only the first 10 data files in dataPath to be read in; startIdx=2 and stopIdx=3 will cause only the third file (zero-based index of 2) to be read in. startIdx and stopIdx use the python slice indexing convention (zero-based indexing with an exclusive final position). stopIdx: nonnegative int, optional See startIdx. shuffle: boolean, optional, default True Controls whether the conversion from Images to Series formats will make use of a Spark shuffle-based method. overwrite: boolean, optional, default False If true, the directory specified by outputDirPath will first be deleted, along with all its contents, if it already exists. (Use with caution.) If false, a ValueError will be thrown if outputDirPath is found to already exist. recursive: boolean, default False If true, will recursively descend directories rooted at dataPath, loading all files in the tree that have an appropriate extension. Recursive loading is currently only implemented for local filesystems (not s3), and only with shuffle=True. nplanes: positive integer, default None If passed, will cause a single image file to be subdivided into multiple records. Every `nplanes` z-planes (or multipage tif pages) in the file will be taken as a new record, with the first nplane planes of the first file being record 0, the second nplane planes being record 1, etc, until the first file is exhausted and record ordering continues with the first nplane planes of the second file, and so on. With nplanes=None (the default), a single file will be considered as representing a single record. Keys are calculated assuming that all input files contain the same number of records; if the number of records per file is not the same across all files, then `renumber` should be set to True to ensure consistent keys. nplanes is only supported for shuffle=True (the default). npartitions: positive int, optional If specified, request a certain number of partitions for the underlying Spark RDD. Default is 1 partition per image file. Only applies when shuffle=True. renumber: boolean, optional, default False If renumber evaluates to True, then the keys for each record will be explicitly recalculated after all images are loaded. This should only be necessary at load time when different files contain different number of records. renumber is only supported for shuffle=True (the default). See Images.renumber(). """ checkParams(inputFormat, ['stack', 'tif', 'tif-stack']) if inputFormat.lower() == 'stack' and not dims: raise ValueError( "Dimensions ('dims' parameter) must be specified if loading from binary image stack" + " ('stack' value for 'inputFormat' parameter)") if not overwrite: raiseErrorIfPathExists(outputDirPath, awsCredentialsOverride=self._credentials) overwrite = True # prevent additional downstream checks for this path if not ext: ext = DEFAULT_EXTENSIONS.get(inputFormat.lower(), None) if shuffle: from thunder.rdds.fileio.imagesloader import ImagesLoader loader = ImagesLoader(self._sc) if inputFormat.lower() == 'stack': images = loader.fromStack(dataPath, dims, ext=ext, dtype=dtype, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive, nplanes=nplanes, npartitions=npartitions) else: # 'tif' or 'tif-stack' images = loader.fromTif(dataPath, ext=ext, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive, nplanes=nplanes, npartitions=npartitions) if renumber: images = images.renumber() images.toBlocks(blockSize, units=blockSizeUnits).saveAsBinarySeries( outputDirPath, overwrite=overwrite) else: from thunder.rdds.fileio.seriesloader import SeriesLoader if nplanes is not None: raise NotImplementedError( "nplanes is not supported with shuffle=False") if npartitions is not None: raise NotImplementedError( "npartitions is not supported with shuffle=False") loader = SeriesLoader(self._sc) if inputFormat.lower() == 'stack': loader.saveFromStack(dataPath, outputDirPath, dims, ext=ext, dtype=dtype, blockSize=blockSize, overwrite=overwrite, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive) else: # 'tif' or 'tif-stack' loader.saveFromTif(dataPath, outputDirPath, ext=ext, blockSize=blockSize, startIdx=startIdx, stopIdx=stopIdx, overwrite=overwrite, recursive=recursive)
def test_roundtripConvertToSeries(self): imagepath = findSourceTreeDir("utils/data/fish/images") images = ImagesLoader(self.sc).fromTif(imagepath) strategy = SimpleBlockingStrategy.generateFromBlockSize(images, blockSize=76 * 20) self._run_tst_roundtripConvertToSeries(images, strategy)
class TestImagesMeanByRegions(PySparkTestCase): def setUp(self): super(TestImagesMeanByRegions, self).setUp() self.ary1 = array([[3, 5], [6, 8]], dtype='int32') self.ary2 = array([[13, 15], [16, 18]], dtype='int32') self.images = ImagesLoader(self.sc).fromArrays([self.ary1, self.ary2]) def __checkAttrPropagation(self, newImages, newDims): assert_equals(newDims, newImages._dims.count) assert_equals(self.images._nrecords, newImages._nrecords) assert_equals(self.images._dtype, newImages._dtype) def test_badMaskShapeThrowsValueError(self): mask = array([[1]], dtype='int16') assert_raises(ValueError, self.images.meanByRegions, mask) def test_meanWithFloatMask(self): mask = array([[1.0, 0.0], [0.0, 1.0]], dtype='float32') regionMeanImages = self.images.meanByRegions(mask) self.__checkAttrPropagation(regionMeanImages, (1, 1)) collected = regionMeanImages.collect() assert_equals(2, len(collected)) assert_equals((1, 1), collected[0][1].shape) # check keys assert_equals(0, collected[0][0]) assert_equals(1, collected[1][0]) # check values assert_equals(5, collected[0][1][0]) assert_equals(15, collected[1][1][0]) def test_meanWithIntMask(self): mask = array([[1, 0], [2, 1]], dtype='uint8') regionMeanImages = self.images.meanByRegions(mask) self.__checkAttrPropagation(regionMeanImages, (1, 2)) collected = regionMeanImages.collect() assert_equals(2, len(collected)) assert_equals((1, 2), collected[0][1].shape) # check keys assert_equals(0, collected[0][0]) assert_equals(1, collected[1][0]) # check values assert_equals(5, collected[0][1].flat[0]) assert_equals(6, collected[0][1].flat[1]) assert_equals(15, collected[1][1].flat[0]) assert_equals(16, collected[1][1].flat[1]) def test_meanWithSingleRegionIndices(self): indices = [[(1, 1), (0, 0)]] # one region with two indices regionMeanImages = self.images.meanByRegions(indices) self.__checkAttrPropagation(regionMeanImages, (1, 1)) collected = regionMeanImages.collect() assert_equals(2, len(collected)) assert_equals((1, 1), collected[0][1].shape) # check keys assert_equals(0, collected[0][0]) assert_equals(1, collected[1][0]) # check values assert_equals(5, collected[0][1][0]) assert_equals(15, collected[1][1][0]) def test_meanWithMultipleRegionIndices(self): indices = [[(0, 0), (0, 1)], [(0, 1), (1, 0)]] # two regions with two indices each regionMeanImages = self.images.meanByRegions(indices) self.__checkAttrPropagation(regionMeanImages, (1, 2)) collected = regionMeanImages.collect() assert_equals(2, len(collected)) assert_equals((1, 2), collected[0][1].shape) # check keys assert_equals(0, collected[0][0]) assert_equals(1, collected[1][0]) # check values assert_equals(4, collected[0][1].flat[0]) assert_equals(5, collected[0][1].flat[1]) assert_equals(14, collected[1][1].flat[0]) assert_equals(15, collected[1][1].flat[1]) def test_badIndexesThrowErrors(self): indices = [[(0, 0), (-1, 0)]] # index too small (-1) assert_raises(ValueError, self.images.meanByRegions, indices) indices = [[(0, 0), (2, 0)]] # index too large (2) assert_raises(ValueError, self.images.meanByRegions, indices) indices = [[(0, 0), (0, )]] # too few indices assert_raises(ValueError, self.images.meanByRegions, indices) indices = [[(0, 0), (0, 1, 0)]] # too many indices assert_raises(ValueError, self.images.meanByRegions, indices) def test_meanWithSingleRegionIndices3D(self): ary1 = array([[[3, 5, 3], [6, 8, 6]], [[3, 5, 3], [6, 8, 6]]], dtype='int32') ary2 = array( [[[13, 15, 13], [16, 18, 16]], [[13, 15, 13], [16, 18, 16]]], dtype='int32') images = ImagesLoader(self.sc).fromArrays([ary1, ary2]) indices = [[(1, 1, 1), (0, 0, 0)]] # one region with two indices regionMeanImages = images.meanByRegions(indices) self.__checkAttrPropagation(regionMeanImages, (1, 1)) collected = regionMeanImages.collect() # check values assert_equals(5, collected[0][1][0]) assert_equals(15, collected[1][1][0])
def loadImagesAsSeries(self, dataPath, dims=None, inputFormat='stack', ext=None, dtype='int16', blockSize="150M", blockSizeUnits="pixels", startIdx=None, stopIdx=None, shuffle=True, recursive=False, nplanes=None, npartitions=None, renumber=False, confFilename='conf.json'): """ Load Images data as Series data. Parameters ---------- dataPath: string Path to data files or directory, as either a local filesystem path or a URI. May include a single '*' wildcard in the filename. Examples of valid dataPaths include 'local/directory/*.stack", "s3n:///my-s3-bucket/data/", or "file:///mnt/another/directory/". dims: tuple of positive int, optional (required if inputFormat is 'stack') Image dimensions. Binary stack data will be interpreted as a multidimensional array with the given dimensions, and should be stored in row-major order (Fortran or Matlab convention), where the first dimension changes most rapidly. For 'png' or 'tif' data dimensions will be read from the image file headers. inputFormat: str, optional, default = 'stack' Expected format of the input data: 'stack', 'png', or 'tif'. 'stack' indicates flat binary stacks. 'png' or 'tif' indicate image formats. Page of a multipage tif file will be extend along the third dimension. Separate files interpreted as distinct records, with ordering given by lexicographic sorting of file names. ext: string, optional, default = None File extension, default will be "bin" if inputFormat=="stack", "tif" for inputFormat=='tif', and 'png' for inputFormat=="png". dtype: string or numpy dtype. optional, default 'int16' Data type of the image files to be loaded, specified as a numpy "dtype" string. Ignored for 'tif' or 'png' (data will be inferred from image formats). blockSize: string or positive int, optional, default "150M" Requested size of blocks (e.g "64M", "512k", "2G"). If shuffle=True, can also be a tuple of int specifying the number of pixels or splits per dimension. Indirectly controls the number of Spark partitions, with one partition per block. blockSizeUnits: string, either "pixels" or "splits", default "pixels" Units for interpreting a tuple passed as blockSize when shuffle=True. startIdx: nonnegative int, optional, default = None Convenience parameters to read only a subset of input files. Uses python slice conventions (zero-based indexing with exclusive final position). These parameters give the starting and final index after lexicographic sorting. stopIdx: nonnegative int, optional, default = None See startIdx. shuffle: boolean, optional, default = True Controls whether the conversion from Images to Series formats will use of a Spark shuffle-based method. recursive: boolean, optional, default = False If true, will recursively descend directories rooted at dataPath, loading all files in the tree with an appropriate extension. nplanes: positive integer, optional, default = None Subdivide individual image files. Every `nplanes` from each file will be considered a new record. With nplanes=None (the default), a single file will be considered as representing a single record. If the number of records per file is not the same across all files, then `renumber` should be set to True to ensure consistent keys. npartitions: positive int, optional, default = None Specify number of partitions for the RDD, if unspecified will use 1 partition per image. renumber: boolean, optional, default = False Recalculate keys for records after images are loading. Only necessary if different files contain different number of records (e.g. due to specifying nplanes). See Images.renumber(). confFilename : string, optional, default = 'conf.json' Name of conf file if using to specify parameters for binary stack data Returns ------- data: thunder.rdds.Series A Series object, wrapping an RDD, with (n-tuples of ints) : (numpy array) pairs. Keys will be n-tuples of int, with n given by dimensionality of the images, and correspond to indexes into the image arrays. Value will have length equal to the number of image files. With each image contributing one point to this value array, with ordering given by the lexicographic ordering of image file names. """ checkParams(inputFormat, ['stack', 'tif', 'tif-stack']) if not ext: ext = DEFAULT_EXTENSIONS.get(inputFormat.lower(), None) if shuffle: from thunder.rdds.fileio.imagesloader import ImagesLoader loader = ImagesLoader(self._sc) if inputFormat.lower() == 'stack': images = loader.fromStack(dataPath, dims, dtype=dtype, ext=ext, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive, nplanes=nplanes, npartitions=npartitions, confFilename=confFilename) else: # tif / tif stack images = loader.fromTif(dataPath, ext=ext, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive, nplanes=nplanes, npartitions=npartitions) if renumber: images = images.renumber() return images.toBlocks(blockSize, units=blockSizeUnits).toSeries() else: from thunder.rdds.fileio.seriesloader import SeriesLoader if nplanes is not None: raise NotImplementedError("nplanes is not supported with shuffle=False") if npartitions is not None: raise NotImplementedError("npartitions is not supported with shuffle=False") if renumber: raise NotImplementedError("renumber is not supported with shuffle=False") loader = SeriesLoader(self._sc) if inputFormat.lower() == 'stack': return loader.fromStack(dataPath, dims, ext=ext, dtype=dtype, blockSize=blockSize, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive) else: # tif / tif stack return loader.fromTif(dataPath, ext=ext, blockSize=blockSize, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive)
def convertImagesToSeries(self, dataPath, outputDirPath, dims=None, inputFormat='stack', ext=None, dtype='int16', blockSize="150M", blockSizeUnits="pixels", startIdx=None, stopIdx=None, shuffle=True, overwrite=False, recursive=False, nplanes=None, npartitions=None, renumber=False, confFilename='conf.json'): """ Write out Images data as Series data, saved in a flat binary format. The resulting files may subsequently be read in using ThunderContext.loadSeries(). Loading Series data directly will likely be faster than converting image data to a Series object through loadImagesAsSeries(). Parameters ---------- dataPath: string Path to data files or directory, as either a local filesystem path or a URI. May include a single '*' wildcard in the filename. Examples of valid dataPaths include 'local/directory/*.stack", "s3n:///my-s3-bucket/data/", or "file:///mnt/another/directory/". outputDirPath: string Path to directory to write Series file output. May be either a path on the local file system or a URI-like format, such as "local/directory", "s3n:///my-s3-bucket/data/", or "file:///mnt/another/directory/". If the directory exists and 'overwrite' is True, the existing directory and all its contents will be deleted and overwritten. dims: tuple of positive int, optional (required if inputFormat is 'stack') Image dimensions. Binary stack data will be interpreted as a multidimensional array with the given dimensions, and should be stored in row-major order (Fortran or Matlab convention), where the first dimension changes most rapidly. For 'png' or 'tif' data dimensions will be read from the image file headers. inputFormat: str, optional, default = 'stack' Expected format of the input data: 'stack', 'png', or 'tif'. 'stack' indicates flat binary stacks. 'png' or 'tif' indicate image formats. Page of a multipage tif file will be extend along the third dimension. Separate files interpreted as distinct records, with ordering given by lexicographic sorting of file names. ext: string, optional, default = None File extension, default will be "bin" if inputFormat=="stack", "tif" for inputFormat=='tif', and 'png' for inputFormat=="png". dtype: string or numpy dtype. optional, default 'int16' Data type of the image files to be loaded, specified as a numpy "dtype" string. Ignored for 'tif' or 'png' (data will be inferred from image formats). blockSize: string or positive int, optional, default "150M" Requested size of blocks (e.g "64M", "512k", "2G"). If shuffle=True, can also be a tuple of int specifying the number of pixels or splits per dimension. Indirectly controls the number of Spark partitions, with one partition per block. blockSizeUnits: string, either "pixels" or "splits", default "pixels" Units for interpreting a tuple passed as blockSize when shuffle=True. startIdx: nonnegative int, optional, default = None Convenience parameters to read only a subset of input files. Uses python slice conventions (zero-based indexing with exclusive final position). These parameters give the starting and final index after lexicographic sorting. stopIdx: nonnegative int, optional, default = None See startIdx. shuffle: boolean, optional, default = True Controls whether the conversion from Images to Series formats will use of a Spark shuffle-based method. overwrite: boolean, optional, default False If true, the directory specified by outputDirPath will be deleted (recursively) if it already exists. (Use with caution.) recursive: boolean, optional, default = False If true, will recursively descend directories rooted at dataPath, loading all files in the tree with an appropriate extension. nplanes: positive integer, optional, default = None Subdivide individual image files. Every `nplanes` from each file will be considered a new record. With nplanes=None (the default), a single file will be considered as representing a single record. If the number of records per file is not the same across all files, then `renumber` should be set to True to ensure consistent keys. npartitions: positive int, optional, default = None Specify number of partitions for the RDD, if unspecified will use 1 partition per image. renumber: boolean, optional, default = False Recalculate keys for records after images are loading. Only necessary if different files contain different number of records (e.g. due to specifying nplanes). See Images.renumber(). confFilename : string, optional, default = 'conf.json' Name of conf file if using to specify parameters for binary stack data """ checkParams(inputFormat, ['stack', 'tif', 'tif-stack']) if not overwrite: raiseErrorIfPathExists(outputDirPath, awsCredentialsOverride=self._credentials) overwrite = True # prevent additional downstream checks for this path if not ext: ext = DEFAULT_EXTENSIONS.get(inputFormat.lower(), None) if shuffle: from thunder.rdds.fileio.imagesloader import ImagesLoader loader = ImagesLoader(self._sc) if inputFormat.lower() == 'stack': images = loader.fromStack(dataPath, dims, ext=ext, dtype=dtype, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive, nplanes=nplanes, npartitions=npartitions, confFilename=confFilename) else: # 'tif' or 'tif-stack' images = loader.fromTif(dataPath, ext=ext, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive, nplanes=nplanes, npartitions=npartitions) if renumber: images = images.renumber() images.toBlocks(blockSize, units=blockSizeUnits).saveAsBinarySeries(outputDirPath, overwrite=overwrite) else: from thunder.rdds.fileio.seriesloader import SeriesLoader if nplanes is not None: raise NotImplementedError("nplanes is not supported with shuffle=False") if npartitions is not None: raise NotImplementedError("npartitions is not supported with shuffle=False") loader = SeriesLoader(self._sc) if inputFormat.lower() == 'stack': loader.saveFromStack(dataPath, outputDirPath, dims, ext=ext, dtype=dtype, blockSize=blockSize, overwrite=overwrite, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive) else: # 'tif' or 'tif-stack' loader.saveFromTif(dataPath, outputDirPath, ext=ext, blockSize=blockSize, startIdx=startIdx, stopIdx=stopIdx, overwrite=overwrite, recursive=recursive)
def roundTrip(images, dtype): outdir = os.path.join(self.outputdir, "binary-images-" + dtype) images.astype(dtype).saveAsBinaryImages(outdir) newimages = ImagesLoader(self.sc).fromStack(outdir, ext='bin') array_equal(images.first()[1], newimages.first()[1])