def saveAsBinarySeries(self, outputDirPath, overwrite=False): """Writes out Series-formatted data. Subclasses are *not* expected to override this method. Parameters ---------- outputdirname : string path or URI to directory to be created Output files will be written underneath outputdirname. This directory must not yet exist (unless overwrite is True), and must be no more than one level beneath an existing directory. It will be created as a result of this call. overwrite : bool If true, outputdirname and all its contents will be deleted and recreated as part of this call. """ from thunder.rdds.fileio.writers import getParallelWriterForPath from thunder.rdds.fileio.seriesloader import writeSeriesConfig from thunder.utils.common import AWSCredentials if not overwrite: self._checkOverwrite(outputDirPath) overwrite = True # prevent additional downstream checks for this path awsCredentialsOverride = AWSCredentials.fromContext(self.rdd.ctx) writer = getParallelWriterForPath(outputDirPath)(outputDirPath, overwrite=overwrite, awsCredentialsOverride=awsCredentialsOverride) binseriesRdd = self.toBinarySeries() binseriesRdd.foreach(writer.writerFcn) writeSeriesConfig(outputDirPath, len(self.dims), self.nimages, keyType='int16', valueType=self.dtype, overwrite=overwrite, awsCredentialsOverride=awsCredentialsOverride)
def __init__(self, sparkContext): """Initialize a new ImagesLoader object. Parameters ---------- sparkcontext: SparkContext The pyspark SparkContext object used by the current Thunder environment. """ from thunder.utils.common import AWSCredentials self.sc = sparkContext self.awsCredentialsOverride = AWSCredentials.fromContext(sparkContext)
def exportAsPngs(self, outputDirPath, filePrefix="export", overwrite=False, collectToDriver=True): """ Write out basic png files for two-dimensional image data. Files will be written into a newly-created directory on the local file system given by outputdirname. All workers must be able to see the output directory via an NFS share or similar. Parameters ---------- outputDirPath : string Path to output directory to be created. Exception will be thrown if this directory already exists, unless overwrite is True. Directory must be one level below an existing directory. filePrefix : string String to prepend to all filenames. Files will be named <fileprefix>00000.png, <fileprefix>00001.png, etc overwrite : bool If true, the directory given by outputdirname will first be deleted if it already exists. collectToDriver : bool, default True If true, images will be collect()'ed at the driver first before being written out, allowing for use of a local filesystem at the expense of network overhead. If false, images will be written in parallel by each executor, presumably to a distributed or networked filesystem. """ dims = self.dims if not len(dims) == 2: raise ValueError("Only two-dimensional images can be exported as .png files; image is %d-dimensional." % len(dims)) from matplotlib.pyplot import imsave from io import BytesIO from thunder.rdds.fileio.writers import getParallelWriterForPath, getCollectedFileWriterForPath from thunder.utils.common import AWSCredentials def toFilenameAndPngBuf(kv): key, img = kv fname = filePrefix+"%05d.png" % int(key) bytebuf = BytesIO() imsave(bytebuf, img, format="png") return fname, bytebuf.getvalue() bufRdd = self.rdd.map(toFilenameAndPngBuf) awsCredentials = AWSCredentials.fromContext(self.rdd.ctx) if collectToDriver: writer = getCollectedFileWriterForPath(outputDirPath)(outputDirPath, overwrite=overwrite, awsCredentialsOverride=awsCredentials) writer.writeCollectedFiles(bufRdd.collect()) else: writer = getParallelWriterForPath(outputDirPath)(outputDirPath, overwrite=overwrite, awsCredentialsOverride=awsCredentials) bufRdd.foreach(writer.writerFcn)
def __init__(self, sparkContext, minPartitions=None): """Initialize a new SeriesLoader object. Parameters ---------- sparkcontext: SparkContext The pyspark SparkContext object used by the current Thunder environment. minPartitions: int minimum number of partitions to use when loading data. (Used by fromText, fromMatLocal, and fromNpyLocal) """ from thunder.utils.common import AWSCredentials self.sc = sparkContext self.minPartitions = minPartitions self.awsCredentialsOverride = AWSCredentials.fromContext(sparkContext)
def saveAsBinaryImages(self, outputDirPath, filePrefix="export", overwrite=False): """ Write out images or volumes as flat binary files. Files will be written into a newly-created directory given by outputdirname. Parameters ---------- outputDirPath : string Path to output directory to be created. Exception will be thrown if this directory already exists, unless overwrite is True. Directory must be one level below an existing directory. filePrefix : string String to prepend to all filenames. Files will be named <fileprefix>-00000.bin, <fileprefix>-00001.bin, etc overwrite : bool If true, the directory given by outputdirname will first be deleted if it already exists. """ from thunder.rdds.fileio.writers import getParallelWriterForPath from thunder.rdds.fileio.imagesloader import writeBinaryImagesConfig from thunder.utils.common import AWSCredentials dimsTotal = list(asarray(self.dims.max) - asarray(self.dims.min) + 1) def toFilenameAndBinaryBuf(kv): key, img = kv fname = filePrefix + "-" + "%05d.bin" % int(key) return fname, img.transpose().copy() bufRdd = self.rdd.map(toFilenameAndBinaryBuf) awsCredentials = AWSCredentials.fromContext(self.rdd.ctx) writer = getParallelWriterForPath(outputDirPath)( outputDirPath, overwrite=overwrite, awsCredentialsOverride=awsCredentials) bufRdd.foreach(writer.writerFcn) writeBinaryImagesConfig(outputDirPath, dims=dimsTotal, dtype=self.dtype, overwrite=overwrite, awsCredentialsOverride=awsCredentials)
def saveAsBinarySeries(self, outputDirPath, overwrite=False): """Writes out Series-formatted data. Subclasses are *not* expected to override this method. Parameters ---------- outputdirname : string path or URI to directory to be created Output files will be written underneath outputdirname. This directory must not yet exist (unless overwrite is True), and must be no more than one level beneath an existing directory. It will be created as a result of this call. overwrite : bool If true, outputdirname and all its contents will be deleted and recreated as part of this call. """ from thunder.rdds.fileio.writers import getParallelWriterForPath from thunder.rdds.fileio.seriesloader import writeSeriesConfig from thunder.utils.common import AWSCredentials if not overwrite: self._checkOverwrite(outputDirPath) overwrite = True # prevent additional downstream checks for this path awsCredentialsOverride = AWSCredentials.fromContext(self.rdd.ctx) writer = getParallelWriterForPath(outputDirPath)( outputDirPath, overwrite=overwrite, awsCredentialsOverride=awsCredentialsOverride) binseriesRdd = self.toBinarySeries() binseriesRdd.foreach(writer.writerFcn) writeSeriesConfig(outputDirPath, len(self.dims), self.nimages, keyType='int16', valueType=self.dtype, overwrite=overwrite, awsCredentialsOverride=awsCredentialsOverride)
def saveAsBinaryImages(self, outputDirPath, filePrefix="export", overwrite=False): """ Write out images or volumes as flat binary files. Files will be written into a newly-created directory given by outputdirname. Parameters ---------- outputDirPath : string Path to output directory to be created. Exception will be thrown if this directory already exists, unless overwrite is True. Directory must be one level below an existing directory. filePrefix : string String to prepend to all filenames. Files will be named <fileprefix>-00000.bin, <fileprefix>-00001.bin, etc overwrite : bool If true, the directory given by outputdirname will first be deleted if it already exists. """ from thunder.rdds.fileio.writers import getParallelWriterForPath from thunder.rdds.fileio.imagesloader import writeBinaryImagesConfig from thunder.utils.common import AWSCredentials dimsTotal = list(asarray(self.dims.max)-asarray(self.dims.min)+1) def toFilenameAndBinaryBuf(kv): key, img = kv fname = filePrefix+"-"+"%05d.bin" % int(key) return fname, img.transpose().copy() bufRdd = self.rdd.map(toFilenameAndBinaryBuf) awsCredentials = AWSCredentials.fromContext(self.rdd.ctx) writer = getParallelWriterForPath(outputDirPath)(outputDirPath, overwrite=overwrite, awsCredentialsOverride=awsCredentials) bufRdd.foreach(writer.writerFcn) writeBinaryImagesConfig(outputDirPath, dims=dimsTotal, dtype=self.dtype, overwrite=overwrite, awsCredentialsOverride=awsCredentials)
def saveAsBinarySeries(self, outputDirPath, overwrite=False): """ Writes out Series-formatted data. This method (Series.saveAsBinarySeries) writes out binary series files using the current partitioning of this Series object. (That is, if mySeries.rdd.getNumPartitions() == 5, then 5 files will be written out, one per partition.) The records will not be resorted; the file names for each partition will be taken from the key of the first Series record in that partition. If the Series object is already sorted and no records have been removed by filtering, then the resulting output should be equivalent to what one would get from calling myImages.saveAsBinarySeries(). If all one wishes to do is to save out Images data in a binary series format, then tsc.convertImagesToSeries() will likely be more efficient than tsc.loadImages().toSeries().saveAsBinarySeries(). Parameters ---------- outputDirPath : string path or URI to directory to be created Output files will be written underneath outputdirname. This directory must not yet exist (unless overwrite is True), and must be no more than one level beneath an existing directory. It will be created as a result of this call. overwrite : bool If true, outputdirname and all its contents will be deleted and recreated as part of this call. """ import cStringIO as StringIO import struct from thunder.rdds.imgblocks.blocks import SimpleBlocks from thunder.rdds.fileio.writers import getParallelWriterForPath from thunder.rdds.fileio.seriesloader import writeSeriesConfig from thunder.utils.common import AWSCredentials if not overwrite: self._checkOverwrite(outputDirPath) overwrite = True # prevent additional downstream checks for this path def partitionToBinarySeries(kvIter): """ Collects all Series records in a partition into a single binary series record. """ keypacker = None firstKey = None buf = StringIO.StringIO() for seriesKey, series in kvIter: if keypacker is None: keypacker = struct.Struct('h' * len(seriesKey)) firstKey = seriesKey # print >> sys.stderr, seriesKey, series, series.tostring().encode('hex') buf.write(keypacker.pack(*seriesKey)) buf.write(series.tostring()) val = buf.getvalue() buf.close() # we might have an empty partition, in which case firstKey will still be None if firstKey is None: return iter([]) else: label = SimpleBlocks.getBinarySeriesNameForKey( firstKey) + ".bin" return iter([(label, val)]) awsCredentials = AWSCredentials.fromContext(self.rdd.ctx) writer = getParallelWriterForPath(outputDirPath)( outputDirPath, overwrite=overwrite, awsCredentialsOverride=awsCredentials) binseriesrdd = self.rdd.mapPartitions(partitionToBinarySeries) binseriesrdd.foreach(writer.writerFcn) # TODO: all we really need here are the number of keys and number of values, which could in principle # be cached in _nkeys and _nvals attributes, removing the need for this .first() call in most cases. firstKey, firstVal = self.first() writeSeriesConfig(outputDirPath, len(firstKey), len(firstVal), keyType='int16', valueType=self.dtype, overwrite=overwrite, awsCredentialsOverride=awsCredentials)
def saveAsBinarySeries(self, outputDirPath, overwrite=False): """ Writes out Series-formatted data. This method (Series.saveAsBinarySeries) writes out binary series files using the current partitioning of this Series object. (That is, if mySeries.rdd.getNumPartitions() == 5, then 5 files will be written out, one per partition.) The records will not be resorted; the file names for each partition will be taken from the key of the first Series record in that partition. If the Series object is already sorted and no records have been removed by filtering, then the resulting output should be equivalent to what one would get from calling myImages.saveAsBinarySeries(). If all one wishes to do is to save out Images data in a binary series format, then tsc.convertImagesToSeries() will likely be more efficient than tsc.loadImages().toSeries().saveAsBinarySeries(). Parameters ---------- outputDirPath : string path or URI to directory to be created Output files will be written underneath outputdirname. This directory must not yet exist (unless overwrite is True), and must be no more than one level beneath an existing directory. It will be created as a result of this call. overwrite : bool If true, outputdirname and all its contents will be deleted and recreated as part of this call. """ import cStringIO as StringIO import struct from thunder.rdds.imgblocks.blocks import SimpleBlocks from thunder.rdds.fileio.writers import getParallelWriterForPath from thunder.rdds.fileio.seriesloader import writeSeriesConfig from thunder.utils.common import AWSCredentials if not overwrite: self._checkOverwrite(outputDirPath) overwrite = True # prevent additional downstream checks for this path def partitionToBinarySeries(kvIter): """ Collects all Series records in a partition into a single binary series record. """ keypacker = None firstKey = None buf = StringIO.StringIO() for seriesKey, series in kvIter: if keypacker is None: keypacker = struct.Struct('h'*len(seriesKey)) firstKey = seriesKey # print >> sys.stderr, seriesKey, series, series.tostring().encode('hex') buf.write(keypacker.pack(*seriesKey)) buf.write(series.tostring()) val = buf.getvalue() buf.close() # we might have an empty partition, in which case firstKey will still be None if firstKey is None: return iter([]) else: label = SimpleBlocks.getBinarySeriesNameForKey(firstKey) + ".bin" return iter([(label, val)]) awsCredentials = AWSCredentials.fromContext(self.rdd.ctx) writer = getParallelWriterForPath(outputDirPath)(outputDirPath, overwrite=overwrite, awsCredentialsOverride=awsCredentials) binseriesrdd = self.rdd.mapPartitions(partitionToBinarySeries) binseriesrdd.foreach(writer.writerFcn) # TODO: all we really need here are the number of keys and number of values, which could in principle # be cached in _nkeys and _nvals attributes, removing the need for this .first() call in most cases. firstKey, firstVal = self.first() writeSeriesConfig(outputDirPath, len(firstKey), len(firstVal), keyType='int16', valueType=self.dtype, overwrite=overwrite, awsCredentialsOverride=awsCredentials)
def exportAsPngs(self, outputDirPath, filePrefix="export", overwrite=False, collectToDriver=True): """ Write out basic png files for two-dimensional image data. Files will be written into a newly-created directory on the local file system given by outputdirname. All workers must be able to see the output directory via an NFS share or similar. Parameters ---------- outputDirPath : string Path to output directory to be created. Exception will be thrown if this directory already exists, unless overwrite is True. Directory must be one level below an existing directory. filePrefix : string String to prepend to all filenames. Files will be named <fileprefix>00000.png, <fileprefix>00001.png, etc overwrite : bool If true, the directory given by outputdirname will first be deleted if it already exists. collectToDriver : bool, default True If true, images will be collect()'ed at the driver first before being written out, allowing for use of a local filesystem at the expense of network overhead. If false, images will be written in parallel by each executor, presumably to a distributed or networked filesystem. """ dims = self.dims if not len(dims) == 2: raise ValueError( "Only two-dimensional images can be exported as .png files; image is %d-dimensional." % len(dims)) from matplotlib.pyplot import imsave from io import BytesIO from thunder.rdds.fileio.writers import getParallelWriterForPath, getCollectedFileWriterForPath from thunder.utils.common import AWSCredentials def toFilenameAndPngBuf(kv): key, img = kv fname = filePrefix + "%05d.png" % int(key) bytebuf = BytesIO() imsave(bytebuf, img, format="png") return fname, bytebuf.getvalue() bufRdd = self.rdd.map(toFilenameAndPngBuf) awsCredentials = AWSCredentials.fromContext(self.rdd.ctx) if collectToDriver: writer = getCollectedFileWriterForPath(outputDirPath)( outputDirPath, overwrite=overwrite, awsCredentialsOverride=awsCredentials) writer.writeCollectedFiles(bufRdd.collect()) else: writer = getParallelWriterForPath(outputDirPath)( outputDirPath, overwrite=overwrite, awsCredentialsOverride=awsCredentials) bufRdd.foreach(writer.writerFcn)
def _checkOverwrite(self, outputDirPath): """ Checks for existence of outputDirPath, raising ValueError if it already exists """ from thunder.utils.common import AWSCredentials, raiseErrorIfPathExists awsCredentialOverride = AWSCredentials.fromContext(self.rdd.ctx) raiseErrorIfPathExists(outputDirPath, awsCredentialsOverride=awsCredentialOverride)