def __init__(self, h5fFile, tablename='FitResults'): """ Data source for use with h5r files as saved by the PYME analysis component. Takes either an open h5r file or a string filename to be opened.""" from PYME.IO import h5rFile self.tablename = tablename if isinstance(h5fFile, tables.file.File): try: self.fitResults = getattr(h5fFile.root, tablename)[:] except (AttributeError, tables.NoSuchNodeError): logger.exception('Was expecting to find a "%s" table' % tablename) raise #allow access using unnested original names self._keys = unNestNames( getattr(h5fFile.root, tablename).description._v_nested_names) else: if isinstance(h5fFile, h5rFile.H5RFile): h5f = h5fFile else: h5f = h5rFile.openH5R(h5fFile) with h5f: self.fitResults = h5f.getTableData(tablename, slice(None)) if (len(self.fitResults) == 0): raise RuntimeError('Was expecting to find a "%s" table' % tablename) #allow access using unnested original names self._keys = unNestNames( getattr(h5f._h5file.root, tablename).description._v_nested_names)
def to_hdf(self, filename, tablename='Data', keys=None, metadata=None, keep_alive_timeout=0): """ Writes data to a table in an HDF5 file Parameters ---------- filename: string the name of the file to save to tablename: string [optional] the name of the table within the file to save to. Defaults to "Data" keys: list [optional] a list of column names to save (if keys == None, all columns are saved) metadata: a MetaDataHandler instance [optional] associated metadata to write to the file keep_alive_timeout: float a timeout in seconds. If non-zero, the file is held open after we have finished writing to it until the timeout elapses. Useful as a performance optimisation when making multiple writes to a single file, potentially across multiple threads. NOTE: the keep_alive_timeout is not garuanteed to be observed - it gets set by the first open call of a given session, so if the file is already open due to a previous openH5R call, the timeout requested by that call will be used. """ from PYME.IO import h5rFile with h5rFile.openH5R(filename, 'a', keep_alive_timeout=keep_alive_timeout) as f: f.appendToTable(tablename, self.to_recarray(keys)) if metadata is not None: f.updateMetadata(metadata) #wait until data is written f.flush()
def to_hdf(self, filename, tablename='Data', keys=None, metadata=None): from PYME.IO import h5rFile with h5rFile.openH5R(filename, 'a') as f: f.appendToTable(tablename, self.to_recarray(keys)) if metadata is not None: f.updateMetadata(metadata)
def loadInput(self, filename, key='input'): """ Load input data from a file and inject into namespace """ from PYME.IO import unifiedIO import os extension = os.path.splitext(filename)[1] if extension in ['.h5r', '.hdf']: import tables from PYME.IO import h5rFile try: with unifiedIO.local_or_temp_filename( filename) as fn, h5rFile.openH5R( fn, mode='r')._h5file as h5f: self._inject_tables_from_hdf5(key, h5f, fn, extension) except tables.exceptions.HDF5ExtError: # access issue likely due to multiple processes if unifiedIO.is_cluster_uri(filename): # try again, this time forcing access through the dataserver # NOTE: it is unclear why this should work when local_or_temp_filename() doesn't # as this still opens / copies the file independently, albeit in the same process as is doing the writing. # The fact that this works is relying on one of a quirk of the GIL, a quirk in HDF5 locking, or the fact # that copying the file to a stream is much faster than opening it with pytables. The copy vs pytables open # scenario would match what has been observed with old style spooling analysis where copying a file # prior to opening in VisGUI would work more reliably than opening directly. This retains, however, # an inherent race condition so we risk replacing a predictable failure with a less frequent one. # TODO - consider whether h5r_part might be a better choice. # FIXME: (DB) I'm not comfortable with having this kind of special case retry logic here, and would # much prefer if we could find an alternative workaround, refactor into something like h5rFile.open_robust(), # or just let this fail). Leaving it for the meantime to get chained recipes working, but we should revisit. from PYME.IO import clusterIO relative_filename, server_filter = unifiedIO.split_cluster_url( filename) file_as_bytes = clusterIO.get_file( relative_filename, serverfilter=server_filter, local_short_circuit=False) with tables.open_file('in-memory.h5', driver='H5FD_CORE', driver_core_image=file_as_bytes, driver_core_backing_store=0) as h5f: self._inject_tables_from_hdf5(key, h5f, filename, extension) else: #not a cluster file, doesn't make sense to retry with cluster. Propagate exception to user. raise elif extension == '.csv': logger.error('loading .csv not supported yet') raise NotImplementedError elif extension in ['.xls', '.xlsx']: logger.error('loading .xls not supported yet') raise NotImplementedError else: self.namespace[key] = ImageStack(filename=filename, haveGUI=False)
def _aggregate_h5r(self, path, data): """ Support for results aggregation into an HDF5 file, using pytables. We treat any path components after the .h5r as locations within the file (ie table names). e.g. /path/to/data.h5r/<tablename> A few special cases / Table names are accommodated: MetaData: assumes we have sent PYME metadata in json format and saves to the file using the appropriate metadatahandler No table name: assumes we have a fitResults object (as returned by remFitBuf and saves to the the appropriate tables (as HDF task queue would) """ import numpy as np from io import BytesIO from six.moves import cPickle from PYME.IO import MetaDataHandler from PYME.IO import h5rFile path = self.translate_path(path.lstrip('/')[len('__aggregate_h5r'):]) filename, tablename = path.split('.h5r') filename += '.h5r' #logging.debug('opening h5r file') with h5rFile.openH5R(filename, 'a') as h5f: if tablename == '/MetaData': mdh_in = MetaDataHandler.CachingMDHandler(json.loads(data)) h5f.updateMetadata(mdh_in) elif tablename == '': #legacy fitResults structure fitResults = cPickle.loads(data) h5f.fileFitResult(fitResults) else: try: #try to read data as if it was numpy binary formatted data = np.load(BytesIO(data)) except IOError: #it's not numpy formatted - try json import pandas as pd #FIXME!! - this will work, but will likely be really slow! data = pd.read_json(data).to_records(False) #logging.debug('adding data to table') h5f.appendToTable(tablename.lstrip('/'), data) #logging.debug('added data to table') #logging.debug('left h5r file') return ResponseOK()
def getQueueData(self, fieldName, *args): """Get data, defined by fieldName and potntially additional arguments, ascociated with queue""" if fieldName == 'FitResults': startingAt, = args #with self.fileResultsLock.rlock: # if self.h5ResultsFile.__contains__('/FitResults'): # res = self.h5ResultsFile.root.FitResults[startingAt:] # else: # res = [] with h5rFile.openH5R(self.resultsFilename, 'a') as h5f: res = h5f.getTableData('FitResults', slice(startingAt, None)) return res elif fieldName == 'PSF': #from PYME.ParallelTasks.relativeFiles import getFullExistingFilename from PYME.IO.load_psf import load_psf res = None modName = self.metaData.getEntry('PSFFile') # mf = open(getFullExistingFilename(modName), 'rb') #res = np.load(mf) #mf.close() res = load_psf(getFullExistingFilename(modName)) return res elif fieldName == 'MAP': mapName, = args #from PYME.ParallelTasks.relativeFiles import getFullExistingFilename from PYME.IO.image import ImageStack print('Serving map: %s' % mapName) fn = getFullExistingFilename(mapName) varmap = ImageStack( filename=fn, haveGUI=False).data[:, :, 0].squeeze( ) #this should handle .tif, .h5, and a few others return varmap else: return None
def fileResults(self, ress): """ File/save the results of fitting multiple frames Args: ress: list of fit results Returns: """ with h5rFile.openH5R(self.resultsFilename, 'a') as h5f: for res in ress: if res is None: logging.warn('got a none result') else: if (len(res.results) > 0): h5f.appendToTable('FitResults', res.results) if (len(res.driftResults) > 0): h5f.appendToTable('DriftResults', res.driftResults) self.numClosedTasks += len(ress)
def _aggregate_h5r(self): """ Support for results aggregation into an HDF5 file, using pytables. We treat any path components after the .h5r as locations within the file (ie table names). e.g. /path/to/data.h5r/<tablename> A few special cases / Table names are accommodated: MetaData: assumes we have sent PYME metadata in json format and saves to the file using the appropriate metadatahandler No table name: assumes we have a fitResults object (as returned by remFitBuf and saves to the the appropriate tables (as HDF task queue would) """ import numpy as np from io import BytesIO from six.moves import cPickle from PYME.IO import MetaDataHandler from PYME.IO import h5rFile # path = self.translate_path(self.path.lstrip('/')[len('__aggregate_h5r'):]) # filename, tablename = path.split('.h5r') # filename += '.h5r' filename, tablename = self.path.lstrip( '/')[len('__aggregate_h5r'):].split('.h5r') filename = self.translate_path(filename + '.h5r') data = self._get_data() dirname = os.path.dirname(filename) #if not os.path.exists(dirname): # os.makedirs(dirname) makedirs_safe(dirname) #logging.debug('opening h5r file') with h5rFile.openH5R(filename, 'a') as h5f: if tablename == '/MetaData': mdh_in = MetaDataHandler.CachingMDHandler(json.loads(data)) h5f.updateMetadata(mdh_in) elif tablename == '': #legacy fitResults structure fitResults = cPickle.loads(data) h5f.fileFitResult(fitResults) else: try: #pickle is much faster than numpy array format (despite the array format being simpler) #reluctanltly use pickles data = np.loads(data) except cPickle.UnpicklingError: try: #try to read data as if it was numpy binary formatted data = np.load(BytesIO(data)) except IOError: #it's not numpy formatted - try json import pandas as pd #FIXME!! - this will work, but will likely be really slow! data = pd.read_json(data).to_records(False) #logging.debug('adding data to table') h5f.appendToTable(tablename.lstrip('/'), data) #logging.debug('added data to table') #logging.debug('left h5r file') if USE_DIR_CACHE: cl.dir_cache.update_cache(filename, int(len(data))) self.send_response(200) self.send_header("Content-Length", "0") self.end_headers() return
def get_tabular_part(self, path): """ Parameters ---------- path: str OS-translated path to an hdf or h5r file on the dataserver computer. Append the part of the file to read after the file extension, e.g. .h5r/Events. Return format (for arrays) can additionally be specified, as can slices using the following syntax: test.h5r/FitResults.json?from=0&to=100. Supported array formats include json and npy. Returns ------- f: BytesIO Requested part of the file encoded as bytes """ from PYME.IO import h5rFile, clusterResults # parse path ext = '.h5r' if '.h5r' in path else '.hdf' # TODO - should we just use the the untranslated path? filename, details = path.split(ext + os.sep) filename = filename + ext # path to file on dataserver disk query = urlparse.urlparse(details).query details = details.strip('?' + query) if '.' in details: part, return_type = details.split('.') else: part, return_type = details, '' try: with h5rFile.openH5R(filename) as h5f: if part == 'Metadata': wire_data, output_format = clusterResults.format_results( h5f.mdh, return_type) else: # figure out if we have any slicing to do query = urlparse.parse_qs(query) start = int(query.get('from', [0])[0]) end = None if 'to' not in query.keys() else int( query['to'][0]) wire_data, output_format = clusterResults.format_results( h5f.getTableData(part, slice(start, end)), '.' + return_type) f, length = self._string_to_file(wire_data) self.send_response(200) self.send_header( "Content-Type", output_format if output_format else 'application/octet-stream') self.send_header("Content-Length", length) #self.send_header("Last-Modified", self.date_time_string(fs.st_mtime)) self.end_headers() return f except IOError: self.send_error(404, "File not found - %s, [%s]" % (self.path, path))
def getNumQueueEvents(self): with h5rFile.openH5R(self.resultsFilename, 'a') as h5f: res = len(h5f.events) return res
def addQueueEvents(self, events): with h5rFile.openH5R(self.resultsFilename, 'a') as h5f: h5f.addEvents(events)
def setQueueMetaDataEntries(self, mdh): with h5rFile.openH5R(self.resultsFilename, 'a') as h5f: h5f.updateMetadata(mdh) self.metaData.update(mdh)
def flushMetaData(self): if len(self.MDHCache) > 0: new_md = dict(self.MDHCache) self.MDHCache = [] with h5rFile.openH5R(self.resultsFilename, 'a') as h5f: h5f.updateMetadata(new_md)