def _stop(self): pointScanner.PointScanner._stop(self, send_stop=False) self.progress.send(self) t_ = time.time() logger.info('Finished tile acquisition') if self._backend == 'cluster': logger.info( 'Waiting for spoolers to empty and for base levels to be built' ) self.P.finish_base_tiles() if self._backend == 'cluster': logger.info('Base tiles built') logger.info('Completing pyramid (dt = %3.2f)' % (time.time() - t_)) self.P.update_pyramid() if self._backend == 'cluster': from PYME.IO import clusterIO clusterIO.put_file(self.P.base_dir + '/metadata.json', self.P.mdh.to_JSON().encode()) else: with open(os.path.join(self._tiledir, 'metadata.json'), 'w') as f: f.write(self.P.mdh.to_JSON()) logger.info('Pyramid complete (dt = %3.2f)' % (time.time() - t_)) self.on_stop.send(self) self.progress.send(self)
def prepare(self): """ Do any setup work - e.g. uploading metadata required before the rule is triggered Returns ------- post_args : dict a dictionary with arguments to pass to RulePusher._post_rule() - specifically timeout, max_tasks, release_start, release_end """ #set up results file: logging.debug('resultsURI: ' + self.worker_resultsURI) clusterResults.fileResults(self.worker_resultsURI + '/MetaData', self.mdh) # defer copying events to after series completion #clusterResults.fileResults(self.worker_resultsURI + '/Events', self.ds.getEvents()) # set up metadata file which is used for deciding how to launch the analysis clusterIO.put_file(self.resultsMDFilename, self.mdh.to_JSON().encode(), serverfilter=self.serverfilter) #wait until clusterIO caches clear to avoid replicating the results file. #time.sleep(1.5) #moved inside polling thread so launches will run quicker self._next_release_start = self.start_at self.frames_outstanding = self.total_frames - self._next_release_start if self.data_complete: return dict(max_tasks=self.total_frames) return {}
def test_dircache_purge(): testdata = b'foo bar\n' for i in range(1050): clusterIO.put_file('_testing/lots_of_folders/test_%d/test.txt' % i, testdata, 'TEST') listing = clusterIO.listdir('_testing/lots_of_folders/test_%d/' % i, 'TEST')
def StartSpool(self): sp.Spooler.StartSpool(self) logger.debug('Starting spooling: %s' %self.seriesName) if self._aggregate_h5: #NOTE: allow a longer timeout than normal here as __aggregate with metadata waits for a lock on the server side before # actually adding (and is therefore susceptible to longer latencies than most operations). FIXME - remove server side lock. clusterIO.put_file('__aggregate_h5/' + self.seriesName + '/metadata.json', self.md.to_JSON().encode(), serverfilter=self.clusterFilter, timeout=3) else: clusterIO.put_file(self.seriesName + '/metadata.json', self.md.to_JSON().encode(), serverfilter=self.clusterFilter)
def test_double_put(): """Trying to put the same file twice should cause an error""" testdata = b'foo bar\n' clusterIO.put_file('_testing/test_d.txt', testdata, 'TES1') try: clusterIO.put_file('_testing/test_d.txt', testdata, 'TES1') raise AssertionError('Second put attempt did not raise an error') except RuntimeError: #we want to generate this error pass
def StopSpool(self): self._dPoll = False sp.Spooler.StopSpool(self) logger.debug('Stopping spooling %s' % self.seriesName) if self._aggregate_h5: clusterIO.put_file('__aggregate_h5/' + self.seriesName + '/final_metadata.json', self.md.to_JSON().encode(), serverfilter=self.clusterFilter) #save the acquisition events as json - TODO - consider a binary format as the events #can be quite numerous clusterIO.put_file('__aggregate_h5/' + self.seriesName + '/events.json', self.evtLogger.to_JSON().encode(), serverfilter=self.clusterFilter) else: clusterIO.put_file(self.seriesName + '/final_metadata.json', self.md.to_JSON().encode(), serverfilter=self.clusterFilter) #save the acquisition events as json - TODO - consider a binary format as the events #can be quite numerous clusterIO.put_file(self.seriesName + '/events.json', self.evtLogger.to_JSON().encode(), serverfilter=self.clusterFilter)
def StartSpool(self): sp.Spooler.StartSpool(self) logger.debug('Starting spooling: %s' % self.seriesName) if self._aggregate_h5: clusterIO.put_file('__aggregate_h5/' + self.seriesName + '/metadata.json', self.md.to_JSON().encode(), serverfilter=self.clusterFilter) else: clusterIO.put_file(self.seriesName + '/metadata.json', self.md.to_JSON().encode(), serverfilter=self.clusterFilter)
def mkdir(request, basedir): from PYME.IO import clusterIO newDirectory = request.POST.get('newDirectory', request.GET.get('newDirectory', None)) if newDirectory is None or newDirectory == '': return HttpResponseForbidden('No directory name specified') newDirectory = (basedir + newDirectory).rstrip('/') + '/' if clusterIO.exists(newDirectory) or clusterIO.exists(newDirectory[:-1]): return HttpResponseForbidden('Directory already exists') clusterIO.put_file(newDirectory, '') return HttpResponse(newDirectory)
def upload_files(request, directory): from PYME.IO import clusterIO files = request.FILES.getlist('file') for file in files: targetFilename = directory + file.name if clusterIO.exists(targetFilename): return HttpResponseForbidden( 'Upload failed [no files uploaded]. %s already exists on cluster' % targetFilename) for file in files: targetFilename = directory + file.name clusterIO.put_file(targetFilename, file.read()) return HttpResponseRedirect(request.META['HTTP_REFERER'])
def test_single_put(): testdata = 'foo bar\n' t = time.time() clusterIO.put_file('_testing/test.txt', testdata, 'TEST') print('putting a small file took %3.5f s' % (time.time() - t)) t = time.time() clusterIO.put_file('_testing/test1.txt', testdata, 'TEST') print('putting a second small file took %3.5f s' % (time.time() - t)) t = time.time() retrieved = clusterIO.get_file('_testing/test.txt', 'TEST') print('retrieving a small file took %3.5f s' % (time.time() - t))
def recvMember(self, rfile, name, size, req): """Receive (save) a member file""" fname = os.path.join(self.fsname, urllib.unquote(name)) if size == 0: _dummy_files.append(fname) return else: try: _dummy_files.remove(fname) except ValueError: pass #f = file(fname, 'wb') f = BytesIO() # if size=-1 it's Transfer-Encoding: Chunked mode, like OSX finder using this mode put data # so the file size need get here. if size == -2: l = int(rfile.readline(), 16) ltotal = 0 while l > 0: buf = rfile.read(l) f.write(buf) #yield buf rfile.readline() ltotal += l l = int(rfile.readline(), 16) elif size > 0: # if size=0 ,just save a empty file. writ = 0 bs = 65536 while True: if size != -1 and (bs > size - writ): bs = size - writ buf = rfile.read(bs) if len(buf) == 0: break f.write(buf) writ += len(buf) if size != -1 and writ >= size: break logger.debug('ClusterIO put: %s' % fname) clusterIO.put_file(fname, f.getvalue()) f.close()
def finalise(self): # wait until our input queue is empty rather than immediately stopping saving. self._stopping=True logger.debug('Stopping spooling %s' % self.seriesName) #join our polling threads if config.get('httpspooler-jointhreads', True): # Allow this to be switched off in a config option for maximum performance on High Throughput system. # Joining threads is the recommended and safest behaviour, but forces spooling of current series to complete # before next series starts, so could have negative performance implications. # The alternative - letting spooling continue during the acquisition of the next series - has the potential # to result in runaway memory and thread usage when things go pear shaped (i.e. spooling is not fast enough) # TODO - is there actually a performance impact that justifies this config option, or is it purely theoretical for pt in self._pollThreads: pt.join() # remove our reference to the threads which hold back-references preventing garbage collection del(self._pollThreads) # save events and final metadata # TODO - use a binary format for saving events - they can be quite # numerous, and can trip the standard 1 s clusterIO.put_file timeout. # Use long timeouts as a temporary hack because failing these can ruin # a dataset if self._aggregate_h5: clusterIO.put_file('__aggregate_h5/' + self.seriesName + '/final_metadata.json', self.md.to_JSON().encode(), self.clusterFilter) clusterIO.put_file('__aggregate_h5/' + self.seriesName + '/events.json', self.evtLogger.to_JSON().encode(), self.clusterFilter, timeout=10) else: clusterIO.put_file(self.seriesName + '/final_metadata.json', self.md.to_JSON().encode(), self.clusterFilter) clusterIO.put_file(self.seriesName + '/events.json', self.evtLogger.to_JSON().encode(), self.clusterFilter, timeout=10)
def __init__(self, dataSourceID, metadata, resultsFilename, queueName = None, startAt = 10, dataSourceModule=None, serverfilter=''): """ Create a pusher and push tasks for each frame in a series. For use with the new cluster distribution architecture Parameters ---------- dataSourceID : str The URI of the data source - e.g. PYME-CLUSTER://serverfilter/path/to/data metadata : PYME.IO.MetaDataHandler object The acquisition and analysis metadata resultsFilename : str The cluster relative path to the results file. e.g. "<username>/analysis/<date>/seriesname.h5r" queueName : str a name to give the queue. The results filename is used if no name is given. startAt : int which frame to start at. TODO - read from metadata instead of taking as a parameter. dataSourceModule : str [optional] The name of the module to use for reading the raw data. If not given, it will be inferred from the dataSourceID serverfilter : str A cluster filter, for use when multiple PYME clusters are visible on the same network segment. """ if queueName is None: queueName = resultsFilename self.queueID = queueName self.dataSourceID = dataSourceID if '~' in self.dataSourceID or '~' in self.queueID or '~' in resultsFilename: raise RuntimeError('File, queue or results name must NOT contain dashes') self.resultsURI = 'PYME-CLUSTER://%s/__aggregate_h5r/%s' % (serverfilter, resultsFilename) resultsMDFilename = resultsFilename + '.json' self.results_md_uri = 'PYME-CLUSTER://%s/%s' % (serverfilter, resultsMDFilename) self.taskQueueURI = _getTaskQueueURI() self.mdh = metadata #load data source if dataSourceModule is None: DataSource = DataSources.getDataSourceForFilename(dataSourceID) else: DataSource = __import__('PYME.IO.DataSources.' + dataSourceModule, fromlist=['PYME', 'io', 'DataSources']).DataSource #import our data source self.ds = DataSource(self.dataSourceID) #set up results file: logging.debug('resultsURI: ' + self.resultsURI) clusterResults.fileResults(self.resultsURI + '/MetaData', metadata) clusterResults.fileResults(self.resultsURI + '/Events', self.ds.getEvents()) # set up metadata file which is used for deciding how to launch the analysis clusterIO.put_file(resultsMDFilename, self.mdh.to_JSON(), serverfilter=serverfilter) #wait until clusterIO caches clear to avoid replicating the results file. #time.sleep(1.5) #moved inside polling thread so launches will run quicker self.currentFrameNum = startAt self._task_template = None self.doPoll = True self.pollT = threading.Thread(target=self._updatePoll) self.pollT.start()
def test_put(): testdata = b'foo bar\n' clusterIO.put_file('_testing/test.txt', testdata, 'TES1') retrieved = clusterIO.get_file('_testing/test.txt', 'TES1') assert testdata == retrieved
def _save(self, filename, data): from PYME.IO import clusterIO, PZFFormat clusterIO.put_file(filename, PZFFormat.dumps(data.astype('float32')))
def distributed_pyramid(out_folder, ds, xm, ym, mdh, split=False, skipMoveFrames=False, shiftfield=None, mixmatrix=[[1., 0.], [0., 1.]], correlate=False, dark=None, flat=None, pyramid_tile_size=256): """Create a distributed pyramid through PYMECluster. Parameters ---------- out_folder : str directory to save pyramid tiles(/directories). The same folder will be created on the cluster servers. ds : PYME.IO.DataSources.BaseDataSource, np.ndarray array-like image xm : np.ndarray or PYME.Analysis.piecewiseMapping.piecewiseMap x positions of frames in ds. Raw stage positions in [um]. ImagePyramid origin will be at at minimum x, and offset to camera chip origin will be handled in SupertileDatasource tile_coords_um method. to the camera chip origin. ym : np.ndarray or PYME.Analysis.piecewiseMapping.piecewiseMap y positions of frames in ds. Raw stage positions in [um]. ImagePyramid origin will be at at minimum y, and offset to camera chip origin will be handled in SupertileDatasource tile_coords_um method. mdh : PYME.IO.MetaDataHandler.MDataHandlerBase metadata for ds split : bool, optional whether this is a splitter datasource and should be treated like one, by default False skipMoveFrames : bool, optional flag to drop frames which are the first frame acquired at a given position, by default False shiftfield : [type], optional required for splitter data, see PYME.Acquire.Hardware.splitter, by default None mixmatrix : list, optional for splitter data, see PYME.Acquire.Hardware.splitter, by default [[1., 0.], [0., 1.]] correlate : bool, optional whether to add a 300 pixel padding to the edges, by default False dark : ndarray, float, optional (appropriately-cropped or scalar) dark frame (analog-digital offset) calibration to subtract when adding frames to the pyramid, by default None, in which case Camera.ADOffset from metadata will be used, if available flat : ndarray, optional (appropriately-cropped or scalar) flatfield calibration to apply to frames when adding them to the pyramid, by default None pyramid_tile_size : int, optional base tile size, by default 256 pixels Returns ------- DistributedImagePyramid coalesced/averaged/etc multilevel DistributedImagePyramid instance Notes ----- Code is currently somewhat alpha in that the splitter functionality is more or less untested, and we only get tile orientations right for primary cameras (i.e. when the stage is registered with multipliers to match the camera, rather than camera registered with orientation metadata to match it to the stage). TODO - this largely duplicates the corresponding function in tile_pyramid => refactor """ frameSizeX, frameSizeY, numFrames = ds.shape[:3] if split: from PYME.Acquire.Hardware import splitter frameSizeY /= 2 nchans = 2 unmux = splitter.Unmixer(shiftfield, mdh.voxelsize_nm.x) else: nchans = 1 #x & y positions of each frame xps = xm(np.arange(numFrames)) if not isinstance(xm, np.ndarray) else xm yps = ym(np.arange(numFrames)) if not isinstance(ym, np.ndarray) else ym #give some room at the edges bufSize = 0 if correlate: bufSize = 300 # to avoid building extra, empty tiles, the pyramid origin is the minimum # x and y position present in the tiles x0_pyramid, y0_pyramid = xps.min(), yps.min() xps -= x0_pyramid yps -= y0_pyramid # calculate origin independent of the camera ROI setting to store in # metadata for use in e.g. SupertileDatasource.DataSource.tile_coords_um x0_cam, y0_cam = get_camera_physical_roi_origin(mdh) x0 = x0_pyramid + mdh.voxelsize_nm.x / 1e3 * x0_cam y0 = y0_pyramid + mdh.voxelsize_nm.y / 1e3 * y0_cam #convert to pixels xdp = (bufSize + (xps / (mdh.getEntry('voxelsize.x'))).round()).astype('i') ydp = (bufSize + (yps / (mdh.getEntry('voxelsize.y'))).round()).astype('i') # get splitter ROI coordinates in units of pixels ROIX1 = x0_cam + 1 # TODO - is splitter 1-indexed? ROIY1 = y0_cam + 1 ROIX2 = ROIX1 + mdh.getEntry('Camera.ROIWidth') ROIY2 = ROIY1 + mdh.getEntry('Camera.ROIHeight') if dark is None: dark = float(mdh.getOrDefault('Camera.ADOffset', 0)) P = DistributedImagePyramid( out_folder, pyramid_tile_size, x0=x0, y0=y0, pixel_size=mdh.getEntry('voxelsize.x'), ) logger.debug('Updating base tiles ...') t1 = time.time() for i in range(int(mdh.getEntry('Protocol.DataStartsAt')), numFrames): if xdp[i - 1] == xdp[i] or not skipMoveFrames: x_i = xdp[i] y_i = ydp[i] d = ds[:, :, i].astype('f') - dark if not flat is None: d = d * flat if split: d = np.concatenate( unmux.Unmix(d, mixmatrix, dark, [ROIX1, ROIY1, ROIX2, ROIY2]), 2) # TODO - account for orientation so this works for non-primary cams P.update_base_tiles_from_frame(x_i, y_i, d) P.finish_base_tiles() t2 = time.time() logger.debug('Updated base tiles in %fs' % (t2 - t1)) #P._occ.flush() logger.debug(time.time() - t2) logger.debug('Updating pyramid ...') P.update_pyramid() # TODO: make cluster-aware logger.debug(time.time() - t2) logger.debug('Done') clusterIO.put_file('/'.join([P.base_dir, 'metadata.json']), P.mdh.to_JSON()) return P