def __init__(self, data_dir, batch_range=None, init_epoch=1, init_batchnum=None, dp_params=None, test=False): LabeledDataProvider.__init__(self, data_dir, batch_range, init_epoch, init_batchnum, dp_params, test) if EMDataProvider.data_parser == None: assert os.path.isfile( data_dir ) # this needs to be the full path / file name of EMDataParser config file # if the convnet is writing features and an em feature path is provided then also have parser write outputs EMDataProvider.write_features = dp_params['convnet'].op.get_value( 'write_features') write_outputs = False append_features = False # modes exposed in init by the EMDataParser class if EMDataProvider.write_features: if dp_params['em_feature_path']: # this command line flag along with write_features enables writing output probabilities EMDataProvider.write_features_type = 'prob' # if the em_feature_path is an hdf file name, then this is a single whole-dataset hdf5 file fn, ext = os.path.splitext(dp_params['em_feature_path']) ext = ext.lower() append_features = (ext == '.h5' or ext == '.hdf5') # if not appending features, then just do normal write outputs write_outputs = not append_features else: # if em_feature_path is not specified, then this mode is for initializing data pre-processing EMDataProvider.write_features_type = 'data' assert (dp_params['convnet'].op.get_value('numpy_dump')) # instantiate the parser, override some attributes and then initialize EMDataProvider.data_parser = EMDataParser( data_dir, write_outputs, dp_params['init_load_path'], dp_params['save_name'], append_features, dp_params['convnet'].op.get_value('chunk_skip_list'), dp_params['convnet'].op.get_value('dim_ordering')) # if writing any features, override the outpath and force no label lookup if EMDataProvider.write_features: EMDataProvider.data_parser.outpath = dp_params[ 'em_feature_path'] EMDataProvider.data_parser.no_label_lookup = True EMDataProvider.data_parser.initBatches() self.batch_meta = EMDataProvider.data_parser.batch_meta self.batches_generated = 0
def __init__(self, cfg_file, write_output=None, chunk_skip_list=[], dim_ordering='', batch_range=[1, 10], name='emdata', isTest=False, concatenate_batches=False, NBUF=2, image_in_size=None): Thread.__init__(self) self.name = name # mostly intended for double buffering (NBUF==2) so that data can be pushed to card simultaneous with training. # single buffer (NUF==1) fetches next EM batch in parallel but waits until __iter__ to push to backend buffer. # more buffers should work (NBUF > 2) but takes more gpu memory and likely no speed improvement assert (NBUF > 0) self.NBUF = NBUF # batches are numbered starting at 1 and inclusive of end of range. # this needs to be done first so that nmacrobatches property works. self.batch_range = batch_range self.batchnum = batch_range[0] # previously parser was agnostic to test or train, but needed it for allowing single ini in chunk_list_all mode self.isTest = isTest # if the output an hdf file name, then this is a single whole-dataset hdf5 file. # xxx - initializations for writing output features could be cleaned up. write_outputs = (write_output is not None) append_features = False if write_outputs: fn, ext = os.path.splitext(write_output) ext = ext.lower() # .conf indicates to write knossos-style outputs append_features = (ext == '.h5' or ext == '.hdf5' or ext == '.conf') write_outputs = not append_features # instantiate the actual em data parser, code shared with cuda-convnets2 em data parser self.parser = EMDataParser(cfg_file, write_outputs=write_outputs, append_features=append_features, chunk_skip_list=chunk_skip_list, dim_ordering=dim_ordering, isTest=self.isTest, image_in_size=image_in_size) if write_outputs or append_features: # force some properties if in mode for writing outputs. # xxx - this is not clean, needs some rethinking on how write_outputs modes are initialized self.parser.outpath = write_output self.parser.no_label_lookup = True self.parser.append_features_knossos = append_features and ( ext == '.conf') if self.parser.append_features_knossos: self.parser.outpath, fn = os.path.split(fn) self.parser.strnetid = re.findall(r'\d+', fn)[0] # parser relies on having initBatches called right away, xxx - could revisit this? self.parser.initBatches() # no need for special code to concatenate if there is only one macrobatch anyways self.concatenate_batches = concatenate_batches and (self.nmacrobatches > 1) self.nexamples = self.parser.num_cases_per_batch if self.concatenate_batches: self.nexamples *= self.nmacrobatches # locks and events for synchronizing data loading thread. self.init_event = threading.Event() if self.NBUF > 1: self.lbuf_lock = threading.Lock() self.cbuf_lock = threading.Lock() self.lbuf_event = threading.Event() self.cbuf_event = threading.Event() else: self.push_event = threading.Event() self.push_done_event = threading.Event() # set pycuda driver for gpu backend # xxx - this is a bit hacky, is there a better way to do this? if type(self.be) == NervanaGPU: import pycuda.driver as drv self.drv = drv #self.stream = self.drv.Stream() # xxx - for other synchonize method??? see below else: self.drv = None # start the thread and wait for initialization to complete. # initialization of backend memory has to occur within the thread. self.daemon = True # so that stop event is not necessary to terminate threads when process completes. self.start() self.init_event.wait()