示例#1
0
文件: convEMdata.py 项目: erjel/emdrp
    def __init__(self,
                 data_dir,
                 batch_range=None,
                 init_epoch=1,
                 init_batchnum=None,
                 dp_params=None,
                 test=False):
        LabeledDataProvider.__init__(self, data_dir, batch_range, init_epoch,
                                     init_batchnum, dp_params, test)

        if EMDataProvider.data_parser == None:
            assert os.path.isfile(
                data_dir
            )  # this needs to be the full path / file name of EMDataParser config file

            # if the convnet is writing features and an em feature path is provided then also have parser write outputs
            EMDataProvider.write_features = dp_params['convnet'].op.get_value(
                'write_features')
            write_outputs = False
            append_features = False  # modes exposed in init by the EMDataParser class
            if EMDataProvider.write_features:
                if dp_params['em_feature_path']:
                    # this command line flag along with write_features enables writing output probabilities
                    EMDataProvider.write_features_type = 'prob'
                    # if the em_feature_path is an hdf file name, then this is a single whole-dataset hdf5 file
                    fn, ext = os.path.splitext(dp_params['em_feature_path'])
                    ext = ext.lower()
                    append_features = (ext == '.h5' or ext == '.hdf5')
                    # if not appending features, then just do normal write outputs
                    write_outputs = not append_features
                else:
                    # if em_feature_path is not specified, then this mode is for initializing data pre-processing
                    EMDataProvider.write_features_type = 'data'
                    assert (dp_params['convnet'].op.get_value('numpy_dump'))

            # instantiate the parser, override some attributes and then initialize
            EMDataProvider.data_parser = EMDataParser(
                data_dir, write_outputs, dp_params['init_load_path'],
                dp_params['save_name'], append_features,
                dp_params['convnet'].op.get_value('chunk_skip_list'),
                dp_params['convnet'].op.get_value('dim_ordering'))
            # if writing any features, override the outpath and force no label lookup
            if EMDataProvider.write_features:
                EMDataProvider.data_parser.outpath = dp_params[
                    'em_feature_path']
                EMDataProvider.data_parser.no_label_lookup = True
            EMDataProvider.data_parser.initBatches()
        self.batch_meta = EMDataProvider.data_parser.batch_meta
        self.batches_generated = 0
示例#2
0
    def __init__(self,
                 cfg_file,
                 write_output=None,
                 chunk_skip_list=[],
                 dim_ordering='',
                 batch_range=[1, 10],
                 name='emdata',
                 isTest=False,
                 concatenate_batches=False,
                 NBUF=2,
                 image_in_size=None):
        Thread.__init__(self)
        self.name = name

        # mostly intended for double buffering (NBUF==2) so that data can be pushed to card simultaneous with training.
        # single buffer (NUF==1) fetches next EM batch in parallel but waits until __iter__ to push to backend buffer.
        # more buffers should work (NBUF > 2) but takes more gpu memory and likely no speed improvement
        assert (NBUF > 0)
        self.NBUF = NBUF

        # batches are numbered starting at 1 and inclusive of end of range.
        # this needs to be done first so that nmacrobatches property works.
        self.batch_range = batch_range
        self.batchnum = batch_range[0]

        # previously parser was agnostic to test or train, but needed it for allowing single ini in chunk_list_all mode
        self.isTest = isTest

        # if the output an hdf file name, then this is a single whole-dataset hdf5 file.
        # xxx - initializations for writing output features could be cleaned up.
        write_outputs = (write_output is not None)
        append_features = False
        if write_outputs:
            fn, ext = os.path.splitext(write_output)
            ext = ext.lower()
            # .conf indicates to write knossos-style outputs
            append_features = (ext == '.h5' or ext == '.hdf5'
                               or ext == '.conf')
            write_outputs = not append_features
        # instantiate the actual em data parser, code shared with cuda-convnets2 em data parser
        self.parser = EMDataParser(cfg_file,
                                   write_outputs=write_outputs,
                                   append_features=append_features,
                                   chunk_skip_list=chunk_skip_list,
                                   dim_ordering=dim_ordering,
                                   isTest=self.isTest,
                                   image_in_size=image_in_size)
        if write_outputs or append_features:
            # force some properties if in mode for writing outputs.
            # xxx - this is not clean, needs some rethinking on how write_outputs modes are initialized
            self.parser.outpath = write_output
            self.parser.no_label_lookup = True
            self.parser.append_features_knossos = append_features and (
                ext == '.conf')
            if self.parser.append_features_knossos:
                self.parser.outpath, fn = os.path.split(fn)
                self.parser.strnetid = re.findall(r'\d+', fn)[0]
        # parser relies on having initBatches called right away, xxx - could revisit this?
        self.parser.initBatches()

        # no need for special code to concatenate if there is only one macrobatch anyways
        self.concatenate_batches = concatenate_batches and (self.nmacrobatches
                                                            > 1)

        self.nexamples = self.parser.num_cases_per_batch
        if self.concatenate_batches: self.nexamples *= self.nmacrobatches

        # locks and events for synchronizing data loading thread.
        self.init_event = threading.Event()
        if self.NBUF > 1:
            self.lbuf_lock = threading.Lock()
            self.cbuf_lock = threading.Lock()
            self.lbuf_event = threading.Event()
            self.cbuf_event = threading.Event()
        else:
            self.push_event = threading.Event()
            self.push_done_event = threading.Event()

        # set pycuda driver for gpu backend
        # xxx - this is a bit hacky, is there a better way to do this?
        if type(self.be) == NervanaGPU:
            import pycuda.driver as drv
            self.drv = drv
            #self.stream = self.drv.Stream() # xxx - for other synchonize method??? see below
        else:
            self.drv = None

        # start the thread and wait for initialization to complete.
        # initialization of backend memory has to occur within the thread.
        self.daemon = True  # so that stop event is not necessary to terminate threads when process completes.
        self.start()
        self.init_event.wait()