def __init__( self, node_chain_spec, parameter_setting, rel_dataset_dir, run, split, storage_format, result_dataset_directory, store_node_chain=False, hide_parameters=[], ): super(NodeChainProcess, self).__init__() self.node_chain_spec = node_chain_spec self.parameter_setting = parameter_setting self.rel_dataset_dir = rel_dataset_dir self.storage = pySPACE.configuration.storage self.run = run self.storage_format = storage_format self.result_dataset_directory = result_dataset_directory self.persistency_dir = os.sep.join([result_dataset_directory, "persistency_run%s" % run]) create_directory(self.persistency_dir) self.store_node_chain = store_node_chain self.hide_parameters = hide_parameters # reduce_log_level for process creation try: console_log_level = ( eval(pySPACE.configuration.console_log_level) if hasattr(pySPACE.configuration, "console_log_level") else logging.WARNING ) except (AttributeError, NameError): console_log_level = logging.WARNING try: file_log_level = ( eval(pySPACE.configuration.file_log_level) if hasattr(pySPACE.configuration, "file_log_level") else logging.INFO ) except (AttributeError, NameError): file_log_level = logging.INFO self.min_log_level = min(console_log_level, file_log_level) pySPACE.configuration.min_log_level = self.min_log_level # Replace parameters in spec file # self.node_chain_spec = replace_parameters_and_convert( # self.node_chain_spec, self.parameter_setting) self.node_chain_spec = replace_parameters2(self.node_chain_spec, self.parameter_setting) # Create node chain self.node_chain = NodeChainFactory.flow_from_yaml(Flow_Class=BenchmarkNodeChain, flow_spec=self.node_chain_spec) for node in self.node_chain: node.current_split = split # Remove pseudo parameter "__PREPARE_OPERATION__" if "__PREPARE_OPERATION__" in self.parameter_setting: self.parameter_setting = copy.deepcopy(self.parameter_setting) self.parameter_setting.pop("__PREPARE_OPERATION__")
def test_dataflow_from_yaml(self): simpleYAMLInput =""" - node : Time_Series_Source - node : Detrending parameters : detrend_method : "eval(__import__('pylab').detrend_mean)" - node : Subsampling parameters : target_frequency : 100.0 - node : CSP parameters : retained_channels : 4 """ flow = NodeChainFactory.flow_from_yaml(NodeChain, simpleYAMLInput) self.assert_(isinstance(flow, NodeChain) and len(flow) == 4) self.assert_(isinstance(flow[0], TimeSeriesSourceNode) and isinstance(flow[1], DetrendingNode) and isinstance(flow[2], SubsamplingNode) and isinstance(flow[3], CSPNode)) self.assert_(flow[1].detrend_method == pylab.detrend_mean) self.assert_(flow[2].target_frequency == 100.0) self.assert_(flow[3].retained_channels == 4)
def test_dataflow_from_yaml(self): simpleYAMLInput = """ - node : Time_Series_Source - node : Detrending parameters : detrend_method : "eval(__import__('pylab').detrend_mean)" - node : Subsampling parameters : target_frequency : 100.0 - node : CSP parameters : retained_channels : 4 """ flow = NodeChainFactory.flow_from_yaml(NodeChain, simpleYAMLInput) self.assert_(isinstance(flow, NodeChain) and len(flow) == 4) self.assert_( isinstance(flow[0], TimeSeriesSourceNode) and isinstance(flow[1], DetrendingNode) and isinstance(flow[2], SubsamplingNode) and isinstance(flow[3], CSPNode)) self.assert_(flow[1].detrend_method == pylab.detrend_mean) self.assert_(flow[2].target_frequency == 100.0) self.assert_(flow[3].retained_channels == 4)
def __init__(self, node_chain_spec, parameter_setting, rel_dataset_dir, run, split, storage_format, result_dataset_directory, store_node_chain=False, hide_parameters=[]): super(NodeChainProcess, self).__init__() self.node_chain_spec = node_chain_spec self.parameter_setting = parameter_setting self.rel_dataset_dir = rel_dataset_dir self.storage = pySPACE.configuration.storage self.run = run self.storage_format = storage_format self.result_dataset_directory = result_dataset_directory self.persistency_dir = os.sep.join( [result_dataset_directory, "persistency_run%s" % run]) create_directory(self.persistency_dir) self.store_node_chain = store_node_chain self.hide_parameters = hide_parameters # reduce_log_level for process creation try: console_log_level = eval(pySPACE.configuration.console_log_level) \ if hasattr(pySPACE.configuration, "console_log_level") \ else logging.WARNING except (AttributeError, NameError): console_log_level = logging.WARNING try: file_log_level = eval(pySPACE.configuration.file_log_level) \ if hasattr(pySPACE.configuration, "file_log_level") \ else logging.INFO except (AttributeError, NameError): file_log_level = logging.INFO self.min_log_level = min(console_log_level, file_log_level) pySPACE.configuration.min_log_level = self.min_log_level # Replace parameters in spec file # self.node_chain_spec = replace_parameters_and_convert( # self.node_chain_spec, self.parameter_setting) self.node_chain_spec = replace_parameters2(self.node_chain_spec, self.parameter_setting) # Create node chain self.node_chain = NodeChainFactory.flow_from_yaml( Flow_Class=BenchmarkNodeChain, flow_spec=self.node_chain_spec) for node in self.node_chain: node.current_split = split # Remove pseudo parameter "__PREPARE_OPERATION__" if "__PREPARE_OPERATION__" in self.parameter_setting: self.parameter_setting = copy.deepcopy(self.parameter_setting) self.parameter_setting.pop("__PREPARE_OPERATION__")
def prepare_adaptation(self, adaptation_files, datasets): """ Prepares the threshold adaptation. """ online_logger.info("Preparing Adaptation") online_logger.info("adaptation files:" + str(adaptation_files)) for key in self.datasets.keys(): if "threshold_adaptation_flow" in self.datasets[key]: spec_base = self.datasets[key]["configuration"].spec_dir self.datasets[key]["threshold_adaptation_flow"] = os.path.join( spec_base, self.datasets[key]["threshold_adaptation_flow"]) online_logger.info( "windower_spec_path:" + self.datasets[key]["windower_spec_threshold_adaptation"]) online_logger.info( "dataflow_spec_" + key + ":" + self.datasets[key]["threshold_adaptation_flow"]) self.adaptation_active_potential[key] = multiprocessing.Value( 'b', False) # start the eeg server # check if multiple datasets are given for adaptation if hasattr(adaptation_files, '__iter__'): self.adaptation_data = adaptation_files online_logger.debug("Using multiple data sets:" + str(self.adaptation_data)) else: self.adaptation_data = [adaptation_files] # Adaptation is done in separate threads, we send the time series # windows to these threads via two queues online_logger.info("Initializing Queues") for key in self.datasets.keys(): self.queue[key] = multiprocessing.Queue() online_logger.info("Creating flows") def flow_generator(key): """create a generator to yield all the windows""" # Yield all windows until a None item is found in the queue while True: window = self.queue[key].get(block=True, timeout=None) if window == None: break yield window # Create the actual data flows for S1 vs P3 discrimination # and S1 vs LRP discrimination for key in self.datasets.keys(): if "threshold_adaptation_flow" in self.datasets[key]: self.aBRI_flow[key] = NodeChainFactory.flow_from_yaml( Flow_Class=NodeChain, flow_spec=file( self.datasets[key]["threshold_adaptation_flow"])) self.aBRI_flow[key][0].set_generator(flow_generator(key)) online_logger.info("threshold adaptation preparations finished") return 0
def prepare_adaptation(self, adaptation_files, datasets, nullmarker_stride_ms = None): """ Prepares the threshold adaptation. """ online_logger.info( "Preparing Adaptation") online_logger.info( "adaptation files:" + str(adaptation_files)) self.nullmarker_stride_ms = nullmarker_stride_ms if self.nullmarker_stride_ms == None: online_logger.warn( 'Nullmarker stride interval is %s. You can specify it in your parameter file.' % self.nullmarker_stride_ms) else: online_logger.info( 'Nullmarker stride interval is set to %s ms' % self.nullmarker_stride_ms) for key in self.datasets.keys(): if "threshold_adaptation_flow" in self.datasets[key]: spec_base = self.datasets[key]["configuration"].spec_dir self.datasets[key]["threshold_adaptation_flow"] = os.path.join(spec_base, self.datasets[key]["threshold_adaptation_flow"]) online_logger.info( "windower_spec_path:" + self.datasets[key]["windower_spec_threshold_adaptation"]) online_logger.info( "dataflow_spec_" + key + ":" + self.datasets[key]["threshold_adaptation_flow"]) self.adaptation_active_potential[key] = multiprocessing.Value('b',False) # start the eeg server # check if multiple datasets are given for adaptation if hasattr(adaptation_files,'__iter__'): self.adaptation_data = adaptation_files online_logger.debug("Using multiple data sets:" + str(self.adaptation_data)) else: self.adaptation_data = [adaptation_files] # Adaptation is done in separate threads, we send the time series # windows to these threads via two queues online_logger.info( "Initializing Queues") for key in self.datasets.keys(): self.queue[key] = multiprocessing.Queue() online_logger.info( "Creating flows") def flow_generator(key): """create a generator to yield all the windows""" # Yield all windows until a None item is found in the queue while True: window = self.queue[key].get(block = True, timeout = None) if window == None: break yield window # Create the actual data flows for S1 vs P3 discrimination # and S1 vs LRP discrimination for key in self.datasets.keys(): if "threshold_adaptation_flow" in self.datasets[key]: self.aBRI_flow[key] = NodeChainFactory.flow_from_yaml(Flow_Class = NodeChain, flow_spec = file(self.datasets[key]["threshold_adaptation_flow"])) self.aBRI_flow[key][0].set_generator(flow_generator(key)) online_logger.info( "threshold adaptation preparations finished") return 0
def prepare_training(self, training_files, potentials, operation, nullmarker_stride_ms=None): """ Prepares pyspace live for training. Prepares everything for training of pyspace live, i.e. creates flows based on the dataflow specs and configures them. """ online_logger.info("Preparing Training") self.potentials = potentials self.operation = operation self.nullmarker_stride_ms = nullmarker_stride_ms if self.nullmarker_stride_ms == None: online_logger.warn( 'Nullmarker stride interval is %s. You can specify it in your parameter file.' % self.nullmarker_stride_ms) else: online_logger.info('Nullmarker stride interval is set to %s ms ' % self.nullmarker_stride_ms) online_logger.info("Creating flows..") for key in self.potentials.keys(): spec_base = self.potentials[key]["configuration"].spec_dir if self.operation == "train": self.potentials[key]["node_chain"] = os.path.join( spec_base, self.potentials[key]["node_chain"]) online_logger.info("node_chain_spec:" + self.potentials[key]["node_chain"]) elif self.operation in ("prewindowing", "prewindowing_offline"): self.potentials[key]["prewindowing_flow"] = os.path.join( spec_base, self.potentials[key]["prewindowing_flow"]) online_logger.info("prewindowing_dataflow_spec: " + self.potentials[key]["prewindowing_flow"]) elif self.operation == "prewindowed_train": self.potentials[key]["postprocess_flow"] = os.path.join( spec_base, self.potentials[key]["postprocess_flow"]) online_logger.info("postprocessing_dataflow_spec: " + self.potentials[key]["postprocess_flow"]) self.training_active_potential[key] = multiprocessing.Value( "b", False) online_logger.info("Path variables set for NodeChains") # check if multiple potentials are given for training if isinstance(training_files, list): self.training_data = training_files else: self.training_data = [training_files] # Training is done in separate processes, we send the time series # windows to these threads via two queues online_logger.info("Initializing Queues") for key in self.potentials.keys(): self.queue[key] = multiprocessing.Queue() def flow_generator(key): """create a generator to yield all the abri flow windows""" # Yield all windows until a None item is found in the queue while True: window = self.queue[key].get(block=True, timeout=None) if window == None: break yield window # Create the actual data flows for key in self.potentials.keys(): if self.operation == "train": self.node_chains[key] = NodeChainFactory.flow_from_yaml( Flow_Class=NodeChain, flow_spec=file(self.potentials[key]["node_chain"])) self.node_chains[key][0].set_generator(flow_generator(key)) flow = open(self.potentials[key]["node_chain"]) elif self.operation in ("prewindowing", "prewindowing_offline"): online_logger.info("loading prewindowing flow..") online_logger.info( "file: " + str(self.potentials[key]["prewindowing_flow"])) self.node_chains[key] = NodeChainFactory.flow_from_yaml( Flow_Class=NodeChain, flow_spec=file(self.potentials[key]["prewindowing_flow"])) self.node_chains[key][0].set_generator(flow_generator(key)) flow = open(self.potentials[key]["prewindowing_flow"]) elif self.operation == "prewindowed_train": self.node_chains[key] = NodeChainFactory.flow_from_yaml( Flow_Class=NodeChain, flow_spec=file(self.potentials[key]["postprocess_flow"])) replace_start_and_end_markers = False final_collection = TimeSeriesDataset() final_collection_path = os.path.join( self.prewindowed_data_directory, key, "all_train_data") # delete previous training collection if os.path.exists(final_collection_path): online_logger.info( "deleting old training data collection for " + key) shutil.rmtree(final_collection_path) # load all prewindowed collections and # append data to the final collection prewindowed_sets = \ glob.glob(os.path.join(self.prewindowed_data_directory, key, "*")) if len(prewindowed_sets) == 0: online_logger.error( "Couldn't find data, please do prewindowing first!") raise Exception online_logger.info("concatenating prewindowed data from " + str(prewindowed_sets)) for s, d in enumerate(prewindowed_sets): collection = BaseDataset.load(d) data = collection.get_data(0, 0, "train") for d, (sample, label) in enumerate(data): if replace_start_and_end_markers: # in case we concatenate multiple 'Window' labeled # sets we have to remove every start- and endmarker for k in sample.marker_name.keys(): # find '{S,s} 8' or '{S,s} 9' m = re.match("^s\s{0,2}[8,9]{1}$", k, re.IGNORECASE) if m is not None: online_logger.info( str("remove %s from %d %d" % (m.group(), s, d))) del (sample.marker_name[m.group()]) if s == len(prewindowed_sets)-1 and \ d == len(data)-1: # insert endmarker sample.marker_name["S 9"] = [0.0] online_logger.info("added endmarker" + str(s) + " " + str(d)) if s == 0 and d == 0: # insert startmarker sample.marker_name["S 8"] = [0.0] online_logger.info("added startmarker" + str(s) + " " + str(d)) final_collection.add_sample(sample, label, True) # save final collection (just for debugging) os.mkdir(final_collection_path) final_collection.store(final_collection_path) online_logger.info("stored final collection at " + final_collection_path) # load final collection again for training online_logger.info("loading data from " + final_collection_path) self.prewindowed_data[key] = BaseDataset.load( final_collection_path) self.node_chains[key][0].set_input_dataset( self.prewindowed_data[key]) flow = open(self.potentials[key]["postprocess_flow"]) # create window_stream for every potential if self.operation in ("prewindowing"): window_spec_file = os.path.join( spec_base, "node_chains", "windower", self.potentials[key]["windower_spec_path_train"]) self.window_stream[key] = \ self.stream_manager.request_window_stream(window_spec_file, nullmarker_stride_ms = self.nullmarker_stride_ms) elif self.operation in ("prewindowing_offline"): pass elif self.operation in ("train"): pass self.node_chain_definitions[key] = yaml.load(flow) flow.close() # TODO: check if the prewindowing flow is still needed when using the stream mode! if self.operation in ("train"): online_logger.info("Removing old flows...") try: shutil.rmtree(self.flow_storage) except: online_logger.info("Could not delete flow storage directory") os.mkdir(self.flow_storage) elif self.operation in ("prewindowing", "prewindowing_offline"): # follow this policy: # - delete prewindowed data older than 12 hours # - always delete trained/stored flows now = datetime.datetime.now() then = now - datetime.timedelta(hours=12) if not os.path.exists(self.prewindowed_data_directory): os.mkdir(self.prewindowed_data_directory) if not os.path.exists(self.flow_storage): os.mkdir(self.flow_storage) for key in self.potentials.keys(): found = self.find_files_older_than(then, \ os.path.join(self.prewindowed_data_directory, key)) if found is not None: for f in found: online_logger.info( str("recursively deleting files in \'%s\'" % f)) try: shutil.rmtree(os.path.abspath(f)) except Exception as e: # TODO: find a smart solution for this! pass # dir was probably already deleted.. if os.path.exists( os.path.join(self.prewindowed_data_directory, key, "all_train_data")): shutil.rmtree( os.path.join(self.prewindowed_data_directory, key, "all_train_data")) online_logger.info( "deleted concatenated training data for " + key) online_logger.info("Training preparations finished") return 0
def prepare_training(self, training_files, potentials, operation): """ Prepares pyspace live for training. Prepares everything for training of pyspace live, i.e. creates flows based on the dataflow specs and configures them. """ online_logger.info( "Preparing Training") self.potentials = potentials self.operation = operation online_logger.info( "Creating flows..") for key in self.potentials.keys(): spec_base = self.potentials[key]["configuration"].spec_dir if self.operation == "train": self.potentials[key]["node_chain"] = os.path.join(spec_base, self.potentials[key]["node_chain"]) online_logger.info( "node_chain_spec:" + self.potentials[key]["node_chain"]) elif self.operation in ("prewindowing", "prewindowing_offline"): if self.potentials[key].has_key("stream") and self.potentials[key]["stream"] == True: self.potentials[key]["prewindowing_flow"] = os.path.join(spec_base, self.potentials[key]["stream_prewindowing_flow"]) else: self.potentials[key]["prewindowing_flow"] = os.path.join(spec_base, self.potentials[key]["prewindowing_flow"]) online_logger.info( "prewindowing_dataflow_spec: " + self.potentials[key]["prewindowing_flow"]) elif self.operation == "prewindowed_train": if self.potentials[key].has_key("stream") and self.potentials[key]["stream"] == True: self.potentials[key]["postprocess_flow"] = os.path.join(spec_base, self.potentials[key]["stream_postprocess_flow"]) else: self.potentials[key]["postprocess_flow"] = os.path.join(spec_base, self.potentials[key]["postprocess_flow"]) online_logger.info( "postprocessing_dataflow_spec: " + self.potentials[key]["postprocess_flow"]) self.training_active_potential[key] = multiprocessing.Value("b",False) online_logger.info("Path variables set for NodeChains") # check if multiple potentials are given for training if isinstance(training_files, list): self.training_data = training_files else: self.training_data = [training_files] # Training is done in separate processes, we send the time series # windows to these threads via two queues online_logger.info( "Initializing Queues") for key in self.potentials.keys(): self.queue[key] = multiprocessing.Queue() def flow_generator(key): """create a generator to yield all the abri flow windows""" # Yield all windows until a None item is found in the queue while True: window = self.queue[key].get(block = True, timeout = None) if window == None: break yield window # Create the actual data flows for key in self.potentials.keys(): if self.operation == "train": self.node_chains[key] = NodeChainFactory.flow_from_yaml(Flow_Class = NodeChain, flow_spec = file(self.potentials[key]["node_chain"])) self.node_chains[key][0].set_generator(flow_generator(key)) flow = open(self.potentials[key]["node_chain"]) elif self.operation in ("prewindowing", "prewindowing_offline"): online_logger.info("loading prewindowing flow..") online_logger.info("file: " + str(self.potentials[key]["prewindowing_flow"])) self.node_chains[key] = NodeChainFactory.flow_from_yaml(Flow_Class = NodeChain, flow_spec = file(self.potentials[key]["prewindowing_flow"])) self.node_chains[key][0].set_generator(flow_generator(key)) flow = open(self.potentials[key]["prewindowing_flow"]) elif self.operation == "prewindowed_train": if self.potentials[key].has_key("stream") and self.potentials[key]["stream"] == True: self.node_chains[key] = NodeChainFactory.flow_from_yaml(Flow_Class = NodeChain, flow_spec = file(self.potentials[key]["postprocess_flow"])) # create windower online_logger.info( "Creating Windower") online_logger.info(self.potentials[key]["windower_spec_path_train"]) self.node_chains[key][0].set_windower_spec_file(os.path.join(spec_base, "node_chains", "windower", self.potentials[key]["windower_spec_path_train"])) replace_start_and_end_markers = True else: self.node_chains[key] = NodeChainFactory.flow_from_yaml(Flow_Class = NodeChain, flow_spec = file(self.potentials[key]["postprocess_flow"])) replace_start_and_end_markers = False final_collection = TimeSeriesDataset() final_collection_path = os.path.join(self.prewindowed_data_directory, key, "all_train_data") # delete previous training collection if os.path.exists(final_collection_path): online_logger.info("deleting old training data collection for " + key) shutil.rmtree(final_collection_path) # load all prewindowed collections and # append data to the final collection prewindowed_sets = \ glob.glob(os.path.join(self.prewindowed_data_directory, key, "*")) if len(prewindowed_sets) == 0: online_logger.error("Couldn't find data, please do prewindowing first!") raise Exception online_logger.info("concatenating prewindowed data from " + str(prewindowed_sets)) for s,d in enumerate(prewindowed_sets): collection = BaseDataset.load(d) data = collection.get_data(0, 0, "train") for d,(sample,label) in enumerate(data): if replace_start_and_end_markers: # in case we concatenate multiple 'Window' labeled # sets we have to remove every start- and endmarker for k in sample.marker_name.keys(): # find '{S,s} 8' or '{S,s} 9' m = re.match("^s\s{0,2}[8,9]{1}$", k, re.IGNORECASE) if m is not None: online_logger.info(str("remove %s from %d %d" % (m.group(), s, d))) del(sample.marker_name[m.group()]) if s == len(prewindowed_sets)-1 and \ d == len(data)-1: # insert endmarker sample.marker_name["S 9"] = [0.0] online_logger.info("added endmarker" + str(s) + " " + str(d)) if s == 0 and d == 0: # insert startmarker sample.marker_name["S 8"] = [0.0] online_logger.info("added startmarker" + str(s) + " " + str(d)) final_collection.add_sample(sample, label, True) # save final collection (just for debugging) os.mkdir(final_collection_path) final_collection.store(final_collection_path) online_logger.info("stored final collection at " + final_collection_path) # load final collection again for training online_logger.info("loading data from " + final_collection_path) self.prewindowed_data[key] = BaseDataset.load(final_collection_path) self.node_chains[key][0].set_input_dataset(self.prewindowed_data[key]) flow = open(self.potentials[key]["postprocess_flow"]) self.node_chain_definitions[key] = yaml.load(flow) flow.close() # TODO: check if the prewindowing flow is still needed # when using the stream mode! if self.operation in ("train"): online_logger.info( "Removing old flows...") try: shutil.rmtree(self.flow_storage) except: online_logger.info("Could not delete flow storage directory") os.mkdir(self.flow_storage) elif self.operation in ("prewindowing", "prewindowing_offline"): # follow this policy: # - delete prewindowed data older than 12 hours # - always delete trained/stored flows now = datetime.datetime.now() then = now - datetime.timedelta(hours=12) if not os.path.exists(self.prewindowed_data_directory): os.mkdir(self.prewindowed_data_directory) if not os.path.exists(self.flow_storage): os.mkdir(self.flow_storage) for key in self.potentials.keys(): found = self.find_files_older_than(then, \ os.path.join(self.prewindowed_data_directory, key)) if found is not None: for f in found: online_logger.info(str("recursively deleting files in \'%s\'" % f)) try: shutil.rmtree(os.path.abspath(f)) except Exception as e: # TODO: find a smart solution for this! pass # dir was probably already deleted.. if os.path.exists(os.path.join(self.prewindowed_data_directory, key, "all_train_data")): shutil.rmtree(os.path.join(self.prewindowed_data_directory, key, "all_train_data")) online_logger.info("deleted concatenated training data for " + key) online_logger.info( "Training preparations finished") return 0