def node_from_yaml(layer_spec): """ Load the specs and initialize the layer nodes """ assert("parameters" in layer_spec and "class_labels" in layer_spec["parameters"] and "node" in layer_spec["parameters"]),\ "Node requires specification of a node and classification labels!" scheme = layer_spec["parameters"].pop("scheme", "1vs1") # Create all nodes that are packed together in this layer layer_nodes = [] node_spec = layer_spec["parameters"]["node"][0] classes = layer_spec["parameters"]["class_labels"] if scheme == '1vR': for label in layer_spec["parameters"]["class_labels"]: node_obj = BaseNode.node_from_yaml( NodeChainFactory.instantiate(node_spec, {"LABEL": label})) layer_nodes.append(node_obj) else: n = len(classes) for i in range(n - 1): for j in range(i + 1, n): replace_dict = {"LABEL1": classes[i], "LABEL2": classes[j]} node_obj = BaseNode.node_from_yaml( NodeChainFactory.instantiate(node_spec, replace_dict)) layer_nodes.append(node_obj) layer_spec["parameters"].pop("node") layer_spec["parameters"].pop("class_labels") # Create the node object node_obj = MultiClassLayerNode(nodes=layer_nodes, **layer_spec["parameters"]) return node_obj
def node_from_yaml(layer_spec): """ Load the specs and initialize the layer nodes """ assert("parameters" in layer_spec and "class_labels" in layer_spec["parameters"] and "node" in layer_spec["parameters"]),\ "Node requires specification of a node and classification labels!" scheme = layer_spec["parameters"].pop("scheme","1vs1") # Create all nodes that are packed together in this layer layer_nodes = [] node_spec = layer_spec["parameters"]["node"][0] classes = layer_spec["parameters"]["class_labels"] if scheme=='1vR': for label in layer_spec["parameters"]["class_labels"]: node_obj = BaseNode.node_from_yaml(NodeChainFactory.instantiate(node_spec,{"LABEL":label})) layer_nodes.append(node_obj) else: n=len(classes) for i in range(n-1): for j in range(i+1,n): replace_dict = {"LABEL1":classes[i],"LABEL2":classes[j]} node_obj = BaseNode.node_from_yaml(NodeChainFactory.instantiate(node_spec,replace_dict)) layer_nodes.append(node_obj) layer_spec["parameters"].pop("node") layer_spec["parameters"].pop("class_labels") # Create the node object node_obj = MultiClassLayerNode(nodes = layer_nodes,**layer_spec["parameters"]) return node_obj
def __init__( self, node_chain_spec, parameter_setting, rel_dataset_dir, run, split, storage_format, result_dataset_directory, store_node_chain=False, hide_parameters=[], ): super(NodeChainProcess, self).__init__() self.node_chain_spec = node_chain_spec self.parameter_setting = parameter_setting self.rel_dataset_dir = rel_dataset_dir self.storage = pySPACE.configuration.storage self.run = run self.storage_format = storage_format self.result_dataset_directory = result_dataset_directory self.persistency_dir = os.sep.join([result_dataset_directory, "persistency_run%s" % run]) create_directory(self.persistency_dir) self.store_node_chain = store_node_chain self.hide_parameters = hide_parameters # reduce_log_level for process creation try: console_log_level = ( eval(pySPACE.configuration.console_log_level) if hasattr(pySPACE.configuration, "console_log_level") else logging.WARNING ) except (AttributeError, NameError): console_log_level = logging.WARNING try: file_log_level = ( eval(pySPACE.configuration.file_log_level) if hasattr(pySPACE.configuration, "file_log_level") else logging.INFO ) except (AttributeError, NameError): file_log_level = logging.INFO self.min_log_level = min(console_log_level, file_log_level) pySPACE.configuration.min_log_level = self.min_log_level # Replace parameters in spec file # self.node_chain_spec = replace_parameters_and_convert( # self.node_chain_spec, self.parameter_setting) self.node_chain_spec = replace_parameters2(self.node_chain_spec, self.parameter_setting) # Create node chain self.node_chain = NodeChainFactory.flow_from_yaml(Flow_Class=BenchmarkNodeChain, flow_spec=self.node_chain_spec) for node in self.node_chain: node.current_split = split # Remove pseudo parameter "__PREPARE_OPERATION__" if "__PREPARE_OPERATION__" in self.parameter_setting: self.parameter_setting = copy.deepcopy(self.parameter_setting) self.parameter_setting.pop("__PREPARE_OPERATION__")
def test_dataflow_from_yaml(self): simpleYAMLInput =""" - node : Time_Series_Source - node : Detrending parameters : detrend_method : "eval(__import__('pylab').detrend_mean)" - node : Subsampling parameters : target_frequency : 100.0 - node : CSP parameters : retained_channels : 4 """ flow = NodeChainFactory.flow_from_yaml(NodeChain, simpleYAMLInput) self.assert_(isinstance(flow, NodeChain) and len(flow) == 4) self.assert_(isinstance(flow[0], TimeSeriesSourceNode) and isinstance(flow[1], DetrendingNode) and isinstance(flow[2], SubsamplingNode) and isinstance(flow[3], CSPNode)) self.assert_(flow[1].detrend_method == pylab.detrend_mean) self.assert_(flow[2].target_frequency == 100.0) self.assert_(flow[3].retained_channels == 4)
def test_dataflow_from_yaml(self): simpleYAMLInput = """ - node : Time_Series_Source - node : Detrending parameters : detrend_method : "eval(__import__('pylab').detrend_mean)" - node : Subsampling parameters : target_frequency : 100.0 - node : CSP parameters : retained_channels : 4 """ flow = NodeChainFactory.flow_from_yaml(NodeChain, simpleYAMLInput) self.assert_(isinstance(flow, NodeChain) and len(flow) == 4) self.assert_( isinstance(flow[0], TimeSeriesSourceNode) and isinstance(flow[1], DetrendingNode) and isinstance(flow[2], SubsamplingNode) and isinstance(flow[3], CSPNode)) self.assert_(flow[1].detrend_method == pylab.detrend_mean) self.assert_(flow[2].target_frequency == 100.0) self.assert_(flow[3].retained_channels == 4)
def __init__(self, node_chain_spec, parameter_setting, rel_dataset_dir, run, split, storage_format, result_dataset_directory, store_node_chain=False, hide_parameters=[]): super(NodeChainProcess, self).__init__() self.node_chain_spec = node_chain_spec self.parameter_setting = parameter_setting self.rel_dataset_dir = rel_dataset_dir self.storage = pySPACE.configuration.storage self.run = run self.storage_format = storage_format self.result_dataset_directory = result_dataset_directory self.persistency_dir = os.sep.join( [result_dataset_directory, "persistency_run%s" % run]) create_directory(self.persistency_dir) self.store_node_chain = store_node_chain self.hide_parameters = hide_parameters # reduce_log_level for process creation try: console_log_level = eval(pySPACE.configuration.console_log_level) \ if hasattr(pySPACE.configuration, "console_log_level") \ else logging.WARNING except (AttributeError, NameError): console_log_level = logging.WARNING try: file_log_level = eval(pySPACE.configuration.file_log_level) \ if hasattr(pySPACE.configuration, "file_log_level") \ else logging.INFO except (AttributeError, NameError): file_log_level = logging.INFO self.min_log_level = min(console_log_level, file_log_level) pySPACE.configuration.min_log_level = self.min_log_level # Replace parameters in spec file # self.node_chain_spec = replace_parameters_and_convert( # self.node_chain_spec, self.parameter_setting) self.node_chain_spec = replace_parameters2(self.node_chain_spec, self.parameter_setting) # Create node chain self.node_chain = NodeChainFactory.flow_from_yaml( Flow_Class=BenchmarkNodeChain, flow_spec=self.node_chain_spec) for node in self.node_chain: node.current_split = split # Remove pseudo parameter "__PREPARE_OPERATION__" if "__PREPARE_OPERATION__" in self.parameter_setting: self.parameter_setting = copy.deepcopy(self.parameter_setting) self.parameter_setting.pop("__PREPARE_OPERATION__")
def prepare_adaptation(self, adaptation_files, datasets): """ Prepares the threshold adaptation. """ online_logger.info("Preparing Adaptation") online_logger.info("adaptation files:" + str(adaptation_files)) for key in self.datasets.keys(): if "threshold_adaptation_flow" in self.datasets[key]: spec_base = self.datasets[key]["configuration"].spec_dir self.datasets[key]["threshold_adaptation_flow"] = os.path.join( spec_base, self.datasets[key]["threshold_adaptation_flow"]) online_logger.info( "windower_spec_path:" + self.datasets[key]["windower_spec_threshold_adaptation"]) online_logger.info( "dataflow_spec_" + key + ":" + self.datasets[key]["threshold_adaptation_flow"]) self.adaptation_active_potential[key] = multiprocessing.Value( 'b', False) # start the eeg server # check if multiple datasets are given for adaptation if hasattr(adaptation_files, '__iter__'): self.adaptation_data = adaptation_files online_logger.debug("Using multiple data sets:" + str(self.adaptation_data)) else: self.adaptation_data = [adaptation_files] # Adaptation is done in separate threads, we send the time series # windows to these threads via two queues online_logger.info("Initializing Queues") for key in self.datasets.keys(): self.queue[key] = multiprocessing.Queue() online_logger.info("Creating flows") def flow_generator(key): """create a generator to yield all the windows""" # Yield all windows until a None item is found in the queue while True: window = self.queue[key].get(block=True, timeout=None) if window == None: break yield window # Create the actual data flows for S1 vs P3 discrimination # and S1 vs LRP discrimination for key in self.datasets.keys(): if "threshold_adaptation_flow" in self.datasets[key]: self.aBRI_flow[key] = NodeChainFactory.flow_from_yaml( Flow_Class=NodeChain, flow_spec=file( self.datasets[key]["threshold_adaptation_flow"])) self.aBRI_flow[key][0].set_generator(flow_generator(key)) online_logger.info("threshold adaptation preparations finished") return 0
def prepare_adaptation(self, adaptation_files, datasets, nullmarker_stride_ms = None): """ Prepares the threshold adaptation. """ online_logger.info( "Preparing Adaptation") online_logger.info( "adaptation files:" + str(adaptation_files)) self.nullmarker_stride_ms = nullmarker_stride_ms if self.nullmarker_stride_ms == None: online_logger.warn( 'Nullmarker stride interval is %s. You can specify it in your parameter file.' % self.nullmarker_stride_ms) else: online_logger.info( 'Nullmarker stride interval is set to %s ms' % self.nullmarker_stride_ms) for key in self.datasets.keys(): if "threshold_adaptation_flow" in self.datasets[key]: spec_base = self.datasets[key]["configuration"].spec_dir self.datasets[key]["threshold_adaptation_flow"] = os.path.join(spec_base, self.datasets[key]["threshold_adaptation_flow"]) online_logger.info( "windower_spec_path:" + self.datasets[key]["windower_spec_threshold_adaptation"]) online_logger.info( "dataflow_spec_" + key + ":" + self.datasets[key]["threshold_adaptation_flow"]) self.adaptation_active_potential[key] = multiprocessing.Value('b',False) # start the eeg server # check if multiple datasets are given for adaptation if hasattr(adaptation_files,'__iter__'): self.adaptation_data = adaptation_files online_logger.debug("Using multiple data sets:" + str(self.adaptation_data)) else: self.adaptation_data = [adaptation_files] # Adaptation is done in separate threads, we send the time series # windows to these threads via two queues online_logger.info( "Initializing Queues") for key in self.datasets.keys(): self.queue[key] = multiprocessing.Queue() online_logger.info( "Creating flows") def flow_generator(key): """create a generator to yield all the windows""" # Yield all windows until a None item is found in the queue while True: window = self.queue[key].get(block = True, timeout = None) if window == None: break yield window # Create the actual data flows for S1 vs P3 discrimination # and S1 vs LRP discrimination for key in self.datasets.keys(): if "threshold_adaptation_flow" in self.datasets[key]: self.aBRI_flow[key] = NodeChainFactory.flow_from_yaml(Flow_Class = NodeChain, flow_spec = file(self.datasets[key]["threshold_adaptation_flow"])) self.aBRI_flow[key][0].set_generator(flow_generator(key)) online_logger.info( "threshold adaptation preparations finished") return 0
def prepare_training(self, training_files, potentials, operation, nullmarker_stride_ms=None): """ Prepares pyspace live for training. Prepares everything for training of pyspace live, i.e. creates flows based on the dataflow specs and configures them. """ online_logger.info("Preparing Training") self.potentials = potentials self.operation = operation self.nullmarker_stride_ms = nullmarker_stride_ms if self.nullmarker_stride_ms == None: online_logger.warn( 'Nullmarker stride interval is %s. You can specify it in your parameter file.' % self.nullmarker_stride_ms) else: online_logger.info('Nullmarker stride interval is set to %s ms ' % self.nullmarker_stride_ms) online_logger.info("Creating flows..") for key in self.potentials.keys(): spec_base = self.potentials[key]["configuration"].spec_dir if self.operation == "train": self.potentials[key]["node_chain"] = os.path.join( spec_base, self.potentials[key]["node_chain"]) online_logger.info("node_chain_spec:" + self.potentials[key]["node_chain"]) elif self.operation in ("prewindowing", "prewindowing_offline"): self.potentials[key]["prewindowing_flow"] = os.path.join( spec_base, self.potentials[key]["prewindowing_flow"]) online_logger.info("prewindowing_dataflow_spec: " + self.potentials[key]["prewindowing_flow"]) elif self.operation == "prewindowed_train": self.potentials[key]["postprocess_flow"] = os.path.join( spec_base, self.potentials[key]["postprocess_flow"]) online_logger.info("postprocessing_dataflow_spec: " + self.potentials[key]["postprocess_flow"]) self.training_active_potential[key] = multiprocessing.Value( "b", False) online_logger.info("Path variables set for NodeChains") # check if multiple potentials are given for training if isinstance(training_files, list): self.training_data = training_files else: self.training_data = [training_files] # Training is done in separate processes, we send the time series # windows to these threads via two queues online_logger.info("Initializing Queues") for key in self.potentials.keys(): self.queue[key] = multiprocessing.Queue() def flow_generator(key): """create a generator to yield all the abri flow windows""" # Yield all windows until a None item is found in the queue while True: window = self.queue[key].get(block=True, timeout=None) if window == None: break yield window # Create the actual data flows for key in self.potentials.keys(): if self.operation == "train": self.node_chains[key] = NodeChainFactory.flow_from_yaml( Flow_Class=NodeChain, flow_spec=file(self.potentials[key]["node_chain"])) self.node_chains[key][0].set_generator(flow_generator(key)) flow = open(self.potentials[key]["node_chain"]) elif self.operation in ("prewindowing", "prewindowing_offline"): online_logger.info("loading prewindowing flow..") online_logger.info( "file: " + str(self.potentials[key]["prewindowing_flow"])) self.node_chains[key] = NodeChainFactory.flow_from_yaml( Flow_Class=NodeChain, flow_spec=file(self.potentials[key]["prewindowing_flow"])) self.node_chains[key][0].set_generator(flow_generator(key)) flow = open(self.potentials[key]["prewindowing_flow"]) elif self.operation == "prewindowed_train": self.node_chains[key] = NodeChainFactory.flow_from_yaml( Flow_Class=NodeChain, flow_spec=file(self.potentials[key]["postprocess_flow"])) replace_start_and_end_markers = False final_collection = TimeSeriesDataset() final_collection_path = os.path.join( self.prewindowed_data_directory, key, "all_train_data") # delete previous training collection if os.path.exists(final_collection_path): online_logger.info( "deleting old training data collection for " + key) shutil.rmtree(final_collection_path) # load all prewindowed collections and # append data to the final collection prewindowed_sets = \ glob.glob(os.path.join(self.prewindowed_data_directory, key, "*")) if len(prewindowed_sets) == 0: online_logger.error( "Couldn't find data, please do prewindowing first!") raise Exception online_logger.info("concatenating prewindowed data from " + str(prewindowed_sets)) for s, d in enumerate(prewindowed_sets): collection = BaseDataset.load(d) data = collection.get_data(0, 0, "train") for d, (sample, label) in enumerate(data): if replace_start_and_end_markers: # in case we concatenate multiple 'Window' labeled # sets we have to remove every start- and endmarker for k in sample.marker_name.keys(): # find '{S,s} 8' or '{S,s} 9' m = re.match("^s\s{0,2}[8,9]{1}$", k, re.IGNORECASE) if m is not None: online_logger.info( str("remove %s from %d %d" % (m.group(), s, d))) del (sample.marker_name[m.group()]) if s == len(prewindowed_sets)-1 and \ d == len(data)-1: # insert endmarker sample.marker_name["S 9"] = [0.0] online_logger.info("added endmarker" + str(s) + " " + str(d)) if s == 0 and d == 0: # insert startmarker sample.marker_name["S 8"] = [0.0] online_logger.info("added startmarker" + str(s) + " " + str(d)) final_collection.add_sample(sample, label, True) # save final collection (just for debugging) os.mkdir(final_collection_path) final_collection.store(final_collection_path) online_logger.info("stored final collection at " + final_collection_path) # load final collection again for training online_logger.info("loading data from " + final_collection_path) self.prewindowed_data[key] = BaseDataset.load( final_collection_path) self.node_chains[key][0].set_input_dataset( self.prewindowed_data[key]) flow = open(self.potentials[key]["postprocess_flow"]) # create window_stream for every potential if self.operation in ("prewindowing"): window_spec_file = os.path.join( spec_base, "node_chains", "windower", self.potentials[key]["windower_spec_path_train"]) self.window_stream[key] = \ self.stream_manager.request_window_stream(window_spec_file, nullmarker_stride_ms = self.nullmarker_stride_ms) elif self.operation in ("prewindowing_offline"): pass elif self.operation in ("train"): pass self.node_chain_definitions[key] = yaml.load(flow) flow.close() # TODO: check if the prewindowing flow is still needed when using the stream mode! if self.operation in ("train"): online_logger.info("Removing old flows...") try: shutil.rmtree(self.flow_storage) except: online_logger.info("Could not delete flow storage directory") os.mkdir(self.flow_storage) elif self.operation in ("prewindowing", "prewindowing_offline"): # follow this policy: # - delete prewindowed data older than 12 hours # - always delete trained/stored flows now = datetime.datetime.now() then = now - datetime.timedelta(hours=12) if not os.path.exists(self.prewindowed_data_directory): os.mkdir(self.prewindowed_data_directory) if not os.path.exists(self.flow_storage): os.mkdir(self.flow_storage) for key in self.potentials.keys(): found = self.find_files_older_than(then, \ os.path.join(self.prewindowed_data_directory, key)) if found is not None: for f in found: online_logger.info( str("recursively deleting files in \'%s\'" % f)) try: shutil.rmtree(os.path.abspath(f)) except Exception as e: # TODO: find a smart solution for this! pass # dir was probably already deleted.. if os.path.exists( os.path.join(self.prewindowed_data_directory, key, "all_train_data")): shutil.rmtree( os.path.join(self.prewindowed_data_directory, key, "all_train_data")) online_logger.info( "deleted concatenated training data for " + key) online_logger.info("Training preparations finished") return 0
def _stop_training(self): """ Do the optimization step and define final parameter choice This is the main method of this node! .. todo:: Allow also parallelization over nominal_ranges! """ self._log("Starting optimization Process.") self.runs = [10 * self.run_number + run for run in range(self.runs)] original_flow_template = copy.copy(self.flow_template) # Fill in validation parameters in the template self.flow_template = NodeChainFactory.replace_parameters_in_node_chain( original_flow_template, self.validation_parameter_settings) if self.nom_rng is None: self.prepare_optimization() self.best_parametrization, self.best_performance = \ self.get_best_parametrization() self.performance_dict[self.p2key(self.best_parametrization)] = \ (self.best_performance, self.best_parametrization) else: nom_grid = self.search_grid(self.nom_rng) iterations = 0 search_history = [] # copy flow_template since we have to instantiate for every nom_par flow_template = copy.copy(self.flow_template) for nom_par in nom_grid: # for getting the best parameterization, # the class attribute flow_template must be overwritten self.flow_template = \ NodeChainFactory.replace_parameters_in_node_chain( flow_template, nom_par) self.prepare_optimization() parametrization, performance = self.get_best_parametrization() self.performance_dict[self.p2key(nom_par)] = (performance, parametrization) iterations += self.iterations search_history.append((nom_par,self.search_history)) # reinitialize optimization parameters self.re_init() # reconstructing the overwritten flow for further usage self.flow_template = flow_template self.iterations = iterations self.search_history = sorted(search_history, key=lambda t: t[1][-1]["best_performance"]) best_key = max(sorted(self.performance_dict.items()), key=lambda t: t[1])[0] self.best_performance, self.best_parametrization = \ self.performance_dict[best_key] self.best_parametrization.update(dict(best_key)) # when best parameter dict is calculated, this has to be logged # or saved and the chosen parameter is used for training on the # whole data set, independent of the chosen algorithm self._log("Using parameterization %s with optimal performance %s for " \ "metric %s." % (self.best_parametrization, self.best_performance, self.metric)) # Fill in the final parameters in the flow template self.flow_template = NodeChainFactory.replace_parameters_in_node_chain( original_flow_template, self.final_training_parameter_settings) best_flow_template = self.flow_template best_flow_template[1] = {'node': 'All_Train_Splitter'} #delete last node best_flow_template.pop(-1) self.flow = self.generate_subflow(best_flow_template, self.best_parametrization, NodeChain) self.flow[-1].set_run_number(self.run_number) self.flow[0].set_generator(self.train_instances) self.flow.train() self._log("Training of optimal flow finished") # delete training instances that would be stored to disk if this node # is saved del self.train_instances
def _stop_training(self): """ Do the optimization step and define final parameter choice This is the main method of this node! .. todo:: Allow also parallelization over nominal_ranges! """ self._log("Starting optimization Process.") self.runs = [10 * self.run_number + run for run in range(self.runs)] original_flow_template = copy.copy(self.flow_template) # Fill in validation parameters in the template if not self.validation_parameter_settings == {}: self.flow_template = [ NodeChainFactory.instantiate( template=node, parametrization=self.validation_parameter_settings) for node in original_flow_template ] if self.nom_rng is None: self.prepare_optimization() self.best_parametrization, self.best_performance = \ self.get_best_parametrization() self.performance_dict[self.p2key(self.best_parametrization)] = \ (self.best_performance, self.best_parametrization) else: nom_grid = self.search_grid(self.nom_rng) iterations = 0 search_history = [] # copy flow_template since we have to instantiate for every nom_par flow_template = copy.copy(self.flow_template) for nom_par in nom_grid: # for getting the best parameterization, # the class attribute flow_template must be overwritten self.flow_template = [ NodeChainFactory.instantiate(template=node, parametrization=nom_par) for node in flow_template ] self.prepare_optimization() parametrization, performance = self.get_best_parametrization() self.performance_dict[self.p2key(nom_par)] = (performance, parametrization) iterations += self.iterations search_history.append((nom_par, self.search_history)) # reinitialize optimization parameters self.re_init() # reconstructing the overwritten flow for further usage self.flow_template = flow_template self.iterations = iterations self.search_history = sorted( search_history, key=lambda t: t[1][-1]["best_performance"]) best_key = max(sorted(self.performance_dict.items()), key=lambda t: t[1])[0] self.best_performance, self.best_parametrization = \ self.performance_dict[best_key] self.best_parametrization.update(dict(best_key)) # when best parameter dict is calculated, this has to be logged # or saved and the chosen parameter is used for training on the # whole data set, independent of the chosen algorithm self._log("Using parameterization %s with optimal performance %s for " \ "metric %s." % (self.best_parametrization, self.best_performance, self.metric)) # Fill in the final parameters in the flow template if not self.final_training_parameter_settings == {}: self.flow_template = [ NodeChainFactory.instantiate( template=node, parametrization=self.final_training_parameter_settings) for node in original_flow_template ] else: self.flow_template = original_flow_template best_flow_template = self.flow_template best_flow_template[1] = {'node': 'All_Train_Splitter'} #delete last node best_flow_template.pop(-1) self.flow = self.generate_subflow(best_flow_template, self.best_parametrization, NodeChain) self.flow[-1].set_run_number(self.run_number) self.flow[0].set_generator(self.train_instances) self.flow.train() self._log("Training of optimal flow finished") # delete training instances that would be stored to disk if this node # is saved del self.train_instances
def prepare_training(self, training_files, potentials, operation): """ Prepares pyspace live for training. Prepares everything for training of pyspace live, i.e. creates flows based on the dataflow specs and configures them. """ online_logger.info( "Preparing Training") self.potentials = potentials self.operation = operation online_logger.info( "Creating flows..") for key in self.potentials.keys(): spec_base = self.potentials[key]["configuration"].spec_dir if self.operation == "train": self.potentials[key]["node_chain"] = os.path.join(spec_base, self.potentials[key]["node_chain"]) online_logger.info( "node_chain_spec:" + self.potentials[key]["node_chain"]) elif self.operation in ("prewindowing", "prewindowing_offline"): if self.potentials[key].has_key("stream") and self.potentials[key]["stream"] == True: self.potentials[key]["prewindowing_flow"] = os.path.join(spec_base, self.potentials[key]["stream_prewindowing_flow"]) else: self.potentials[key]["prewindowing_flow"] = os.path.join(spec_base, self.potentials[key]["prewindowing_flow"]) online_logger.info( "prewindowing_dataflow_spec: " + self.potentials[key]["prewindowing_flow"]) elif self.operation == "prewindowed_train": if self.potentials[key].has_key("stream") and self.potentials[key]["stream"] == True: self.potentials[key]["postprocess_flow"] = os.path.join(spec_base, self.potentials[key]["stream_postprocess_flow"]) else: self.potentials[key]["postprocess_flow"] = os.path.join(spec_base, self.potentials[key]["postprocess_flow"]) online_logger.info( "postprocessing_dataflow_spec: " + self.potentials[key]["postprocess_flow"]) self.training_active_potential[key] = multiprocessing.Value("b",False) online_logger.info("Path variables set for NodeChains") # check if multiple potentials are given for training if isinstance(training_files, list): self.training_data = training_files else: self.training_data = [training_files] # Training is done in separate processes, we send the time series # windows to these threads via two queues online_logger.info( "Initializing Queues") for key in self.potentials.keys(): self.queue[key] = multiprocessing.Queue() def flow_generator(key): """create a generator to yield all the abri flow windows""" # Yield all windows until a None item is found in the queue while True: window = self.queue[key].get(block = True, timeout = None) if window == None: break yield window # Create the actual data flows for key in self.potentials.keys(): if self.operation == "train": self.node_chains[key] = NodeChainFactory.flow_from_yaml(Flow_Class = NodeChain, flow_spec = file(self.potentials[key]["node_chain"])) self.node_chains[key][0].set_generator(flow_generator(key)) flow = open(self.potentials[key]["node_chain"]) elif self.operation in ("prewindowing", "prewindowing_offline"): online_logger.info("loading prewindowing flow..") online_logger.info("file: " + str(self.potentials[key]["prewindowing_flow"])) self.node_chains[key] = NodeChainFactory.flow_from_yaml(Flow_Class = NodeChain, flow_spec = file(self.potentials[key]["prewindowing_flow"])) self.node_chains[key][0].set_generator(flow_generator(key)) flow = open(self.potentials[key]["prewindowing_flow"]) elif self.operation == "prewindowed_train": if self.potentials[key].has_key("stream") and self.potentials[key]["stream"] == True: self.node_chains[key] = NodeChainFactory.flow_from_yaml(Flow_Class = NodeChain, flow_spec = file(self.potentials[key]["postprocess_flow"])) # create windower online_logger.info( "Creating Windower") online_logger.info(self.potentials[key]["windower_spec_path_train"]) self.node_chains[key][0].set_windower_spec_file(os.path.join(spec_base, "node_chains", "windower", self.potentials[key]["windower_spec_path_train"])) replace_start_and_end_markers = True else: self.node_chains[key] = NodeChainFactory.flow_from_yaml(Flow_Class = NodeChain, flow_spec = file(self.potentials[key]["postprocess_flow"])) replace_start_and_end_markers = False final_collection = TimeSeriesDataset() final_collection_path = os.path.join(self.prewindowed_data_directory, key, "all_train_data") # delete previous training collection if os.path.exists(final_collection_path): online_logger.info("deleting old training data collection for " + key) shutil.rmtree(final_collection_path) # load all prewindowed collections and # append data to the final collection prewindowed_sets = \ glob.glob(os.path.join(self.prewindowed_data_directory, key, "*")) if len(prewindowed_sets) == 0: online_logger.error("Couldn't find data, please do prewindowing first!") raise Exception online_logger.info("concatenating prewindowed data from " + str(prewindowed_sets)) for s,d in enumerate(prewindowed_sets): collection = BaseDataset.load(d) data = collection.get_data(0, 0, "train") for d,(sample,label) in enumerate(data): if replace_start_and_end_markers: # in case we concatenate multiple 'Window' labeled # sets we have to remove every start- and endmarker for k in sample.marker_name.keys(): # find '{S,s} 8' or '{S,s} 9' m = re.match("^s\s{0,2}[8,9]{1}$", k, re.IGNORECASE) if m is not None: online_logger.info(str("remove %s from %d %d" % (m.group(), s, d))) del(sample.marker_name[m.group()]) if s == len(prewindowed_sets)-1 and \ d == len(data)-1: # insert endmarker sample.marker_name["S 9"] = [0.0] online_logger.info("added endmarker" + str(s) + " " + str(d)) if s == 0 and d == 0: # insert startmarker sample.marker_name["S 8"] = [0.0] online_logger.info("added startmarker" + str(s) + " " + str(d)) final_collection.add_sample(sample, label, True) # save final collection (just for debugging) os.mkdir(final_collection_path) final_collection.store(final_collection_path) online_logger.info("stored final collection at " + final_collection_path) # load final collection again for training online_logger.info("loading data from " + final_collection_path) self.prewindowed_data[key] = BaseDataset.load(final_collection_path) self.node_chains[key][0].set_input_dataset(self.prewindowed_data[key]) flow = open(self.potentials[key]["postprocess_flow"]) self.node_chain_definitions[key] = yaml.load(flow) flow.close() # TODO: check if the prewindowing flow is still needed # when using the stream mode! if self.operation in ("train"): online_logger.info( "Removing old flows...") try: shutil.rmtree(self.flow_storage) except: online_logger.info("Could not delete flow storage directory") os.mkdir(self.flow_storage) elif self.operation in ("prewindowing", "prewindowing_offline"): # follow this policy: # - delete prewindowed data older than 12 hours # - always delete trained/stored flows now = datetime.datetime.now() then = now - datetime.timedelta(hours=12) if not os.path.exists(self.prewindowed_data_directory): os.mkdir(self.prewindowed_data_directory) if not os.path.exists(self.flow_storage): os.mkdir(self.flow_storage) for key in self.potentials.keys(): found = self.find_files_older_than(then, \ os.path.join(self.prewindowed_data_directory, key)) if found is not None: for f in found: online_logger.info(str("recursively deleting files in \'%s\'" % f)) try: shutil.rmtree(os.path.abspath(f)) except Exception as e: # TODO: find a smart solution for this! pass # dir was probably already deleted.. if os.path.exists(os.path.join(self.prewindowed_data_directory, key, "all_train_data")): shutil.rmtree(os.path.join(self.prewindowed_data_directory, key, "all_train_data")) online_logger.info("deleted concatenated training data for " + key) online_logger.info( "Training preparations finished") return 0