def get_result_dataset(self): """ Return the result """ # Merges all timeseries inside the collection if merge flag is set to true if self.merge: merged_time_series = self.merge_time_series( self.time_series_collection) self.time_series_collection = None self.time_series_collection = \ TimeSeriesDataset(sort_string=self.sort_string) self.time_series_collection.add_sample(merged_time_series, label='Window', train=False) return self.time_series_collection
def generate_data_set(self): """ Generate a dataset using the given generators """ self.dataset = TimeSeriesDataset() # generate a set of dummy labels to know which class is used later label_sequence = numpy.hstack( (numpy.ones(self.ir_items), numpy.zeros(self.nir_items))) if self.shuffle: random.shuffle(label_sequence) ts_generator = TestTimeSeriesGenerator() current_item = 0 # count produced data objects for drift for label in label_sequence: if label == 1: #generate a data item using the ir_generator data_item = \ ts_generator.generate_test_data( channels=len(self.channel_names), time_points=self.time_points, function=self.ir_generator, sampling_frequency=self.sampling_frequency, channel_order=True, channel_names=self.channel_names, dtype=numpy.float) # Drift: data_item = data_item + current_item * self.ir_drift_vector self.dataset.add_sample(data_item, self.ir_label, False) else: #generate a data item using the nir_generator data_item = \ ts_generator.generate_test_data( channels=len(self.channel_names), time_points=self.time_points, function=self.nir_generator, sampling_frequency=self.sampling_frequency, channel_order=True, channel_names=self.channel_names, dtype=numpy.float) # Drift: data_item = data_item + current_item * self.nir_drift_vector self.dataset.add_sample(data_item, self.nir_label, False) current_item += 1. / (self.ir_items + self.nir_items)
def process_current_split(self): """ Compute the results of this sink node for the current split of the data into train and test data """ index = 0 # Compute the time series for the data used for training for time_series, label in self.input_node.request_data_for_training( False): # Do lazy initialization of the class if self.time_series_collection == None: self.time_series_collection = \ TimeSeriesDataset(sort_string=self.sort_string) if index < self.max_num_stored_objects: # Add sample self.time_series_collection.add_sample( time_series, label=label, train=True, split=self.current_split, run=self.run_number) index += 1 # Compute the time series for the data used for testing index = 0 for time_series, label in self.input_node.request_data_for_testing(): # Do lazy initialization of the class # (maybe there were no training examples) if self.time_series_collection == None: self.time_series_collection = \ TimeSeriesDataset(sort_string=self.sort_string) if index < self.max_num_stored_objects: # Add sample self.time_series_collection.add_sample( time_series, label=label, train=False, split=self.current_split, run=self.run_number) index += 1
def get_result_dataset(self): """ Return the result """ # Merges all timeseries inside the collection if merge flag is set to true if self.merge: merged_time_series = self.merge_time_series(self.time_series_collection) self.time_series_collection = None self.time_series_collection = \ TimeSeriesDataset(sort_string=self.sort_string) self.time_series_collection.add_sample(merged_time_series, label = 'Window', train = False) return self.time_series_collection
def process_current_split(self): """ Compute the results of this sink node for the current split of the data into train and test data """ index = 0 # Compute the time series for the data used for training for time_series, label in self.input_node.request_data_for_training(False): # Do lazy initialization of the class if self.time_series_collection == None: self.time_series_collection = \ TimeSeriesDataset(sort_string=self.sort_string) if index < self.max_num_stored_objects: # Add sample self.time_series_collection.add_sample(time_series, label = label, train = True, split = self.current_split, run = self.run_number) index += 1 # Compute the time series for the data used for testing index = 0 for time_series, label in self.input_node.request_data_for_testing(): # Do lazy initialization of the class # (maybe there were no training examples) if self.time_series_collection == None: self.time_series_collection = \ TimeSeriesDataset(sort_string=self.sort_string) if index < self.max_num_stored_objects: # Add sample self.time_series_collection.add_sample(time_series, label = label, train = False, split = self.current_split, run = self.run_number) index += 1
def generate_data_set(self): """ Generate a dataset using the given generators """ self.dataset = TimeSeriesDataset() # generate a set of dummy labels to know which class is used later label_sequence = numpy.hstack((numpy.ones(self.ir_items),numpy.zeros(self.nir_items))) if self.shuffle: random.shuffle(label_sequence) ts_generator = TestTimeSeriesGenerator() current_item = 0 # count produced data objects for drift for label in label_sequence: if label == 1: #generate a data item using the ir_generator data_item = \ ts_generator.generate_test_data( channels=len(self.channel_names), time_points=self.time_points, function=self.ir_generator, sampling_frequency=self.sampling_frequency, channel_order=True, channel_names=self.channel_names, dtype=numpy.float) # Drift: data_item = data_item + current_item*self.ir_drift_vector self.dataset.add_sample(data_item,self.ir_label,False) else: #generate a data item using the nir_generator data_item = \ ts_generator.generate_test_data( channels=len(self.channel_names), time_points=self.time_points, function=self.nir_generator, sampling_frequency=self.sampling_frequency, channel_order=True, channel_names=self.channel_names, dtype=numpy.float) # Drift: data_item = data_item + current_item*self.nir_drift_vector self.dataset.add_sample(data_item,self.nir_label,False) current_item += 1./(self.ir_items+self.nir_items)
class DataGenerationTimeSeriesSourceNode(TimeSeriesSourceNode): """ Generate data of two classes for testing This node can generate data according to the specifications of two different DataGenerators. It generates objects of the type TimeSeries **Parameters** :ir_generator: A generator of type DataGenerator for data items of the information relevant class. If it is specified in a node chain, it should be given as a string. (*optional, default: 100*) :nir_generator: A generator of type DataGenerator for data items of the not information relevant class. If it is specified in a node chain, it should be given as a string. (*optional, default: 100*) :ir_items: Number of items that should be generated for the ir class. (*optional, default: 100*) :nir_items: Number of items that should be generated for the non ir class. (*optional, default: 100*) :channel_names: List of strings for the channel names. Determines also the number of generated channels. (*optional*) :num_channels: Number of channels. Unused, if channel_names is set. (*optional, default: 16*) :ir_label: The label for the ir_class. (*optional, default: 'Target'*) :nir_label: The label for the ir_class. (*optional, default: 'Standard'*) :shuffle: If the data items for the two classes are shuffled. (*optional, default: True*) :time_points: Number of points per channel in a generated TimeSeries object. (*optional, default: 100*) :sampling_frequency: Sampling rate of the generated data. Important for sines etc. A generated time series object has a temporal length of time_points/sampling_frequency (*optional, default: 1000*) :ir_drift_vector: Drift of the ir class data. Specify a vector (numpy array) of shape (time_points,num_channels) and the a linear drift in this direction will be added to the generated data: [0 * ir_drift_vector] added to first sample, [1/(ir_items+nir_items) * ir_drift_vector] to the second sample [...] and so on, until [1 * ir_drift_vector] added to last sample. The specification of the drift vector in the specification can, e.g., be done like this: ir_drift_vector : "eval(__import__('numpy').asarray([[1,1],[2,2]]))" (*optional, default: None*) :nir_drift_vector: Drift of the ir class data. See ir_drift_vector. (*optional, default: None*) **Exemplary Call** .. code-block:: yaml - node : Data_Generation_Source parameters : ir_generator : "Adder([SineGenerator(),GaussianNoiseGenerator()])" nir_generator : "GaussianNoiseGenerator()" :Author: Hendrik Woehrle :Created: 201/07/27 """ def __init__(self, ir_generator="Adder([Sine(),GaussianNoise()])", nir_generator="GaussianNoise()", ir_items=100, nir_items=100, ir_drift_vector=None, nir_drift_vector=None, channel_names=None, num_channels=16, ir_label='Target', nir_label='Standard', time_points=100, sampling_frequency=1000, shuffle=True, **kwargs): super(DataGenerationTimeSeriesSourceNode, self).__init__(**kwargs) if type(ir_generator) == str: ir_generator = eval(ir_generator) if type(nir_generator) == str: nir_generator = eval(nir_generator) ir_generator.sampling_frequency = sampling_frequency nir_generator.sampling_frequency = sampling_frequency run_number = 0 dataset = None if not channel_names is None: num_channels = len(channel_names) else: channel_names = [] for i in xrange(num_channels): channel_names.append(str(i)) # Translate drift "None" to zero-vector if ir_drift_vector is None: ir_drift_vector = numpy.zeros((time_points,num_channels)) if nir_drift_vector is None: nir_drift_vector = numpy.zeros((time_points,num_channels)) self.set_permanent_attributes(dataset=dataset, ir_generator=ir_generator, nir_generator=nir_generator, ir_items=ir_items, nir_items=nir_items, channel_names=channel_names, num_channels=num_channels, ir_label=ir_label, nir_label=nir_label, time_points=time_points, sampling_frequency=sampling_frequency, shuffle=shuffle, run_number=run_number, data_for_testing=None, data_for_training=None, ir_drift_vector=ir_drift_vector, nir_drift_vector=nir_drift_vector) self.generate_data_set() def set_input_dataset(self, dataset): """ Instead of using a given dataset, a new one is generated """ self.generate_data_set() def generate_data_set(self): """ Generate a dataset using the given generators """ self.dataset = TimeSeriesDataset() # generate a set of dummy labels to know which class is used later label_sequence = numpy.hstack((numpy.ones(self.ir_items),numpy.zeros(self.nir_items))) if self.shuffle: random.shuffle(label_sequence) ts_generator = TestTimeSeriesGenerator() current_item = 0 # count produced data objects for drift for label in label_sequence: if label == 1: #generate a data item using the ir_generator data_item = \ ts_generator.generate_test_data( channels=len(self.channel_names), time_points=self.time_points, function=self.ir_generator, sampling_frequency=self.sampling_frequency, channel_order=True, channel_names=self.channel_names, dtype=numpy.float) # Drift: data_item = data_item + current_item*self.ir_drift_vector self.dataset.add_sample(data_item,self.ir_label,False) else: #generate a data item using the nir_generator data_item = \ ts_generator.generate_test_data( channels=len(self.channel_names), time_points=self.time_points, function=self.nir_generator, sampling_frequency=self.sampling_frequency, channel_order=True, channel_names=self.channel_names, dtype=numpy.float) # Drift: data_item = data_item + current_item*self.nir_drift_vector self.dataset.add_sample(data_item,self.nir_label,False) current_item += 1./(self.ir_items+self.nir_items)
def prepare_training(self, training_files, potentials, operation, nullmarker_stride_ms=None): """ Prepares pyspace live for training. Prepares everything for training of pyspace live, i.e. creates flows based on the dataflow specs and configures them. """ online_logger.info("Preparing Training") self.potentials = potentials self.operation = operation self.nullmarker_stride_ms = nullmarker_stride_ms if self.nullmarker_stride_ms == None: online_logger.warn( 'Nullmarker stride interval is %s. You can specify it in your parameter file.' % self.nullmarker_stride_ms) else: online_logger.info('Nullmarker stride interval is set to %s ms ' % self.nullmarker_stride_ms) online_logger.info("Creating flows..") for key in self.potentials.keys(): spec_base = self.potentials[key]["configuration"].spec_dir if self.operation == "train": self.potentials[key]["node_chain"] = os.path.join( spec_base, self.potentials[key]["node_chain"]) online_logger.info("node_chain_spec:" + self.potentials[key]["node_chain"]) elif self.operation in ("prewindowing", "prewindowing_offline"): self.potentials[key]["prewindowing_flow"] = os.path.join( spec_base, self.potentials[key]["prewindowing_flow"]) online_logger.info("prewindowing_dataflow_spec: " + self.potentials[key]["prewindowing_flow"]) elif self.operation == "prewindowed_train": self.potentials[key]["postprocess_flow"] = os.path.join( spec_base, self.potentials[key]["postprocess_flow"]) online_logger.info("postprocessing_dataflow_spec: " + self.potentials[key]["postprocess_flow"]) self.training_active_potential[key] = multiprocessing.Value( "b", False) online_logger.info("Path variables set for NodeChains") # check if multiple potentials are given for training if isinstance(training_files, list): self.training_data = training_files else: self.training_data = [training_files] # Training is done in separate processes, we send the time series # windows to these threads via two queues online_logger.info("Initializing Queues") for key in self.potentials.keys(): self.queue[key] = multiprocessing.Queue() def flow_generator(key): """create a generator to yield all the abri flow windows""" # Yield all windows until a None item is found in the queue while True: window = self.queue[key].get(block=True, timeout=None) if window == None: break yield window # Create the actual data flows for key in self.potentials.keys(): if self.operation == "train": self.node_chains[key] = NodeChainFactory.flow_from_yaml( Flow_Class=NodeChain, flow_spec=file(self.potentials[key]["node_chain"])) self.node_chains[key][0].set_generator(flow_generator(key)) flow = open(self.potentials[key]["node_chain"]) elif self.operation in ("prewindowing", "prewindowing_offline"): online_logger.info("loading prewindowing flow..") online_logger.info( "file: " + str(self.potentials[key]["prewindowing_flow"])) self.node_chains[key] = NodeChainFactory.flow_from_yaml( Flow_Class=NodeChain, flow_spec=file(self.potentials[key]["prewindowing_flow"])) self.node_chains[key][0].set_generator(flow_generator(key)) flow = open(self.potentials[key]["prewindowing_flow"]) elif self.operation == "prewindowed_train": self.node_chains[key] = NodeChainFactory.flow_from_yaml( Flow_Class=NodeChain, flow_spec=file(self.potentials[key]["postprocess_flow"])) replace_start_and_end_markers = False final_collection = TimeSeriesDataset() final_collection_path = os.path.join( self.prewindowed_data_directory, key, "all_train_data") # delete previous training collection if os.path.exists(final_collection_path): online_logger.info( "deleting old training data collection for " + key) shutil.rmtree(final_collection_path) # load all prewindowed collections and # append data to the final collection prewindowed_sets = \ glob.glob(os.path.join(self.prewindowed_data_directory, key, "*")) if len(prewindowed_sets) == 0: online_logger.error( "Couldn't find data, please do prewindowing first!") raise Exception online_logger.info("concatenating prewindowed data from " + str(prewindowed_sets)) for s, d in enumerate(prewindowed_sets): collection = BaseDataset.load(d) data = collection.get_data(0, 0, "train") for d, (sample, label) in enumerate(data): if replace_start_and_end_markers: # in case we concatenate multiple 'Window' labeled # sets we have to remove every start- and endmarker for k in sample.marker_name.keys(): # find '{S,s} 8' or '{S,s} 9' m = re.match("^s\s{0,2}[8,9]{1}$", k, re.IGNORECASE) if m is not None: online_logger.info( str("remove %s from %d %d" % (m.group(), s, d))) del (sample.marker_name[m.group()]) if s == len(prewindowed_sets)-1 and \ d == len(data)-1: # insert endmarker sample.marker_name["S 9"] = [0.0] online_logger.info("added endmarker" + str(s) + " " + str(d)) if s == 0 and d == 0: # insert startmarker sample.marker_name["S 8"] = [0.0] online_logger.info("added startmarker" + str(s) + " " + str(d)) final_collection.add_sample(sample, label, True) # save final collection (just for debugging) os.mkdir(final_collection_path) final_collection.store(final_collection_path) online_logger.info("stored final collection at " + final_collection_path) # load final collection again for training online_logger.info("loading data from " + final_collection_path) self.prewindowed_data[key] = BaseDataset.load( final_collection_path) self.node_chains[key][0].set_input_dataset( self.prewindowed_data[key]) flow = open(self.potentials[key]["postprocess_flow"]) # create window_stream for every potential if self.operation in ("prewindowing"): window_spec_file = os.path.join( spec_base, "node_chains", "windower", self.potentials[key]["windower_spec_path_train"]) self.window_stream[key] = \ self.stream_manager.request_window_stream(window_spec_file, nullmarker_stride_ms = self.nullmarker_stride_ms) elif self.operation in ("prewindowing_offline"): pass elif self.operation in ("train"): pass self.node_chain_definitions[key] = yaml.load(flow) flow.close() # TODO: check if the prewindowing flow is still needed when using the stream mode! if self.operation in ("train"): online_logger.info("Removing old flows...") try: shutil.rmtree(self.flow_storage) except: online_logger.info("Could not delete flow storage directory") os.mkdir(self.flow_storage) elif self.operation in ("prewindowing", "prewindowing_offline"): # follow this policy: # - delete prewindowed data older than 12 hours # - always delete trained/stored flows now = datetime.datetime.now() then = now - datetime.timedelta(hours=12) if not os.path.exists(self.prewindowed_data_directory): os.mkdir(self.prewindowed_data_directory) if not os.path.exists(self.flow_storage): os.mkdir(self.flow_storage) for key in self.potentials.keys(): found = self.find_files_older_than(then, \ os.path.join(self.prewindowed_data_directory, key)) if found is not None: for f in found: online_logger.info( str("recursively deleting files in \'%s\'" % f)) try: shutil.rmtree(os.path.abspath(f)) except Exception as e: # TODO: find a smart solution for this! pass # dir was probably already deleted.. if os.path.exists( os.path.join(self.prewindowed_data_directory, key, "all_train_data")): shutil.rmtree( os.path.join(self.prewindowed_data_directory, key, "all_train_data")) online_logger.info( "deleted concatenated training data for " + key) online_logger.info("Training preparations finished") return 0
class TimeSeriesSinkNode(BaseNode): """ Collect all :mod:`time series objects <pySPACE.resources.data_types.time_series>` in a :mod:`collection <pySPACE.resources.dataset_defs.time_series>` **Parameters** :sort_string: A lambda function string that is passed to the TimeSeriesDataset and evaluated before the data is stored. (*optional, default: None*) :max_num_stored_objects: Number of maximal stored time series objects. Can be used if only a part of a dataset should be exported, e.g. for size purposes in debugging. Applies to train and test set separately. (*optional, default: numpy.inf*) :merge: Can be set to true if the use wants to get one timeseries containing the entier input data (*optional, default: False*) **Exemplary Call** .. code-block:: yaml - node: Time_Series_Sink :Author: Jan Hendrik Metzen ([email protected]) :Created: 2008/11/28 :LastChange: 2011/04/13 Anett Seeland ([email protected]) """ def __init__(self, sort_string=None, merge=False, **kwargs): super(TimeSeriesSinkNode, self).__init__(**kwargs) self.set_permanent_attributes( sort_string=sort_string, merge=merge, # This will be created lazily time_series_collection=None, max_num_stored_objects=numpy.inf) def reset(self): """ Reset the state of the object to the clean state it had after its initialization """ # We have to create a temporary reference since we remove # the self.permanent_state reference in the next step by overwriting # self.__dict__ tmp = self.permanent_state # TODO: just a hack to get it working quickly... tmp["time_series_collection"] = self.time_series_collection self.__dict__ = copy.copy(tmp) self.permanent_state = tmp def is_trainable(self): """ Returns whether this node is trainable. """ # Though this node is not really trainable, it returns true in order # to get trained. The reason is that during this training phase, # it stores all time windows along with their class label return True def _get_train_set(self, use_test_data): """ Returns the data that can be used for training """ # We take data that is provided by the input node for training # NOTE: This might involve training of the preceding nodes train_set = self.input_node.request_data_for_training(use_test_data) # Add the data provided by the input node for testing to the # training set # NOTE: This node is not really learning but creating a labeled set # of time windows. Because of that it must take all # data for training (even when use_test_data is False) train_set = itertools.chain(train_set, self.input_node.request_data_for_testing()) return train_set def is_supervised(self): """ Returns whether this node requires supervised training """ return True def _execute(self, data): # We simply pass the given data on to the next node return data def _train(self, data, label): # We simply pass the given data on to the next node return (data, label) def process_current_split(self): """ Compute the results of this sink node for the current split of the data into train and test data """ index = 0 # Compute the time series for the data used for training for time_series, label in self.input_node.request_data_for_training( False): # Do lazy initialization of the class if self.time_series_collection == None: self.time_series_collection = \ TimeSeriesDataset(sort_string=self.sort_string) if index < self.max_num_stored_objects: # Add sample self.time_series_collection.add_sample( time_series, label=label, train=True, split=self.current_split, run=self.run_number) index += 1 # Compute the time series for the data used for testing index = 0 for time_series, label in self.input_node.request_data_for_testing(): # Do lazy initialization of the class # (maybe there were no training examples) if self.time_series_collection == None: self.time_series_collection = \ TimeSeriesDataset(sort_string=self.sort_string) if index < self.max_num_stored_objects: # Add sample self.time_series_collection.add_sample( time_series, label=label, train=False, split=self.current_split, run=self.run_number) index += 1 def merge_time_series(self, input_collection): """ Merges all timeseries of the input_collection to one big timeseries """ # Retriev the time series from the input_collection input_timeseries = input_collection.get_data(0, 0, 'test') # Get the data from the first timeseries output_data = input_timeseries[0][0].get_data() # Change the endtime of the first timeseries to the one of the last # timeseries inside the input_collection input_timeseries[0][0].end_time = input_timeseries[-1][0].end_time # For all the remaining timeseries for ts in input_timeseries[1:]: # Concatenate the data... output_data = numpy.vstack((output_data, ts[0].get_data())) # ... and add the marker to the first timeseries if (len(ts[0].marker_name) > 0): for k in ts[0].marker_name: if (not input_timeseries[0][0].marker_name.has_key(k)): input_timeseries[0][0].marker_name[k] = [] for time in ts[0].marker_name[k]: input_timeseries[0][0].marker_name[k].append( time + ts[0].start_time + input_timeseries[0][0].start_time) # Use the meta information from the first timeseries e.g. marker start/end_time # and create a new timeseries with the concatenated data merged_time_series = TimeSeries.replace_data(input_timeseries[0][0], output_data) # Change the name of the merged_time_series merged_time_series.name = "%s, length %d ms, %s" % (merged_time_series.name.split(',')[0], \ (len(merged_time_series)*1000.0)/merged_time_series.sampling_frequency,\ merged_time_series.name.split(',')[-1]) return merged_time_series def get_result_dataset(self): """ Return the result """ # Merges all timeseries inside the collection if merge flag is set to true if self.merge: merged_time_series = self.merge_time_series( self.time_series_collection) self.time_series_collection = None self.time_series_collection = \ TimeSeriesDataset(sort_string=self.sort_string) self.time_series_collection.add_sample(merged_time_series, label='Window', train=False) return self.time_series_collection
class TimeSeriesSinkNode(BaseNode): """ Collect all :mod:`time series objects <pySPACE.resources.data_types.time_series>` in a :mod:`collection <pySPACE.resources.dataset_defs.time_series>` **Parameters** :sort_string: A lambda function string that is passed to the TimeSeriesDataset and evaluated before the data is stored. (*optional, default: None*) :max_num_stored_objects: Number of maximal stored time series objects. Can be used if only a part of a dataset should be exported, e.g. for size purposes in debugging. Applies to train and test set separately. (*optional, default: numpy.inf*) :merge: Can be set to true if the use wants to get one timeseries containing the entier input data (*optional, default: False*) **Exemplary Call** .. code-block:: yaml - node: Time_Series_Sink :Author: Jan Hendrik Metzen ([email protected]) :Created: 2008/11/28 :LastChange: 2011/04/13 Anett Seeland ([email protected]) """ input_types = ["TimeSeries"] def __init__(self, sort_string=None, merge = False, **kwargs): super(TimeSeriesSinkNode, self).__init__(**kwargs) self.set_permanent_attributes(sort_string=sort_string, merge = merge, # This will be created lazily time_series_collection = None, max_num_stored_objects = numpy.inf) def reset(self): """ Reset the state of the object to the clean state it had after its initialization """ # We have to create a temporary reference since we remove # the self.permanent_state reference in the next step by overwriting # self.__dict__ tmp = self.permanent_state # TODO: just a hack to get it working quickly... tmp["time_series_collection"] = self.time_series_collection self.__dict__ = copy.copy(tmp) self.permanent_state = tmp def is_trainable(self): """ Returns whether this node is trainable. """ # Though this node is not really trainable, it returns true in order # to get trained. The reason is that during this training phase, # it stores all time windows along with their class label return True def _get_train_set(self, use_test_data): """ Returns the data that can be used for training """ # We take data that is provided by the input node for training # NOTE: This might involve training of the preceding nodes train_set = self.input_node.request_data_for_training(use_test_data) # Add the data provided by the input node for testing to the # training set # NOTE: This node is not really learning but creating a labeled set # of time windows. Because of that it must take all # data for training (even when use_test_data is False) train_set = itertools.chain(train_set, self.input_node.request_data_for_testing()) return train_set def is_supervised(self): """ Returns whether this node requires supervised training """ return True def _train(self, data, label): # We do nothing pass def process_current_split(self): """ Compute the results of this sink node for the current split of the data into train and test data """ index = 0 # Compute the time series for the data used for training for time_series, label in self.input_node.request_data_for_training(False): # Do lazy initialization of the class if self.time_series_collection == None: self.time_series_collection = \ TimeSeriesDataset(sort_string=self.sort_string) if index < self.max_num_stored_objects: # Add sample self.time_series_collection.add_sample(time_series, label = label, train = True, split = self.current_split, run = self.run_number) index += 1 # Compute the time series for the data used for testing index = 0 for time_series, label in self.input_node.request_data_for_testing(): # Do lazy initialization of the class # (maybe there were no training examples) if self.time_series_collection == None: self.time_series_collection = \ TimeSeriesDataset(sort_string=self.sort_string) if index < self.max_num_stored_objects: # Add sample self.time_series_collection.add_sample(time_series, label = label, train = False, split = self.current_split, run = self.run_number) index += 1 def merge_time_series(self, input_collection): """ Merges all timeseries of the input_collection to one big timeseries """ # Retriev the time series from the input_collection input_timeseries = input_collection.get_data(0,0,'test') # Get the data from the first timeseries output_data = input_timeseries[0][0] skiped_range = output_data.start_time # Change the endtime of the first timeseries to the one of the last # timeseries inside the input_collection input_timeseries[0][0].end_time = input_timeseries[-1][0].end_time # For all the remaining timeseries for ts in input_timeseries[1:]: # Concatenate the data... output_data = numpy.vstack((output_data,ts[0])) # ... and add the marker to the first timeseries if(len(ts[0].marker_name) > 0): for k in ts[0].marker_name: if(not input_timeseries[0][0].marker_name.has_key(k)): input_timeseries[0][0].marker_name[k] = [] for time in ts[0].marker_name[k]: input_timeseries[0][0].marker_name[k].append(time+ts[0].start_time - skiped_range) # Use the meta information from the first timeseries e.g. marker start/end_time # and create a new timeseries with the concatenated data merged_time_series = TimeSeries.replace_data(input_timeseries[0][0],output_data) # Change the name of the merged_time_series merged_time_series.name = "%s, length %d ms, %s" % (merged_time_series.name.split(',')[0], \ (len(merged_time_series)*1000.0)/merged_time_series.sampling_frequency,\ merged_time_series.name.split(',')[-1]) return merged_time_series def get_result_dataset(self): """ Return the result """ # Merges all timeseries inside the collection if merge flag is set to true if self.merge: merged_time_series = self.merge_time_series(self.time_series_collection) self.time_series_collection = None self.time_series_collection = \ TimeSeriesDataset(sort_string=self.sort_string) self.time_series_collection.add_sample(merged_time_series, label = 'Window', train = False) return self.time_series_collection
def prepare_training(self, training_files, potentials, operation): """ Prepares pyspace live for training. Prepares everything for training of pyspace live, i.e. creates flows based on the dataflow specs and configures them. """ online_logger.info( "Preparing Training") self.potentials = potentials self.operation = operation online_logger.info( "Creating flows..") for key in self.potentials.keys(): spec_base = self.potentials[key]["configuration"].spec_dir if self.operation == "train": self.potentials[key]["node_chain"] = os.path.join(spec_base, self.potentials[key]["node_chain"]) online_logger.info( "node_chain_spec:" + self.potentials[key]["node_chain"]) elif self.operation in ("prewindowing", "prewindowing_offline"): if self.potentials[key].has_key("stream") and self.potentials[key]["stream"] == True: self.potentials[key]["prewindowing_flow"] = os.path.join(spec_base, self.potentials[key]["stream_prewindowing_flow"]) else: self.potentials[key]["prewindowing_flow"] = os.path.join(spec_base, self.potentials[key]["prewindowing_flow"]) online_logger.info( "prewindowing_dataflow_spec: " + self.potentials[key]["prewindowing_flow"]) elif self.operation == "prewindowed_train": if self.potentials[key].has_key("stream") and self.potentials[key]["stream"] == True: self.potentials[key]["postprocess_flow"] = os.path.join(spec_base, self.potentials[key]["stream_postprocess_flow"]) else: self.potentials[key]["postprocess_flow"] = os.path.join(spec_base, self.potentials[key]["postprocess_flow"]) online_logger.info( "postprocessing_dataflow_spec: " + self.potentials[key]["postprocess_flow"]) self.training_active_potential[key] = multiprocessing.Value("b",False) online_logger.info("Path variables set for NodeChains") # check if multiple potentials are given for training if isinstance(training_files, list): self.training_data = training_files else: self.training_data = [training_files] # Training is done in separate processes, we send the time series # windows to these threads via two queues online_logger.info( "Initializing Queues") for key in self.potentials.keys(): self.queue[key] = multiprocessing.Queue() def flow_generator(key): """create a generator to yield all the abri flow windows""" # Yield all windows until a None item is found in the queue while True: window = self.queue[key].get(block = True, timeout = None) if window == None: break yield window # Create the actual data flows for key in self.potentials.keys(): if self.operation == "train": self.node_chains[key] = NodeChainFactory.flow_from_yaml(Flow_Class = NodeChain, flow_spec = file(self.potentials[key]["node_chain"])) self.node_chains[key][0].set_generator(flow_generator(key)) flow = open(self.potentials[key]["node_chain"]) elif self.operation in ("prewindowing", "prewindowing_offline"): online_logger.info("loading prewindowing flow..") online_logger.info("file: " + str(self.potentials[key]["prewindowing_flow"])) self.node_chains[key] = NodeChainFactory.flow_from_yaml(Flow_Class = NodeChain, flow_spec = file(self.potentials[key]["prewindowing_flow"])) self.node_chains[key][0].set_generator(flow_generator(key)) flow = open(self.potentials[key]["prewindowing_flow"]) elif self.operation == "prewindowed_train": if self.potentials[key].has_key("stream") and self.potentials[key]["stream"] == True: self.node_chains[key] = NodeChainFactory.flow_from_yaml(Flow_Class = NodeChain, flow_spec = file(self.potentials[key]["postprocess_flow"])) # create windower online_logger.info( "Creating Windower") online_logger.info(self.potentials[key]["windower_spec_path_train"]) self.node_chains[key][0].set_windower_spec_file(os.path.join(spec_base, "node_chains", "windower", self.potentials[key]["windower_spec_path_train"])) replace_start_and_end_markers = True else: self.node_chains[key] = NodeChainFactory.flow_from_yaml(Flow_Class = NodeChain, flow_spec = file(self.potentials[key]["postprocess_flow"])) replace_start_and_end_markers = False final_collection = TimeSeriesDataset() final_collection_path = os.path.join(self.prewindowed_data_directory, key, "all_train_data") # delete previous training collection if os.path.exists(final_collection_path): online_logger.info("deleting old training data collection for " + key) shutil.rmtree(final_collection_path) # load all prewindowed collections and # append data to the final collection prewindowed_sets = \ glob.glob(os.path.join(self.prewindowed_data_directory, key, "*")) if len(prewindowed_sets) == 0: online_logger.error("Couldn't find data, please do prewindowing first!") raise Exception online_logger.info("concatenating prewindowed data from " + str(prewindowed_sets)) for s,d in enumerate(prewindowed_sets): collection = BaseDataset.load(d) data = collection.get_data(0, 0, "train") for d,(sample,label) in enumerate(data): if replace_start_and_end_markers: # in case we concatenate multiple 'Window' labeled # sets we have to remove every start- and endmarker for k in sample.marker_name.keys(): # find '{S,s} 8' or '{S,s} 9' m = re.match("^s\s{0,2}[8,9]{1}$", k, re.IGNORECASE) if m is not None: online_logger.info(str("remove %s from %d %d" % (m.group(), s, d))) del(sample.marker_name[m.group()]) if s == len(prewindowed_sets)-1 and \ d == len(data)-1: # insert endmarker sample.marker_name["S 9"] = [0.0] online_logger.info("added endmarker" + str(s) + " " + str(d)) if s == 0 and d == 0: # insert startmarker sample.marker_name["S 8"] = [0.0] online_logger.info("added startmarker" + str(s) + " " + str(d)) final_collection.add_sample(sample, label, True) # save final collection (just for debugging) os.mkdir(final_collection_path) final_collection.store(final_collection_path) online_logger.info("stored final collection at " + final_collection_path) # load final collection again for training online_logger.info("loading data from " + final_collection_path) self.prewindowed_data[key] = BaseDataset.load(final_collection_path) self.node_chains[key][0].set_input_dataset(self.prewindowed_data[key]) flow = open(self.potentials[key]["postprocess_flow"]) self.node_chain_definitions[key] = yaml.load(flow) flow.close() # TODO: check if the prewindowing flow is still needed # when using the stream mode! if self.operation in ("train"): online_logger.info( "Removing old flows...") try: shutil.rmtree(self.flow_storage) except: online_logger.info("Could not delete flow storage directory") os.mkdir(self.flow_storage) elif self.operation in ("prewindowing", "prewindowing_offline"): # follow this policy: # - delete prewindowed data older than 12 hours # - always delete trained/stored flows now = datetime.datetime.now() then = now - datetime.timedelta(hours=12) if not os.path.exists(self.prewindowed_data_directory): os.mkdir(self.prewindowed_data_directory) if not os.path.exists(self.flow_storage): os.mkdir(self.flow_storage) for key in self.potentials.keys(): found = self.find_files_older_than(then, \ os.path.join(self.prewindowed_data_directory, key)) if found is not None: for f in found: online_logger.info(str("recursively deleting files in \'%s\'" % f)) try: shutil.rmtree(os.path.abspath(f)) except Exception as e: # TODO: find a smart solution for this! pass # dir was probably already deleted.. if os.path.exists(os.path.join(self.prewindowed_data_directory, key, "all_train_data")): shutil.rmtree(os.path.join(self.prewindowed_data_directory, key, "all_train_data")) online_logger.info("deleted concatenated training data for " + key) online_logger.info( "Training preparations finished") return 0
class DataGenerationTimeSeriesSourceNode(TimeSeriesSourceNode): """ Generate data of two classes for testing This node can generate data according to the specifications of two different DataGenerators. It generates objects of the type TimeSeries **Parameters** :ir_generator: A generator of type DataGenerator for data items of the information relevant class. If it is specified in a node chain, it should be given as a string. (*optional, default: 100*) :nir_generator: A generator of type DataGenerator for data items of the not information relevant class. If it is specified in a node chain, it should be given as a string. (*optional, default: 100*) :ir_items: Number of items that should be generated for the ir class. (*optional, default: 100*) :nir_items: Number of items that should be generated for the non ir class. (*optional, default: 100*) :channel_names: List of strings for the channel names. Determines also the number of generated channels. (*optional*) :num_channels: Number of channels. Unused, if channel_names is set. (*optional, default: 16*) :ir_label: The label for the ir_class. (*optional, default: 'Target'*) :nir_label: The label for the ir_class. (*optional, default: 'Standard'*) :shuffle: If the data items for the two classes are shuffled. (*optional, default: True*) :time_points: Number of points per channel in a generated TimeSeries object. (*optional, default: 100*) :sampling_frequency: Sampling rate of the generated data. Important for sines etc. A generated time series object has a temporal length of time_points/sampling_frequency (*optional, default: 1000*) :ir_drift_vector: Drift of the ir class data. Specify a vector (numpy array) of shape (time_points,num_channels) and the a linear drift in this direction will be added to the generated data: [0 * ir_drift_vector] added to first sample, [1/(ir_items+nir_items) * ir_drift_vector] to the second sample [...] and so on, until [1 * ir_drift_vector] added to last sample. The specification of the drift vector in the specification can, e.g., be done like this: ir_drift_vector : "eval(__import__('numpy').asarray([[1,1],[2,2]]))" (*optional, default: None*) :nir_drift_vector: Drift of the ir class data. See ir_drift_vector. (*optional, default: None*) **Exemplary Call** .. code-block:: yaml - node : Data_Generation_Source parameters : ir_generator : "Adder([SineGenerator(),GaussianNoiseGenerator()])" nir_generator : "GaussianNoiseGenerator()" :Author: Hendrik Woehrle :Created: 201/07/27 """ def __init__(self, ir_generator="Adder([Sine(),GaussianNoise()])", nir_generator="GaussianNoise()", ir_items=100, nir_items=100, ir_drift_vector=None, nir_drift_vector=None, channel_names=None, num_channels=16, ir_label='Target', nir_label='Standard', time_points=100, sampling_frequency=1000, shuffle=True, **kwargs): super(DataGenerationTimeSeriesSourceNode, self).__init__(**kwargs) if type(ir_generator) == str: ir_generator = eval(ir_generator) if type(nir_generator) == str: nir_generator = eval(nir_generator) ir_generator.sampling_frequency = sampling_frequency nir_generator.sampling_frequency = sampling_frequency run_number = 0 dataset = None if not channel_names is None: num_channels = len(channel_names) else: channel_names = [] for i in xrange(num_channels): channel_names.append(str(i)) # Translate drift "None" to zero-vector if ir_drift_vector is None: ir_drift_vector = numpy.zeros((time_points, num_channels)) if nir_drift_vector is None: nir_drift_vector = numpy.zeros((time_points, num_channels)) self.set_permanent_attributes(dataset=dataset, ir_generator=ir_generator, nir_generator=nir_generator, ir_items=ir_items, nir_items=nir_items, channel_names=channel_names, num_channels=num_channels, ir_label=ir_label, nir_label=nir_label, time_points=time_points, sampling_frequency=sampling_frequency, shuffle=shuffle, run_number=run_number, data_for_testing=None, data_for_training=None, ir_drift_vector=ir_drift_vector, nir_drift_vector=nir_drift_vector) self.generate_data_set() def set_input_dataset(self, dataset): """ Instead of using a given dataset, a new one is generated """ self.generate_data_set() def generate_data_set(self): """ Generate a dataset using the given generators """ self.dataset = TimeSeriesDataset() # generate a set of dummy labels to know which class is used later label_sequence = numpy.hstack( (numpy.ones(self.ir_items), numpy.zeros(self.nir_items))) if self.shuffle: random.shuffle(label_sequence) ts_generator = TestTimeSeriesGenerator() current_item = 0 # count produced data objects for drift for label in label_sequence: if label == 1: #generate a data item using the ir_generator data_item = \ ts_generator.generate_test_data( channels=len(self.channel_names), time_points=self.time_points, function=self.ir_generator, sampling_frequency=self.sampling_frequency, channel_order=True, channel_names=self.channel_names, dtype=numpy.float) # Drift: data_item = data_item + current_item * self.ir_drift_vector self.dataset.add_sample(data_item, self.ir_label, False) else: #generate a data item using the nir_generator data_item = \ ts_generator.generate_test_data( channels=len(self.channel_names), time_points=self.time_points, function=self.nir_generator, sampling_frequency=self.sampling_frequency, channel_order=True, channel_names=self.channel_names, dtype=numpy.float) # Drift: data_item = data_item + current_item * self.nir_drift_vector self.dataset.add_sample(data_item, self.nir_label, False) current_item += 1. / (self.ir_items + self.nir_items)