def request_data_for_testing(self): """ Returns the data for testing of subsequent nodes .. todo:: to document """ if self.data_for_testing is None: # set window definition for test phase windower file self.window_definition = \ Windower._load_window_spec(self.windower_spec_file, self.local_window_conf) test_data = list(self.input_node.request_data_for_testing()) # create stream of windows self.window_stream(test_data) # Create a generator that emits the windows test_data_generator = ((sample, label) \ for (sample, label) in self.marker_windower) self.data_for_testing = MemoizeGenerator(test_data_generator) # Return a fresh copy of the generator return self.data_for_testing.fresh() else: return self.data_for_testing.fresh()
def request_data_for_training(self, use_test_data): """ Returns data for training of subsequent nodes of the node chain A call to this method might involve training of the node chain up this node. If use_test_data is true, all available data is used for training, otherwise only the data that is explicitly for training. """ assert (self.input_node != None) self._log("Data for training is requested.", level=logging.DEBUG) # If we haven't computed the data for training yet if self.data_for_training == None: self._log("Producing data for training.", level=logging.DEBUG) # Train this node self.train_sweep(use_test_data) # Compute a generator the yields the train data and # encapsulate it in an object that memoizes its outputs and # provides a "fresh" method that returns a new generator that'll # yield the same sequence # This line crashes without the NodeMetaclass bug fix train_data_generator = \ itertools.imap(lambda (data, label) : self.print_data(data, label), self.input_node.request_data_for_training( use_test_data)) self.data_for_training = MemoizeGenerator(train_data_generator, caching=self.caching) self._log("Data for training finished", level=logging.DEBUG) # Return a fresh copy of the generator return self.data_for_training.fresh()
def request_data_for_training(self, use_test_data): """ Returns the time windows that can be used for training of subsequent nodes .. todo:: to document """ if not use_test_data: # If the input dataset consists only of one single run, # we use this as input for all runs to be conducted (i.e. we # rely on later randomization of the order). Otherwise # we use the data for this run number if self.dataset.meta_data["runs"] > 1: key = (self.run_number, self.current_split, "train") else: key = (0, self.current_split, "train") # Check if there is training data for the current split and run if key in self.dataset.data.keys(): self._log("Accessing input dataset's training feature vector windows.") self.data_for_training = MemoizeGenerator(self.dataset.get_data(*key).__iter__(), caching=self.caching) else: # Returns an iterator that iterates over an empty sequence # (i.e. an iterator that is immediately exhausted), since # this node does not provide any data that is explicitly # dedicated for training self._log("No training data available.") self.data_for_training = MemoizeGenerator((x for x in [].__iter__()), caching=self.caching) else: # Return the test data as there is no additional data that # was dedicated for training return self.request_data_for_testing() # Return a fresh copy of the generator return self.data_for_training.fresh()
def request_data_for_testing(self): """ Returns data for testing of subsequent nodes of the node chain A call to this node might involve evaluating the whole node chain up to this node. """ assert (self.input_node != None) self._log("Data for testing is requested.", level=logging.DEBUG) # If we haven't computed the data for testing yet if self.data_for_testing == None: # Assert that this node has already been trained assert (not self.is_trainable() or self.get_remaining_train_phase() == 0) # Compute a generator the yields the test data and # encapsulate it in an object that memoizes its outputs and # provides a "fresh" method that returns a new generator that'll # yield the same sequence self._log("Producing data for testing.", level=logging.DEBUG) test_data_generator = \ itertools.imap(lambda (data, label): self.print_data(data, label), self.input_node.request_data_for_testing()) self.data_for_testing = MemoizeGenerator(test_data_generator, caching=self.caching) self._log("Data for testing finished", level=logging.DEBUG) # Return a fresh copy of the generator return self.data_for_testing.fresh()
def request_data_for_testing(self): """ Returns the data that can be used for testing of subsequent nodes The principle of obtaining the testing data are the same as the principles used in obtaining the training data set. The only difference here is that, in the case in which there is no testing data available, we allow for the training data to be used as testing data. """ # If we haven't read the data for testing yet if self.data_for_testing == None: self._log("Accessing input dataset's test feature vector windows.") # If the input dataset consists only of one single run, # we use this as input for all runs to be conducted (i.e. we # rely on later randomization of the order). Otherwise # we use the data for this run number if self.dataset.meta_data["runs"] > 1: key = (self.run_number, self.current_split, "test") else: key = (0, self.current_split, "test") test_data_generator = self.dataset.get_data(*key).__iter__() self.data_for_testing = MemoizeGenerator(test_data_generator, caching=self.caching) # Return a fresh copy of the generator return self.data_for_testing.fresh()
def request_data_for_testing(self): """ Returns the data that can be used for testing of subsequent nodes .. todo:: to document """ self._log("Requesting test data...") # If we haven't read the data for testing yet if self.data_for_testing is None: self._log("Start streaming.") self.dataset.set_window_defs( window_definition=self.window_definition, nullmarker_stride_ms=self.nullmarker_stride_ms, no_overlap=self.no_overlap, data_consistency_check=self.data_consistency_check) if self.dataset.meta_data["runs"] > 1: key = (self.run_number, self.current_split, "test") else: key = (0, self.current_split, "test") # Create a generator that emits the windows test_data_generator = ((sample, label) for (sample, label) in self.dataset.get_data(*key)) self.data_for_testing = \ MemoizeGenerator(test_data_generator, caching=self.caching) # Return a fresh copy of the generator return self.data_for_testing.fresh()
def request_data_for_training(self, use_test_data): """ Returns data for training of subsequent nodes .. todo:: to document """ assert (self.input_node != None) self._log("Data for training is requested.", level=logging.DEBUG) # If we haven't computed the data for training yet if self.data_for_training == None: self._log("Producing data for training.", level=logging.DEBUG) # Train this node self.train_sweep(use_test_data) # Compute a generator the yields the train data and # encapsulate it in an object that memoizes its outputs and # provides a "fresh" method that returns a new generator that'll # yield the same sequence train_data_generator = \ itertools.imap(lambda (data, label) : (self.execute(data), label), self.external_training_set) self.data_for_training = MemoizeGenerator(train_data_generator, caching=self.caching) self._log("Data for training finished", level=logging.DEBUG) # Return a fresh copy of the generator return self.data_for_training.fresh()
def request_data_for_training(self, use_test_data): """ Returns the time windows that can be used for training of subsequent nodes """ # TODO:Is all this really necessary? if not use_test_data: # If the input dataset consists only of one single run, # we use this as input for all runs to be conducted (i.e. we # rely on later randomization of the order). Otherwise # we use the data for this run number if self.dataset.meta_data["runs"] > 1: key = (self.run_number, self.current_split, "train") else: key = (0, self.current_split, "train") # Check if there is training data for the current split and run if key in self.dataset.data.keys(): self._log("Accessing input dataset's training prediction vectors.") self.data_for_training = MemoizeGenerator(self.dataset.get_data(*key).__iter__(), caching=self.caching) else: # Returns an iterator that iterates over an empty sequence # (i.e. an iterator that is immediately exhausted), since # this node does not provide any data that is explicitly # dedicated for training self._log("No training data available.") self.data_for_training = MemoizeGenerator((x for x in [].__iter__()), caching=self.caching) else: # Return the test data as there is no additional data that # was dedicated for training return self.request_data_for_testing() # Return a fresh copy of the generator return self.data_for_training.fresh()
def request_data_for_testing(self): """ Returns the data that can be used for testing of subsequent nodes .. todo:: to document """ # If we haven't read the data for testing yet if self.data_for_testing == None: self._log("Accessing input dataset's test feature vector windows.") # If the input dataset consists only of one single run, # we use this as input for all runs to be conducted (i.e. we # rely on later randomization of the order). Otherwise # we use the data for this run number if self.dataset.meta_data["runs"] > 1: key = (self.run_number, self.current_split, "test") else: key = (0, self.current_split, "test") test_data_generator = self.dataset.get_data(*key).__iter__() self.data_for_testing = MemoizeGenerator(test_data_generator, caching=self.caching) # Return a fresh copy of the generator return self.data_for_testing.fresh()
def process(self): """ Processes all data that is provided by the input node Returns a generator that yields the data after being processed by this node. """ assert(self.input_node != None), "No input node specified!" # Assert that this node has already been trained assert(not self.is_trainable() or self.get_remaining_train_phase() == 0), "Node not trained!" data_generator = \ itertools.imap(lambda (data, label): (self.execute(data), label), self.input_node.process()) self.client = TimeSeriesClient(ts_stream = data_generator) self.client.connect() self.marker_windower = MarkerWindower(data_client=self.client, windowdefs=self.window_definition, stridems=self.nullmarker_stride_ms) if self.marker_windower == None: self.window_stream() # Create a generator that emits the windows test_data_generator = ((sample, label) \ for (sample, label) in self.marker_windower) self.data_for_testing = MemoizeGenerator(test_data_generator) # Return a fresh copy of the generator return self.data_for_testing.fresh()
def request_data_for_testing(self): # Create split lazily when required if self.split_indices_test == None: self._create_split() # Create test data generator self.data_for_testing = MemoizeGenerator( self.data[i] for i in self.split_indices_test[self.current_split]) return self.data_for_testing.fresh()
def request_data_for_training(self, use_test_data): """ Returns the data for training of subsequent nodes .. todo:: to document """ # Create split lazily when required if self.train_data == None: self._create_split() # Create training data generator self.data_for_training = \ MemoizeGenerator(instance for instance in self.train_data) return self.data_for_training.fresh()
def request_data_for_testing(self): """ Returns the data for testing of subsequent nodes .. todo:: to document """ # Create cv-splits lazily when required if self.split_indices == None: self._create_splits() # Only that data can be used for testing which is explicitly # specified for this purpose by the current cv-split self.data_for_testing = MemoizeGenerator( self.data[i] for i in self.split_indices[self.current_split]) return self.data_for_testing.fresh()
def request_data_for_testing(self): """ Returns the data that can be used for testing of subsequent nodes .. todo:: to document """ self._log("Requesting test data...") # If we haven't read the data for testing yet if self.data_for_testing is None: self._log("Start streaming.") self.dataset.set_window_defs( window_definition=self.window_definition, nullmarker_stride_ms=self.nullmarker_stride_ms, no_overlap=self.no_overlap, data_consistency_check=self.data_consistency_check) if self.dataset.meta_data["runs"] > 1: key = (self.run_number, self.current_split, "test") else: key = (0, self.current_split, "test") # Create a generator that emits the windows test_data_generator = ( (sample, label) for (sample, label) in self.dataset.get_data(*key)) self.data_for_testing = \ MemoizeGenerator(test_data_generator, caching=self.caching) # Return a fresh copy of the generator return self.data_for_testing.fresh()
def request_data_for_testing(self): """ Returns the data that can be used for testing of subsequent nodes .. todo:: to document """ # If we haven't read the data for testing yet if self.data_for_testing is None: self._log("Accessing input dataset's test time series windows.") # If the input dataset consists only of one single run, # we use this as input for all runs to be conducted (i.e. we # rely on later randomization of the order). Otherwise # we use the data for this run number if self.dataset.meta_data["runs"] > 1: key = (self.run_number, self.current_split, "test") else: key = (0, self.current_split, "test") test_data_generator = self.dataset.get_data(*key).__iter__() self.data_for_testing = MemoizeGenerator(test_data_generator, caching=self.caching) # Return a fresh copy of the generator return self.data_for_testing.fresh()
def request_data_for_training(self, use_test_data): """ Returns data for training of subsequent nodes .. todo:: to document """ assert(self.input_node != None) self._log("Data for training is requested.", level = logging.DEBUG) # If we haven't computed the data for training yet if self.data_for_training == None: self._log("Producing data for training.", level = logging.DEBUG) # Train this node self.train_sweep(use_test_data) # Compute a generator the yields the train data and # encapsulate it in an object that memoizes its outputs and # provides a "fresh" method that returns a new generator that'll # yield the same sequence train_data_generator = \ itertools.imap(lambda (data, label) : (self.execute(data), label), self.external_training_set) self.data_for_training = MemoizeGenerator(train_data_generator, caching=self.caching) self._log("Data for training finished", level = logging.DEBUG) # Return a fresh copy of the generator return self.data_for_training.fresh()
def request_data_for_testing(self): """ Returns data for testing of subsequent nodes of the node chain A call to this node might involve evaluating the whole node chain up to this node. """ assert(self.input_node != None) self._log("Data for testing is requested.", level = logging.DEBUG) # If we haven't computed the data for testing yet if self.data_for_testing == None: # Assert that this node has already been trained assert(not self.is_trainable() or self.get_remaining_train_phase() == 0) # Compute a generator the yields the test data and # encapsulate it in an object that memoizes its outputs and # provides a "fresh" method that returns a new generator that'll # yield the same sequence self._log("Producing data for testing.", level = logging.DEBUG) test_data_generator = \ itertools.imap(lambda (data, label): self.print_data(data, label), self.input_node.request_data_for_testing()) self.data_for_testing = MemoizeGenerator(test_data_generator, caching=self.caching) self._log("Data for testing finished", level = logging.DEBUG) # Return a fresh copy of the generator return self.data_for_testing.fresh()
def request_data_for_training(self, use_test_data): """ Returns data for training of subsequent nodes of the node chain A call to this method might involve training of the node chain up this node. If use_test_data is true, all available data is used for training, otherwise only the data that is explicitly for training. """ assert(self.input_node != None) self._log("Data for training is requested.", level = logging.DEBUG) # If we haven't computed the data for training yet if self.data_for_training == None: self._log("Producing data for training.", level = logging.DEBUG) # Train this node self.train_sweep(use_test_data) # Compute a generator the yields the train data and # encapsulate it in an object that memoizes its outputs and # provides a "fresh" method that returns a new generator that'll # yield the same sequence # This line crashes without the NodeMetaclass bug fix train_data_generator = \ itertools.imap(lambda (data, label) : self.print_data(data, label), self.input_node.request_data_for_training( use_test_data)) self.data_for_training = MemoizeGenerator(train_data_generator, caching=self.caching) self._log("Data for training finished", level = logging.DEBUG) # Return a fresh copy of the generator return self.data_for_training.fresh()
def request_data_for_training(self, use_test_data): """ Returns the data for training of subsequent nodes .. todo:: to document """ # Create cv-splits lazily when required if self.split_indices == None: self._create_splits() # All data can be used for training which is not explicitly # specified for testing by the current cv-split self.data_for_training = MemoizeGenerator( self.data[i] for i in range(len(self.data)) if not i in self.split_indices[self.current_split]) return self.data_for_training.fresh()
def request_data_for_training(self, use_test_data): """ Returns data for training of subsequent nodes .. todo:: to document .. note:: This method works differently in InstanceSelectionNode than in other nodes: Only *percentage_selected* of the available data are returned. """ assert(self.input_node != None) if self.train_percentage_selected>100: self._log("Train percentage of %f reduced to 100."%self.train_percentage_selected, level=logging.ERROR) self.train_percentage_selected=100 self._log("Data for training is requested.", level = logging.DEBUG) # If we haven't computed the data for training yet if self.data_for_training == None: self._log("Producing data for training.", level = logging.DEBUG) # Train this node self.train_sweep(use_test_data) # Divide available instances according to label all_instances = defaultdict(list) for instance, label in self.input_node.request_data_for_training(use_test_data): all_instances[label].append(instance) self._log("Keeping only %s percent of training data" % self.train_percentage_selected, level = logging.DEBUG) r = random.Random(self.run_number) # Retain only *percentage_selected* percent of the data retained_instances = [] for label, instances in all_instances.iteritems(): r.shuffle(instances) if not self.reduce_class or self.train_percentage_selected==100: end_index = int(round(len(instances) * self.train_percentage_selected / 100)) elif not (self.reduce_class==label): end_index = len(instances) else: #self.reduce_class==label--> reduction needed end_index = int(round(len(instances) * self.train_percentage_selected / 100)) retained_instances.extend(zip(instances[0:end_index], [label for i in range(end_index)])) # Compute a generator the yields the train data and # encapsulate it in an object that memoizes its outputs and # provides a "fresh" method that returns a new generator that'll # yield the same sequence train_data_generator = \ ((self.execute(data), label) for (data, label) in retained_instances) self.data_for_training = MemoizeGenerator(train_data_generator, caching=self.caching) self._log("Data for training finished", level = logging.DEBUG) # Return a fresh copy of the generator return self.data_for_training.fresh()
def request_data_for_training(self, use_test_data): """ Returns data for training of subsequent nodes .. todo:: to document """ assert(self.input_node != None) self._log("Data for testing is requested.", level = logging.DEBUG) if self.data_for_training == None: self._log("Producing data for training.", level = logging.DEBUG) # Train this node self.train_sweep(use_test_data) # Divide available instances according to label all_instances = defaultdict(list) for instance, label in self.input_node.request_data_for_training(use_test_data): all_instances[label].append(instance) retained_instances = self.balance_instances(all_instances) # Compute a generator the yields the test data and # encapsulate it in an object that memoizes its outputs and # provides a "fresh" method that returns a new generator that'll # yield the same sequence self._log("Producing data for testing.", level = logging.DEBUG) train_data_generator = \ ((self.execute(data), label) for (data, label) in retained_instances) self.data_for_training = MemoizeGenerator(train_data_generator, caching=self.caching) self._log("Data for training finished", level = logging.DEBUG) # Return a fresh copy of the generator return self.data_for_training.fresh()
def request_data_for_training(self, use_test_data): """ Returns the data that can be used for training of subsequent nodes This method streams training data and sends it to the subsequent nodes. If one looks at the tutorial related to building new nodes (available in the tutorial section), one can see exactly where the ``request_data`` methods are put to use. The following example is one that was extracted from the :mod:`~pySPACE.missions.nodes.source.feature_vector_source.FeatureVectorSourceNode` which should(in theory at least) be implementable for all types of data. """ if not use_test_data: # If the input dataset consists only of one single run, # we use this as input for all runs to be conducted (i.e. we # rely on later randomization of the order). Otherwise # we use the data for this run number if self.dataset.meta_data["runs"] > 1: key = (self.run_number, self.current_split, "train") else: key = (0, self.current_split, "train") # Check if there is training data for the current split and run if key in self.dataset.data.keys(): self._log( "Accessing input dataset's training feature vector windows." ) self.data_for_training = MemoizeGenerator( self.dataset.get_data(*key).__iter__(), caching=self.caching) else: # Returns an iterator that iterates over an empty sequence # (i.e. an iterator that is immediately exhausted), since # this node does not provide any data that is explicitly # dedicated for training self._log("No training data available.") self.data_for_training = MemoizeGenerator( (x for x in [].__iter__()), caching=self.caching) else: # Return the test data as there is no additional data that # was dedicated for training return self.request_data_for_testing() # Return a fresh copy of the generator return self.data_for_training.fresh()
def request_data_for_training(self, use_test_data): """ Returns the data that can be used for training of subsequent nodes This method streams training data and sends it to the subsequent nodes. If one looks at the tutorial related to building new nodes (available in the tutorial section), one can see exactly where the ``request_data`` methods are put to use. The following example is one that was extracted from the :mod:`~pySPACE.missions.nodes.source.feature_vector_source.FeatureVectorSourceNode` which should(in theory at least) be implementable for all types of data. """ if not use_test_data: # If the input dataset consists only of one single run, # we use this as input for all runs to be conducted (i.e. we # rely on later randomization of the order). Otherwise # we use the data for this run number if self.dataset.meta_data["runs"] > 1: key = (self.run_number, self.current_split, "train") else: key = (0, self.current_split, "train") # Check if there is training data for the current split and run if key in self.dataset.data.keys(): self._log("Accessing input dataset's training feature vector windows.") self.data_for_training = MemoizeGenerator(self.dataset.get_data(*key).__iter__(), caching=self.caching) else: # Returns an iterator that iterates over an empty sequence # (i.e. an iterator that is immediately exhausted), since # this node does not provide any data that is explicitly # dedicated for training self._log("No training data available.") self.data_for_training = MemoizeGenerator((x for x in [].__iter__()), caching=self.caching) else: # Return the test data as there is no additional data that # was dedicated for training return self.request_data_for_testing() # Return a fresh copy of the generator return self.data_for_training.fresh()
def request_data_for_testing(self): """ Returns the data that can be used for testing of subsequent nodes .. todo:: to document """ # If we haven't read the data for testing yet if self.data_for_testing == None: generated_data = self.generate_random_data() # Create a generator that emits the windows test_data_generator = ((sample, label) \ for (sample, label) in generated_data) self.data_for_testing = MemoizeGenerator(test_data_generator, caching=True) # Return a fresh copy of the generator return self.data_for_testing.fresh()
def request_data_for_testing(self): """ Returns the data that can be used for testing of subsequent nodes .. todo:: to document """ # If we haven't read the data for testing yet if self.data_for_testing is None: self.time_series = [(TimeSeries(input_array=numpy.ones((2, 2)) * i, channel_names=["X", "Y"], sampling_frequency=2), random.choice(["A", "B"])) for i in range(23)] # Create a generator that emits the windows test_data_generator = ((sample, label) \ for (sample, label) in self.time_series) self.data_for_testing = MemoizeGenerator(test_data_generator, caching=True) # Return a fresh copy of the generator return self.data_for_testing.fresh()
def request_data_for_training(self, use_test_data): """ Returns the data that can be used for training of subsequent nodes .. todo:: to document """ # set window definition for train phase windower file self.window_definition = \ Windower._load_window_spec(self.windower_spec_file_train, self.local_window_conf) self._log("Requesting train data...") if self.data_for_training is None: if not use_test_data: # Get training and test data (with labels) train_data = \ list(self.input_node.request_data_for_training(use_test_data=use_test_data)) # If training or test data is an empty list if train_data == []: self.data_for_training = MemoizeGenerator( (x for x in [].__iter__()), caching=True) return self.data_for_training.fresh() # create stream of self.window_stream(train_data) # Create a generator that emits the windows train_data_generator = ((sample, label) for (sample, label) in self.marker_windower) self.data_for_training = MemoizeGenerator(train_data_generator, caching=True) return self.data_for_training.fresh() else: # Return the test data as there is no additional data that # was dedicated for training self.data_for_training = self.request_data_for_testing() return self.data_for_training else: return self.data_for_training.fresh()
def request_data_for_training(self, use_test_data): """ Returns the data that can be used for training of subsequent nodes .. todo:: to document """ # set window definition for train phase windower file self.window_definition = \ Windower._load_window_spec(self.windower_spec_file_train, self.local_window_conf) self._log("Requesting train data...") if self.data_for_training is None: if not use_test_data: # Get training and test data (with labels) train_data = \ list(self.input_node.request_data_for_training(use_test_data=use_test_data)) # If training or test data is an empty list if train_data == []: self.data_for_training=MemoizeGenerator( (x for x in [].__iter__()), caching=True) return self.data_for_training.fresh() # create stream of self.window_stream(train_data) # Create a generator that emits the windows train_data_generator = ((sample, label) for (sample, label) in self.marker_windower) self.data_for_training = MemoizeGenerator(train_data_generator, caching=True) return self.data_for_training.fresh() else: # Return the test data as there is no additional data that # was dedicated for training self.data_for_training = self.request_data_for_testing() return self.data_for_training.fresh() else: return self.data_for_training.fresh()
def request_data_for_testing(self): """ Returns data for testing of subsequent nodes .. todo:: to document """ assert(self.input_node is not None) self._log("Data for testing is requested.", level=logging.DEBUG) # If we haven't computed the data for testing yet if self.data_for_testing is None: # Assert that this node has already been trained assert(not self.is_trainable() or self.get_remaining_train_phase() == 0) # Divide available instances according to label all_instances = defaultdict(list) for instance, label in self.input_node.request_data_for_testing(): all_instances[label].append(instance) retained_instances = self.balance_instances(all_instances) # Compute a generator the yields the test data and # encapsulate it in an object that memoizes its outputs and # provides a "fresh" method that returns a new generator that will # yield the same sequence self._log("Producing data for testing.", level=logging.DEBUG) test_data_generator = ((self.execute(data), label) for (data, label) in retained_instances) self.data_for_testing = MemoizeGenerator(test_data_generator, caching=self.caching) self._log("Data for testing finished", level=logging.DEBUG) # Return a fresh copy of the generator return self.data_for_testing.fresh()
class SimpleTimeSeriesSourceNode(TimeSeriesSourceNode): """ A simple test class for unit tests Generates the same data for test and training. """ def __init__(self, *args, **kwargs): super(SimpleTimeSeriesSourceNode, self).__init__(*args, **kwargs) run_number = 0 # We have to create a dummy dataset class DummyObject(object): pass dataset = DummyObject() dataset.meta_data = {'runs' : 1} dataset.data = {} self.set_permanent_attributes(dataset = dataset, run_number=run_number) def request_data_for_testing(self): """ Returns the data that can be used for testing of subsequent nodes .. todo:: to document """ # If we haven't read the data for testing yet if self.data_for_testing is None: self.time_series = [(TimeSeries(input_array = numpy.ones((2,2))*i, channel_names = ["X", "Y"], sampling_frequency = 2), random.choice(["A", "B"])) for i in range(23)] # Create a generator that emits the windows test_data_generator = ((sample, label) \ for (sample, label) in self.time_series) self.data_for_testing = MemoizeGenerator(test_data_generator, caching = True) # Return a fresh copy of the generator return self.data_for_testing.fresh()
class SimpleTimeSeriesSourceNode(TimeSeriesSourceNode): """ A simple test class for unit tests Generates the same data for test and training. """ def __init__(self, *args, **kwargs): super(SimpleTimeSeriesSourceNode, self).__init__(*args, **kwargs) run_number = 0 # We have to create a dummy dataset class DummyObject(object): pass dataset = DummyObject() dataset.meta_data = {'runs': 1} dataset.data = {} self.set_permanent_attributes(dataset=dataset, run_number=run_number) def request_data_for_testing(self): """ Returns the data that can be used for testing of subsequent nodes .. todo:: to document """ # If we haven't read the data for testing yet if self.data_for_testing is None: self.time_series = [(TimeSeries(input_array=numpy.ones((2, 2)) * i, channel_names=["X", "Y"], sampling_frequency=2), random.choice(["A", "B"])) for i in range(23)] # Create a generator that emits the windows test_data_generator = ((sample, label) \ for (sample, label) in self.time_series) self.data_for_testing = MemoizeGenerator(test_data_generator, caching=True) # Return a fresh copy of the generator return self.data_for_testing.fresh()
def request_data_for_testing(self): """ Returns the data that can be used for testing of subsequent nodes .. todo:: to document """ # If we haven't read the data for testing yet if self.data_for_testing == None: generated_data = self.generate_random_data() # Create a generator that emits the windows test_data_generator = ((sample, label) \ for (sample, label) in generated_data) self.data_for_testing = MemoizeGenerator(test_data_generator, caching = True) # Return a fresh copy of the generator return self.data_for_testing.fresh()
def request_data_for_testing(self): """ Returns the data that can be used for testing of subsequent nodes .. todo:: to document """ # If we haven't read the data for testing yet if self.data_for_testing is None: self.time_series = [(TimeSeries(input_array = numpy.ones((2,2))*i, channel_names = ["X", "Y"], sampling_frequency = 2), random.choice(["A", "B"])) for i in range(23)] # Create a generator that emits the windows test_data_generator = ((sample, label) \ for (sample, label) in self.time_series) self.data_for_testing = MemoizeGenerator(test_data_generator, caching = True) # Return a fresh copy of the generator return self.data_for_testing.fresh()
class TimeSeriesSourceNode(BaseNode): """ Source for windowed :class:`~pySPACE.resources.data_types.time_series.TimeSeries` saved in pickle format via :class:`~pySPACE.missions.nodes.sink.time_series_sink.TimeSeriesSinkNode` **Parameters** **Exemplary Call** .. code-block:: yaml - node : TimeSeriesSource :Author: Jan Hendrik Metzen ([email protected]) :Created: 2008/11/25 """ input_types = ["TimeSeries"] def __init__(self, **kwargs): super(TimeSeriesSourceNode, self).__init__(**kwargs) self.set_permanent_attributes(dataset=None) def set_input_dataset(self, dataset): """ Sets the dataset from which this node reads the data """ self.set_permanent_attributes(dataset=dataset) def register_input_node(self, node): """ Register the given node as input """ raise Exception("No nodes can be registered as inputs for source nodes") def use_next_split(self): """ Use the next split of the data into training and test data. Returns True if more splits are available, otherwise False. This method is useful for benchmarking """ # if the input dataset has more than one split/run we will compute # the splits in parallel, i.e. we don't return any further splits return False def train_sweep(self, use_test_data): """ Performs the actual training of the node. .. note:: Source nodes cannot be trained """ raise Exception("Source nodes cannot be trained") def request_data_for_training(self, use_test_data): """ Returns the time windows that can be used for training of subsequent nodes .. todo:: to document """ if not use_test_data: # If the input dataset consists only of one single run, # we use this as input for all runs to be conducted (i.e. we # rely on later randomization of the order). Otherwise # we use the data for this run number if self.dataset.meta_data["runs"] > 1: key = (self.run_number, self.current_split, "train") self._log("Run %s." % self.run_number) else: key = (0, self.current_split, "train") self._log("Run %s. Using input data of run 0." % self.run_number) # Check if there is training data for the current split and run if key in self.dataset.data.keys(): self._log("Accessing input dataset's training time series windows.") self.data_for_training = \ MemoizeGenerator(self.dataset.get_data(*key).__iter__(), caching=self.caching) else: # Returns an iterator that iterates over an empty sequence # (i.e. an iterator that is immediately exhausted), since # this node does not provide any data that is explicitly # dedicated for training self._log("No training data available.") self.data_for_training = MemoizeGenerator((x for x in [].__iter__()), caching=self.caching) else: # Return the test data as there is no additional data that # was dedicated for training return self.request_data_for_testing() # Return a fresh copy of the generator return self.data_for_training.fresh() def request_data_for_testing(self): """ Returns the data that can be used for testing of subsequent nodes .. todo:: to document """ # If we haven't read the data for testing yet if self.data_for_testing is None: self._log("Accessing input dataset's test time series windows.") # If the input dataset consists only of one single run, # we use this as input for all runs to be conducted (i.e. we # rely on later randomization of the order). Otherwise # we use the data for this run number if self.dataset.meta_data["runs"] > 1: key = (self.run_number, self.current_split, "test") else: key = (0, self.current_split, "test") test_data_generator = self.dataset.get_data(*key).__iter__() self.data_for_testing = MemoizeGenerator(test_data_generator, caching=self.caching) # Return a fresh copy of the generator return self.data_for_testing.fresh() def get_metadata(self, key): """ Return the value corresponding to the given key from the dataset meta data of this source node. """ return self.dataset.meta_data.get(key) def __del__(self): del self.dataset self.dataset = None
class StreamWindowingNode(BaseNode): """Get a stream of time series objects and window them inside a flow. Node that interprets a stream of incoming time series objects as a raw data stream. The markers stored in marker_name attribute are used as the markers for a :class:`~pySPACE.missions.support.windower.MarkerWindower`. This should done *before* any splitter, since all incoming windows are regarded as parts of a consecutive data stream. **Parameters** :windower_spec_file: The window specification file for the :class:`~pySPACE.missions.support.windower.MarkerWindower`. Used for testing and training, if windower_spec_file_train is not specified. :windower_spec_file_train: A separate window file for training only. If not specified, windower_spec_file is used for training and testing. **Parameters** **Exemplary Call** .. code-block:: yaml - node : Stream_Windowing parameters : windower_spec_file : "example_lrp_window_spec.yaml" :Authors: Hendrik Woehrle ([email protected]) :Created: 2012/07/09 """ def __init__(self, windower_spec_file, windower_spec_file_train=None, local_window_conf=False, nullmarker_stride_ms=1000, *args, **kwargs): super(StreamWindowingNode, self).__init__(*args, **kwargs) if windower_spec_file_train is None: windower_spec_file_train = windower_spec_file self.set_permanent_attributes( client=None, marker_windower=None, window_definition=None, local_window_conf=local_window_conf, windower_spec_file=windower_spec_file, windower_spec_file_train=windower_spec_file_train, nullmarker_stride_ms=nullmarker_stride_ms) def request_data_for_training(self, use_test_data): """ Returns the data that can be used for training of subsequent nodes .. todo:: to document """ # set window definition for train phase windower file self.window_definition = \ Windower._load_window_spec(self.windower_spec_file_train, self.local_window_conf) self._log("Requesting train data...") if self.data_for_training is None: if not use_test_data: # Get training and test data (with labels) train_data = \ list(self.input_node.request_data_for_training(use_test_data=use_test_data)) # If training or test data is an empty list if train_data == []: self.data_for_training = MemoizeGenerator( (x for x in [].__iter__()), caching=True) return self.data_for_training.fresh() # create stream of self.window_stream(train_data) # Create a generator that emits the windows train_data_generator = ((sample, label) for (sample, label) in self.marker_windower) self.data_for_training = MemoizeGenerator(train_data_generator, caching=True) return self.data_for_training.fresh() else: # Return the test data as there is no additional data that # was dedicated for training self.data_for_training = self.request_data_for_testing() return self.data_for_training else: return self.data_for_training.fresh() def request_data_for_testing(self): """ Returns the data for testing of subsequent nodes .. todo:: to document """ if self.data_for_testing is None: # set window definition for test phase windower file self.window_definition = \ Windower._load_window_spec(self.windower_spec_file, self.local_window_conf) test_data = list(self.input_node.request_data_for_testing()) # create stream of windows self.window_stream(test_data) # Create a generator that emits the windows test_data_generator = ((sample, label) \ for (sample, label) in self.marker_windower) self.data_for_testing = MemoizeGenerator(test_data_generator) # Return a fresh copy of the generator return self.data_for_testing.fresh() else: return self.data_for_testing.fresh() def process(self): """ Processes all data that is provided by the input node Returns a generator that yields the data after being processed by this node. """ assert (self.input_node != None), "No input node specified!" # Assert that this node has already been trained assert (not self.is_trainable() or self.get_remaining_train_phase() == 0), "Node not trained!" if self.window_definition is None: if self.is_training( ) and self.windower_spec_file_train is not None: self.window_definition = \ Windower._load_window_spec(self.windower_spec_file_train, self.local_window_conf) else: self.window_definition = \ Windower._load_window_spec(self.windower_spec_file, self.local_window_conf) data_generator = \ itertools.imap(lambda (data, label): (self.execute(data), label), self.input_node.process()) self.client = TimeSeriesClient(ts_stream=data_generator) self.client.set_window_defs(self.window_definition) self.client.connect() self.marker_windower = MarkerWindower( data_client=self.client, windowdefs=self.window_definition, nullmarker_stride_ms=self.nullmarker_stride_ms) if self.marker_windower == None: self.window_stream() # Create a generator that emits the windows test_data_generator = ((sample, label) \ for (sample, label) in self.marker_windower) self.data_for_testing = MemoizeGenerator(test_data_generator) # Return a fresh copy of the generator return self.data_for_testing.fresh() def window_stream(self, data): # Creates a windower that splits the given data data into windows # based in the window definitions provided # and assigns correct labels to these windows self.client = TimeSeriesClient(ts_stream=iter(data)) self.client.connect() self.client.set_window_defs(self.window_definition) self.marker_windower = MarkerWindower( data_client=self.client, windowdefs=self.window_definition, nullmarker_stride_ms=self.nullmarker_stride_ms) def __getstate__(self): """ Return a pickable state for this object """ self.window_definition = None return super(StreamWindowingNode, self).__getstate__() def get_output_type(self, input_type, as_string=True): from pySPACE.resources.data_types.time_series import TimeSeries if as_string: return "TimeSeries" else: return TimeSeries
class TrainTestSplitterNode(BaseNode): """ Split data into one training and one test data set with a fixed ratio The relative size of the two sets is controlled via the parameter train_ratio. .. warning:: the class ratio is not retained .. todo:: introduce stratified parameter as in CV_Splitter **Parameters** :train_ratio: The ratio of the overall available data that is assigned to the training set. The remaining data (1-train_ratio) is used for testing. (*optional, default: 0.5*) :num_train_instances: Instead of specifying a train_ratio, this option allows to specify the absolute number of training instances of class *class_label* that should be in the training set. All instances that occur until *num_train_instances* are found are used for training. The remaining data are used for testing. (*optional, default: None*) :class_label: If *num_train_instances*-option is used, this string determines the class of which training examples are count. :random: If *False*, the order of the data is retained. I.e. the train_ratio instances are used for training and the remaining as test data. If *True*, the two sets are sampled randomly from the data without taking into consideration the data's order. (*optional, default: True*) **Exemplary Call** .. code-block:: yaml - node : TrainTestSplitter parameters : train_ratio : 0.7 random : False :Author: Jan Hendrik Metzen ([email protected]) :Created: 2010/03/08 (Documentation, old node) :LastChange: 2011/11/14 (Documentation) Anett Seeland """ def __init__(self, train_ratio=0.5, random=True, num_train_instances=None, class_label='Target', reverse=False, **kwargs): super(TrainTestSplitterNode, self).__init__(**kwargs) assert(not(random and reverse)),"Reverse ordering makes no sense when randomization is active!" self.set_permanent_attributes(train_ratio=train_ratio, random=random, num_train_instances=num_train_instances, class_label=class_label, reverse=reverse, train_data=None, test_data=None) def is_split_node(self): """ Returns whether this is a split node. """ return True def use_next_split(self): """ Use the next split of the data into training and test data. Returns True if more splits are available, otherwise False. This method is useful for benchmarking """ return False def train_sweep(self, use_test_data): """ Performs the actual training of the node. .. note:: Split nodes cannot be trained """ raise Exception("Split nodes cannot be trained") def request_data_for_training(self, use_test_data): """ Returns the data for training of subsequent nodes .. todo:: to document """ # Create split lazily when required if self.train_data == None: self._create_split() # Create training data generator self.data_for_training = \ MemoizeGenerator(instance for instance in self.train_data) return self.data_for_training.fresh() def request_data_for_testing(self): """ Returns the data for testing of subsequent nodes .. todo:: to document """ # Create split lazily when required if self.test_data == None: self._create_split() # Create test data generator self.data_for_testing = \ MemoizeGenerator(instance for instance in self.test_data) return self.data_for_testing.fresh() def _create_split(self): """ Create the split of the data into training and test data. """ self._log("Splitting data into train and test data") train_data = list(self.input_node.request_data_for_training(use_test_data=False)) # If there is already a non-empty training set, # it means that we are not the first split node in the node chain. if len(train_data) > 0: raise Exception("No iterated splitting of data sets allowed\n " "(Calling a splitter on a data set that is already " "split)") # Create generator instead of loading all data if self.num_train_instances and not (self.random): self.train_data = [] input_generator=self.input_node.request_data_for_testing for i in range(self.num_train_instances): self.train_data.append(input_generator.next()) self.test_data = input_generator return # Gather all test data test_data = list(self.input_node.request_data_for_testing()) # Remember all the data and store it in memory # TODO: This might cause problems for large dataset data = train_data + test_data data_size = len(data) # Randomize order if randomization is not switched of if self.random: r = random.Random(self.run_number) r.shuffle(data) if self.num_train_instances!=None: if self.reverse: data = data[::-1] if len([i for i in range(len(data)) \ if data[i][1]==self.class_label])==self.num_train_instances: train_end = data_size else: counter = 0 for (index, (window, label)) in enumerate(data): # print "Label: ", label, "Zeitpunkt: ", window.start_time if label == self.class_label: counter += 1 if counter == self.num_train_instances: train_end = index+1 break assert(self.num_train_instances==counter), \ "Too many instances to select." else: # Split data into train and test data according train_ratio train_end = int(round(data_size * self.train_ratio)) self.train_data=data[0:train_end] self.test_data=data[train_end:]
class RandomTimeSeriesSourceNode(TimeSeriesSourceNode): """ Generate random data and act as a source for windowed TimeSeries This node acts as a source for windowed TimeSeries. The TimeSeries are generated randomly according to the given parameters and forwarded. The time series are generated according to the given generating function, and the class label by a uniform distribution according with a given threshold Only two classes are supported by now. **Parameters** :num_instances: The number of instances to be generated. (*optional, default: 20*) :generating_function_class_0: A function to generate data for class 0. Receives an index, which states the number of already generated samples. (*optional, default: lambda i: numpy.ones((2,2))*i*) :generating_function_class_1: A function to generate data for class 1. Receives an index, which states the number of already generated samples. (*optional, default: lambda i: numpy.ones((2,2))*i*) :channel_names: Channel names of the time series objects. :class_labels: The class labels of the generated time series. :choice_threshold: The threshold class assignment. The classes are generated randomly by generating a random number r between 0 and 1. If r < threshold, the class label is class_labels[0], and class_labels[1] otherwise. :sampling_frequency: Sampling frequency of the generated time series. :random: If true, the order of the data is randomly shuffled. (*optional, default: True*) **Exemplary Call** .. code-block:: yaml - node : RandomTimeSeriesSource :Author: Hendrik Woehrle ([email protected]) :Created: 2010/09/22 """ def __init__(self, num_instances=20, generating_function_class_0=lambda i: numpy.ones((2, 2)) * i, generating_function_class_1=lambda i: numpy.ones((2, 2)) * i, channel_names=["X", "Y"], class_labels=['A', 'B'], class_choice_function=random.random, choice_threshold=0.33, sampling_frequency=2, **kwargs): super(RandomTimeSeriesSourceNode, self).__init__(**kwargs) # We have to create a dummy collection class DummyObject(object): pass collection = DummyObject() collection.meta_data = {'runs': 1} collection.data = {} # only binary classification supported by now assert (len(class_labels) == 2) self.set_permanent_attributes( collection=collection, num_instances=num_instances, generating_function_class_0=generating_function_class_0, generating_function_class_1=generating_function_class_1, channel_names=channel_names, class_labels=class_labels, class_choice_function=class_choice_function, choice_threshold=choice_threshold, sampling_frequency=sampling_frequency) def generate_random_data(self): """ Method that is invoked by train and test data generation functions""" # invokes the given generating functions generated_data = [] for i in range(self.num_instances): choice = self.class_choice_function() label = None if choice < self.choice_threshold: input_array = self.generating_function_class_0(i) label = self.class_labels[0] else: input_array = self.generating_function_class_1(i) label = self.class_labels[1] generated_data.append( (TimeSeries(input_array=input_array, channel_names=self.channel_names, sampling_frequency=self.sampling_frequency), label)) return generated_data def request_data_for_testing(self): """ Returns the data that can be used for testing of subsequent nodes .. todo:: to document """ # If we haven't read the data for testing yet if self.data_for_testing == None: generated_data = self.generate_random_data() # Create a generator that emits the windows test_data_generator = ((sample, label) \ for (sample, label) in generated_data) self.data_for_testing = MemoizeGenerator(test_data_generator, caching=True) # Return a fresh copy of the generator return self.data_for_testing.fresh() def request_data_for_training(self, use_test_data): """ Returns the data that can be used for testing of subsequent nodes .. todo:: to document """ if use_test_data: return self.request_data_for_testing() # If we haven't read the data for testing yet if self.data_for_training == None: generated_data = self.generate_random_data() # Create a generator that emits the windows train_data_generator = ((sample, label) \ for (sample, label) in generated_data) self.data_for_training = MemoizeGenerator(train_data_generator, caching=True) # Return a fresh copy of the generator return self.data_for_training.fresh() def get_metadata(self, key): """ This source node does not contain collection meta data. """ return None
class StreamWindowingNode(BaseNode): """Get a stream of time series objects and window them inside a flow. Node that interprets a stream of incoming time series objects as a raw data stream. The markers stored in marker_name attribute are used as the markers for a :class:`~pySPACE.missions.support.windower.MarkerWindower`. This should done *before* any splitter, since all incoming windows are regarded as parts of a consecutive data stream. **Parameters** :windower_spec_file: The window specification file for the :class:`~pySPACE.missions.support.windower.MarkerWindower`. Used for testing and training, if windower_spec_file_train is not specified. :windower_spec_file_train: A separate window file for training only. If not specified, windower_spec_file is used for training and testing. **Parameters** **Exemplary Call** .. code-block:: yaml - node : Stream_Windowing parameters : windower_spec_file : "example_lrp_window_spec.yaml" :Authors: Hendrik Woehrle ([email protected]) :Created: 2012/07/09 """ def __init__(self, windower_spec_file, windower_spec_file_train = None, local_window_conf=False, nullmarker_stride_ms=None, *args, **kwargs): super(StreamWindowingNode, self).__init__(*args, **kwargs) if windower_spec_file_train is None: windower_spec_file_train = windower_spec_file self.set_permanent_attributes(client = None, marker_windower = None, window_definition = None, local_window_conf = local_window_conf, windower_spec_file = windower_spec_file, windower_spec_file_train = windower_spec_file_train, nullmarker_stride_ms=nullmarker_stride_ms) def request_data_for_training(self, use_test_data): """ Returns the data that can be used for training of subsequent nodes .. todo:: to document """ # set window definition for train phase windower file self.window_definition = \ Windower._load_window_spec(self.windower_spec_file_train, self.local_window_conf) self._log("Requesting train data...") if self.data_for_training is None: if not use_test_data: # Get training and test data (with labels) train_data = \ list(self.input_node.request_data_for_training(use_test_data=use_test_data)) # If training or test data is an empty list if train_data == []: self.data_for_training=MemoizeGenerator( (x for x in [].__iter__()), caching=True) return self.data_for_training.fresh() # create stream of self.window_stream(train_data) # Create a generator that emits the windows train_data_generator = ((sample, label) for (sample, label) in self.marker_windower) self.data_for_training = MemoizeGenerator(train_data_generator, caching=True) return self.data_for_training.fresh() else: # Return the test data as there is no additional data that # was dedicated for training self.data_for_training = self.request_data_for_testing() return self.data_for_training.fresh() else: return self.data_for_training.fresh() def request_data_for_testing(self): """ Returns the data for testing of subsequent nodes .. todo:: to document """ if self.data_for_testing is None: # set window definition for test phase windower file self.window_definition = \ Windower._load_window_spec(self.windower_spec_file, self.local_window_conf) test_data = list(self.input_node.request_data_for_testing()) # create stream of windows self.window_stream(test_data) # Create a generator that emits the windows test_data_generator = ((sample, label) \ for (sample, label) in self.marker_windower) self.data_for_testing = MemoizeGenerator(test_data_generator) # Return a fresh copy of the generator return self.data_for_testing.fresh() else: return self.data_for_testing.fresh() def process(self): """ Processes all data that is provided by the input node Returns a generator that yields the data after being processed by this node. """ assert(self.input_node != None), "No input node specified!" # Assert that this node has already been trained assert(not self.is_trainable() or self.get_remaining_train_phase() == 0), "Node not trained!" data_generator = \ itertools.imap(lambda (data, label): (self.execute(data), label), self.input_node.process()) self.client = TimeSeriesClient(ts_stream = data_generator) self.client.connect() self.marker_windower = MarkerWindower(data_client=self.client, windowdefs=self.window_definition, stridems=self.nullmarker_stride_ms) if self.marker_windower == None: self.window_stream() # Create a generator that emits the windows test_data_generator = ((sample, label) \ for (sample, label) in self.marker_windower) self.data_for_testing = MemoizeGenerator(test_data_generator) # Return a fresh copy of the generator return self.data_for_testing.fresh() def window_stream(self, data): # Creates a windower that splits the given data data into windows # based in the window definitions provided # and assigns correct labels to these windows self.client = TimeSeriesClient(ts_stream = iter(data)) self.client.connect() self.marker_windower = MarkerWindower(data_client=self.client, windowdefs=self.window_definition, stridems=self.nullmarker_stride_ms)
class InstanceSelectionNode(BaseNode): """Retain only a certain percentage of the instances The node InstanceSelectionNode forwards only *train_percentage_selected* percent of the training instances passed to him to the successor node and only *test_percentage_selected* percent of the test instances. The forwarded instances are selected randomly but so that the class ratio is kept. If *reduce_class* is used, only the chosen class is reduced, without keeping the class ratio. So the total mount of reduced data does not match the percentage values. **Parameters** :train_percentage_selected: The percentage of training instances which is forwarded to successor node. (*optional, default: 100*) :test_percentage_selected: The percentage of test instances which is forwarded to successor node. (*optional, default: 100*) :reduce_class: If you want only to reduce one class, choose this parameter otherwise, both classes are reduced in a balanced fashion. (*optional, default: False*) **Exemplary call** .. code-block:: yaml - node : InstanceSelection parameters : train_percentage_selected : 80 test_percentage_selected : 100 reduce_class : Standard :Author: Jan Hendrik Metzen ([email protected]) :Created: 2010/03/31 """ def __init__(self, train_percentage_selected=100, test_percentage_selected=100, reduce_class=False, **kwargs): super(InstanceSelectionNode, self).__init__(**kwargs) self.set_permanent_attributes( train_percentage_selected=train_percentage_selected, test_percentage_selected=test_percentage_selected, reduce_class=reduce_class) def request_data_for_training(self, use_test_data): """ Returns data for training of subsequent nodes .. todo:: to document .. note:: This method works differently in InstanceSelectionNode than in other nodes: Only *percentage_selected* of the available data are returned. """ assert(self.input_node is not None) if self.train_percentage_selected > 100: self._log("Train percentage of %f reduced to 100." % self.train_percentage_selected, level=logging.ERROR) self.train_percentage_selected = 100 self._log("Data for training is requested.", level=logging.DEBUG) if self.train_percentage_selected == 100: return super(InstanceSelectionNode, self).request_data_for_training( use_test_data) # If we haven't computed the data for training yet if self.data_for_training is None: self._log("Producing data for training.", level=logging.DEBUG) # Train this node self.train_sweep(use_test_data) # Divide available instances according to label all_instances = defaultdict(list) for instance, label in self.input_node.request_data_for_training( use_test_data): all_instances[label].append(instance) self._log("Keeping only %s percent of training data" % self.train_percentage_selected, level=logging.DEBUG) r = random.Random(self.run_number) # Retain only *percentage_selected* percent of the data retained_instances = [] for label, instances in all_instances.iteritems(): # enable random choice of samples r.shuffle(instances) if not self.reduce_class or \ self.train_percentage_selected == 100: end_index = int(round(len(instances) * self.train_percentage_selected / 100)) elif not (self.reduce_class == label): end_index = len(instances) else: # self.reduce_class==label--> reduction needed end_index = int(round(len(instances) * self.train_percentage_selected / 100)) retained_instances.extend(zip(instances[0:end_index], [label]*end_index)) # mix up samples between the different labels r.shuffle(retained_instances) # Compute a generator the yields the train data and # encapsulate it in an object that memoizes its outputs and # provides a "fresh" method that returns a new generator that will # yield the same sequence train_data_generator = ((self.execute(data), label) for (data, label) in retained_instances) self.data_for_training = MemoizeGenerator(train_data_generator, caching=self.caching) self._log("Data for training finished", level=logging.DEBUG) # Return a fresh copy of the generator return self.data_for_training.fresh() def request_data_for_testing(self): """ Returns data for testing of subsequent nodes .. todo:: to document """ assert(self.input_node is not None) if self.test_percentage_selected > 100: self._log("Test percentage of %f reduced to 100." % self.test_percentage_selected, level=logging.ERROR) self.test_percentage_selected = 100 self._log("Data for testing is requested.", level=logging.DEBUG) if self.test_percentage_selected == 100: return super(InstanceSelectionNode, self).request_data_for_testing() # If we haven't computed the data for testing yet if self.data_for_testing is None: # Assert that this node has already been trained assert(not self.is_trainable() or self.get_remaining_train_phase() == 0) # Divide available instances according to label all_instances = defaultdict(list) for instance, label in self.input_node.request_data_for_testing(): all_instances[label].append(instance) self._log("Keeping only %s percent of test data" % self.test_percentage_selected, level=logging.DEBUG) r = random.Random(self.run_number) # Retain only *percentage_selected* percent of the data retained_instances = [] for label, instances in all_instances.iteritems(): # enable random choice of samples r.shuffle(instances) if not self.reduce_class or \ self.test_percentage_selected == 100: end_index = int(round(len(instances) * self.test_percentage_selected / 100)) elif not (self.reduce_class == label): end_index = len(instances) else: # self.reduce_class==label--> reduction needed end_index = int(round(len(instances) * self.test_percentage_selected / 100)) retained_instances.extend(zip(instances[0:end_index], [label]*end_index)) # mix up samples between the different labels r.shuffle(retained_instances) # Compute a generator the yields the test data and # encapsulate it in an object that memoizes its outputs and # provides a "fresh" method that returns a new generator that'll # yield the same sequence self._log("Producing data for testing.", level=logging.DEBUG) test_data_generator = ((self.execute(data), label) for (data, label) in retained_instances) self.data_for_testing = MemoizeGenerator(test_data_generator, caching=self.caching) self._log("Data for testing finished", level=logging.DEBUG) # Return a fresh copy of the generator return self.data_for_testing.fresh() def _execute(self, time_series): return time_series # We don't do anything with the kept instances
class ConsumeTrainingDataNode(BaseNode): """ Split training data for internal usage and usage of successor nodes This node allows to handle situations where some model needs to be trained and later on evaluated on the given training data (using test data may not be allowed for certain reasons). Simply training and evaluating the model on the same data is not an option, since the evaluation would have a strong optimistic bias (model is well adapted to the data it was trained on). One example of such a situation is when a node chain is trained on the data that should be combined later on with an ensemble of node chains trained on historic data. The ensemble training should not happen on the same data as training. This node therefore splits the training data into two parts: one for internal use (training the model) and one for usage of successor nodes (model evaluation). The ratio of training data that should be used internally can be controlled with the argument *consumption_rate* (a value between 0.0 and 1.0). .. note:: When defining this node in the pySPACE YAML syntax, "wrapped_node" can be the definition of a node in YAML syntax (see below). The node object is then created automatically based on this definition. **Parameters** :wrapped_node: The node that is trained with the internally used training data. :consumption_rate: The rate of training data that is used internally for training *wrapped_node*. The remaining data is supplied for the successor nodes. :random_seed: The seed of the random generator. Defaults to 0. **Exemplary Call** .. code-block:: yaml - node: ConsumeTrainingData parameters : consumption_rate : 0.8 wrapped_node : node : Flow_Node parameters : input_dim : 64 output_dim : 1 nodes : ...... :Author: Jan Hendrik Metzen ([email protected]) :Created: 2010/08/06 """ def __init__(self, wrapped_node, consumption_rate, random_seed=0, *args, **kwargs): self.wrapped_node = wrapped_node # Necessary to determine whether trainable. super(ConsumeTrainingDataNode, self).__init__(*args, **kwargs) ############################################# self.set_permanent_attributes(wrapped_node=wrapped_node, consumption_rate=consumption_rate, internal_training_set=[], external_training_set=[], r=random.Random(random_seed)) @staticmethod def node_from_yaml(node_spec): """ Creates a node based on the node_spec to overwrite default """ # This node requires one parameters, namely a list of nodes assert("parameters" in node_spec and "wrapped_node" in node_spec["parameters"]),\ "ConsumeTrainingDataNode requires specification of a wrapped node!" # Create all nodes that are packed together in this layer wrapped_node = BaseNode.node_from_yaml( node_spec["parameters"]["wrapped_node"]) node_spec["parameters"].pop("wrapped_node") # Create the node object node_obj = ConsumeTrainingDataNode(wrapped_node=wrapped_node, **node_spec["parameters"]) return node_obj def is_trainable(self): """ Returns whether this node is trainable. """ return self.wrapped_node.is_trainable() def is_supervised(self): """ Returns whether this node requires supervised training """ return self.wrapped_node.is_supervised() def _get_train_set(self, use_test_data=False): """ Returns the data that can be used for training """ # We take data that is provided by the input node for training # NOTE: This might involve training of the preceding nodes train_set = list( self.input_node.request_data_for_training(use_test_data)) # Divide available instances according to label all_instances = defaultdict(list) for instance, label in train_set: all_instances[label].append(instance) # Split into training data used internally and training data that is # available for successor nodes self.internal_training_set = [] self.external_training_set = [] for label, instances in all_instances.iteritems(): self.r.shuffle(instances) split_index = int(round(len(instances) * self.consumption_rate)) self.internal_training_set.extend( zip(instances[:split_index], repeat(label))) self.external_training_set.extend( zip(instances[split_index:], repeat(label))) return self.internal_training_set def request_data_for_training(self, use_test_data): """ Returns data for training of subsequent nodes .. todo:: to document """ assert (self.input_node != None) self._log("Data for training is requested.", level=logging.DEBUG) # If we haven't computed the data for training yet if self.data_for_training == None: self._log("Producing data for training.", level=logging.DEBUG) # Train this node self.train_sweep(use_test_data) # Compute a generator the yields the train data and # encapsulate it in an object that memoizes its outputs and # provides a "fresh" method that returns a new generator that'll # yield the same sequence train_data_generator = \ itertools.imap(lambda (data, label) : (self.execute(data), label), self.external_training_set) self.data_for_training = MemoizeGenerator(train_data_generator, caching=self.caching) self._log("Data for training finished", level=logging.DEBUG) # Return a fresh copy of the generator return self.data_for_training.fresh() def _train(self, data, label): """ Trains the wrapped nodes on the given data vector *data* """ self.wrapped_node.train(data, label) def _stop_training(self): """ Finish the training of the node.""" self.wrapped_node.stop_training() def _execute(self, data): """ Executes the node on the given data vector *data* """ return self.wrapped_node.execute(data) def store_state(self, result_dir, index=None): """ Stores this node in the given directory *result_dir* """ self.wrapped_node.store_state(result_dir, index=None) def get_output_type(self, input_type, as_string=True): """ Return the output type The method calls the corresponding method in the wrapped node """ return self.wrapped_node.get_output_type(input_type, as_string)
class CrossValidationSplitterNode(BaseNode): """ Perform (stratified) cross-validation During benchmarking, n pairs of training and test data are generated, where n is configurable via the parameter splits. The n test datasets are pairwise disjunct. Internally, the available data is partitioned into n pairwise disjunct sets s_1, ..., s_n of equal size (the "splits"). The i-th pair of training and test data is generated by using s_i as test data and the union of the remaining datasets as training data. The partitioning is stratified per default, i.e. the splits have the same class ratio as the overall dataset. Per default, the partitioning is based on shuffling the data randomly. In this case, the partitioning of the data into s_1, ..., s_n is determined solely based on the run number (used as random seed), yielding the same split for the same run_number and different ones for two different run_numbers. **Parameters** :splits: The number of splits created internally. If n data points exist and m splits are created, each of these splits consists of approx. m/n data points. (*optional, default: 10*) :stratified: If true, the cross-validation is stratified, i.e. the overall class-ratio is retained in each split (as good as possible). (*optional, default: True*) :random: If true, the order of the data is randomly shuffled. (*optional, default: True*) :time_dependent: If True splitting is done separately for different (= not overlapping) time windows to ensure that instances corresponding to the same marker will be in the same split. .. note:: Stratification is only allowed here if there is only one class label for one marker. (*optional, default: False*) :stratified_class: If *time_dependent* is True and *stratified_class* is specified stratification is only done for the specified class label (String). The other class is filling the split preserving the time order of the data. This also means that *random* has no effect here. (*optional, default: None*) **Exemplary Call** .. code-block:: yaml - node : CV_Splitter parameters : splits : 10 stratified : True :Author: Jan Hendrik Metzen ([email protected]) :Created: 2008/12/16 """ def __init__(self, splits=10, stratified=True, random=True, time_dependent=False, stratified_class=None, *args, **kwargs): super(CrossValidationSplitterNode, self).__init__(*args, **kwargs) self.set_permanent_attributes( splits=int(splits), #how many splits current_split=0, # current split for testing split_indices=None, run_number=-1, random=random, stratified=stratified, stratified_class=stratified_class, time_dependent=time_dependent) def is_split_node(self): """ Return whether this is a split node """ return True def use_next_split(self): """ Use the next split of the data into training and test data. Returns True if more splits are available, otherwise False. This method is useful for benchmarking """ if self.current_split + 1 < self.splits: self.current_split = self.current_split + 1 self._log("Benchmarking with split %s/%s" % (self.current_split + 1, self.splits)) return True else: return False def train_sweep(self, use_test_data): """ Performs the actual training of the node. .. note:: Split nodes cannot be trained """ raise Exception("Split nodes cannot be trained") def request_data_for_training(self, use_test_data): """ Returns the data for training of subsequent nodes .. todo:: to document """ # Create cv-splits lazily when required if self.split_indices == None: self._create_splits() # All data can be used for training which is not explicitly # specified for testing by the current cv-split self.data_for_training = MemoizeGenerator( self.data[i] for i in range(len(self.data)) if not i in self.split_indices[self.current_split]) return self.data_for_training.fresh() def request_data_for_testing(self): """ Returns the data for testing of subsequent nodes .. todo:: to document """ # Create cv-splits lazily when required if self.split_indices == None: self._create_splits() # Only that data can be used for testing which is explicitly # specified for this purpose by the current cv-split self.data_for_testing = MemoizeGenerator( self.data[i] for i in self.split_indices[self.current_split]) return self.data_for_testing.fresh() def _create_splits(self): """ Create the split of the data for n-fold cross-validation """ self._log("Creating %s splits for cross validation" % self.splits) # Get training and test data (with labels) train_data = \ list(self.input_node.request_data_for_training(use_test_data=False)) test_data = list(self.input_node.request_data_for_testing()) # If there is already a non-empty training set, # it means that we are not the first split node in the node chain if len(train_data) > 0: raise Exception("No iterated splitting of data sets allowed\n " "(Calling a splitter on a data set that is " "already split)") # Remember all the data and store it in memory # TODO: This might cause problems for large dataset self.data = train_data + test_data # initialize result structure: Determine which data points are # reserved for testing in which cross validation run split_indices = [] if self.time_dependent: # sort the data according to start_time self.data.sort(key=lambda swindow: swindow[0].start_time) # divide the data with respect to the time_point data_time = dict() last_window_end_time = 0.0 marker = -1 label_marker = dict() for (index, (window, label)) in enumerate(self.data): if window.start_time > last_window_end_time: marker += 1 data_time[marker] = [index] if self.stratified or self.stratified_class: if label not in label_marker: label_marker[label] = [marker] else: label_marker[label].append(marker) else: data_time[marker].append(index) # check label consistency for later stratification if (self.stratified or self.stratified_class) and \ self.data[data_time[marker][0]][1] != label: import warnings warnings.warn( "Since there are several class labels" " for one marker stratification is set to False.", UserWarning) self.stratified = False self.stratified_class = None last_window_end_time = window.end_time #print "data_time: \n", data_time if self.stratified: # each marker has only one label # not more splits then markers of every class! assert (min( [len(markers) for markers in label_marker.values()]) >= self.splits) # extend result structure since we need it in the next block split_indices = [[] for i in range(self.splits)] # determine the splits of the data for label, markers in label_marker.iteritems(): data_size = len(markers) # Set random seed and randomize the order of the data if self.random: r = random.Random(self.run_number) r.shuffle(markers) for j in range(self.splits): split_start = int( round(float(j) * data_size / self.splits)) split_end = int( round(float(j + 1) * data_size / self.splits)) # means half-open interval [split_start, split_end) for i in range(split_start, split_end): split_indices[j].extend(data_time[markers[i]]) # avoid sorted labels by sorting time dependent split_indices = [ sorted(split_list) for split_list in split_indices ] #print "run_number:", self.run_number #print "time_dependent && stratified:\n", split_indices elif self.stratified_class: # extend result structure since we need it in the next block split_indices = [[] for i in range(self.splits)] # determine the splits of the data data_size = len(label_marker[self.stratified_class]) for j in range(self.splits): split_start = int(round( float(j) * data_size / self.splits)) split_end = int( round(float(j + 1) * data_size / self.splits)) # means half-open interval [split_start, split_end) for i in range(split_start, split_end): split_indices[j].extend( data_time[label_marker[self.stratified_class][i]]) #print "time_dependent && stratified_class:\n before filling up\n", split_indices # fill up with other classes last_max_index = 0 for split_list in split_indices: max_index = max(split_list) for i in range(last_max_index, max_index): if self.data[i][1] != self.stratified_class: split_list.append(i) last_max_index = max_index + 1 for i in range(last_max_index, len(self.data)): if self.data[i][1] != self.stratified_class: split_indices[-1].append(i) # avoid sorted labels by sorting time dependent split_indices = [ sorted(split_list) for split_list in split_indices ] print "time_dependent && stratified_class:\n", split_indices else: # we should not have more splits then (marker)time points data_size = len(data_time.keys()) assert (data_size >= self.splits) # Set random seed and randomize the order of the data indices = data_time.keys() if self.random: r = random.Random(self.run_number) r.shuffle(indices) # determine the splits of the data for i in range(self.splits): split_indices.append([]) split_start = int(round( float(i) * data_size / self.splits)) split_end = int( round(float(i + 1) * data_size / self.splits)) # means half-open interval [split_start, split_end) for j in range(split_start, split_end): split_indices[i].extend(data_time[indices[j]]) # avoid sorted labels by sorting time dependent split_indices = [ sorted(split_list) for split_list in split_indices ] #for index, splitlist in enumerate(split_indices): # print index, "first: ", self.data[splitlist[0]][0].start_time, ", last: ", self.data[splitlist[-1]][0].start_time, ", Laenge: ", len(data_time.keys()) #print "time_dependent:\n", split_indices elif self.stratified: # Stratified cross-validation # divide the data with respect to the class_label data_labeled = dict() for (index, (window, label)) in enumerate(self.data): if not data_labeled.has_key(label): data_labeled[label] = [index] else: data_labeled[label].append(index) # we should not have more splits then instances of every class! min_nr_per_class = min( [len(data) for data in data_labeled.values()]) if self.splits > min_nr_per_class: self.splits = min_nr_per_class self._log("Reducing number of splits to %s since no more " "instances of one of the classes are available." % self.splits, level=logging.CRITICAL) # extend result structure since we need it in the next block split_indices = [[] for i in range(self.splits)] # determine the splits of the data for label, indices in data_labeled.iteritems(): data_size = len(indices) # Set random seed and randomize the order of the data if self.random: r = random.Random(self.run_number) r.shuffle(indices) for j in range(self.splits): split_start = int(round( float(j) * data_size / self.splits)) split_end = int( round(float(j + 1) * data_size / self.splits)) # means half-open interval [split_start, split_end) split_indices[j].extend(indices[split_start:split_end]) # avoid sorted labels for j in range(self.splits): r = random.Random(self.run_number) r.shuffle(split_indices[j]) # print "stratified:\n", split_indices # old trunk version # ================= # data_size = len(self.data) # # Determine ratio of class1 # instance_labels = map(lambda x: x[1], self.data) # classes = list(set(instance_labels)) # assert (len(classes) == 2),\ # "Stratified cross-validation works currently only for "\ # "binary classification tasks." # class1_instances = instance_labels.count(classes[0]) # class2_instances = instance_labels.count(classes[1]) # if self.splits > min(class1_instances, class2_instances): # self.set_permanent_attributes(splits = min(class1_instances, # class2_instances)) # self._log("Reducing number of splits to %s since no more " \ # "instances of one of the classes are available." # % self.splits) # class1_ratio = float(class1_instances) / data_size # # Determine which instances belong to which class # class1_indices = [] # class2_indices = [] # for index, instance_label in enumerate(instance_labels): # if instance_label == classes[0]: # class1_indices.append(index) # else: # class2_indices.append(index) # # # Randomize order # if self.random: # r = random.Random(self.run_number) # r.shuffle(class1_indices) # r.shuffle(class2_indices) # # # Merge the two classes (such that they alternate in the appropriate # # frequency) # indices = [] # n = 0 # class1 counter # for i in range(data_size): # if i == round((n + 0.5) / class1_ratio): # indices.append(class1_indices.pop()) # n += 1 # else: # indices.append(class2_indices.pop()) else: # Non-stratified cross-validation data_size = len(self.data) # We cannot have more splits than data points assert (data_size >= self.splits) # Set random seed and randomize the order of the data indices = range(data_size) if self.random: r = random.Random(self.run_number) r.shuffle(indices) # Determine the splits of the data for i in range(self.splits): split_start = int(round(float(i) * data_size / self.splits)) split_end = int(round(float(i + 1) * data_size / self.splits)) # means half-open interval [split_start, split_end) split_indices.append(indices[split_start:split_end]) self.split_indices = split_indices self._log("Benchmarking with split %s/%s" % (self.current_split + 1, self.splits))
class SimpleSourceTemplateNode(BaseNode): """ A simple template that illustrates the basic principles of a source node In `pySPACE`, source nodes are used at the beginning of the node chain. The source nodes are responsible for the input of data, be it from a static source or from a live stream. It is very important to note that these nodes just serve the purpose of providing the node chain with an input dataset and do not perform any changes on the data itself. That being said, these nodes are **do not** have an **input node** and are **not trainable**! In the following we will discuss the general strategy for building a new source node for a static input data set which has been saved to disk. In the case of more complicated inputs, please consult the documentation of :mod:`~pySPACE.missions.nodes.source.external_generator_source.ExternalGeneratorSourceNode` and :mod:`~pySPACE.missions.nodes.source.time_series_source.Stream2TimeSeriesSourceNode` """ def __init__(self, **kwargs): """ Initialize some values to 0 or `None` The initialization routine of the source node is basically completely empty. Should you feel the need to do something in this part of the code, you can initialize the ``input_dataset`` to ``None``. This attribute will then later be changed when the ``set_input_dataset`` method is called. If the user wants to generate the dataset inside the SourceNode, this should be done in the ``__init__`` method though. A good example of this practice can be found in the :mod:`~pySPACE.missions.nodes.source.random_time_series_source.RandomTimeSeriesSourceNode` """ super(SimpleSourceTemplateNode, self).__init__(**kwargs) self.set_permanent_attributes(dataset=None) def set_input_dataset(self, dataset): """ Sets the dataset from which this node reads the data This method is the beginning of the node. Put simply, this method starts the feeding process of your node chain by telling the node chain where to get the data from. """ self.set_permanent_attributes(dataset=dataset) def request_data_for_training(self, use_test_data): """ Returns the data that can be used for training of subsequent nodes This method streams training data and sends it to the subsequent nodes. If one looks at the tutorial related to building new nodes (available in the tutorial section), one can see exactly where the ``request_data`` methods are put to use. The following example is one that was extracted from the :mod:`~pySPACE.missions.nodes.source.feature_vector_source.FeatureVectorSourceNode` which should(in theory at least) be implementable for all types of data. """ if not use_test_data: # If the input dataset consists only of one single run, # we use this as input for all runs to be conducted (i.e. we # rely on later randomization of the order). Otherwise # we use the data for this run number if self.dataset.meta_data["runs"] > 1: key = (self.run_number, self.current_split, "train") else: key = (0, self.current_split, "train") # Check if there is training data for the current split and run if key in self.dataset.data.keys(): self._log("Accessing input dataset's training feature vector windows.") self.data_for_training = MemoizeGenerator(self.dataset.get_data(*key).__iter__(), caching=self.caching) else: # Returns an iterator that iterates over an empty sequence # (i.e. an iterator that is immediately exhausted), since # this node does not provide any data that is explicitly # dedicated for training self._log("No training data available.") self.data_for_training = MemoizeGenerator((x for x in [].__iter__()), caching=self.caching) else: # Return the test data as there is no additional data that # was dedicated for training return self.request_data_for_testing() # Return a fresh copy of the generator return self.data_for_training.fresh() def request_data_for_testing(self): """ Returns the data that can be used for testing of subsequent nodes The principle of obtaining the testing data are the same as the principles used in obtaining the training data set. The only difference here is that, in the case in which there is no testing data available, we allow for the training data to be used as testing data. """ # If we haven't read the data for testing yet if self.data_for_testing == None: self._log("Accessing input dataset's test feature vector windows.") # If the input dataset consists only of one single run, # we use this as input for all runs to be conducted (i.e. we # rely on later randomization of the order). Otherwise # we use the data for this run number if self.dataset.meta_data["runs"] > 1: key = (self.run_number, self.current_split, "test") else: key = (0, self.current_split, "test") test_data_generator = self.dataset.get_data(*key).__iter__() self.data_for_testing = MemoizeGenerator(test_data_generator, caching=self.caching) # Return a fresh copy of the generator return self.data_for_testing.fresh() def getMetadata(self, key): """ Return the value corresponding to the given key from the dataset meta data of this source node At some point in time, you might need to know the metadata of some specific input in your input and this is when you would use this method. """ return self.dataset.meta_data.get(key) def use_next_split(self): """ Return False The method will always return `False` since the SourceNode should(in the case of more than 1 split) execute the splits in parallel and not in series. """ return False
def request_data_for_training(self, use_test_data): """ Returns data for training of subsequent nodes .. todo:: to document .. note:: This method works differently in InstanceSelectionNode than in other nodes: Only *percentage_selected* of the available data are returned. """ assert(self.input_node is not None) if self.train_percentage_selected > 100: self._log("Train percentage of %f reduced to 100." % self.train_percentage_selected, level=logging.ERROR) self.train_percentage_selected = 100 self._log("Data for training is requested.", level=logging.DEBUG) if self.train_percentage_selected == 100: return super(InstanceSelectionNode, self).request_data_for_training( use_test_data) # If we haven't computed the data for training yet if self.data_for_training is None: self._log("Producing data for training.", level=logging.DEBUG) # Train this node self.train_sweep(use_test_data) # Divide available instances according to label all_instances = defaultdict(list) for instance, label in self.input_node.request_data_for_training( use_test_data): all_instances[label].append(instance) self._log("Keeping only %s percent of training data" % self.train_percentage_selected, level=logging.DEBUG) r = random.Random(self.run_number) # Retain only *percentage_selected* percent of the data retained_instances = [] for label, instances in all_instances.iteritems(): # enable random choice of samples r.shuffle(instances) if not self.reduce_class or \ self.train_percentage_selected == 100: end_index = int(round(len(instances) * self.train_percentage_selected / 100)) elif not (self.reduce_class == label): end_index = len(instances) else: # self.reduce_class==label--> reduction needed end_index = int(round(len(instances) * self.train_percentage_selected / 100)) retained_instances.extend(zip(instances[0:end_index], [label]*end_index)) # mix up samples between the different labels r.shuffle(retained_instances) # Compute a generator the yields the train data and # encapsulate it in an object that memoizes its outputs and # provides a "fresh" method that returns a new generator that will # yield the same sequence train_data_generator = ((self.execute(data), label) for (data, label) in retained_instances) self.data_for_training = MemoizeGenerator(train_data_generator, caching=self.caching) self._log("Data for training finished", level=logging.DEBUG) # Return a fresh copy of the generator return self.data_for_training.fresh()
class CrossValidationSplitterNode(BaseNode): """ Perform (stratified) cross-validation During benchmarking, n pairs of training and test data are generated, where n is configurable via the parameter splits. The n test datasets are pairwise disjunct. Internally, the available data is partitioned into n pairwise disjunct sets s_1, ..., s_n of equal size (the "splits"). The i-th pair of training and test data is generated by using s_i as test data and the union of the remaining datasets as training data. The partitioning is stratified per default, i.e. the splits have the same class ratio as the overall dataset. Per default, the partitioning is based on shuffling the data randomly. In this case, the partitioning of the data into s_1, ..., s_n is determined solely based on the run number (used as random seed), yielding the same split for the same run_number and different ones for two different run_numbers. **Parameters** :splits: The number of splits created internally. If n data points exist and m splits are created, each of these splits consists of approx. m/n data points. (*optional, default: 10*) :stratified: If true, the cross-validation is stratified, i.e. the overall class-ratio is retained in each split (as good as possible). (*optional, default: True*) :random: If true, the order of the data is randomly shuffled. (*optional, default: True*) :time_dependent: If True splitting is done separately for different (= not overlapping) time windows to ensure that instances corresponding to the same marker will be in the same split. .. note:: Stratification is only allowed here if there is only one class label for one marker. (*optional, default: False*) :stratified_class: If *time_dependent* is True and *stratified_class* is specified stratification is only done for the specified class label (String). The other class is filling the split preserving the time order of the data. This also means that *random* has no effect here. (*optional, default: None*) **Exemplary Call** .. code-block:: yaml - node : CV_Splitter parameters : splits : 10 stratified : True :Author: Jan Hendrik Metzen ([email protected]) :Created: 2008/12/16 """ def __init__(self, splits=10, stratified=True, random=True, time_dependent=False, stratified_class = None, *args, **kwargs): super(CrossValidationSplitterNode, self).__init__(*args, **kwargs) self.set_permanent_attributes(splits = int(splits), #how many splits current_split = 0, # current split for testing split_indices = None, run_number = -1, random = random, stratified = stratified, stratified_class = stratified_class, time_dependent = time_dependent) def is_split_node(self): """ Return whether this is a split node """ return True def use_next_split(self): """ Use the next split of the data into training and test data. Returns True if more splits are available, otherwise False. This method is useful for benchmarking """ if self.current_split + 1 < self.splits: self.current_split = self.current_split + 1 self._log("Benchmarking with split %s/%s" % (self.current_split + 1, self.splits)) return True else: return False def train_sweep(self, use_test_data): """ Performs the actual training of the node. .. note:: Split nodes cannot be trained """ raise Exception("Split nodes cannot be trained") def request_data_for_training(self, use_test_data): """ Returns the data for training of subsequent nodes .. todo:: to document """ # Create cv-splits lazily when required if self.split_indices == None: self._create_splits() # All data can be used for training which is not explicitly # specified for testing by the current cv-split self.data_for_training = MemoizeGenerator( self.data[i] for i in range(len(self.data)) if not i in self.split_indices[self.current_split]) return self.data_for_training.fresh() def request_data_for_testing(self): """ Returns the data for testing of subsequent nodes .. todo:: to document """ # Create cv-splits lazily when required if self.split_indices == None: self._create_splits() # Only that data can be used for testing which is explicitly # specified for this purpose by the current cv-split self.data_for_testing = MemoizeGenerator( self.data[i] for i in self.split_indices[self.current_split]) return self.data_for_testing.fresh() def _create_splits(self): """ Create the split of the data for n-fold cross-validation """ self._log("Creating %s splits for cross validation" % self.splits) # Get training and test data (with labels) train_data = \ list(self.input_node.request_data_for_training(use_test_data=False)) test_data = list(self.input_node.request_data_for_testing()) # If there is already a non-empty training set, # it means that we are not the first split node in the node chain if len(train_data) > 0: raise Exception("No iterated splitting of data sets allowed\n " "(Calling a splitter on a data set that is " "already split)") # Remember all the data and store it in memory # TODO: This might cause problems for large dataset self.data = train_data + test_data # initialize result structure: Determine which data points are # reserved for testing in which cross validation run split_indices = [] if self.time_dependent: # sort the data according to start_time self.data.sort(key=lambda swindow: swindow[0].start_time) # divide the data with respect to the time_point data_time = dict() last_window_end_time = 0.0 marker = -1 label_marker = dict() for (index, (window, label)) in enumerate(self.data): if window.start_time > last_window_end_time: marker += 1 data_time[marker] = [index] if self.stratified or self.stratified_class: if label not in label_marker: label_marker[label] = [marker] else: label_marker[label].append(marker) else: data_time[marker].append(index) # check label consistency for later stratification if (self.stratified or self.stratified_class) and \ self.data[data_time[marker][0]][1] != label: import warnings warnings.warn( "Since there are several class labels" " for one marker stratification is set to False.", UserWarning) self.stratified = False self.stratified_class = None last_window_end_time = window.end_time #print "data_time: \n", data_time if self.stratified: # each marker has only one label # not more splits then markers of every class! assert(min([len(markers) for markers in label_marker.values()]) >= self.splits) # extend result structure since we need it in the next block split_indices = [[] for i in range(self.splits)] # determine the splits of the data for label, markers in label_marker.iteritems(): data_size = len(markers) # Set random seed and randomize the order of the data if self.random: r = random.Random(self.run_number) r.shuffle(markers) for j in range(self.splits): split_start = int(round(float(j) * data_size/self.splits)) split_end = int(round(float(j+1) * data_size/self.splits)) # means half-open interval [split_start, split_end) for i in range(split_start, split_end): split_indices[j].extend(data_time[markers[i]]) # avoid sorted labels by sorting time dependent split_indices = [sorted(split_list) for split_list in split_indices] #print "run_number:", self.run_number #print "time_dependent && stratified:\n", split_indices elif self.stratified_class: # extend result structure since we need it in the next block split_indices = [[] for i in range(self.splits)] # determine the splits of the data data_size = len(label_marker[self.stratified_class]) for j in range(self.splits): split_start = int(round(float(j) * data_size/self.splits)) split_end = int(round(float(j+1) * data_size/self.splits)) # means half-open interval [split_start, split_end) for i in range(split_start, split_end): split_indices[j].extend(data_time[label_marker[self.stratified_class][i]]) #print "time_dependent && stratified_class:\n before filling up\n", split_indices # fill up with other classes last_max_index = 0 for split_list in split_indices: max_index = max(split_list) for i in range(last_max_index, max_index): if self.data[i][1] != self.stratified_class: split_list.append(i) last_max_index = max_index+1 for i in range(last_max_index, len(self.data)): if self.data[i][1] != self.stratified_class: split_indices[-1].append(i) # avoid sorted labels by sorting time dependent split_indices = [sorted(split_list) for split_list in split_indices] print "time_dependent && stratified_class:\n", split_indices else: # we should not have more splits then (marker)time points data_size = len(data_time.keys()) assert(data_size >= self.splits) # Set random seed and randomize the order of the data indices = data_time.keys() if self.random: r = random.Random(self.run_number) r.shuffle(indices) # determine the splits of the data for i in range(self.splits): split_indices.append([]) split_start = int(round(float(i) * data_size / self.splits)) split_end = int(round(float(i + 1) * data_size / self.splits)) # means half-open interval [split_start, split_end) for j in range(split_start,split_end): split_indices[i].extend(data_time[indices[j]]) # avoid sorted labels by sorting time dependent split_indices = [sorted(split_list) for split_list in split_indices] #for index, splitlist in enumerate(split_indices): # print index, "first: ", self.data[splitlist[0]][0].start_time, ", last: ", self.data[splitlist[-1]][0].start_time, ", Laenge: ", len(data_time.keys()) #print "time_dependent:\n", split_indices elif self.stratified: # Stratified cross-validation # divide the data with respect to the class_label data_labeled = dict() for (index, (window, label)) in enumerate(self.data): if not data_labeled.has_key(label): data_labeled[label] = [index] else: data_labeled[label].append(index) # we should not have more splits then instances of every class! min_nr_per_class = min([len(data) for data in data_labeled.values()]) if self.splits > min_nr_per_class: self.splits = min_nr_per_class self._log("Reducing number of splits to %s since no more " "instances of one of the classes are available." % self.splits, level=logging.CRITICAL) # extend result structure since we need it in the next block split_indices = [[] for i in range(self.splits)] # determine the splits of the data for label, indices in data_labeled.iteritems(): data_size = len(indices) # Set random seed and randomize the order of the data if self.random: r = random.Random(self.run_number) r.shuffle(indices) for j in range(self.splits): split_start = int(round(float(j) * data_size/self.splits)) split_end = int(round(float(j+1) * data_size/self.splits)) # means half-open interval [split_start, split_end) split_indices[j].extend(indices[split_start: split_end]) # avoid sorted labels for j in range(self.splits): r = random.Random(self.run_number) r.shuffle(split_indices[j]) # print "stratified:\n", split_indices # old trunk version # ================= # data_size = len(self.data) # # Determine ratio of class1 # instance_labels = map(lambda x: x[1], self.data) # classes = list(set(instance_labels)) # assert (len(classes) == 2),\ # "Stratified cross-validation works currently only for "\ # "binary classification tasks." # class1_instances = instance_labels.count(classes[0]) # class2_instances = instance_labels.count(classes[1]) # if self.splits > min(class1_instances, class2_instances): # self.set_permanent_attributes(splits = min(class1_instances, # class2_instances)) # self._log("Reducing number of splits to %s since no more " \ # "instances of one of the classes are available." # % self.splits) # class1_ratio = float(class1_instances) / data_size # # Determine which instances belong to which class # class1_indices = [] # class2_indices = [] # for index, instance_label in enumerate(instance_labels): # if instance_label == classes[0]: # class1_indices.append(index) # else: # class2_indices.append(index) # # # Randomize order # if self.random: # r = random.Random(self.run_number) # r.shuffle(class1_indices) # r.shuffle(class2_indices) # # # Merge the two classes (such that they alternate in the appropriate # # frequency) # indices = [] # n = 0 # class1 counter # for i in range(data_size): # if i == round((n + 0.5) / class1_ratio): # indices.append(class1_indices.pop()) # n += 1 # else: # indices.append(class2_indices.pop()) else: # Non-stratified cross-validation data_size = len(self.data) # We cannot have more splits than data points assert(data_size >= self.splits) # Set random seed and randomize the order of the data indices = range(data_size) if self.random: r = random.Random(self.run_number) r.shuffle(indices) # Determine the splits of the data for i in range(self.splits): split_start = int(round(float(i) * data_size / self.splits)) split_end = int(round(float(i + 1) * data_size / self.splits)) # means half-open interval [split_start, split_end) split_indices.append(indices[split_start: split_end]) self.split_indices = split_indices self._log("Benchmarking with split %s/%s" % (self.current_split + 1, self.splits))
class ReduceOverrepresentedClassNode(BaseNode): """ Reject instances to balance categories for classification The node forwards only a reduced number of the training and test instances of the bigger class to get a balanced ratio of the classes. The forwarded instances are selected randomly. All data of the underrepresented class is forwarded. **Parameters** **Exemplary call** .. code-block:: yaml - node : Reduce_Overrepresented_Class :Author: Hendrik Woehrle ([email protected]) :Created: 2010/09/22 """ def __init__(self, **kwargs): super(ReduceOverrepresentedClassNode, self).__init__(**kwargs) def request_data_for_training(self, use_test_data): """ Returns data for training of subsequent nodes .. todo:: to document """ assert(self.input_node is not None) self._log("Data for testing is requested.", level=logging.DEBUG) if self.data_for_training is None: self._log("Producing data for training.", level=logging.DEBUG) # Train this node self.train_sweep(use_test_data) # Divide available instances according to label all_instances = defaultdict(list) for instance, label in self.input_node.request_data_for_training( use_test_data): all_instances[label].append(instance) retained_instances = self.balance_instances(all_instances) # Compute a generator the yields the test data and # encapsulate it in an object that memoizes its outputs and # provides a "fresh" method that returns a new generator that will # yield the same sequence self._log("Producing data for testing.", level=logging.DEBUG) train_data_generator = ((self.execute(data), label) for (data, label) in retained_instances) self.data_for_training = MemoizeGenerator(train_data_generator, caching=self.caching) self._log("Data for training finished", level=logging.DEBUG) # Return a fresh copy of the generator return self.data_for_training.fresh() def request_data_for_testing(self): """ Returns data for testing of subsequent nodes .. todo:: to document """ assert(self.input_node is not None) self._log("Data for testing is requested.", level=logging.DEBUG) # If we haven't computed the data for testing yet if self.data_for_testing is None: # Assert that this node has already been trained assert(not self.is_trainable() or self.get_remaining_train_phase() == 0) # Divide available instances according to label all_instances = defaultdict(list) for instance, label in self.input_node.request_data_for_testing(): all_instances[label].append(instance) retained_instances = self.balance_instances(all_instances) # Compute a generator the yields the test data and # encapsulate it in an object that memoizes its outputs and # provides a "fresh" method that returns a new generator that will # yield the same sequence self._log("Producing data for testing.", level=logging.DEBUG) test_data_generator = ((self.execute(data), label) for (data, label) in retained_instances) self.data_for_testing = MemoizeGenerator(test_data_generator, caching=self.caching) self._log("Data for testing finished", level=logging.DEBUG) # Return a fresh copy of the generator return self.data_for_testing.fresh() def _execute(self, time_series): return time_series # We don't do anything with the kept instances def balance_instances(self, all_instances): """Method that performs the rejections of the data in the oversized class""" retained_instances = [] # it is supposed to have a binary classifier, e.g. to have exactly 2 classes #if not len(all_instances.keys())==2: # raise ValueError("Too many classes: only binary classification supported") # count the number of instances per class min_num_instances_per_class = float("+inf") for label, instances in all_instances.iteritems(): min_num_instances_per_class = min(min_num_instances_per_class, len(instances)) r = random.Random(self.run_number) # retain only the number of instances that corresponds # to the size of smaller class for label, instances in all_instances.iteritems(): r.shuffle(instances) retained_instances.extend( zip(instances[0:min_num_instances_per_class], [label]*min_num_instances_per_class)) r.shuffle(retained_instances) return retained_instances
def request_data_for_testing(self): """ Returns data for testing of subsequent nodes .. todo:: to document """ assert(self.input_node is not None) if self.test_percentage_selected > 100: self._log("Test percentage of %f reduced to 100." % self.test_percentage_selected, level=logging.ERROR) self.test_percentage_selected = 100 self._log("Data for testing is requested.", level=logging.DEBUG) if self.test_percentage_selected == 100: return super(InstanceSelectionNode, self).request_data_for_testing() # If we haven't computed the data for testing yet if self.data_for_testing is None: # Assert that this node has already been trained assert(not self.is_trainable() or self.get_remaining_train_phase() == 0) # Divide available instances according to label all_instances = defaultdict(list) for instance, label in self.input_node.request_data_for_testing(): all_instances[label].append(instance) self._log("Keeping only %s percent of test data" % self.test_percentage_selected, level=logging.DEBUG) r = random.Random(self.run_number) # Retain only *percentage_selected* percent of the data retained_instances = [] for label, instances in all_instances.iteritems(): # enable random choice of samples r.shuffle(instances) if not self.reduce_class or \ self.test_percentage_selected == 100: end_index = int(round(len(instances) * self.test_percentage_selected / 100)) elif not (self.reduce_class == label): end_index = len(instances) else: # self.reduce_class==label--> reduction needed end_index = int(round(len(instances) * self.test_percentage_selected / 100)) retained_instances.extend(zip(instances[0:end_index], [label]*end_index)) # mix up samples between the different labels r.shuffle(retained_instances) # Compute a generator the yields the test data and # encapsulate it in an object that memoizes its outputs and # provides a "fresh" method that returns a new generator that'll # yield the same sequence self._log("Producing data for testing.", level=logging.DEBUG) test_data_generator = ((self.execute(data), label) for (data, label) in retained_instances) self.data_for_testing = MemoizeGenerator(test_data_generator, caching=self.caching) self._log("Data for testing finished", level=logging.DEBUG) # Return a fresh copy of the generator return self.data_for_testing.fresh()
class TransferSplitterNode(BaseNode): """ Allow to split data into training and test data sets according to different window definitions Splits the available data into disjunct training and test sets. The transfer of different training and test window definitions is supported. The node was implemented with several use cases in mind: - The training set contains instances of 'Standard' and 'Target' stimuli but the test set of 'Target' and 'MissedTarget' stimuli. - The training set contains instances of 'LRP' with different training times and 'NoLRPs', but the test set should contain sliding windows. Cross validation should be supported to use the node together with parameter optimization node. - The use of merged data sets should be possible. **Parameters** :wdefs_train: A list with window definition names (specified in the window spec file when the raw data was segmented). All windows that belong to one of the window definition are considered when the training set(s) is(/are) determined. :wdefs_test: A list with window definition names (specified in the window spec file when the raw data was segmented). All windows that belong to one of the window definition are considered when the testing set(s) is(/are) determined. :split_method: One of the following Strings: 'all_data', 'time', 'count', 'set_flag'. - all_data : All possible data is used in every split. This results in splitting only window definitions that occur in both, *wdefs_train* AND *wdefs_test*. Window definitions that only occur in either *wdefs_train* or *wdefs_test* are retained in every split. - time : The data is sorted and split according to time. For that (*start_time* of last window - *start_time* of first window)/*nr_of_splits*) is determined. Since time in eeg data is relative for every set, ensure that each input collection consists only of one data set (is not a merge of several sets) or that the change_option has been used. - count : The data is split according to *num_split_instances*. By default only windows specified in both, *wdefs_train* and *wdefs_test*, are count. With the parameter *wdefs_split* window definition that are count can be specified. If *num_split_instances* is not specified, *splits* determines how many instances of *wdefs_split* are in one split. - set_flag : When the data has been merged with the concatenate operation before, a flag 'new_set' has been inserted to the time series specs. Splits are based on this flag, i.e. the splits behave like a inter-set cross validation. For example you merged 3 sets: 'A', 'B', 'C', then there are 3 splits generated: 'A'+'B' vs 'C', 'A'+'C' vs 'B' and 'B'+'C' vs 'A'. :random: If True, the data is randomized before splitting. .. note:: It is not guaranteed that overlapping windows will be in the same split for split methods 'time' and 'all_data'! (*optional, default: False*) :splits: The number of splits created internally and the number of train-test pairs. (*optional, default: 10*) :num_split_instances: If *split_method* is 'count', *num_split_instances* specifies how many instances will be in one split. After splitting one split is evaluated according to *wdefs_test* for the test data set and the remaining splits according to *wdefs_train*. The test split is iterated. If the total number of instances that are count is not divisible by *num_split_instances* the last split will contain the remaining instances. If in addition *splits* is set to 1, only one train-test pair is created with *num_split_instances* in the training set. (*optional, default: None*) :wdefs_split: A list with window definition names (specified in the window spec file when the raw data was segmented). All windows that belong to one of the window definition are counted when *split_method* was set to 'count'. (*optional, default: None*) :reverse: If this option is True, the data is split in reverse ordering. (*optional, default: False*) **Exemplary Call** .. code-block:: yaml - node : TransferSplitter parameters : wdefs_train : ['s2', 's1'] wdefs_test : ['s5', 's2'] split_method : "all_data" splits : 5 :Author: Anett Seeland ([email protected]) :Created: 2011/04/10 :LastChange: 2011/11/14 (traintest functionality) """ def __init__(self, wdefs_train, wdefs_test, split_method, wdefs_train_test = None, splits=10, random=False, num_split_instances=None, wdefs_split=None, reverse=False, sort=False, *args, **kwargs): super(TransferSplitterNode, self).__init__(*args, **kwargs) if wdefs_train_test == None: wdefs_train_test = [wdef for wdef in \ wdefs_train if wdef in wdefs_test], self.set_permanent_attributes(wdefs_train = wdefs_train, wdefs_test = wdefs_test, split_method = split_method, splits = splits, random = random, num_split_instances = num_split_instances, wdefs_split = wdefs_split, reverse = reverse, sort = sort, current_split = 0, wdefs_train_test = wdefs_train_test, split_indices_train = None, split_indices_test = None) def is_split_node(self): """ Returns whether this is a split node. """ return True def use_next_split(self): """ Use the next split of the data into training and test data. Returns True if more splits are available, otherwise False. This method is useful for benchmarking """ if self.current_split + 1 < self.splits: self.current_split = self.current_split + 1 self._log("Benchmarking with split %s/%s" % (self.current_split + 1, self.splits)) return True else: return False def train_sweep(self, use_test_data): """ Performs the actual training of the node. .. note:: Split nodes cannot be trained """ raise Exception("Split nodes cannot be trained") def request_data_for_training(self, use_test_data): # Create split lazily when required if self.split_indices_train == None: self._create_split() # Create training data generator self.data_for_training = MemoizeGenerator( self.data[i] for i in self.split_indices_train[self.current_split]) return self.data_for_training.fresh() def request_data_for_testing(self): # Create split lazily when required if self.split_indices_test == None: self._create_split() # Create test data generator self.data_for_testing = MemoizeGenerator( self.data[i] for i in self.split_indices_test[self.current_split]) return self.data_for_testing.fresh() def _create_split(self): """ Create the split of the data into training and test data. """ self._log("Splitting data into train and test data") # Get training and test data # note: return the data in a list can double the memory requirements! train_data = list(self.input_node.request_data_for_training( use_test_data = False)) test_data = list(self.input_node.request_data_for_testing()) # If there is already a non-empty training set, # it means that we are not the first split node in the node chain. if len(train_data) > 0: if len(test_data)==0: # If there was an All_Train_Splitter before, filter according # to wdef_train and return all training data self.split_indices_train = \ [[ind for ind, (win, lab) in enumerate(train_data) \ if win.specs['wdef_name'] in self.wdefs_train]] self.split_indices_test = [[]] self.splits = 1 self.data = train_data self._log("Using all data for training.") return else: raise Exception("No iterated splitting of data sets allowed\n " "(Calling a splitter on a data set that is already " "splitted)") # Remember all the data and store it in memory # TODO: This might cause problems for large dataset self.data = train_data + test_data del train_data, test_data if self.reverse: self.data = self.data[::-1] # sort the data according to the start time if self.sort or self.split_method == 'time': self.data.sort(key=lambda swindow: swindow[0].start_time) # randomize the data if needed if self.random: r = random.Random(self.run_number) if self.split_method == 'set_flag': self.random = False # TODO: log this elif self.split_method == 'count': if self.wdefs_split == None: self.wdefs_split = self.wdefs_train_test # divide the data with respect to the time data_time = dict() marker = -1 last_window_endtime = 0 for ind, (win, lab) in enumerate(self.data): if win.start_time < last_window_endtime: # overlapping windows or start of a new set if win.end_time < last_window_endtime: # new set marker += 1 data_time[marker]=[(win,lab)] else: # overlapping windows data_time[marker].append((win,lab)) else: marker += 1 data_time[marker]=[(win,lab)] last_window_endtime = win.end_time # randomize order of events by simultaneously keep the order of # sliding windows in each event data_random = data_time.values() r.shuffle(data_random) self.data = [] for l in data_random: self.data.extend(l) del data_random, data_time, l else: r.shuffle(self.data) if self.split_method == 'all_data': # divide the data with respect to *wdef_train*, *wdef_test* and # *wdef_train_test* wdef_data = {'wdef_train_test':[],'wdef_train':[],'wdef_test':[]} class_labels = [] for (index, (window, label)) in enumerate(self.data): if window.specs['wdef_name'] in self.wdefs_train_test: wdef_data['wdef_train_test'].append(index) if label not in class_labels: class_labels.append(label) elif window.specs['wdef_name'] in self.wdefs_train: wdef_data['wdef_train'].append(index) elif window.specs['wdef_name'] in self.wdefs_test: wdef_data['wdef_test'].append(index) else: import warnings warnings.warn("Found window definition %s, which is " \ "neither in *wdefs_train* nor in " \ "*wdefs_test*. Window %s will be ignored!" \ % (window.specs['wdef_name'],window.tag)) # check if splitting makes sense if wdef_data['wdef_train_test']==[] and self.splits>1: raise Exception('No instances to split, i.e train-test window'\ ' definitions are disjunct!') split_indices_train = [[] for i in range(self.splits)] split_indices_test = [[] for i in range(self.splits)] # calculate splits if wdef_data['wdef_train_test']!=[]: data_size = len(wdef_data['wdef_train_test']) # ensure stratified splits if there are several classes if len(class_labels)>1: # divide the data with respect to the class_label data_labeled = dict() for index in wdef_data['wdef_train_test']: if not data_labeled.has_key(self.data[index][1]): data_labeled[self.data[index][1]] = [index] else: data_labeled[self.data[index][1]].append(index) # have not more splits than instances of every class! min_nr_per_class = min([len(data) for data in \ data_labeled.values()]) if self.splits > min_nr_per_class: self.splits = min_nr_per_class self._log("Reducing number of splits to %s since no " \ "more instances of one of the classes are " \ "available." % self.splits) # determine the splits of the data for label, indices in data_labeled.iteritems(): data_size = len(indices) for j in range(self.splits): split_start = \ int(round(float(j)*data_size/self.splits)) split_end = \ int(round(float(j+1)*data_size/self.splits)) split_indices_test[j].extend([i for i in indices[split_start: split_end]\ if self.data[i][0].specs['wdef_name'] in self.wdefs_test]) split_indices_train[j].extend([i for i in indices \ if i not in split_indices_test[j]]) else: # len(class_labels) == 1 # have not more splits than instances! if self.splits > data_size: self.splits = data_size self._log("Reducing number of splits to %s since no " \ "more instances of one of the classes are " \ "available." % self.splits) # determine the splits of the data for j in range(self.splits): split_start = \ int(round(float(j)*data_size/self.splits)) split_end = \ int(round(float(j+1)*data_size/self.splits)) # means half-open interval [split_start, split_end) split_indices_test[j].extend( wdef_data['wdef_train_test'][split_start:split_end]) split_indices_train[j].extend([i for i in \ wdef_data['wdef_train_test'] if i \ not in split_indices_test[j]]) for i in range(self.splits): split_indices_train[i].extend(wdef_data['wdef_train']) split_indices_test[i].extend(wdef_data['wdef_test']) elif self.split_method == 'time': first_window_start = self.data[0][0].start_time last_window_start = self.data[-1][0].start_time # ensure, that time can never be greater than self.splits*time! time = round((last_window_start-first_window_start)/self.splits+0.5) # divide the data according to the time data_time = {0: []} time_fold = 0 for (index, (window, label)) in enumerate(self.data): if window.start_time > time_fold*time+time: time_fold += 1 data_time[time_fold]=[index] else: data_time[time_fold].append(index) split_indices_train = [[] for i in range(self.splits)] split_indices_test = [[] for i in range(self.splits)] for i in range(self.splits): split_indices_test[i].extend([index for index in data_time[i] \ if self.data[index][0].specs['wdef_name'] \ in self.wdefs_test]) for j in range(self.splits): split_indices_train[i].extend([index for index in data_time[j] \ if j != i and self.data[index][0].specs['wdef_name'] \ in self.wdefs_train]) elif self.split_method == 'count': if self.wdefs_split == None: self.wdefs_split = self.wdefs_train_test if self.num_split_instances == None: l = len([ind for ind, (win, lab) \ in enumerate(self.data) if win.specs['wdef_name'] \ in self.wdefs_split]) self.num_split_instances = round(float(l)/self.splits) # divide the data according to *num_split_instances* data_count = {0:[]} count = -1 count_fold = 0 if self.splits==1 and len([i for i in range(len(self.data)) \ if self.data[i][0].specs['wdef_name'] in self.wdefs_split])\ == self.num_split_instances: train_end = len(self.data) else: for (ind, (win, lab)) in enumerate(self.data): #print ind, win.specs['wdef_name'], lab if win.specs['wdef_name'] in self.wdefs_split: count += 1 if self.splits == 1 and \ count == self.num_split_instances: train_end = ind break if count != 0 and count % self.num_split_instances == 0: count_fold += 1 data_count[count_fold] = [ind] else: data_count[count_fold].append(ind) else: data_count[count_fold].append(ind) if self.splits != 1: # self.num_split_instances*self.splits < l, but in the case # when only num_split_instances is specified we can not trust # self.splits if len(data_count.keys()) == self.splits+1 or \ (len(data_count.keys())-1)*self.num_split_instances > l: data_count[count_fold-1].extend(data_count[count_fold]) del data_count[count_fold] self.splits = len(data_count.keys()) split_indices_train = [[] for i in range(self.splits)] split_indices_test = [[] for i in range(self.splits)] for i in range(self.splits): split_indices_test[i].extend([ind for ind in data_count[i] \ if self.data[ind][0].specs['wdef_name'] \ in self.wdefs_test]) for j in range(self.splits): split_indices_train[i].extend([ind for ind in data_count[j]\ if j != i and self.data[ind][0].specs['wdef_name'] \ in self.wdefs_train]) else: # self.splits == 1 split_indices_train = \ [[ind for ind in range(len(self.data[:train_end])) if \ self.data[ind][0].specs['wdef_name'] in self.wdefs_train]] split_indices_test = \ [[ind for ind in range(train_end,len(self.data)) if \ self.data[ind][0].specs['wdef_name'] in self.wdefs_test]] elif self.split_method == 'set_flag': # divide the data according to *new_set* flag in time series specs data_set = {0:[]} key_fold = 0 for (ind, (win, lab)) in enumerate(self.data): if win.specs['new_set']: key_fold += 1 data_set[key_fold]=[ind] else: data_set[key_fold].append(ind) self.splits = len(data_set.keys()) split_indices_train = [[] for i in range(self.splits)] split_indices_test = [[] for i in range(self.splits)] for i in range(self.splits): split_indices_test[i].extend([ind for ind in data_set[i] \ if self.data[ind][0].specs['wdef_name'] \ in self.wdefs_test]) for j in range(self.splits): split_indices_train[i].extend([ind for ind in data_set[j] \ if j != i and self.data[ind][0].specs['wdef_name'] \ in self.wdefs_train]) self.split_indices_train = split_indices_train self.split_indices_test = split_indices_test self._log("Benchmarking with split %s/%s" % (self.current_split + 1, self.splits))
class PrintDataNode(BaseNode): """Print out formatted data. This prints out the data to support debugging. **Parameters** :print_delimiters: Separate prints with delimiters for readibility (*optional, default: True*) :print_markers: Print the markers. (*optional, default: True*) :print_shape: Print the the datas shape. (*optional, default: False*) :print_samples: Print the data. (*optional, default: True*) :print_hex: Print the data in flattened hex format. (*optional, default: False*) :print_normal: Print the data "normally". (*optional, default: True*) :numpy_printoptions: Specify numpy printoptions. Use none, if it does not apply. (*optional, default: None*) **Exemplary Call** .. code-block:: yaml - node : PrintData parameters : numpy_printoptions : precision : 12 threshold : 100 :Authors: Hendrik Woehrle ([email protected]) :Created: 2012/04/20 """ def __init__(self, print_delimiters=True, print_markers=True, print_hex=False, print_normal=True, numpy_printoptions=None, print_samples=True, print_shape=False, **kwargs): super(PrintDataNode, self).__init__(*kwargs) self.set_permanent_attributes(item=0, print_delimiters=print_delimiters, print_markers=print_markers, print_hex=print_hex, print_normal=print_normal, numpy_printoptions=numpy_printoptions, print_samples=print_samples, print_shape=print_shape) def process(self): """ Processes all data that is provided by the input node Returns a generator that yields the data after being processed by this node. """ assert (self.input_node != None), "No input node specified!" # Assert that this node has already been trained assert (not self.is_trainable() or self.get_remaining_train_phase() == 0), "Node not trained!" self._log("Processing data.", level=logging.DEBUG) data_generator = \ itertools.imap(lambda (data, label): self.print_data(data, label), self.input_node.process()) return data_generator def request_data_for_training(self, use_test_data): """ Returns data for training of subsequent nodes of the node chain A call to this method might involve training of the node chain up this node. If use_test_data is true, all available data is used for training, otherwise only the data that is explicitly for training. """ assert (self.input_node != None) self._log("Data for training is requested.", level=logging.DEBUG) # If we haven't computed the data for training yet if self.data_for_training == None: self._log("Producing data for training.", level=logging.DEBUG) # Train this node self.train_sweep(use_test_data) # Compute a generator the yields the train data and # encapsulate it in an object that memoizes its outputs and # provides a "fresh" method that returns a new generator that'll # yield the same sequence # This line crashes without the NodeMetaclass bug fix train_data_generator = \ itertools.imap(lambda (data, label) : self.print_data(data, label), self.input_node.request_data_for_training( use_test_data)) self.data_for_training = MemoizeGenerator(train_data_generator, caching=self.caching) self._log("Data for training finished", level=logging.DEBUG) # Return a fresh copy of the generator return self.data_for_training.fresh() def request_data_for_testing(self): """ Returns data for testing of subsequent nodes of the node chain A call to this node might involve evaluating the whole node chain up to this node. """ assert (self.input_node != None) self._log("Data for testing is requested.", level=logging.DEBUG) # If we haven't computed the data for testing yet if self.data_for_testing == None: # Assert that this node has already been trained assert (not self.is_trainable() or self.get_remaining_train_phase() == 0) # Compute a generator the yields the test data and # encapsulate it in an object that memoizes its outputs and # provides a "fresh" method that returns a new generator that'll # yield the same sequence self._log("Producing data for testing.", level=logging.DEBUG) test_data_generator = \ itertools.imap(lambda (data, label): self.print_data(data, label), self.input_node.request_data_for_testing()) self.data_for_testing = MemoizeGenerator(test_data_generator, caching=self.caching) self._log("Data for testing finished", level=logging.DEBUG) # Return a fresh copy of the generator return self.data_for_testing.fresh() def print_data(self, data, label): """ Print the data according to the specified constraints. """ if self.print_delimiters == True: print 50 * "*" if hasattr(data, "marker_name" ) and data.marker_name != None and self.print_markers: print "%s: markers: %s" % (str(type(data)), str(data.marker_name)) else: print "%s" % (str(type(data))) if issubclass(FeatureVector, type(data)): print "%04d: %s %s" % (self.item, data.tag, label) elif issubclass(TimeSeries, type(data)): print "%04d: %s %s %s" % (self.item, data.name, data.marker_name, label) # backup printoptions if self.numpy_printoptions: default_printoptions = numpy.get_printoptions() numpy.set_printoptions(**self.numpy_printoptions) if self.print_shape: print "shape:", data.shape if self.print_normal: if self.print_delimiters == True: print 25 * "-" print data if self.print_hex: if self.print_delimiters == True: print 25 * "-" print map(hex, data.flatten()) if self.print_delimiters == True: print 50 * "*" #set back default printoptions if self.numpy_printoptions: numpy.set_printoptions(default_printoptions) self.item += 1 return (data, label)
class FeatureVectorSourceNode(BaseNode): """ Source for samples of type :class:`~pySPACE.resources.data_types.feature_vector.FeatureVector` This node reads :class:`~pySPACE.resources.data_types.feature_vector.FeatureVector` elements accumulated in a :mod:`~pySPACE.resources.dataset_defs.feature_vector` and passes them into the :mod:`~pySPACE.environments.chains.node_chain`. As described in :mod:`~pySPACE.resources.dataset_defs.feature_vector` it is important, that the storage format is correct specified in the metadata.yaml. If the dataset has been constructed by pySPACE, this is done automatically. **Parameters** **Exemplary Call** .. code-block:: yaml - node : Feature_Vector_Source :Author: Jan Hendrik Metzen ([email protected]) :Created: 2008/11/25 """ input_types = ["FeatureVector"] def __init__(self, **kwargs): super(FeatureVectorSourceNode, self).__init__(**kwargs) def set_input_dataset(self, dataset): """ Sets the dataset from which this node reads the data """ self.set_permanent_attributes(dataset=dataset) def register_input_node(self, node): """ Register the given node as input """ raise Exception( "No nodes can be registered as inputs for source nodes") def use_next_split(self): """ Use the next split of the data into training and test data. Returns True if more splits are available, otherwise False. This method is useful for benchmarking """ return False def train_sweep(self, use_test_data): """ Performs the actual training of the node. .. note:: Source nodes cannot be trained """ raise Exception("Source nodes cannot be trained") def request_data_for_training(self, use_test_data): """ Returns the time windows that can be used for training of subsequent nodes .. todo:: to document """ if not use_test_data: # If the input dataset consists only of one single run, # we use this as input for all runs to be conducted (i.e. we # rely on later randomization of the order). Otherwise # we use the data for this run number if self.dataset.meta_data["runs"] > 1: key = (self.run_number, self.current_split, "train") else: key = (0, self.current_split, "train") # Check if there is training data for the current split and run if key in self.dataset.data.keys(): self._log( "Accessing input dataset's training feature vector windows." ) self.data_for_training = MemoizeGenerator( self.dataset.get_data(*key).__iter__(), caching=self.caching) else: # Returns an iterator that iterates over an empty sequence # (i.e. an iterator that is immediately exhausted), since # this node does not provide any data that is explicitly # dedicated for training self._log("No training data available.") self.data_for_training = MemoizeGenerator( (x for x in [].__iter__()), caching=self.caching) else: # Return the test data as there is no additional data that # was dedicated for training return self.request_data_for_testing() # Return a fresh copy of the generator return self.data_for_training.fresh() def request_data_for_testing(self): """ Returns the data that can be used for testing of subsequent nodes .. todo:: to document """ # If we haven't read the data for testing yet if self.data_for_testing == None: self._log("Accessing input dataset's test feature vector windows.") # If the input dataset consists only of one single run, # we use this as input for all runs to be conducted (i.e. we # rely on later randomization of the order). Otherwise # we use the data for this run number if self.dataset.meta_data["runs"] > 1: key = (self.run_number, self.current_split, "test") else: key = (0, self.current_split, "test") test_data_generator = self.dataset.get_data(*key).__iter__() self.data_for_testing = MemoizeGenerator(test_data_generator, caching=self.caching) # Return a fresh copy of the generator return self.data_for_testing.fresh() def get_metadata(self, key): """ Return the value corresponding to the given key from the dataset meta data of this source node. """ return self.dataset.meta_data.get(key)
class TrainTestSplitterNode(BaseNode): """ Split data into one training and one test data set with a fixed ratio The relative size of the two sets is controlled via the parameter train_ratio. .. warning:: the class ratio is not retained .. todo:: introduce stratified parameter as in CV_Splitter **Parameters** :train_ratio: The ratio of the overall available data that is assigned to the training set. The remaining data (1-train_ratio) is used for testing. (*optional, default: 0.5*) :num_train_instances: Instead of specifying a train_ratio, this option allows to specify the absolute number of training instances of class *class_label* that should be in the training set. All instances that occur until *num_train_instances* are found are used for training. The remaining data are used for testing. (*optional, default: None*) :class_label: If *num_train_instances*-option is used, this string determines the class of which training examples are count. :random: If *False*, the order of the data is retained. I.e. the train_ratio instances are used for training and the remaining as test data. If *True*, the two sets are sampled randomly from the data without taking into consideration the data's order. (*optional, default: True*) **Exemplary Call** .. code-block:: yaml - node : TrainTestSplitter parameters : train_ratio : 0.7 random : False :Author: Jan Hendrik Metzen ([email protected]) :Created: 2010/03/08 (Documentation, old node) :LastChange: 2011/11/14 (Documentation) Anett Seeland """ def __init__(self, train_ratio=0.5, random=True, num_train_instances=None, class_label='Target', reverse=False, **kwargs): super(TrainTestSplitterNode, self).__init__(**kwargs) assert(not(random and reverse)),"Reverse ordering makes no sense when randomization is active!" self.set_permanent_attributes(train_ratio=train_ratio, random=random, num_train_instances=num_train_instances, class_label=class_label, reverse=reverse, train_data=None, test_data=None) def is_split_node(self): """ Returns whether this is a split node. """ return True def use_next_split(self): """ Use the next split of the data into training and test data. Returns True if more splits are available, otherwise False. This method is useful for benchmarking """ return False def train_sweep(self, use_test_data): """ Performs the actual training of the node. .. note:: Split nodes cannot be trained """ raise Exception("Split nodes cannot be trained") def request_data_for_training(self, use_test_data): """ Returns the data for training of subsequent nodes .. todo:: to document """ # Create split lazily when required if self.train_data == None: self._create_split() # Create training data generator self.data_for_training = \ MemoizeGenerator(instance for instance in self.train_data) return self.data_for_training.fresh() def request_data_for_testing(self): """ Returns the data for testing of subsequent nodes .. todo:: to document """ # Create split lazily when required if self.test_data == None: self._create_split() # Create test data generator self.data_for_testing = \ MemoizeGenerator(instance for instance in self.test_data) return self.data_for_testing.fresh() def _create_split(self): """ Create the split of the data into training and test data. """ self._log("Splitting data into train and test data") train_data = list(self.input_node.request_data_for_training(use_test_data=False)) # If there is already a non-empty training set, # it means that we are not the first split node in the node chain. if len(train_data) > 0: raise Exception("No iterated splitting of data sets allowed\n " "(Calling a splitter on a data set that is already " "split)") # Create generator instead of loading all data if self.num_train_instances and not (self.random): self.train_data = [] input_generator=self.input_node.request_data_for_testing() for i in range(self.num_train_instances): self.train_data.append(input_generator.next()) self.test_data = input_generator return # Gather all test data test_data = list(self.input_node.request_data_for_testing()) # Remember all the data and store it in memory # TODO: This might cause problems for large dataset data = train_data + test_data data_size = len(data) # Randomize order if randomization is not switched of if self.random: r = random.Random(self.run_number) r.shuffle(data) if self.num_train_instances!=None: if self.reverse: data = data[::-1] if len([i for i in range(len(data)) \ if data[i][1]==self.class_label])==self.num_train_instances: train_end = data_size else: counter = 0 for (index, (window, label)) in enumerate(data): # print "Label: ", label, "Zeitpunkt: ", window.start_time if label == self.class_label: counter += 1 if counter == self.num_train_instances: train_end = index+1 break assert(self.num_train_instances==counter), \ "Too many instances to select." else: # Split data into train and test data according train_ratio train_end = int(round(data_size * self.train_ratio)) self.train_data=data[0:train_end] self.test_data=data[train_end:]
class PrintDataNode(BaseNode): """Print out formatted data. This prints out the data to support debugging. **Parameters** :print_delimiters: Separate prints with delimiters for readibility (*optional, default: True*) :print_markers: Print the markers. (*optional, default: True*) :print_shape: Print the the datas shape. (*optional, default: False*) :print_samples: Print the data. (*optional, default: True*) :print_hex: Print the data in flattened hex format. (*optional, default: False*) :print_normal: Print the data "normally". (*optional, default: True*) :numpy_printoptions: Specify numpy printoptions. Use none, if it does not apply. (*optional, default: None*) **Exemplary Call** .. code-block:: yaml - node : PrintData parameters : numpy_printoptions : precision : 12 threshold : 100 :Authors: Hendrik Woehrle ([email protected]) :Created: 2012/04/20 """ def __init__(self, print_delimiters = True, print_markers = True, print_hex = False, print_normal = True, numpy_printoptions = None, print_samples = True, print_shape = False, **kwargs): super(PrintDataNode, self).__init__(*kwargs) self.set_permanent_attributes(item = 0, print_delimiters = print_delimiters, print_markers = print_markers, print_hex = print_hex, print_normal = print_normal, numpy_printoptions = numpy_printoptions, print_samples = print_samples, print_shape = print_shape ) def process(self): """ Processes all data that is provided by the input node Returns a generator that yields the data after being processed by this node. """ assert(self.input_node != None), "No input node specified!" # Assert that this node has already been trained assert(not self.is_trainable() or self.get_remaining_train_phase() == 0), "Node not trained!" self._log("Processing data.", level=logging.DEBUG) data_generator = \ itertools.imap(lambda (data, label): self.print_data(data, label), self.input_node.process()) return data_generator def request_data_for_training(self, use_test_data): """ Returns data for training of subsequent nodes of the node chain A call to this method might involve training of the node chain up this node. If use_test_data is true, all available data is used for training, otherwise only the data that is explicitly for training. """ assert(self.input_node != None) self._log("Data for training is requested.", level = logging.DEBUG) # If we haven't computed the data for training yet if self.data_for_training == None: self._log("Producing data for training.", level = logging.DEBUG) # Train this node self.train_sweep(use_test_data) # Compute a generator the yields the train data and # encapsulate it in an object that memoizes its outputs and # provides a "fresh" method that returns a new generator that'll # yield the same sequence # This line crashes without the NodeMetaclass bug fix train_data_generator = \ itertools.imap(lambda (data, label) : self.print_data(data, label), self.input_node.request_data_for_training( use_test_data)) self.data_for_training = MemoizeGenerator(train_data_generator, caching=self.caching) self._log("Data for training finished", level = logging.DEBUG) # Return a fresh copy of the generator return self.data_for_training.fresh() def request_data_for_testing(self): """ Returns data for testing of subsequent nodes of the node chain A call to this node might involve evaluating the whole node chain up to this node. """ assert(self.input_node != None) self._log("Data for testing is requested.", level = logging.DEBUG) # If we haven't computed the data for testing yet if self.data_for_testing == None: # Assert that this node has already been trained assert(not self.is_trainable() or self.get_remaining_train_phase() == 0) # Compute a generator the yields the test data and # encapsulate it in an object that memoizes its outputs and # provides a "fresh" method that returns a new generator that'll # yield the same sequence self._log("Producing data for testing.", level = logging.DEBUG) test_data_generator = \ itertools.imap(lambda (data, label): self.print_data(data, label), self.input_node.request_data_for_testing()) self.data_for_testing = MemoizeGenerator(test_data_generator, caching=self.caching) self._log("Data for testing finished", level = logging.DEBUG) # Return a fresh copy of the generator return self.data_for_testing.fresh() def print_data(self, data, label): """ Print the data according to the specified constraints. """ if self.print_delimiters == True: print 50 *"*" if hasattr(data,"marker_name") and data.marker_name != None and self.print_markers: print "%s: markers: %s" % (str(type(data)), str(data.marker_name)) else : print "%s" % (str(type(data))) if issubclass(FeatureVector, type(data)): print "%04d: %s %s" % (self.item, data.tag, label) elif issubclass(TimeSeries, type(data)): print "%04d: %s %s %s" % (self.item, data.name, data.marker_name, label) # backup printoptions if self.numpy_printoptions: default_printoptions = numpy.get_printoptions() numpy.set_printoptions(**self.numpy_printoptions) if self.print_shape: print "shape:", data.shape if self.print_normal: if self.print_delimiters == True: print 25 *"-" print data if self.print_hex: if self.print_delimiters == True: print 25 *"-" print map(hex,data.flatten()) if self.print_delimiters == True: print 50 *"*" #set back default printoptions if self.numpy_printoptions: numpy.set_printoptions(default_printoptions) self.item += 1 return (data, label)