Пример #1
0
    def request_data_for_testing(self):
        """ Returns the data for testing of subsequent nodes

        .. todo:: to document
        """
        if self.data_for_testing is None:
            # set window definition for test phase windower file
            self.window_definition = \
                Windower._load_window_spec(self.windower_spec_file,
                                           self.local_window_conf)
            test_data = list(self.input_node.request_data_for_testing())

            # create stream of windows
            self.window_stream(test_data)

            # Create a generator that emits the windows
            test_data_generator = ((sample, label) \
                                   for (sample, label) in self.marker_windower)

            self.data_for_testing = MemoizeGenerator(test_data_generator)

            # Return a fresh copy of the generator
            return self.data_for_testing.fresh()
        else:
            return self.data_for_testing.fresh()
Пример #2
0
    def request_data_for_training(self, use_test_data):
        """ Returns data for training of subsequent nodes of the node chain

        A call to this method might involve training of the node chain up this
        node. If use_test_data is true, all available data is used for
        training, otherwise only the data that is explicitly for training.
        """
        assert (self.input_node != None)

        self._log("Data for training is requested.", level=logging.DEBUG)

        # If we haven't computed the data for training yet
        if self.data_for_training == None:
            self._log("Producing data for training.", level=logging.DEBUG)
            # Train this node
            self.train_sweep(use_test_data)
            # Compute a generator the yields the train data and
            # encapsulate it in an object that memoizes its outputs and
            # provides a "fresh" method that returns a new generator that'll
            # yield the same sequence
            # This line crashes without the NodeMetaclass bug fix
            train_data_generator = \
                 itertools.imap(lambda (data, label) :
                                self.print_data(data, label),
                                self.input_node.request_data_for_training(
                                                                use_test_data))
            self.data_for_training = MemoizeGenerator(train_data_generator,
                                                      caching=self.caching)

        self._log("Data for training finished", level=logging.DEBUG)
        # Return a fresh copy of the generator
        return self.data_for_training.fresh()
Пример #3
0
    def request_data_for_training(self, use_test_data):
        """
        Returns the time windows that can be used for training of subsequent nodes

        .. todo:: to document
        """
        if not use_test_data:
            # If the input dataset consists only of one single run,
            # we use this as input for all runs to be conducted (i.e. we
            # rely on later randomization of the order). Otherwise
            # we use the data for this run number
            if self.dataset.meta_data["runs"] > 1:
                key = (self.run_number, self.current_split, "train")
            else: 
                key = (0, self.current_split, "train")
            # Check if there is training data for the current split and run
            if key in self.dataset.data.keys():
                self._log("Accessing input dataset's training feature vector windows.")
                self.data_for_training = MemoizeGenerator(self.dataset.get_data(*key).__iter__(),
                                                          caching=self.caching)
            else:
                # Returns an iterator that iterates over an empty sequence
                # (i.e. an iterator that is immediately exhausted), since
                # this node does not provide any data that is explicitly
                # dedicated for training
                self._log("No training data available.") 
                self.data_for_training = MemoizeGenerator((x for x in [].__iter__()),
                                                          caching=self.caching)
        else:
            # Return the test data as there is no additional data that
            # was dedicated for training
            return self.request_data_for_testing()
        
        # Return a fresh copy of the generator
        return self.data_for_training.fresh()
Пример #4
0
    def request_data_for_testing(self):
        """ Returns data for testing of subsequent nodes of the node chain

        A call to this node might involve evaluating the whole node chain
        up to this node.
        """
        assert (self.input_node != None)

        self._log("Data for testing is requested.", level=logging.DEBUG)

        # If we haven't computed the data for testing yet
        if self.data_for_testing == None:
            # Assert  that this node has already been trained
            assert (not self.is_trainable()
                    or self.get_remaining_train_phase() == 0)
            # Compute a generator the yields the test data and
            # encapsulate it in an object that memoizes its outputs and
            # provides a "fresh" method that returns a new generator that'll
            # yield the same sequence
            self._log("Producing data for testing.", level=logging.DEBUG)
            test_data_generator = \
                itertools.imap(lambda (data, label):
                               self.print_data(data, label),
                               self.input_node.request_data_for_testing())
            self.data_for_testing = MemoizeGenerator(test_data_generator,
                                                     caching=self.caching)
        self._log("Data for testing finished", level=logging.DEBUG)
        # Return a fresh copy of the generator
        return self.data_for_testing.fresh()
Пример #5
0
    def request_data_for_testing(self):
        """ Returns the data that can be used for testing of subsequent nodes

        The principle of obtaining the testing data are the same as the principles
        used in obtaining the training data set. The only difference here is that,
        in the case in which there is no testing data available, we allow for the
        training data to be used as testing data.
        """
        # If we haven't read the data for testing yet
        if self.data_for_testing == None:
            self._log("Accessing input dataset's test feature vector windows.")
            # If the input dataset consists only of one single run,
            # we use this as input for all runs to be conducted (i.e. we
            # rely on later randomization of the order). Otherwise
            # we use the data for this run number
            if self.dataset.meta_data["runs"] > 1:
                key = (self.run_number, self.current_split, "test")
            else:
                key = (0, self.current_split, "test")

            test_data_generator = self.dataset.get_data(*key).__iter__()

            self.data_for_testing = MemoizeGenerator(test_data_generator,
                                                     caching=self.caching)

        # Return a fresh copy of the generator
        return self.data_for_testing.fresh()
Пример #6
0
    def request_data_for_testing(self):
        """
        Returns the data that can be used for testing of subsequent nodes

        .. todo:: to document
        """
        self._log("Requesting test data...")
        # If we haven't read the data for testing yet
        if self.data_for_testing is None:

            self._log("Start streaming.")

            self.dataset.set_window_defs(
                window_definition=self.window_definition,
                nullmarker_stride_ms=self.nullmarker_stride_ms,
                no_overlap=self.no_overlap,
                data_consistency_check=self.data_consistency_check)

            if self.dataset.meta_data["runs"] > 1:
                key = (self.run_number, self.current_split, "test")
            else:
                key = (0, self.current_split, "test")

            # Create a generator that emits the windows
            test_data_generator = ((sample, label)
                                   for (sample,
                                        label) in self.dataset.get_data(*key))

            self.data_for_testing = \
                MemoizeGenerator(test_data_generator,
                                 caching=self.caching)

        # Return a fresh copy of the generator
        return self.data_for_testing.fresh()
Пример #7
0
    def request_data_for_training(self, use_test_data):
        """ Returns data for training of subsequent nodes
        
        .. todo:: to document
        """
        assert (self.input_node != None)

        self._log("Data for training is requested.", level=logging.DEBUG)

        # If we haven't computed the data for training yet
        if self.data_for_training == None:
            self._log("Producing data for training.", level=logging.DEBUG)
            # Train this node
            self.train_sweep(use_test_data)

            # Compute a generator the yields the train data and
            # encapsulate it in an object that memoizes its outputs and
            # provides a "fresh" method that returns a new generator that'll
            # yield the same sequence
            train_data_generator = \
                     itertools.imap(lambda (data, label) : (self.execute(data), label),
                                    self.external_training_set)

            self.data_for_training = MemoizeGenerator(train_data_generator,
                                                      caching=self.caching)

        self._log("Data for training finished", level=logging.DEBUG)
        # Return a fresh copy of the generator
        return self.data_for_training.fresh()
Пример #8
0
    def request_data_for_training(self, use_test_data):
        """
        Returns the time windows that can be used for training of subsequent nodes
        """
        # TODO:Is all this really necessary?
        if not use_test_data:
            # If the input dataset consists only of one single run,
            # we use this as input for all runs to be conducted (i.e. we
            # rely on later randomization of the order). Otherwise
            # we use the data for this run number
            if self.dataset.meta_data["runs"] > 1:
                key = (self.run_number, self.current_split, "train")
            else:
                key = (0, self.current_split, "train")
            # Check if there is training data for the current split and run
            if key in self.dataset.data.keys():
                self._log("Accessing input dataset's training prediction vectors.")
                self.data_for_training = MemoizeGenerator(self.dataset.get_data(*key).__iter__(),
                                                          caching=self.caching)
            else:
                # Returns an iterator that iterates over an empty sequence
                # (i.e. an iterator that is immediately exhausted), since
                # this node does not provide any data that is explicitly
                # dedicated for training
                self._log("No training data available.")
                self.data_for_training = MemoizeGenerator((x for x in [].__iter__()),
                                                          caching=self.caching)
        else:
            # Return the test data as there is no additional data that
            # was dedicated for training
            return self.request_data_for_testing()

        # Return a fresh copy of the generator
        return self.data_for_training.fresh()
Пример #9
0
    def request_data_for_testing(self):
        """
        Returns the data that can be used for testing of subsequent nodes

        .. todo:: to document
        """
        # If we haven't read the data for testing yet
        if self.data_for_testing == None:
            self._log("Accessing input dataset's test feature vector windows.")
            # If the input dataset consists only of one single run,
            # we use this as input for all runs to be conducted (i.e. we
            # rely on later randomization of the order). Otherwise
            # we use the data for this run number
            if self.dataset.meta_data["runs"] > 1:
                key = (self.run_number, self.current_split, "test")
            else:
                key = (0, self.current_split, "test")

            test_data_generator = self.dataset.get_data(*key).__iter__()

            self.data_for_testing = MemoizeGenerator(test_data_generator,
                                                     caching=self.caching)

        # Return a fresh copy of the generator
        return self.data_for_testing.fresh()
Пример #10
0
    def process(self):
        """ Processes all data that is provided by the input node

        Returns a generator that yields the data after being processed by this
        node.
        """
        assert(self.input_node != None), "No input node specified!"
        # Assert  that this node has already been trained
        assert(not self.is_trainable() or
               self.get_remaining_train_phase() == 0), "Node not trained!"
               
        data_generator = \
                itertools.imap(lambda (data, label):
                               (self.execute(data), label),
                               self.input_node.process())
                
        self.client = TimeSeriesClient(ts_stream = data_generator)
        
        self.client.connect()
        self.marker_windower = MarkerWindower(data_client=self.client,
                                              windowdefs=self.window_definition,
                                              stridems=self.nullmarker_stride_ms)
        
        if self.marker_windower == None:
            self.window_stream()

        # Create a generator that emits the windows
        test_data_generator = ((sample, label) \
                               for (sample, label) in self.marker_windower)

        self.data_for_testing = MemoizeGenerator(test_data_generator)

        # Return a fresh copy of the generator
        return self.data_for_testing.fresh()
Пример #11
0
 def request_data_for_testing(self):
     # Create split lazily when required
     if self.split_indices_test == None:
         self._create_split()
     
     # Create test data generator
     self.data_for_testing = MemoizeGenerator(
           self.data[i] for i in self.split_indices_test[self.current_split])
     
     return self.data_for_testing.fresh()
Пример #12
0
    def request_data_for_training(self, use_test_data):
        """ Returns the data for training of subsequent nodes

        .. todo:: to document
        """
        # Create split lazily when required
        if self.train_data == None:
            self._create_split()

        # Create training data generator
        self.data_for_training = \
                MemoizeGenerator(instance for instance in self.train_data)
        
        return self.data_for_training.fresh()
Пример #13
0
    def request_data_for_testing(self):
        """ Returns the data for testing of subsequent nodes

        .. todo:: to document
        """
        # Create cv-splits lazily when required
        if self.split_indices == None:
            self._create_splits()

        # Only that data can be used for testing which is explicitly
        # specified for this purpose by the current cv-split
        self.data_for_testing = MemoizeGenerator(
            self.data[i] for i in self.split_indices[self.current_split])

        return self.data_for_testing.fresh()
Пример #14
0
    def request_data_for_testing(self):
        """
        Returns the data that can be used for testing of subsequent nodes

        .. todo:: to document
        """
        self._log("Requesting test data...")
        # If we haven't read the data for testing yet
        if self.data_for_testing is None:

            self._log("Start streaming.")

            self.dataset.set_window_defs(
                window_definition=self.window_definition,
                nullmarker_stride_ms=self.nullmarker_stride_ms,
                no_overlap=self.no_overlap,
                data_consistency_check=self.data_consistency_check)

            if self.dataset.meta_data["runs"] > 1:
                key = (self.run_number, self.current_split, "test")
            else:
                key = (0, self.current_split, "test")

            # Create a generator that emits the windows
            test_data_generator = (
                (sample, label)
                for (sample, label) in self.dataset.get_data(*key))

            self.data_for_testing = \
                MemoizeGenerator(test_data_generator,
                                 caching=self.caching)

        # Return a fresh copy of the generator
        return self.data_for_testing.fresh()
Пример #15
0
    def request_data_for_testing(self):
        """
        Returns the data that can be used for testing of subsequent nodes

        .. todo:: to document
        """
        # If we haven't read the data for testing yet
        if self.data_for_testing is None:
            self._log("Accessing input dataset's test time series windows.")
            # If the input dataset consists only of one single run,
            # we use this as input for all runs to be conducted (i.e. we
            # rely on later randomization of the order). Otherwise
            # we use the data for this run number
            if self.dataset.meta_data["runs"] > 1:
                key = (self.run_number, self.current_split, "test")
            else: 
                key = (0, self.current_split, "test")
            
            test_data_generator = self.dataset.get_data(*key).__iter__()

            self.data_for_testing = MemoizeGenerator(test_data_generator,
                                                     caching=self.caching)
        
        # Return a fresh copy of the generator
        return self.data_for_testing.fresh()
 def request_data_for_training(self, use_test_data):
     """ Returns data for training of subsequent nodes
     
     .. todo:: to document
     """
     assert(self.input_node != None)
     
     self._log("Data for training is requested.", level = logging.DEBUG)
     
     # If we haven't computed the data for training yet
     if self.data_for_training == None:
         self._log("Producing data for training.", level = logging.DEBUG)
         # Train this node
         self.train_sweep(use_test_data)
         
         # Compute a generator the yields the train data and
         # encapsulate it in an object that memoizes its outputs and
         # provides a "fresh" method that returns a new generator that'll
         # yield the same sequence
         train_data_generator = \
                  itertools.imap(lambda (data, label) : (self.execute(data), label),
                                 self.external_training_set) 
                  
         self.data_for_training = MemoizeGenerator(train_data_generator,
                                                   caching=self.caching) 
     
     self._log("Data for training finished", level = logging.DEBUG)
     # Return a fresh copy of the generator  
     return self.data_for_training.fresh()
Пример #17
0
    def request_data_for_testing(self):
        """ Returns data for testing of subsequent nodes of the node chain

        A call to this node might involve evaluating the whole node chain
        up to this node.
        """
        assert(self.input_node != None)

        self._log("Data for testing is requested.", level = logging.DEBUG)

        # If we haven't computed the data for testing yet
        if self.data_for_testing == None:
            # Assert  that this node has already been trained
            assert(not self.is_trainable() or
                   self.get_remaining_train_phase() == 0)
            # Compute a generator the yields the test data and
            # encapsulate it in an object that memoizes its outputs and
            # provides a "fresh" method that returns a new generator that'll
            # yield the same sequence
            self._log("Producing data for testing.", level = logging.DEBUG)
            test_data_generator = \
                itertools.imap(lambda (data, label):
                               self.print_data(data, label),
                               self.input_node.request_data_for_testing())
            self.data_for_testing = MemoizeGenerator(test_data_generator,
                                                     caching=self.caching)
        self._log("Data for testing finished", level = logging.DEBUG)
        # Return a fresh copy of the generator
        return self.data_for_testing.fresh()
Пример #18
0
    def request_data_for_training(self, use_test_data):
        """ Returns data for training of subsequent nodes of the node chain

        A call to this method might involve training of the node chain up this
        node. If use_test_data is true, all available data is used for
        training, otherwise only the data that is explicitly for training.
        """
        assert(self.input_node != None)

        self._log("Data for training is requested.", level = logging.DEBUG)

        # If we haven't computed the data for training yet
        if self.data_for_training == None:
            self._log("Producing data for training.", level = logging.DEBUG)
            # Train this node
            self.train_sweep(use_test_data)
            # Compute a generator the yields the train data and
            # encapsulate it in an object that memoizes its outputs and
            # provides a "fresh" method that returns a new generator that'll
            # yield the same sequence
            # This line crashes without the NodeMetaclass bug fix
            train_data_generator = \
                 itertools.imap(lambda (data, label) :
                                self.print_data(data, label),
                                self.input_node.request_data_for_training(
                                                                use_test_data))
            self.data_for_training = MemoizeGenerator(train_data_generator,
                                                      caching=self.caching)

        self._log("Data for training finished", level = logging.DEBUG)
        # Return a fresh copy of the generator
        return self.data_for_training.fresh()
Пример #19
0
    def request_data_for_training(self, use_test_data):
        """ Returns the data for training of subsequent nodes

        .. todo:: to document
        """
        # Create cv-splits lazily when required
        if self.split_indices == None:
            self._create_splits()

        # All data can be used for training which is not explicitly
        # specified for testing by the current cv-split
        self.data_for_training = MemoizeGenerator(
            self.data[i] for i in range(len(self.data))
            if not i in self.split_indices[self.current_split])

        return self.data_for_training.fresh()
    def process(self):
        """ Processes all data that is provided by the input node

        Returns a generator that yields the data after being processed by this
        node.
        """
        assert(self.input_node != None), "No input node specified!"
        # Assert  that this node has already been trained
        assert(not self.is_trainable() or
               self.get_remaining_train_phase() == 0), "Node not trained!"
               
        data_generator = \
                itertools.imap(lambda (data, label):
                               (self.execute(data), label),
                               self.input_node.process())
                
        self.client = TimeSeriesClient(ts_stream = data_generator)
        
        self.client.connect()
        self.marker_windower = MarkerWindower(data_client=self.client,
                                              windowdefs=self.window_definition,
                                              stridems=self.nullmarker_stride_ms)
        
        if self.marker_windower == None:
            self.window_stream()

        # Create a generator that emits the windows
        test_data_generator = ((sample, label) \
                               for (sample, label) in self.marker_windower)

        self.data_for_testing = MemoizeGenerator(test_data_generator)

        # Return a fresh copy of the generator
        return self.data_for_testing.fresh()
Пример #21
0
    def request_data_for_testing(self):
        """ Returns the data that can be used for testing of subsequent nodes

        The principle of obtaining the testing data are the same as the principles
        used in obtaining the training data set. The only difference here is that,
        in the case in which there is no testing data available, we allow for the
        training data to be used as testing data.
        """
        # If we haven't read the data for testing yet
        if self.data_for_testing == None:
            self._log("Accessing input dataset's test feature vector windows.")
            # If the input dataset consists only of one single run,
            # we use this as input for all runs to be conducted (i.e. we
            # rely on later randomization of the order). Otherwise
            # we use the data for this run number
            if self.dataset.meta_data["runs"] > 1:
                key = (self.run_number, self.current_split, "test")
            else:
                key = (0, self.current_split, "test")

            test_data_generator = self.dataset.get_data(*key).__iter__()

            self.data_for_testing = MemoizeGenerator(test_data_generator,
                                                     caching=self.caching)

        # Return a fresh copy of the generator
        return self.data_for_testing.fresh()
    def request_data_for_testing(self):
        """ Returns the data for testing of subsequent nodes

        .. todo:: to document
        """

        if self.data_for_testing is None:
            # set window definition for test phase windower file
            self.window_definition = \
                Windower._load_window_spec(self.windower_spec_file,
                                           self.local_window_conf)
            test_data = list(self.input_node.request_data_for_testing())

            # create stream of windows
            self.window_stream(test_data)
    
            # Create a generator that emits the windows
            test_data_generator = ((sample, label) \
                                   for (sample, label) in self.marker_windower)
    
            self.data_for_testing = MemoizeGenerator(test_data_generator)
    
            # Return a fresh copy of the generator
            return self.data_for_testing.fresh()
        else: 
            return  self.data_for_testing.fresh()
Пример #23
0
    def request_data_for_training(self, use_test_data):
        """ Returns data for training of subsequent nodes
        
        .. todo:: to document
        
        .. note::
              This method works differently in InstanceSelectionNode
              than in other nodes: Only *percentage_selected* of the available
              data are returned.
        """
        
        assert(self.input_node != None)
        if self.train_percentage_selected>100:
            self._log("Train percentage of %f reduced to 100."%self.train_percentage_selected,
                      level=logging.ERROR)
            self.train_percentage_selected=100
        self._log("Data for training is requested.", level = logging.DEBUG)
        
        # If we haven't computed the data for training yet
        if self.data_for_training == None:
            self._log("Producing data for training.", level = logging.DEBUG)
            # Train this node
            self.train_sweep(use_test_data)
            
            # Divide available instances according to label
            all_instances = defaultdict(list)
            for instance, label in self.input_node.request_data_for_training(use_test_data):
                all_instances[label].append(instance)
                
            self._log("Keeping only %s percent of training data" % self.train_percentage_selected,
                      level = logging.DEBUG)
            r = random.Random(self.run_number)
            # Retain only *percentage_selected* percent of the data
            retained_instances = []

            for label, instances in all_instances.iteritems():
                r.shuffle(instances)
                if not self.reduce_class or self.train_percentage_selected==100:
                    end_index = int(round(len(instances) * self.train_percentage_selected / 100))
                elif not (self.reduce_class==label):
                    end_index = len(instances)
                else: #self.reduce_class==label--> reduction needed
                    end_index = int(round(len(instances) * self.train_percentage_selected / 100))

                retained_instances.extend(zip(instances[0:end_index],
                                              [label for i in range(end_index)]))              
            
            # Compute a generator the yields the train data and
            # encapsulate it in an object that memoizes its outputs and
            # provides a "fresh" method that returns a new generator that'll
            # yield the same sequence            
            train_data_generator = \
                     ((self.execute(data), label) for (data, label) in retained_instances) 
                     
            self.data_for_training = MemoizeGenerator(train_data_generator,
                                                      caching=self.caching) 
        
        self._log("Data for training finished", level = logging.DEBUG)
        # Return a fresh copy of the generator  
        return self.data_for_training.fresh()
Пример #24
0
 def request_data_for_training(self, use_test_data):
     """ Returns data for training of subsequent nodes
     
     .. todo:: to document
     """
     assert(self.input_node != None)
     
     self._log("Data for testing is requested.", level = logging.DEBUG)
     
     if self.data_for_training == None:
         self._log("Producing data for training.", level = logging.DEBUG)
         # Train this node
         self.train_sweep(use_test_data)
         
         # Divide available instances according to label
         all_instances = defaultdict(list)
         for instance, label in self.input_node.request_data_for_training(use_test_data):
             all_instances[label].append(instance)
         
         retained_instances = self.balance_instances(all_instances)
         
         # Compute a generator the yields the test data and
         # encapsulate it in an object that memoizes its outputs and
         # provides a "fresh" method that returns a new generator that'll
         # yield the same sequence
         self._log("Producing data for testing.", level = logging.DEBUG)
         train_data_generator = \
                 ((self.execute(data), label) for (data, label) in retained_instances) 
                 
         self.data_for_training = MemoizeGenerator(train_data_generator,
                                                  caching=self.caching)
     
     self._log("Data for training finished", level = logging.DEBUG)
     # Return a fresh copy of the generator  
     return self.data_for_training.fresh()
Пример #25
0
    def request_data_for_training(self, use_test_data):
        """ Returns the data that can be used for training of subsequent nodes

        This method streams training data and sends it to the subsequent nodes.
        If one looks at the tutorial related to building new nodes (available in
        the tutorial section), one can see exactly where the ``request_data``
        methods are put to use.

        The following example is one that was extracted from the
        :mod:`~pySPACE.missions.nodes.source.feature_vector_source.FeatureVectorSourceNode`

        which should(in theory at least) be implementable for all types of data.
        """
        if not use_test_data:
            # If the input dataset consists only of one single run,
            # we use this as input for all runs to be conducted (i.e. we
            # rely on later randomization of the order). Otherwise
            # we use the data for this run number
            if self.dataset.meta_data["runs"] > 1:
                key = (self.run_number, self.current_split, "train")
            else:
                key = (0, self.current_split, "train")
            # Check if there is training data for the current split and run
            if key in self.dataset.data.keys():
                self._log(
                    "Accessing input dataset's training feature vector windows."
                )
                self.data_for_training = MemoizeGenerator(
                    self.dataset.get_data(*key).__iter__(),
                    caching=self.caching)
            else:
                # Returns an iterator that iterates over an empty sequence
                # (i.e. an iterator that is immediately exhausted), since
                # this node does not provide any data that is explicitly
                # dedicated for training
                self._log("No training data available.")
                self.data_for_training = MemoizeGenerator(
                    (x for x in [].__iter__()), caching=self.caching)
        else:
            # Return the test data as there is no additional data that
            # was dedicated for training
            return self.request_data_for_testing()

        # Return a fresh copy of the generator
        return self.data_for_training.fresh()
Пример #26
0
    def request_data_for_training(self, use_test_data):
        """ Returns the data that can be used for training of subsequent nodes

        This method streams training data and sends it to the subsequent nodes.
        If one looks at the tutorial related to building new nodes (available in
        the tutorial section), one can see exactly where the ``request_data``
        methods are put to use.

        The following example is one that was extracted from the
        :mod:`~pySPACE.missions.nodes.source.feature_vector_source.FeatureVectorSourceNode`

        which should(in theory at least) be implementable for all types of data.
        """
        if not use_test_data:
            # If the input dataset consists only of one single run,
            # we use this as input for all runs to be conducted (i.e. we
            # rely on later randomization of the order). Otherwise
            # we use the data for this run number
            if self.dataset.meta_data["runs"] > 1:
                key = (self.run_number, self.current_split, "train")
            else:
                key = (0, self.current_split, "train")
            # Check if there is training data for the current split and run
            if key in self.dataset.data.keys():
                self._log("Accessing input dataset's training feature vector windows.")
                self.data_for_training = MemoizeGenerator(self.dataset.get_data(*key).__iter__(),
                                                          caching=self.caching)
            else:
                # Returns an iterator that iterates over an empty sequence
                # (i.e. an iterator that is immediately exhausted), since
                # this node does not provide any data that is explicitly
                # dedicated for training
                self._log("No training data available.")
                self.data_for_training = MemoizeGenerator((x for x in [].__iter__()),
                                                          caching=self.caching)
        else:
            # Return the test data as there is no additional data that
            # was dedicated for training
            return self.request_data_for_testing()

        # Return a fresh copy of the generator
        return self.data_for_training.fresh()
    def request_data_for_testing(self):
        """
        Returns the data that can be used for testing of subsequent nodes

        .. todo:: to document
        """
        # If we haven't read the data for testing yet
        if self.data_for_testing == None:

            generated_data = self.generate_random_data()

            # Create a generator that emits the windows
            test_data_generator = ((sample, label) \
                                     for (sample, label) in generated_data)

            self.data_for_testing = MemoizeGenerator(test_data_generator,
                                                     caching=True)

        # Return a fresh copy of the generator
        return self.data_for_testing.fresh()
Пример #28
0
    def request_data_for_testing(self):
        """
        Returns the data that can be used for testing of subsequent nodes

        .. todo:: to document
        """

        # If we haven't read the data for testing yet
        if self.data_for_testing is None:
            self.time_series = [(TimeSeries(input_array=numpy.ones((2, 2)) * i,
                                            channel_names=["X", "Y"],
                                            sampling_frequency=2),
                                 random.choice(["A", "B"])) for i in range(23)]
            # Create a generator that emits the windows
            test_data_generator = ((sample, label) \
                                     for (sample, label) in self.time_series)

            self.data_for_testing = MemoizeGenerator(test_data_generator,
                                                     caching=True)
        # Return a fresh copy of the generator
        return self.data_for_testing.fresh()
Пример #29
0
    def request_data_for_training(self, use_test_data):
        """ Returns the data that can be used for training of subsequent nodes

        .. todo:: to document
        """

        # set window definition for train phase windower file
        self.window_definition = \
            Windower._load_window_spec(self.windower_spec_file_train,
                                       self.local_window_conf)

        self._log("Requesting train data...")
        if self.data_for_training is None:
            if not use_test_data:
                # Get training and test data (with labels)
                train_data = \
                    list(self.input_node.request_data_for_training(use_test_data=use_test_data))
                # If training or test data is an empty list
                if train_data == []:
                    self.data_for_training = MemoizeGenerator(
                        (x for x in [].__iter__()), caching=True)
                    return self.data_for_training.fresh()
                # create stream of
                self.window_stream(train_data)

                # Create a generator that emits the windows
                train_data_generator = ((sample, label)
                                        for (sample,
                                             label) in self.marker_windower)
                self.data_for_training = MemoizeGenerator(train_data_generator,
                                                          caching=True)
                return self.data_for_training.fresh()

            else:
                # Return the test data as there is no additional data that
                # was dedicated for training
                self.data_for_training = self.request_data_for_testing()
                return self.data_for_training
        else:
            return self.data_for_training.fresh()
    def request_data_for_training(self, use_test_data):
        """ Returns the data that can be used for training of subsequent nodes

        .. todo:: to document
        """
        
        # set window definition for train phase windower file
        self.window_definition = \
            Windower._load_window_spec(self.windower_spec_file_train,
                                       self.local_window_conf)

        self._log("Requesting train data...")
        if self.data_for_training is None:
            if not use_test_data:
                # Get training and test data (with labels)
                train_data = \
                    list(self.input_node.request_data_for_training(use_test_data=use_test_data))
                # If training or test data is an empty list
                if train_data == []:
                    self.data_for_training=MemoizeGenerator(
                        (x for x in [].__iter__()), caching=True)
                    return self.data_for_training.fresh()
                # create stream of 
                self.window_stream(train_data)

                # Create a generator that emits the windows
                train_data_generator = ((sample, label) for (sample, label)
                                        in self.marker_windower)
                self.data_for_training = MemoizeGenerator(train_data_generator, 
                                                          caching=True)
                return self.data_for_training.fresh()
        
            else:
                # Return the test data as there is no additional data that
                # was dedicated for training
                self.data_for_training = self.request_data_for_testing()
                return self.data_for_training.fresh()
        else: 
            return self.data_for_training.fresh()
Пример #31
0
    def request_data_for_training(self, use_test_data):
        """ Returns the data for training of subsequent nodes

        .. todo:: to document
        """
        # Create split lazily when required
        if self.train_data == None:
            self._create_split()

        # Create training data generator
        self.data_for_training = \
                MemoizeGenerator(instance for instance in self.train_data)
        
        return self.data_for_training.fresh()
Пример #32
0
    def request_data_for_testing(self):
        """ Returns the data for testing of subsequent nodes

        .. todo:: to document
        """
        # Create cv-splits lazily when required
        if self.split_indices == None:
            self._create_splits()
        
        # Only that data can be used for testing which is explicitly
        # specified for this purpose by the current cv-split
        self.data_for_testing = MemoizeGenerator(
                self.data[i] for i in self.split_indices[self.current_split])
        
        return self.data_for_testing.fresh()
Пример #33
0
    def request_data_for_testing(self):
        """ Returns data for testing of subsequent nodes

        .. todo:: to document
        """
        assert(self.input_node is not None)
        
        self._log("Data for testing is requested.", level=logging.DEBUG)
        
        # If we haven't computed the data for testing yet
        if self.data_for_testing is None:
            # Assert  that this node has already been trained
            assert(not self.is_trainable() or 
                   self.get_remaining_train_phase() == 0)
            
            # Divide available instances according to label
            all_instances = defaultdict(list)
            
            for instance, label in self.input_node.request_data_for_testing():
                all_instances[label].append(instance)
            
            retained_instances = self.balance_instances(all_instances)
            
            # Compute a generator the yields the test data and
            # encapsulate it in an object that memoizes its outputs and
            # provides a "fresh" method that returns a new generator that will
            # yield the same sequence
            self._log("Producing data for testing.", level=logging.DEBUG)
            test_data_generator = ((self.execute(data), label)
                                   for (data, label) in retained_instances)
                    
            self.data_for_testing = MemoizeGenerator(test_data_generator,
                                                     caching=self.caching)
        self._log("Data for testing finished", level=logging.DEBUG)
        # Return a fresh copy of the generator
        return self.data_for_testing.fresh()
Пример #34
0
    def request_data_for_training(self, use_test_data):
        """ Returns the data for training of subsequent nodes

        .. todo:: to document
        """
        # Create cv-splits lazily when required
        if self.split_indices == None:
            self._create_splits()
            
        # All data can be used for training which is not explicitly
        # specified for testing by the current cv-split
        self.data_for_training = MemoizeGenerator(
                self.data[i] for i in range(len(self.data)) 
                    if not i in self.split_indices[self.current_split])
        
        return self.data_for_training.fresh()
Пример #35
0
class SimpleTimeSeriesSourceNode(TimeSeriesSourceNode):
    """ A simple test class for unit tests 
    
    Generates the same data for test and training.
    """
    
    def __init__(self, *args, **kwargs):
        super(SimpleTimeSeriesSourceNode, self).__init__(*args, **kwargs)
        
        run_number = 0
        
        # We have to create a dummy dataset
        class DummyObject(object): pass
        dataset = DummyObject()
        dataset.meta_data = {'runs' : 1}
        dataset.data = {}
        
        self.set_permanent_attributes(dataset = dataset,
                                      run_number=run_number)
    
    def request_data_for_testing(self):
        """
        Returns the data that can be used for testing of subsequent nodes

        .. todo:: to document
        """
        
        # If we haven't read the data for testing yet
        if self.data_for_testing is None:
            self.time_series = [(TimeSeries(input_array = numpy.ones((2,2))*i,
                                            channel_names = ["X", "Y"], 
                                            sampling_frequency = 2),
                                            random.choice(["A", "B"]))
                                            for i  in range(23)]
            # Create a generator that emits the windows
            test_data_generator = ((sample, label) \
                                     for (sample, label) in self.time_series)
            

            self.data_for_testing = MemoizeGenerator(test_data_generator,
                                                     caching = True)
        # Return a fresh copy of the generator
        return self.data_for_testing.fresh()
Пример #36
0
class SimpleTimeSeriesSourceNode(TimeSeriesSourceNode):
    """ A simple test class for unit tests 
    
    Generates the same data for test and training.
    """
    def __init__(self, *args, **kwargs):
        super(SimpleTimeSeriesSourceNode, self).__init__(*args, **kwargs)

        run_number = 0

        # We have to create a dummy dataset
        class DummyObject(object):
            pass

        dataset = DummyObject()
        dataset.meta_data = {'runs': 1}
        dataset.data = {}

        self.set_permanent_attributes(dataset=dataset, run_number=run_number)

    def request_data_for_testing(self):
        """
        Returns the data that can be used for testing of subsequent nodes

        .. todo:: to document
        """

        # If we haven't read the data for testing yet
        if self.data_for_testing is None:
            self.time_series = [(TimeSeries(input_array=numpy.ones((2, 2)) * i,
                                            channel_names=["X", "Y"],
                                            sampling_frequency=2),
                                 random.choice(["A", "B"])) for i in range(23)]
            # Create a generator that emits the windows
            test_data_generator = ((sample, label) \
                                     for (sample, label) in self.time_series)

            self.data_for_testing = MemoizeGenerator(test_data_generator,
                                                     caching=True)
        # Return a fresh copy of the generator
        return self.data_for_testing.fresh()
Пример #37
0
    def request_data_for_testing(self):
        """
        Returns the data that can be used for testing of subsequent nodes

        .. todo:: to document
        """
        # If we haven't read the data for testing yet
        if self.data_for_testing == None:
            
            generated_data = self.generate_random_data()
                                    
            # Create a generator that emits the windows
            test_data_generator = ((sample, label) \
                                     for (sample, label) in generated_data)
            
            self.data_for_testing = MemoizeGenerator(test_data_generator,
                                                     caching = True)
            
        
        # Return a fresh copy of the generator
        return self.data_for_testing.fresh()   
Пример #38
0
    def request_data_for_testing(self):
        """
        Returns the data that can be used for testing of subsequent nodes

        .. todo:: to document
        """
        
        # If we haven't read the data for testing yet
        if self.data_for_testing is None:
            self.time_series = [(TimeSeries(input_array = numpy.ones((2,2))*i,
                                            channel_names = ["X", "Y"], 
                                            sampling_frequency = 2),
                                            random.choice(["A", "B"]))
                                            for i  in range(23)]
            # Create a generator that emits the windows
            test_data_generator = ((sample, label) \
                                     for (sample, label) in self.time_series)
            

            self.data_for_testing = MemoizeGenerator(test_data_generator,
                                                     caching = True)
        # Return a fresh copy of the generator
        return self.data_for_testing.fresh()
Пример #39
0
class TimeSeriesSourceNode(BaseNode):
    """ Source for windowed :class:`~pySPACE.resources.data_types.time_series.TimeSeries` saved in pickle format via :class:`~pySPACE.missions.nodes.sink.time_series_sink.TimeSeriesSinkNode`
    
    **Parameters**
    
    **Exemplary Call**
    
    .. code-block:: yaml
    
        - 
            node : TimeSeriesSource
    
    :Author: Jan Hendrik Metzen ([email protected])
    :Created: 2008/11/25
    """
    input_types = ["TimeSeries"]

    def __init__(self, **kwargs):
        super(TimeSeriesSourceNode, self).__init__(**kwargs)
        
        self.set_permanent_attributes(dataset=None)

    def set_input_dataset(self, dataset):
        """ Sets the dataset from which this node reads the data """
        self.set_permanent_attributes(dataset=dataset)

    def register_input_node(self, node):
        """ Register the given node as input """
        raise Exception("No nodes can be registered as inputs for source nodes")

    def use_next_split(self):
        """
        Use the next split of the data into training and test data.
        Returns True if more splits are available, otherwise False.
        
        This method is useful for benchmarking
        """
        # if the input dataset has more than one split/run we will compute
        # the splits in parallel, i.e. we don't return any further splits
        return False
    
    def train_sweep(self, use_test_data):
        """
        Performs the actual training of the node.
        .. note:: Source nodes cannot be trained
        """
        raise Exception("Source nodes cannot be trained")
    
    def request_data_for_training(self, use_test_data):
        """
        Returns the time windows that can be used for training of subsequent nodes

        .. todo:: to document
        """
        if not use_test_data:
            # If the input dataset consists only of one single run,
            # we use this as input for all runs to be conducted (i.e. we
            # rely on later randomization of the order). Otherwise
            # we use the data for this run number
            if self.dataset.meta_data["runs"] > 1:
                key = (self.run_number, self.current_split, "train")
                self._log("Run %s." % self.run_number)
            else: 
                key = (0, self.current_split, "train")
                self._log("Run %s. Using input data of run 0." % self.run_number)
                
            # Check if there is training data for the current split and run
            if key in self.dataset.data.keys():
                self._log("Accessing input dataset's training time series windows.")
                self.data_for_training = \
                    MemoizeGenerator(self.dataset.get_data(*key).__iter__(),
                                     caching=self.caching)
            else:
                # Returns an iterator that iterates over an empty sequence
                # (i.e. an iterator that is immediately exhausted), since
                # this node does not provide any data that is explicitly
                # dedicated for training
                self._log("No training data available.") 
                self.data_for_training = MemoizeGenerator((x for x in [].__iter__()),
                                                          caching=self.caching)
        else:
            # Return the test data as there is no additional data that
            # was dedicated for training
            return self.request_data_for_testing()
        
        # Return a fresh copy of the generator
        return self.data_for_training.fresh()
    
    def request_data_for_testing(self):
        """
        Returns the data that can be used for testing of subsequent nodes

        .. todo:: to document
        """
        # If we haven't read the data for testing yet
        if self.data_for_testing is None:
            self._log("Accessing input dataset's test time series windows.")
            # If the input dataset consists only of one single run,
            # we use this as input for all runs to be conducted (i.e. we
            # rely on later randomization of the order). Otherwise
            # we use the data for this run number
            if self.dataset.meta_data["runs"] > 1:
                key = (self.run_number, self.current_split, "test")
            else: 
                key = (0, self.current_split, "test")
            
            test_data_generator = self.dataset.get_data(*key).__iter__()

            self.data_for_testing = MemoizeGenerator(test_data_generator,
                                                     caching=self.caching)
        
        # Return a fresh copy of the generator
        return self.data_for_testing.fresh()
    
    def get_metadata(self, key):
        """ Return the value corresponding to the given key from the dataset meta data of this source node. """
        return self.dataset.meta_data.get(key)

    def __del__(self):
        del self.dataset
        self.dataset = None
Пример #40
0
class StreamWindowingNode(BaseNode):
    """Get a stream of time series objects and window them inside a flow. 

    Node that interprets a stream of incoming time series objects as
    a raw data stream.
    The markers stored in marker_name attribute are used as the markers
    for a :class:`~pySPACE.missions.support.windower.MarkerWindower`.
    This should done *before* any splitter, since all incoming windows
    are regarded as parts of a consecutive data stream.

    **Parameters**

     :windower_spec_file:
         The window specification file for the
         :class:`~pySPACE.missions.support.windower.MarkerWindower`.
         Used for testing and training, if windower_spec_file_train
         is not specified. 

     :windower_spec_file_train:
         A separate window file for training only.
         If not specified, windower_spec_file is used for training
         and testing.

    
    **Parameters**


    **Exemplary Call**

    .. code-block:: yaml

        -
            node : Stream_Windowing
            parameters :
                windower_spec_file : "example_lrp_window_spec.yaml"

    :Authors: Hendrik Woehrle ([email protected])
    :Created: 2012/07/09
    """
    def __init__(self,
                 windower_spec_file,
                 windower_spec_file_train=None,
                 local_window_conf=False,
                 nullmarker_stride_ms=1000,
                 *args,
                 **kwargs):
        super(StreamWindowingNode, self).__init__(*args, **kwargs)

        if windower_spec_file_train is None:
            windower_spec_file_train = windower_spec_file

        self.set_permanent_attributes(
            client=None,
            marker_windower=None,
            window_definition=None,
            local_window_conf=local_window_conf,
            windower_spec_file=windower_spec_file,
            windower_spec_file_train=windower_spec_file_train,
            nullmarker_stride_ms=nullmarker_stride_ms)

    def request_data_for_training(self, use_test_data):
        """ Returns the data that can be used for training of subsequent nodes

        .. todo:: to document
        """

        # set window definition for train phase windower file
        self.window_definition = \
            Windower._load_window_spec(self.windower_spec_file_train,
                                       self.local_window_conf)

        self._log("Requesting train data...")
        if self.data_for_training is None:
            if not use_test_data:
                # Get training and test data (with labels)
                train_data = \
                    list(self.input_node.request_data_for_training(use_test_data=use_test_data))
                # If training or test data is an empty list
                if train_data == []:
                    self.data_for_training = MemoizeGenerator(
                        (x for x in [].__iter__()), caching=True)
                    return self.data_for_training.fresh()
                # create stream of
                self.window_stream(train_data)

                # Create a generator that emits the windows
                train_data_generator = ((sample, label)
                                        for (sample,
                                             label) in self.marker_windower)
                self.data_for_training = MemoizeGenerator(train_data_generator,
                                                          caching=True)
                return self.data_for_training.fresh()

            else:
                # Return the test data as there is no additional data that
                # was dedicated for training
                self.data_for_training = self.request_data_for_testing()
                return self.data_for_training
        else:
            return self.data_for_training.fresh()

    def request_data_for_testing(self):
        """ Returns the data for testing of subsequent nodes

        .. todo:: to document
        """
        if self.data_for_testing is None:
            # set window definition for test phase windower file
            self.window_definition = \
                Windower._load_window_spec(self.windower_spec_file,
                                           self.local_window_conf)
            test_data = list(self.input_node.request_data_for_testing())

            # create stream of windows
            self.window_stream(test_data)

            # Create a generator that emits the windows
            test_data_generator = ((sample, label) \
                                   for (sample, label) in self.marker_windower)

            self.data_for_testing = MemoizeGenerator(test_data_generator)

            # Return a fresh copy of the generator
            return self.data_for_testing.fresh()
        else:
            return self.data_for_testing.fresh()

    def process(self):
        """ Processes all data that is provided by the input node

        Returns a generator that yields the data after being processed by this
        node.
        """
        assert (self.input_node != None), "No input node specified!"
        # Assert  that this node has already been trained
        assert (not self.is_trainable()
                or self.get_remaining_train_phase() == 0), "Node not trained!"

        if self.window_definition is None:
            if self.is_training(
            ) and self.windower_spec_file_train is not None:
                self.window_definition = \
                    Windower._load_window_spec(self.windower_spec_file_train,
                                                    self.local_window_conf)
            else:
                self.window_definition = \
                    Windower._load_window_spec(self.windower_spec_file,
                                                    self.local_window_conf)


        data_generator = \
                itertools.imap(lambda (data, label):
                               (self.execute(data), label),
                               self.input_node.process())

        self.client = TimeSeriesClient(ts_stream=data_generator)
        self.client.set_window_defs(self.window_definition)

        self.client.connect()
        self.marker_windower = MarkerWindower(
            data_client=self.client,
            windowdefs=self.window_definition,
            nullmarker_stride_ms=self.nullmarker_stride_ms)

        if self.marker_windower == None:
            self.window_stream()

        # Create a generator that emits the windows
        test_data_generator = ((sample, label) \
                               for (sample, label) in self.marker_windower)

        self.data_for_testing = MemoizeGenerator(test_data_generator)

        # Return a fresh copy of the generator
        return self.data_for_testing.fresh()

    def window_stream(self, data):
        # Creates a windower that splits the given data data into windows
        # based in the window definitions provided
        # and assigns correct labels to these windows
        self.client = TimeSeriesClient(ts_stream=iter(data))

        self.client.connect()
        self.client.set_window_defs(self.window_definition)
        self.marker_windower = MarkerWindower(
            data_client=self.client,
            windowdefs=self.window_definition,
            nullmarker_stride_ms=self.nullmarker_stride_ms)

    def __getstate__(self):
        """ Return a pickable state for this object """
        self.window_definition = None
        return super(StreamWindowingNode, self).__getstate__()

    def get_output_type(self, input_type, as_string=True):
        from pySPACE.resources.data_types.time_series import TimeSeries
        if as_string:
            return "TimeSeries"
        else:
            return TimeSeries
Пример #41
0
class TrainTestSplitterNode(BaseNode):
    """ Split data into one training and one test data set with a fixed ratio
    
    The relative
    size of the two sets is controlled via the parameter train_ratio.

    .. warning:: the class ratio is not retained

    .. todo::
        introduce stratified parameter as in CV_Splitter
    
    **Parameters**
    
     :train_ratio:
         The ratio of the overall available data that is assigned to the 
         training set. The remaining data (1-train_ratio) is used for testing.
         
         (*optional, default: 0.5*)
         
     :num_train_instances:
         Instead of specifying a train_ratio, this option allows to specify the
         absolute number of training instances of class *class_label* that 
         should be in the training set. All instances that occur until 
         *num_train_instances* are found are used for training. The remaining
         data are used for testing.
         
         (*optional, default: None*)
    
     :class_label:
         If *num_train_instances*-option is used, this string determines the
         class of which training examples are count.
     
     :random:
         If *False*, the order of the data is retained. I.e. the train_ratio
         instances are used for training and the remaining as test data. If 
         *True*, the two sets are sampled randomly from the data without
         taking into consideration the data's order.
         
         (*optional, default: True*)
    
    **Exemplary Call**
    
    .. code-block:: yaml
    
        -
            node : TrainTestSplitter
            parameters :
                  train_ratio : 0.7
                  random : False
    
    :Author: Jan Hendrik Metzen ([email protected])
    :Created: 2010/03/08 (Documentation, old node)
    :LastChange: 2011/11/14 (Documentation) Anett Seeland
    """
    
    def __init__(self, train_ratio=0.5, random=True,
                 num_train_instances=None, class_label='Target', reverse=False,
                 **kwargs):
        super(TrainTestSplitterNode, self).__init__(**kwargs)
        assert(not(random and reverse)),"Reverse ordering makes no sense when randomization is active!"
        self.set_permanent_attributes(train_ratio=train_ratio,
                                      random=random,
                                      num_train_instances=num_train_instances,
                                      class_label=class_label,
                                      reverse=reverse,
                                      train_data=None,
                                      test_data=None)

    def is_split_node(self):
        """ Returns whether this is a split node. """
        return True

    def use_next_split(self):
        """ Use the next split of the data into training and test data.
        
        Returns True if more splits are available, otherwise False.
        
        This method is useful for benchmarking
        """
        return False
    
    def train_sweep(self, use_test_data):
        """ Performs the actual training of the node.
        
        .. note:: Split nodes cannot be trained
        """
        raise Exception("Split nodes cannot be trained")
    
    def request_data_for_training(self, use_test_data):
        """ Returns the data for training of subsequent nodes

        .. todo:: to document
        """
        # Create split lazily when required
        if self.train_data == None:
            self._create_split()

        # Create training data generator
        self.data_for_training = \
                MemoizeGenerator(instance for instance in self.train_data)
        
        return self.data_for_training.fresh()
    
    def request_data_for_testing(self):
        """ Returns the data for testing of subsequent nodes

        .. todo:: to document
        """
        # Create split lazily when required
        if self.test_data == None:
            self._create_split()
        
        # Create test data generator
        self.data_for_testing = \
                MemoizeGenerator(instance for instance in self.test_data)

        return self.data_for_testing.fresh()

    def _create_split(self):
        """ Create the split of the data into training and test data. """
        self._log("Splitting data into train and test data")
        train_data = list(self.input_node.request_data_for_training(use_test_data=False))

        # If there is already a  non-empty training set,
        # it means that we are not  the first split node in the node chain.
        if  len(train_data) > 0:
            raise Exception("No iterated splitting of data sets allowed\n "
                            "(Calling a splitter on a  data set that is already "
                            "split)")

        # Create generator instead of loading all data
        if self.num_train_instances and not (self.random):
            self.train_data = []
            input_generator=self.input_node.request_data_for_testing
            for i in range(self.num_train_instances):
                self.train_data.append(input_generator.next())
            self.test_data = input_generator
            return

        # Gather all test data
        test_data = list(self.input_node.request_data_for_testing())
        
        # Remember all the data and store it in memory
        # TODO: This might cause problems for large dataset
        data = train_data + test_data
        data_size = len(data)

        # Randomize order if randomization is not switched of
        if self.random:
            r = random.Random(self.run_number)
            r.shuffle(data)
        
        if self.num_train_instances!=None:
            if self.reverse:
                data = data[::-1]
            if len([i for i in range(len(data)) \
                  if data[i][1]==self.class_label])==self.num_train_instances:
                train_end = data_size
            else:
                counter = 0
                for (index, (window, label)) in enumerate(data):
                    # print "Label: ", label, "Zeitpunkt: ", window.start_time
                    if label == self.class_label:
                        counter += 1
                    if counter == self.num_train_instances:
                        train_end = index+1
                        break
                assert(self.num_train_instances==counter), \
                            "Too many instances to select."
        else:
            # Split data into train and test data according train_ratio
            train_end = int(round(data_size * self.train_ratio))
            
        self.train_data=data[0:train_end]
        self.test_data=data[train_end:]
class RandomTimeSeriesSourceNode(TimeSeriesSourceNode):
    """ Generate random data and act as a source for windowed TimeSeries
    
    This node acts as a source for windowed TimeSeries. The TimeSeries
    are generated randomly according to the given parameters and
    forwarded.
    
    The time series are generated according to the given generating function,
    and the class label by a uniform distribution according with a given threshold
    Only two classes are supported by now.
    
    **Parameters**

        :num_instances:
            The number of instances to be generated.

            (*optional, default: 20*)

        :generating_function_class_0:
            A function to generate data for class 0.
            Receives an index, which states the 
            number of already generated samples. 
            
            (*optional, default: lambda i: numpy.ones((2,2))*i*)
            
        :generating_function_class_1:
            A function to generate data for class 1.
            Receives an index, which states the 
            number of already generated samples. 
            
            (*optional, default: lambda i: numpy.ones((2,2))*i*)
            
        :channel_names: Channel names of the time series objects.
            
        :class_labels: The class labels of the generated time series.
        
        :choice_threshold:
            The threshold class assignment. The classes are
            generated randomly by generating a random number r
            between 0 and 1. If r < threshold, the class label is
            class_labels[0], and class_labels[1] otherwise.
            
        :sampling_frequency:
            Sampling frequency of the generated time series.
            
        :random:
            If true, the order of the data is randomly shuffled. 

            (*optional, default: True*)
    
    **Exemplary Call**
    
    .. code-block:: yaml
    
        - 
            node : RandomTimeSeriesSource
    
    :Author: Hendrik Woehrle ([email protected])
    :Created: 2010/09/22
    """
    def __init__(self,
                 num_instances=20,
                 generating_function_class_0=lambda i: numpy.ones((2, 2)) * i,
                 generating_function_class_1=lambda i: numpy.ones((2, 2)) * i,
                 channel_names=["X", "Y"],
                 class_labels=['A', 'B'],
                 class_choice_function=random.random,
                 choice_threshold=0.33,
                 sampling_frequency=2,
                 **kwargs):
        super(RandomTimeSeriesSourceNode, self).__init__(**kwargs)

        # We have to create a dummy collection
        class DummyObject(object):
            pass

        collection = DummyObject()
        collection.meta_data = {'runs': 1}
        collection.data = {}

        # only binary classification supported by now
        assert (len(class_labels) == 2)

        self.set_permanent_attributes(
            collection=collection,
            num_instances=num_instances,
            generating_function_class_0=generating_function_class_0,
            generating_function_class_1=generating_function_class_1,
            channel_names=channel_names,
            class_labels=class_labels,
            class_choice_function=class_choice_function,
            choice_threshold=choice_threshold,
            sampling_frequency=sampling_frequency)

    def generate_random_data(self):
        """ Method that is invoked by train and test data generation functions"""
        # invokes the given generating functions
        generated_data = []

        for i in range(self.num_instances):
            choice = self.class_choice_function()
            label = None

            if choice < self.choice_threshold:
                input_array = self.generating_function_class_0(i)
                label = self.class_labels[0]
            else:
                input_array = self.generating_function_class_1(i)
                label = self.class_labels[1]

            generated_data.append(
                (TimeSeries(input_array=input_array,
                            channel_names=self.channel_names,
                            sampling_frequency=self.sampling_frequency),
                 label))
        return generated_data

    def request_data_for_testing(self):
        """
        Returns the data that can be used for testing of subsequent nodes

        .. todo:: to document
        """
        # If we haven't read the data for testing yet
        if self.data_for_testing == None:

            generated_data = self.generate_random_data()

            # Create a generator that emits the windows
            test_data_generator = ((sample, label) \
                                     for (sample, label) in generated_data)

            self.data_for_testing = MemoizeGenerator(test_data_generator,
                                                     caching=True)

        # Return a fresh copy of the generator
        return self.data_for_testing.fresh()

    def request_data_for_training(self, use_test_data):
        """
        Returns the data that can be used for testing of subsequent nodes

        .. todo:: to document
        """
        if use_test_data:
            return self.request_data_for_testing()

        # If we haven't read the data for testing yet
        if self.data_for_training == None:

            generated_data = self.generate_random_data()

            # Create a generator that emits the windows
            train_data_generator = ((sample, label) \
                                     for (sample, label) in generated_data)

            self.data_for_training = MemoizeGenerator(train_data_generator,
                                                      caching=True)

        # Return a fresh copy of the generator
        return self.data_for_training.fresh()

    def get_metadata(self, key):
        """ This source node does not contain collection meta data. """
        return None
class StreamWindowingNode(BaseNode):
    """Get a stream of time series objects and window them inside a flow. 

    Node that interprets a stream of incoming time series objects as
    a raw data stream.
    The markers stored in marker_name attribute are used as the markers
    for a :class:`~pySPACE.missions.support.windower.MarkerWindower`.
    This should done *before* any splitter, since all incoming windows
    are regarded as parts of a consecutive data stream.

    **Parameters**

     :windower_spec_file:
         The window specification file for the
         :class:`~pySPACE.missions.support.windower.MarkerWindower`.
         Used for testing and training, if windower_spec_file_train
         is not specified. 

     :windower_spec_file_train:
         A separate window file for training only.
         If not specified, windower_spec_file is used for training
         and testing.

    
    **Parameters**


    **Exemplary Call**

    .. code-block:: yaml

        -
            node : Stream_Windowing
            parameters :
                windower_spec_file : "example_lrp_window_spec.yaml"

    :Authors: Hendrik Woehrle ([email protected])
    :Created: 2012/07/09
    """
    def __init__(self,
                 windower_spec_file,
                 windower_spec_file_train = None,
                 local_window_conf=False,
                 nullmarker_stride_ms=None,
                 *args,
                 **kwargs):
        super(StreamWindowingNode, self).__init__(*args, **kwargs)
        
        if windower_spec_file_train is None:
            windower_spec_file_train = windower_spec_file

        self.set_permanent_attributes(client = None,
                                      marker_windower = None,
                                      window_definition = None,
                                      local_window_conf = local_window_conf,
                                      windower_spec_file = windower_spec_file,
                                      windower_spec_file_train = windower_spec_file_train,
                                      nullmarker_stride_ms=nullmarker_stride_ms)

    def request_data_for_training(self, use_test_data):
        """ Returns the data that can be used for training of subsequent nodes

        .. todo:: to document
        """
        
        # set window definition for train phase windower file
        self.window_definition = \
            Windower._load_window_spec(self.windower_spec_file_train,
                                       self.local_window_conf)

        self._log("Requesting train data...")
        if self.data_for_training is None:
            if not use_test_data:
                # Get training and test data (with labels)
                train_data = \
                    list(self.input_node.request_data_for_training(use_test_data=use_test_data))
                # If training or test data is an empty list
                if train_data == []:
                    self.data_for_training=MemoizeGenerator(
                        (x for x in [].__iter__()), caching=True)
                    return self.data_for_training.fresh()
                # create stream of 
                self.window_stream(train_data)

                # Create a generator that emits the windows
                train_data_generator = ((sample, label) for (sample, label)
                                        in self.marker_windower)
                self.data_for_training = MemoizeGenerator(train_data_generator, 
                                                          caching=True)
                return self.data_for_training.fresh()
        
            else:
                # Return the test data as there is no additional data that
                # was dedicated for training
                self.data_for_training = self.request_data_for_testing()
                return self.data_for_training.fresh()
        else: 
            return self.data_for_training.fresh()

    def request_data_for_testing(self):
        """ Returns the data for testing of subsequent nodes

        .. todo:: to document
        """

        if self.data_for_testing is None:
            # set window definition for test phase windower file
            self.window_definition = \
                Windower._load_window_spec(self.windower_spec_file,
                                           self.local_window_conf)
            test_data = list(self.input_node.request_data_for_testing())

            # create stream of windows
            self.window_stream(test_data)
    
            # Create a generator that emits the windows
            test_data_generator = ((sample, label) \
                                   for (sample, label) in self.marker_windower)
    
            self.data_for_testing = MemoizeGenerator(test_data_generator)
    
            # Return a fresh copy of the generator
            return self.data_for_testing.fresh()
        else: 
            return  self.data_for_testing.fresh()
    
    
    def process(self):
        """ Processes all data that is provided by the input node

        Returns a generator that yields the data after being processed by this
        node.
        """
        assert(self.input_node != None), "No input node specified!"
        # Assert  that this node has already been trained
        assert(not self.is_trainable() or
               self.get_remaining_train_phase() == 0), "Node not trained!"
               
        data_generator = \
                itertools.imap(lambda (data, label):
                               (self.execute(data), label),
                               self.input_node.process())
                
        self.client = TimeSeriesClient(ts_stream = data_generator)
        
        self.client.connect()
        self.marker_windower = MarkerWindower(data_client=self.client,
                                              windowdefs=self.window_definition,
                                              stridems=self.nullmarker_stride_ms)
        
        if self.marker_windower == None:
            self.window_stream()

        # Create a generator that emits the windows
        test_data_generator = ((sample, label) \
                               for (sample, label) in self.marker_windower)

        self.data_for_testing = MemoizeGenerator(test_data_generator)

        # Return a fresh copy of the generator
        return self.data_for_testing.fresh()
        
    def window_stream(self, data):
        # Creates a windower that splits the given data data into windows
        # based in the window definitions provided
        # and assigns correct labels to these windows
        self.client = TimeSeriesClient(ts_stream = iter(data))
        
        self.client.connect()
        self.marker_windower = MarkerWindower(data_client=self.client,
                                              windowdefs=self.window_definition,
                                              stridems=self.nullmarker_stride_ms)
Пример #44
0
class InstanceSelectionNode(BaseNode):
    """Retain only a certain percentage of the instances

    The node InstanceSelectionNode forwards only
    *train_percentage_selected* percent of the training instances passed to
    him to the successor node and only
    *test_percentage_selected* percent of the test instances. The forwarded 
    instances are selected randomly but so that the class ratio is kept.

    If *reduce_class* is used, only the chosen class is reduced, without
    keeping the class ratio. So the total mount of reduced data does not match
    the percentage values.
    
    **Parameters**
        :train_percentage_selected:
            The percentage of training instances which
            is forwarded to successor node.

            (*optional, default: 100*)

        :test_percentage_selected:
            The percentage of test instances which 
            is forwarded to successor node.

            (*optional, default: 100*)

        :reduce_class:
            If you want only to reduce one class, choose this parameter
            otherwise, both classes are reduced in a balanced fashion.

            (*optional, default: False*)

    **Exemplary call**
    
    .. code-block:: yaml
    
        -
            node : InstanceSelection
            parameters : 
                train_percentage_selected : 80
                test_percentage_selected : 100
                reduce_class : Standard

    :Author: Jan Hendrik Metzen ([email protected])
    :Created: 2010/03/31
    """
    def __init__(self, train_percentage_selected=100,
                 test_percentage_selected=100,
                 reduce_class=False,
                 **kwargs):
        super(InstanceSelectionNode, self).__init__(**kwargs)
        
        self.set_permanent_attributes(
            train_percentage_selected=train_percentage_selected,
            test_percentage_selected=test_percentage_selected,
            reduce_class=reduce_class)

    def request_data_for_training(self, use_test_data):
        """ Returns data for training of subsequent nodes
        
        .. todo:: to document
        
        .. note::
              This method works differently in InstanceSelectionNode
              than in other nodes: Only *percentage_selected* of the available
              data are returned.
        """
        
        assert(self.input_node is not None)
        if self.train_percentage_selected > 100:
            self._log("Train percentage of %f reduced to 100." %
                      self.train_percentage_selected,
                      level=logging.ERROR)
            self.train_percentage_selected = 100
        self._log("Data for training is requested.", level=logging.DEBUG)

        if self.train_percentage_selected == 100:
            return super(InstanceSelectionNode, self).request_data_for_training(
                use_test_data)

        # If we haven't computed the data for training yet
        if self.data_for_training is None:
            self._log("Producing data for training.", level=logging.DEBUG)
            # Train this node
            self.train_sweep(use_test_data)
            
            # Divide available instances according to label
            all_instances = defaultdict(list)
            for instance, label in self.input_node.request_data_for_training(
                    use_test_data):
                all_instances[label].append(instance)
                
            self._log("Keeping only %s percent of training data" %
                      self.train_percentage_selected,
                      level=logging.DEBUG)
            r = random.Random(self.run_number)
            # Retain only *percentage_selected* percent of the data
            retained_instances = []

            for label, instances in all_instances.iteritems():
                # enable random choice of samples
                r.shuffle(instances)
                if not self.reduce_class or \
                        self.train_percentage_selected == 100:
                    end_index = int(round(len(instances) *
                                          self.train_percentage_selected / 100))
                elif not (self.reduce_class == label):
                    end_index = len(instances)
                else:  # self.reduce_class==label--> reduction needed
                    end_index = int(round(len(instances) *
                                          self.train_percentage_selected / 100))

                retained_instances.extend(zip(instances[0:end_index],
                                              [label]*end_index))
            # mix up samples between the different labels
            r.shuffle(retained_instances)
            # Compute a generator the yields the train data and
            # encapsulate it in an object that memoizes its outputs and
            # provides a "fresh" method that returns a new generator that will
            # yield the same sequence            
            train_data_generator = ((self.execute(data), label)
                                    for (data, label) in retained_instances)
                     
            self.data_for_training = MemoizeGenerator(train_data_generator,
                                                      caching=self.caching) 
        
        self._log("Data for training finished", level=logging.DEBUG)
        # Return a fresh copy of the generator  
        return self.data_for_training.fresh()
    
    def request_data_for_testing(self):
        """ Returns data for testing of subsequent nodes

        .. todo:: to document
        """
        assert(self.input_node is not None)
        if self.test_percentage_selected > 100:
            self._log("Test percentage of %f reduced to 100." %
                      self.test_percentage_selected,
                      level=logging.ERROR)
            self.test_percentage_selected = 100
        self._log("Data for testing is requested.", level=logging.DEBUG)

        if self.test_percentage_selected == 100:
            return super(InstanceSelectionNode, self).request_data_for_testing()

        # If we haven't computed the data for testing yet
        if self.data_for_testing is None:
            # Assert  that this node has already been trained
            assert(not self.is_trainable() or 
                   self.get_remaining_train_phase() == 0)
            
            # Divide available instances according to label
            all_instances = defaultdict(list)
            for instance, label in self.input_node.request_data_for_testing():
                all_instances[label].append(instance)
                
            self._log("Keeping only %s percent of test data" %
                      self.test_percentage_selected,
                      level=logging.DEBUG)
            r = random.Random(self.run_number)
            
            # Retain only *percentage_selected* percent of the data
            retained_instances = []
            for label, instances in all_instances.iteritems():
                # enable random choice of samples
                r.shuffle(instances)
                if not self.reduce_class or \
                        self.test_percentage_selected == 100:
                    end_index = int(round(len(instances) *
                                    self.test_percentage_selected / 100))
                elif not (self.reduce_class == label):
                    end_index = len(instances)
                else:  # self.reduce_class==label--> reduction needed
                    end_index = int(round(len(instances) *
                                    self.test_percentage_selected / 100))

                retained_instances.extend(zip(instances[0:end_index],
                                              [label]*end_index))
            # mix up samples between the different labels
            r.shuffle(retained_instances)
            # Compute a generator the yields the test data and
            # encapsulate it in an object that memoizes its outputs and
            # provides a "fresh" method that returns a new generator that'll
            # yield the same sequence
            self._log("Producing data for testing.", level=logging.DEBUG)
            test_data_generator = ((self.execute(data), label)
                                   for (data, label) in retained_instances)
                    
            self.data_for_testing = MemoizeGenerator(test_data_generator,
                                                     caching=self.caching)
        
        self._log("Data for testing finished", level=logging.DEBUG)
        # Return a fresh copy of the generator
        return self.data_for_testing.fresh()
    
    def _execute(self, time_series):
        return time_series  # We don't do anything with the kept instances
Пример #45
0
class ConsumeTrainingDataNode(BaseNode):
    """ Split training data for internal usage and usage of successor nodes
    
    This node allows to handle situations where some model needs to be trained 
    and later on evaluated on the given training data (using test data may not 
    be allowed for certain reasons). Simply training and evaluating the model
    on the same data is not an option, since the evaluation would have a strong
    optimistic bias (model is well adapted to the data it was trained on).
    
    One example of such a situation is when a node chain is trained on the data that
    should be combined later on with an ensemble of node chains trained on historic
    data. The ensemble training should not happen on the same data as
    training.  
    
    This node therefore splits the training data into two parts: one for internal
    use (training the model) and one for usage of successor nodes
    (model evaluation). The ratio of training data that should be used 
    internally can be controlled with the argument *consumption_rate* (a value
    between 0.0 and 1.0).
    
    .. note:: 
            When defining  this node in the pySPACE YAML syntax, "wrapped_node"
            can be the definition of a node in YAML syntax (see below).
            The node object is then created automatically based on this definition.
    
    **Parameters**
         
     :wrapped_node: 
         The node that is trained with the internally used training data.
        
     :consumption_rate:
        The rate of training data that is used internally for training 
        *wrapped_node*. The remaining data is supplied for the successor nodes.
    
     :random_seed:
        The seed of the random generator. Defaults to 0.
       

    **Exemplary Call**
    
    
    .. code-block:: yaml
    
        -
            node: ConsumeTrainingData
            parameters : 
                 consumption_rate : 0.8
                 wrapped_node : 
                      node : Flow_Node
                      parameters :
                           input_dim : 64
                           output_dim : 1
                           nodes : 
                              ......
                              
    :Author: Jan Hendrik Metzen ([email protected])
    :Created: 2010/08/06
    """
    def __init__(self,
                 wrapped_node,
                 consumption_rate,
                 random_seed=0,
                 *args,
                 **kwargs):
        self.wrapped_node = wrapped_node  # Necessary to determine whether trainable.
        super(ConsumeTrainingDataNode, self).__init__(*args, **kwargs)

        #############################################
        self.set_permanent_attributes(wrapped_node=wrapped_node,
                                      consumption_rate=consumption_rate,
                                      internal_training_set=[],
                                      external_training_set=[],
                                      r=random.Random(random_seed))

    @staticmethod
    def node_from_yaml(node_spec):
        """ Creates a node based on the node_spec to overwrite default """
        # This node requires one parameters, namely a list of nodes
        assert("parameters" in node_spec
                and "wrapped_node" in node_spec["parameters"]),\
                   "ConsumeTrainingDataNode requires specification of a wrapped node!"
        # Create all nodes that are packed together in this layer
        wrapped_node = BaseNode.node_from_yaml(
            node_spec["parameters"]["wrapped_node"])
        node_spec["parameters"].pop("wrapped_node")
        # Create the node object
        node_obj = ConsumeTrainingDataNode(wrapped_node=wrapped_node,
                                           **node_spec["parameters"])

        return node_obj

    def is_trainable(self):
        """ Returns whether this node is trainable. """
        return self.wrapped_node.is_trainable()

    def is_supervised(self):
        """ Returns whether this node requires supervised training """
        return self.wrapped_node.is_supervised()

    def _get_train_set(self, use_test_data=False):
        """ Returns the data that can be used for training """
        # We take data that is provided by the input node for training
        # NOTE: This might involve training of the preceding nodes
        train_set = list(
            self.input_node.request_data_for_training(use_test_data))

        # Divide available instances according to label
        all_instances = defaultdict(list)
        for instance, label in train_set:
            all_instances[label].append(instance)

        # Split into training data used internally and training data that is
        # available for successor nodes
        self.internal_training_set = []
        self.external_training_set = []
        for label, instances in all_instances.iteritems():
            self.r.shuffle(instances)
            split_index = int(round(len(instances) * self.consumption_rate))
            self.internal_training_set.extend(
                zip(instances[:split_index], repeat(label)))
            self.external_training_set.extend(
                zip(instances[split_index:], repeat(label)))

        return self.internal_training_set

    def request_data_for_training(self, use_test_data):
        """ Returns data for training of subsequent nodes
        
        .. todo:: to document
        """
        assert (self.input_node != None)

        self._log("Data for training is requested.", level=logging.DEBUG)

        # If we haven't computed the data for training yet
        if self.data_for_training == None:
            self._log("Producing data for training.", level=logging.DEBUG)
            # Train this node
            self.train_sweep(use_test_data)

            # Compute a generator the yields the train data and
            # encapsulate it in an object that memoizes its outputs and
            # provides a "fresh" method that returns a new generator that'll
            # yield the same sequence
            train_data_generator = \
                     itertools.imap(lambda (data, label) : (self.execute(data), label),
                                    self.external_training_set)

            self.data_for_training = MemoizeGenerator(train_data_generator,
                                                      caching=self.caching)

        self._log("Data for training finished", level=logging.DEBUG)
        # Return a fresh copy of the generator
        return self.data_for_training.fresh()

    def _train(self, data, label):
        """ Trains the wrapped nodes on the given data vector *data* """
        self.wrapped_node.train(data, label)

    def _stop_training(self):
        """ Finish the training of the node."""
        self.wrapped_node.stop_training()

    def _execute(self, data):
        """ Executes the node on the given data vector *data* """
        return self.wrapped_node.execute(data)

    def store_state(self, result_dir, index=None):
        """ Stores this node in the given directory *result_dir* """
        self.wrapped_node.store_state(result_dir, index=None)

    def get_output_type(self, input_type, as_string=True):
        """ Return the output type

        The method calls the corresponding method in the wrapped node
        """
        return self.wrapped_node.get_output_type(input_type, as_string)
Пример #46
0
class CrossValidationSplitterNode(BaseNode):
    """ Perform (stratified) cross-validation
    
    During benchmarking, n pairs of training and test data are generated, where
    n is configurable via the parameter splits. The n test datasets are pairwise
    disjunct. Internally, the available data is partitioned into n pairwise 
    disjunct sets s_1, ..., s_n of equal size (the "splits"). The i-th pair of 
    training and test data is generated by using s_i as test data and the 
    union of the remaining datasets as training data.
    
    The partitioning is stratified per default, i.e. the splits have the same 
    class ratio as the overall dataset. Per default, the partitioning is based 
    on shuffling the data randomly. In this case, the partitioning of the data 
    into s_1, ..., s_n is determined solely based on the run number (used as 
    random seed), yielding the same split for the same run_number and different 
    ones for two different run_numbers.
    
    **Parameters**
    
      :splits:
            The number of splits created internally. If n data points exist and
            m splits are created, each of these splits consists of approx. m/n
            data points. 
            
            (*optional, default: 10*)
        
      :stratified:
         If true, the cross-validation is stratified, i.e. the overall 
         class-ratio is retained in each split (as good as possible). 
         
         (*optional, default: True*)
         
      :random:
         If true, the order of the data is randomly shuffled. 
         
         (*optional, default: True*)
         
      :time_dependent:
         If True splitting is done separately for different (= not 
         overlapping) time windows to ensure that instances corresponding to the
         same marker will be in the same split.
         
         .. note:: Stratification is only allowed here if there is only one 
                   class label for one marker.
         
         (*optional, default: False*)

      :stratified_class:
         
         If *time_dependent* is True and *stratified_class* is specified 
         stratification is only done for the specified class label (String).
         The other class is filling the split preserving the time order of the 
         data. This also means that *random* has no effect here.

         (*optional, default: None*)

    **Exemplary Call**
    
    .. code-block:: yaml
    
        -
            node : CV_Splitter
            parameters :
                  splits : 10
                  stratified : True
    
    :Author: Jan Hendrik Metzen ([email protected])
    :Created: 2008/12/16
    """
    def __init__(self,
                 splits=10,
                 stratified=True,
                 random=True,
                 time_dependent=False,
                 stratified_class=None,
                 *args,
                 **kwargs):
        super(CrossValidationSplitterNode, self).__init__(*args, **kwargs)

        self.set_permanent_attributes(
            splits=int(splits),  #how many splits
            current_split=0,  # current split for testing
            split_indices=None,
            run_number=-1,
            random=random,
            stratified=stratified,
            stratified_class=stratified_class,
            time_dependent=time_dependent)

    def is_split_node(self):
        """ Return whether this is a split node """
        return True

    def use_next_split(self):
        """ Use the next split of the data into training and test data.
        
        Returns True if more splits are available, otherwise False.
        
        This method is useful for benchmarking
        """
        if self.current_split + 1 < self.splits:
            self.current_split = self.current_split + 1
            self._log("Benchmarking with split %s/%s" %
                      (self.current_split + 1, self.splits))
            return True
        else:
            return False

    def train_sweep(self, use_test_data):
        """ Performs the actual training of the node.
        
        .. note:: Split nodes cannot be trained
        """
        raise Exception("Split nodes cannot be trained")

    def request_data_for_training(self, use_test_data):
        """ Returns the data for training of subsequent nodes

        .. todo:: to document
        """
        # Create cv-splits lazily when required
        if self.split_indices == None:
            self._create_splits()

        # All data can be used for training which is not explicitly
        # specified for testing by the current cv-split
        self.data_for_training = MemoizeGenerator(
            self.data[i] for i in range(len(self.data))
            if not i in self.split_indices[self.current_split])

        return self.data_for_training.fresh()

    def request_data_for_testing(self):
        """ Returns the data for testing of subsequent nodes

        .. todo:: to document
        """
        # Create cv-splits lazily when required
        if self.split_indices == None:
            self._create_splits()

        # Only that data can be used for testing which is explicitly
        # specified for this purpose by the current cv-split
        self.data_for_testing = MemoizeGenerator(
            self.data[i] for i in self.split_indices[self.current_split])

        return self.data_for_testing.fresh()

    def _create_splits(self):
        """ Create the split of the data for n-fold  cross-validation """
        self._log("Creating %s splits for cross validation" % self.splits)

        # Get training and test data (with labels)
        train_data = \
          list(self.input_node.request_data_for_training(use_test_data=False))
        test_data = list(self.input_node.request_data_for_testing())

        # If there is already a non-empty training set,
        # it means that we are not the first split node in the node chain
        if len(train_data) > 0:
            raise Exception("No iterated splitting of data sets allowed\n "
                            "(Calling a splitter on a data set that is "
                            "already split)")

        # Remember all the data and store it in memory
        # TODO: This might cause problems for large dataset
        self.data = train_data + test_data

        # initialize result structure: Determine which data points are
        # reserved for testing in which cross validation run
        split_indices = []
        if self.time_dependent:

            # sort the data according to start_time
            self.data.sort(key=lambda swindow: swindow[0].start_time)
            # divide the data with respect to the time_point
            data_time = dict()
            last_window_end_time = 0.0
            marker = -1
            label_marker = dict()
            for (index, (window, label)) in enumerate(self.data):
                if window.start_time > last_window_end_time:
                    marker += 1
                    data_time[marker] = [index]
                    if self.stratified or self.stratified_class:
                        if label not in label_marker:
                            label_marker[label] = [marker]
                        else:
                            label_marker[label].append(marker)
                else:
                    data_time[marker].append(index)
                    # check label consistency for later stratification
                    if (self.stratified or self.stratified_class) and \
                                  self.data[data_time[marker][0]][1] != label:
                        import warnings
                        warnings.warn(
                            "Since there are several class labels"
                            " for one marker stratification is set to False.",
                            UserWarning)
                        self.stratified = False
                        self.stratified_class = None
                last_window_end_time = window.end_time
            #print "data_time: \n", data_time

            if self.stratified:  # each marker has only one label
                # not more splits then markers of every class!
                assert (min(
                    [len(markers)
                     for markers in label_marker.values()]) >= self.splits)
                # extend result structure since we need it in the next block
                split_indices = [[] for i in range(self.splits)]
                # determine the splits of the data
                for label, markers in label_marker.iteritems():
                    data_size = len(markers)
                    # Set random seed and randomize the order of the data
                    if self.random:
                        r = random.Random(self.run_number)
                        r.shuffle(markers)
                    for j in range(self.splits):
                        split_start = int(
                            round(float(j) * data_size / self.splits))
                        split_end = int(
                            round(float(j + 1) * data_size / self.splits))
                        # means half-open interval [split_start, split_end)
                        for i in range(split_start, split_end):
                            split_indices[j].extend(data_time[markers[i]])
                # avoid sorted labels by sorting time dependent
                split_indices = [
                    sorted(split_list) for split_list in split_indices
                ]
                #print "run_number:", self.run_number
                #print "time_dependent && stratified:\n", split_indices

            elif self.stratified_class:
                # extend result structure since we need it in the next block
                split_indices = [[] for i in range(self.splits)]
                # determine the splits of the data
                data_size = len(label_marker[self.stratified_class])

                for j in range(self.splits):
                    split_start = int(round(
                        float(j) * data_size / self.splits))
                    split_end = int(
                        round(float(j + 1) * data_size / self.splits))
                    # means half-open interval [split_start, split_end)
                    for i in range(split_start, split_end):
                        split_indices[j].extend(
                            data_time[label_marker[self.stratified_class][i]])
                #print "time_dependent && stratified_class:\n before filling up\n", split_indices
                # fill up with other classes
                last_max_index = 0
                for split_list in split_indices:
                    max_index = max(split_list)
                    for i in range(last_max_index, max_index):
                        if self.data[i][1] != self.stratified_class:
                            split_list.append(i)
                    last_max_index = max_index + 1
                for i in range(last_max_index, len(self.data)):
                    if self.data[i][1] != self.stratified_class:
                        split_indices[-1].append(i)
                # avoid sorted labels by sorting time dependent
                split_indices = [
                    sorted(split_list) for split_list in split_indices
                ]
                print "time_dependent && stratified_class:\n", split_indices
            else:
                # we should not have more splits then (marker)time points
                data_size = len(data_time.keys())
                assert (data_size >= self.splits)

                # Set random seed and randomize the order of the data
                indices = data_time.keys()
                if self.random:
                    r = random.Random(self.run_number)
                    r.shuffle(indices)

                # determine the splits of the data
                for i in range(self.splits):
                    split_indices.append([])
                    split_start = int(round(
                        float(i) * data_size / self.splits))
                    split_end = int(
                        round(float(i + 1) * data_size / self.splits))
                    # means half-open interval [split_start, split_end)
                    for j in range(split_start, split_end):
                        split_indices[i].extend(data_time[indices[j]])
                # avoid sorted labels by sorting time dependent
                split_indices = [
                    sorted(split_list) for split_list in split_indices
                ]
                #for index, splitlist in enumerate(split_indices):
                #    print index, "first: ", self.data[splitlist[0]][0].start_time, ", last: ", self.data[splitlist[-1]][0].start_time, ", Laenge: ", len(data_time.keys())
                #print "time_dependent:\n", split_indices

        elif self.stratified:  # Stratified cross-validation
            # divide the data with respect to the class_label
            data_labeled = dict()
            for (index, (window, label)) in enumerate(self.data):
                if not data_labeled.has_key(label):
                    data_labeled[label] = [index]
                else:
                    data_labeled[label].append(index)

            # we should not have more splits then instances of every class!
            min_nr_per_class = min(
                [len(data) for data in data_labeled.values()])
            if self.splits > min_nr_per_class:
                self.splits = min_nr_per_class
                self._log("Reducing number of splits to %s since no more "
                          "instances of one of the classes are available." %
                          self.splits,
                          level=logging.CRITICAL)
            # extend result structure since we need it in the next block
            split_indices = [[] for i in range(self.splits)]
            # determine the splits of the data
            for label, indices in data_labeled.iteritems():
                data_size = len(indices)
                # Set random seed and randomize the order of the data
                if self.random:
                    r = random.Random(self.run_number)
                    r.shuffle(indices)
                for j in range(self.splits):
                    split_start = int(round(
                        float(j) * data_size / self.splits))
                    split_end = int(
                        round(float(j + 1) * data_size / self.splits))
                    # means half-open interval [split_start, split_end)
                    split_indices[j].extend(indices[split_start:split_end])
            # avoid sorted labels
            for j in range(self.splits):
                r = random.Random(self.run_number)
                r.shuffle(split_indices[j])
            # print "stratified:\n", split_indices

            # old trunk version
            # =================
            # data_size = len(self.data)
            # # Determine ratio of class1
            # instance_labels = map(lambda x: x[1], self.data)
            # classes = list(set(instance_labels))
            # assert (len(classes) == 2),\
            #        "Stratified cross-validation works currently only for "\
            #        "binary classification tasks."
            # class1_instances = instance_labels.count(classes[0])
            # class2_instances = instance_labels.count(classes[1])

            # if self.splits > min(class1_instances, class2_instances):
            #     self.set_permanent_attributes(splits = min(class1_instances,
            #                                                class2_instances))
            #    self._log("Reducing number of splits to %s since no more " \
            #              "instances of one of the classes are available."
            #              % self.splits)

            # class1_ratio = float(class1_instances) / data_size
            # # Determine which instances belong to which class
            # class1_indices = []
            # class2_indices = []
            # for index, instance_label in enumerate(instance_labels):
            #     if instance_label == classes[0]:
            #         class1_indices.append(index)
            #     else:
            #         class2_indices.append(index)
            #
            # # Randomize order
            # if self.random:
            #     r = random.Random(self.run_number)
            #     r.shuffle(class1_indices)
            #     r.shuffle(class2_indices)
            #
            # # Merge the two classes (such that they alternate in the appropriate
            # # frequency)
            # indices = []
            # n = 0 # class1 counter
            # for i in range(data_size):
            #     if i == round((n + 0.5) / class1_ratio):
            #         indices.append(class1_indices.pop())
            #         n += 1
            #     else:
            #         indices.append(class2_indices.pop())

        else:  # Non-stratified cross-validation
            data_size = len(self.data)
            # We cannot have more splits than data points
            assert (data_size >= self.splits)

            # Set random seed and randomize the order of the data
            indices = range(data_size)
            if self.random:
                r = random.Random(self.run_number)
                r.shuffle(indices)

            # Determine the splits of the data
            for i in range(self.splits):
                split_start = int(round(float(i) * data_size / self.splits))
                split_end = int(round(float(i + 1) * data_size / self.splits))
                # means half-open interval [split_start, split_end)
                split_indices.append(indices[split_start:split_end])

        self.split_indices = split_indices

        self._log("Benchmarking with split %s/%s" %
                  (self.current_split + 1, self.splits))
Пример #47
0
class SimpleSourceTemplateNode(BaseNode):
    """ A simple template that illustrates the basic principles of a source node

    In `pySPACE`, source nodes are used at the beginning of the node chain.
    The source nodes are responsible for the input of data, be it from a
    static source or from a live stream.

    It is very important to note that these nodes just serve the purpose of
    providing the node chain with an input dataset and do not perform any
    changes on the data itself. That being said, these nodes are **do not**
    have an **input node** and are **not trainable**!

    In the following we will discuss the general strategy for building a new
    source node for a static input data set which has been saved to disk.
    In the case of more complicated inputs, please consult the documentation of
    :mod:`~pySPACE.missions.nodes.source.external_generator_source.ExternalGeneratorSourceNode`
    and :mod:`~pySPACE.missions.nodes.source.time_series_source.Stream2TimeSeriesSourceNode`
    """
    def __init__(self, **kwargs):
        """ Initialize some values to 0 or `None`

        The initialization routine of the source node is basically completely
        empty. Should you feel the need to do something in this part of the
        code, you can initialize the ``input_dataset`` to ``None``. This
        attribute will then later be changed when the ``set_input_dataset``
        method is called.

        If the user wants to generate the dataset inside the SourceNode,
        this should be done in the ``__init__`` method though. A good example
        of this practice can be found in the
        :mod:`~pySPACE.missions.nodes.source.random_time_series_source.RandomTimeSeriesSourceNode`
        """
        super(SimpleSourceTemplateNode, self).__init__(**kwargs)

        self.set_permanent_attributes(dataset=None)

    def set_input_dataset(self, dataset):
        """ Sets the dataset from which this node reads the data

        This method is the beginning of the node. Put simply, this method
        starts the feeding process of your node chain by telling the node chain
        where to get the data from.
        """
        self.set_permanent_attributes(dataset=dataset)

    def request_data_for_training(self, use_test_data):
        """ Returns the data that can be used for training of subsequent nodes

        This method streams training data and sends it to the subsequent nodes.
        If one looks at the tutorial related to building new nodes (available in
        the tutorial section), one can see exactly where the ``request_data``
        methods are put to use.

        The following example is one that was extracted from the
        :mod:`~pySPACE.missions.nodes.source.feature_vector_source.FeatureVectorSourceNode`

        which should(in theory at least) be implementable for all types of data.
        """
        if not use_test_data:
            # If the input dataset consists only of one single run,
            # we use this as input for all runs to be conducted (i.e. we
            # rely on later randomization of the order). Otherwise
            # we use the data for this run number
            if self.dataset.meta_data["runs"] > 1:
                key = (self.run_number, self.current_split, "train")
            else:
                key = (0, self.current_split, "train")
            # Check if there is training data for the current split and run
            if key in self.dataset.data.keys():
                self._log("Accessing input dataset's training feature vector windows.")
                self.data_for_training = MemoizeGenerator(self.dataset.get_data(*key).__iter__(),
                                                          caching=self.caching)
            else:
                # Returns an iterator that iterates over an empty sequence
                # (i.e. an iterator that is immediately exhausted), since
                # this node does not provide any data that is explicitly
                # dedicated for training
                self._log("No training data available.")
                self.data_for_training = MemoizeGenerator((x for x in [].__iter__()),
                                                          caching=self.caching)
        else:
            # Return the test data as there is no additional data that
            # was dedicated for training
            return self.request_data_for_testing()

        # Return a fresh copy of the generator
        return self.data_for_training.fresh()


    def request_data_for_testing(self):
        """ Returns the data that can be used for testing of subsequent nodes

        The principle of obtaining the testing data are the same as the principles
        used in obtaining the training data set. The only difference here is that,
        in the case in which there is no testing data available, we allow for the
        training data to be used as testing data.
        """
        # If we haven't read the data for testing yet
        if self.data_for_testing == None:
            self._log("Accessing input dataset's test feature vector windows.")
            # If the input dataset consists only of one single run,
            # we use this as input for all runs to be conducted (i.e. we
            # rely on later randomization of the order). Otherwise
            # we use the data for this run number
            if self.dataset.meta_data["runs"] > 1:
                key = (self.run_number, self.current_split, "test")
            else:
                key = (0, self.current_split, "test")

            test_data_generator = self.dataset.get_data(*key).__iter__()

            self.data_for_testing = MemoizeGenerator(test_data_generator,
                                                     caching=self.caching)

        # Return a fresh copy of the generator
        return self.data_for_testing.fresh()


    def getMetadata(self, key):
        """ Return the value corresponding to the given key from the dataset meta data of this source node

        At some point in time, you might need to know the metadata of some
        specific input in your input and this is when you would use this method.
        """
        return self.dataset.meta_data.get(key)

    def use_next_split(self):
        """ Return False

        The method will always return `False` since the SourceNode
        should(in the case of more than 1 split) execute the splits in
        parallel and not in series.
        """
        return False
Пример #48
0
    def request_data_for_training(self, use_test_data):
        """ Returns data for training of subsequent nodes
        
        .. todo:: to document
        
        .. note::
              This method works differently in InstanceSelectionNode
              than in other nodes: Only *percentage_selected* of the available
              data are returned.
        """
        
        assert(self.input_node is not None)
        if self.train_percentage_selected > 100:
            self._log("Train percentage of %f reduced to 100." %
                      self.train_percentage_selected,
                      level=logging.ERROR)
            self.train_percentage_selected = 100
        self._log("Data for training is requested.", level=logging.DEBUG)

        if self.train_percentage_selected == 100:
            return super(InstanceSelectionNode, self).request_data_for_training(
                use_test_data)

        # If we haven't computed the data for training yet
        if self.data_for_training is None:
            self._log("Producing data for training.", level=logging.DEBUG)
            # Train this node
            self.train_sweep(use_test_data)
            
            # Divide available instances according to label
            all_instances = defaultdict(list)
            for instance, label in self.input_node.request_data_for_training(
                    use_test_data):
                all_instances[label].append(instance)
                
            self._log("Keeping only %s percent of training data" %
                      self.train_percentage_selected,
                      level=logging.DEBUG)
            r = random.Random(self.run_number)
            # Retain only *percentage_selected* percent of the data
            retained_instances = []

            for label, instances in all_instances.iteritems():
                # enable random choice of samples
                r.shuffle(instances)
                if not self.reduce_class or \
                        self.train_percentage_selected == 100:
                    end_index = int(round(len(instances) *
                                          self.train_percentage_selected / 100))
                elif not (self.reduce_class == label):
                    end_index = len(instances)
                else:  # self.reduce_class==label--> reduction needed
                    end_index = int(round(len(instances) *
                                          self.train_percentage_selected / 100))

                retained_instances.extend(zip(instances[0:end_index],
                                              [label]*end_index))
            # mix up samples between the different labels
            r.shuffle(retained_instances)
            # Compute a generator the yields the train data and
            # encapsulate it in an object that memoizes its outputs and
            # provides a "fresh" method that returns a new generator that will
            # yield the same sequence            
            train_data_generator = ((self.execute(data), label)
                                    for (data, label) in retained_instances)
                     
            self.data_for_training = MemoizeGenerator(train_data_generator,
                                                      caching=self.caching) 
        
        self._log("Data for training finished", level=logging.DEBUG)
        # Return a fresh copy of the generator  
        return self.data_for_training.fresh()
Пример #49
0
class CrossValidationSplitterNode(BaseNode):
    """ Perform (stratified) cross-validation
    
    During benchmarking, n pairs of training and test data are generated, where
    n is configurable via the parameter splits. The n test datasets are pairwise
    disjunct. Internally, the available data is partitioned into n pairwise 
    disjunct sets s_1, ..., s_n of equal size (the "splits"). The i-th pair of 
    training and test data is generated by using s_i as test data and the 
    union of the remaining datasets as training data.
    
    The partitioning is stratified per default, i.e. the splits have the same 
    class ratio as the overall dataset. Per default, the partitioning is based 
    on shuffling the data randomly. In this case, the partitioning of the data 
    into s_1, ..., s_n is determined solely based on the run number (used as 
    random seed), yielding the same split for the same run_number and different 
    ones for two different run_numbers.
    
    **Parameters**
    
      :splits:
            The number of splits created internally. If n data points exist and
            m splits are created, each of these splits consists of approx. m/n
            data points. 
            
            (*optional, default: 10*)
        
      :stratified:
         If true, the cross-validation is stratified, i.e. the overall 
         class-ratio is retained in each split (as good as possible). 
         
         (*optional, default: True*)
         
      :random:
         If true, the order of the data is randomly shuffled. 
         
         (*optional, default: True*)
         
      :time_dependent:
         If True splitting is done separately for different (= not 
         overlapping) time windows to ensure that instances corresponding to the
         same marker will be in the same split.
         
         .. note:: Stratification is only allowed here if there is only one 
                   class label for one marker.
         
         (*optional, default: False*)

      :stratified_class:
         
         If *time_dependent* is True and *stratified_class* is specified 
         stratification is only done for the specified class label (String).
         The other class is filling the split preserving the time order of the 
         data. This also means that *random* has no effect here.

         (*optional, default: None*)

    **Exemplary Call**
    
    .. code-block:: yaml
    
        -
            node : CV_Splitter
            parameters :
                  splits : 10
                  stratified : True
    
    :Author: Jan Hendrik Metzen ([email protected])
    :Created: 2008/12/16
    """
    
    def __init__(self,  splits=10, stratified=True, random=True,
                 time_dependent=False, stratified_class = None,  *args, **kwargs):
        super(CrossValidationSplitterNode, self).__init__(*args, **kwargs)
        
        self.set_permanent_attributes(splits = int(splits), #how many splits
                                      current_split = 0, # current split for testing
                                      split_indices = None,
                                      run_number = -1,
                                      random = random,
                                      stratified = stratified,
                                      stratified_class = stratified_class,
                                      time_dependent = time_dependent)

    def is_split_node(self):
        """ Return whether this is a split node """
        return True

    def use_next_split(self):
        """ Use the next split of the data into training and test data.
        
        Returns True if more splits are available, otherwise False.
        
        This method is useful for benchmarking
        """
        if self.current_split + 1 < self.splits:
            self.current_split = self.current_split + 1
            self._log("Benchmarking with split %s/%s" % (self.current_split + 1,
                                                         self.splits))
            return True
        else:
            return False
    
    def train_sweep(self, use_test_data):
        """ Performs the actual training of the node.
        
        .. note:: Split nodes cannot be trained
        """
        raise Exception("Split nodes cannot be trained")
        
    def request_data_for_training(self, use_test_data):
        """ Returns the data for training of subsequent nodes

        .. todo:: to document
        """
        # Create cv-splits lazily when required
        if self.split_indices == None:
            self._create_splits()
            
        # All data can be used for training which is not explicitly
        # specified for testing by the current cv-split
        self.data_for_training = MemoizeGenerator(
                self.data[i] for i in range(len(self.data)) 
                    if not i in self.split_indices[self.current_split])
        
        return self.data_for_training.fresh()
    
    def request_data_for_testing(self):
        """ Returns the data for testing of subsequent nodes

        .. todo:: to document
        """
        # Create cv-splits lazily when required
        if self.split_indices == None:
            self._create_splits()
        
        # Only that data can be used for testing which is explicitly
        # specified for this purpose by the current cv-split
        self.data_for_testing = MemoizeGenerator(
                self.data[i] for i in self.split_indices[self.current_split])
        
        return self.data_for_testing.fresh()

    def _create_splits(self):
        """ Create the split of the data for n-fold  cross-validation """
        self._log("Creating %s splits for cross validation" % self.splits)
                  
        # Get training and test data (with labels)
        train_data = \
          list(self.input_node.request_data_for_training(use_test_data=False))
        test_data = list(self.input_node.request_data_for_testing())
        
        # If there is already a non-empty training set, 
        # it means that we are not the first split node in the node chain
        if len(train_data) > 0:
            raise Exception("No iterated splitting of data sets allowed\n "
                            "(Calling a splitter on a data set that is "
                            "already split)")
        
        # Remember all the data and store it in memory
        # TODO: This might cause problems for large dataset
        self.data = train_data + test_data
        
        # initialize result structure: Determine which data points are 
        # reserved for testing in which cross validation run
        split_indices = []
        if self.time_dependent:

            # sort the data according to start_time
            self.data.sort(key=lambda swindow: swindow[0].start_time)
            # divide the data with respect to the time_point
            data_time = dict()
            last_window_end_time = 0.0
            marker = -1
            label_marker = dict()
            for (index, (window, label)) in enumerate(self.data):
                if window.start_time > last_window_end_time:
                    marker += 1
                    data_time[marker] = [index]
                    if self.stratified or self.stratified_class:
                        if label not in label_marker:
                            label_marker[label] = [marker]
                        else:
                            label_marker[label].append(marker)
                else:
                    data_time[marker].append(index)
                    # check label consistency for later stratification
                    if (self.stratified or self.stratified_class) and \
                                  self.data[data_time[marker][0]][1] != label:
                        import warnings
                        warnings.warn(
                            "Since there are several class labels"
                            " for one marker stratification is set to False.",
                            UserWarning)
                        self.stratified = False
                        self.stratified_class = None
                last_window_end_time = window.end_time
            #print "data_time: \n", data_time

            if self.stratified: # each marker has only one label
                # not more splits then markers of every class!
                assert(min([len(markers) for markers in
                            label_marker.values()]) >= self.splits)
                # extend result structure since we need it in the next block
                split_indices = [[] for i in range(self.splits)]
                # determine the splits of the data    
                for label, markers in label_marker.iteritems():
                    data_size = len(markers)
                    # Set random seed and randomize the order of the data
                    if self.random:
                        r = random.Random(self.run_number)
                        r.shuffle(markers)
                    for j in range(self.splits):
                        split_start = int(round(float(j) * data_size/self.splits))
                        split_end = int(round(float(j+1) * data_size/self.splits))
                        # means half-open interval [split_start, split_end)
                        for i in range(split_start, split_end):
                            split_indices[j].extend(data_time[markers[i]])
                # avoid sorted labels by sorting time dependent
                split_indices = [sorted(split_list)
                                 for split_list in split_indices]
                #print "run_number:", self.run_number    
                #print "time_dependent && stratified:\n", split_indices
            
            elif self.stratified_class:
                # extend result structure since we need it in the next block
                split_indices = [[] for i in range(self.splits)]
                # determine the splits of the data
                data_size = len(label_marker[self.stratified_class])

                for j in range(self.splits):
                    split_start = int(round(float(j) * data_size/self.splits))
                    split_end = int(round(float(j+1) * data_size/self.splits))
                    # means half-open interval [split_start, split_end)
                    for i in range(split_start, split_end):
                        split_indices[j].extend(data_time[label_marker[self.stratified_class][i]])
                #print "time_dependent && stratified_class:\n before filling up\n", split_indices        
                # fill up with other classes
                last_max_index = 0
                for split_list in split_indices:
                    max_index = max(split_list)
                    for i in range(last_max_index, max_index):
                        if self.data[i][1] != self.stratified_class:
                            split_list.append(i)
                    last_max_index = max_index+1
                for i in range(last_max_index, len(self.data)):
                    if self.data[i][1] != self.stratified_class:
                        split_indices[-1].append(i)
                # avoid sorted labels by sorting time dependent
                split_indices = [sorted(split_list)
                                 for split_list in split_indices]
                print "time_dependent && stratified_class:\n", split_indices
            else:
                # we should not have more splits then (marker)time points
                data_size = len(data_time.keys())
                assert(data_size >= self.splits)
            
                # Set random seed and randomize the order of the data
                indices = data_time.keys()
                if self.random:
                    r = random.Random(self.run_number)
                    r.shuffle(indices)
                
                # determine the splits of the data    
                for i in range(self.splits):
                    split_indices.append([])
                    split_start = int(round(float(i) * data_size / self.splits))
                    split_end = int(round(float(i + 1) * data_size / self.splits))
                    # means half-open interval [split_start, split_end)
                    for j in range(split_start,split_end):
                        split_indices[i].extend(data_time[indices[j]])
                # avoid sorted labels by sorting time dependent
                split_indices = [sorted(split_list)
                                 for split_list in split_indices]
                #for index, splitlist in enumerate(split_indices):
                #    print index, "first: ", self.data[splitlist[0]][0].start_time, ", last: ", self.data[splitlist[-1]][0].start_time, ", Laenge: ", len(data_time.keys()) 
                #print "time_dependent:\n", split_indices


        elif self.stratified: # Stratified cross-validation
            # divide the data with respect to the class_label 
            data_labeled = dict()
            for (index, (window, label)) in enumerate(self.data):
                if not data_labeled.has_key(label):
                    data_labeled[label] = [index]
                else:
                    data_labeled[label].append(index)
            
            # we should not have more splits then instances of every class!
            min_nr_per_class = min([len(data) for data in data_labeled.values()])
            if self.splits > min_nr_per_class:
                self.splits = min_nr_per_class
                self._log("Reducing number of splits to %s since no more "
                          "instances of one of the classes are available." 
                          % self.splits, level=logging.CRITICAL)
            # extend result structure since we need it in the next block
            split_indices = [[] for i in range(self.splits)]
            # determine the splits of the data    
            for label, indices in data_labeled.iteritems():
                data_size = len(indices)
                # Set random seed and randomize the order of the data
                if self.random:
                    r = random.Random(self.run_number)
                    r.shuffle(indices)
                for j in range(self.splits):
                    split_start = int(round(float(j) * data_size/self.splits))
                    split_end = int(round(float(j+1) * data_size/self.splits))
                    # means half-open interval [split_start, split_end)
                    split_indices[j].extend(indices[split_start: split_end])
            # avoid sorted labels
            for j in range(self.splits):
                r = random.Random(self.run_number)
                r.shuffle(split_indices[j])
            # print "stratified:\n", split_indices

            # old trunk version
            # =================
            # data_size = len(self.data)
            # # Determine ratio of class1
            # instance_labels = map(lambda x: x[1], self.data)
            # classes = list(set(instance_labels))
            # assert (len(classes) == 2),\
            #        "Stratified cross-validation works currently only for "\
            #        "binary classification tasks."
            # class1_instances = instance_labels.count(classes[0])
            # class2_instances = instance_labels.count(classes[1])
            
            # if self.splits > min(class1_instances, class2_instances):
            #     self.set_permanent_attributes(splits = min(class1_instances, 
            #                                                class2_instances))
            #    self._log("Reducing number of splits to %s since no more " \
            #              "instances of one of the classes are available." 
            #              % self.splits)
                    
            # class1_ratio = float(class1_instances) / data_size
            # # Determine which instances belong to which class
            # class1_indices = []
            # class2_indices = []
            # for index, instance_label in enumerate(instance_labels):
            #     if instance_label == classes[0]:
            #         class1_indices.append(index)
            #     else:
            #         class2_indices.append(index)
            # 
            # # Randomize order
            # if self.random:
            #     r = random.Random(self.run_number)
            #     r.shuffle(class1_indices)
            #     r.shuffle(class2_indices)
            #
            # # Merge the two classes (such that they alternate in the appropriate
            # # frequency)
            # indices = []
            # n = 0 # class1 counter
            # for i in range(data_size):
            #     if i == round((n + 0.5) / class1_ratio):
            #         indices.append(class1_indices.pop())
            #         n += 1
            #     else: 
            #         indices.append(class2_indices.pop())

        else:  # Non-stratified cross-validation
            data_size = len(self.data)
            # We cannot have more splits than data points
            assert(data_size >= self.splits) 
    
            # Set random seed and randomize the order of the data
            indices = range(data_size)
            if self.random:
                r = random.Random(self.run_number)
                r.shuffle(indices)
                
            # Determine the splits of the data
            for i in range(self.splits):
                split_start = int(round(float(i) * data_size / self.splits))
                split_end = int(round(float(i + 1) * data_size / self.splits))
                # means half-open interval [split_start, split_end)
                split_indices.append(indices[split_start: split_end]) 

        self.split_indices = split_indices
        
        self._log("Benchmarking with split %s/%s" % (self.current_split + 1,
                                                     self.splits))
Пример #50
0
class ReduceOverrepresentedClassNode(BaseNode):
    """ Reject instances to balance categories for classification

    The node forwards only a reduced number 
    of the training and test instances of the bigger class
    to get a balanced ratio of the
    classes. The forwarded instances are selected randomly.
    All data of the underrepresented class is
    forwarded.
    
    **Parameters**

    **Exemplary call**

    .. code-block:: yaml
    
        -
            node : Reduce_Overrepresented_Class
            
    :Author: Hendrik Woehrle ([email protected])
    :Created: 2010/09/22

    """
    def __init__(self, **kwargs):
        super(ReduceOverrepresentedClassNode, self).__init__(**kwargs)

    def request_data_for_training(self, use_test_data):
        """ Returns data for training of subsequent nodes
        
        .. todo:: to document
        """
        assert(self.input_node is not None)
        
        self._log("Data for testing is requested.", level=logging.DEBUG)
        
        if self.data_for_training is None:
            self._log("Producing data for training.", level=logging.DEBUG)
            # Train this node
            self.train_sweep(use_test_data)
            
            # Divide available instances according to label
            all_instances = defaultdict(list)
            for instance, label in self.input_node.request_data_for_training(
                    use_test_data):
                all_instances[label].append(instance)
            
            retained_instances = self.balance_instances(all_instances)
            
            # Compute a generator the yields the test data and
            # encapsulate it in an object that memoizes its outputs and
            # provides a "fresh" method that returns a new generator that will
            # yield the same sequence
            self._log("Producing data for testing.", level=logging.DEBUG)
            train_data_generator = ((self.execute(data), label)
                                    for (data, label) in retained_instances)
                    
            self.data_for_training = MemoizeGenerator(train_data_generator,
                                                      caching=self.caching)
        
        self._log("Data for training finished", level=logging.DEBUG)
        # Return a fresh copy of the generator  
        return self.data_for_training.fresh()
    
    def request_data_for_testing(self):
        """ Returns data for testing of subsequent nodes

        .. todo:: to document
        """
        assert(self.input_node is not None)
        
        self._log("Data for testing is requested.", level=logging.DEBUG)
        
        # If we haven't computed the data for testing yet
        if self.data_for_testing is None:
            # Assert  that this node has already been trained
            assert(not self.is_trainable() or 
                   self.get_remaining_train_phase() == 0)
            
            # Divide available instances according to label
            all_instances = defaultdict(list)
            
            for instance, label in self.input_node.request_data_for_testing():
                all_instances[label].append(instance)
            
            retained_instances = self.balance_instances(all_instances)
            
            # Compute a generator the yields the test data and
            # encapsulate it in an object that memoizes its outputs and
            # provides a "fresh" method that returns a new generator that will
            # yield the same sequence
            self._log("Producing data for testing.", level=logging.DEBUG)
            test_data_generator = ((self.execute(data), label)
                                   for (data, label) in retained_instances)
                    
            self.data_for_testing = MemoizeGenerator(test_data_generator,
                                                     caching=self.caching)
        self._log("Data for testing finished", level=logging.DEBUG)
        # Return a fresh copy of the generator
        return self.data_for_testing.fresh()
    
    def _execute(self, time_series):
        return time_series # We don't do anything with the kept instances

    def balance_instances(self, all_instances):
        """Method that performs the rejections of the data in the oversized class"""
        retained_instances = []
            
        # it is supposed to have a binary classifier, e.g. to have exactly 2 classes
        #if not len(all_instances.keys())==2:
        #    raise ValueError("Too many classes: only binary classification supported")
            
        # count the number of instances per class 
        min_num_instances_per_class = float("+inf")
        for label, instances in all_instances.iteritems():
            min_num_instances_per_class = min(min_num_instances_per_class,
                                              len(instances))
        r = random.Random(self.run_number)
        # retain only the number of instances that corresponds 
        # to the size of smaller class 
        for label, instances in all_instances.iteritems():
            r.shuffle(instances)
            retained_instances.extend(
                zip(instances[0:min_num_instances_per_class],
                    [label]*min_num_instances_per_class))
        r.shuffle(retained_instances)
        return retained_instances
Пример #51
0
    def request_data_for_testing(self):
        """ Returns data for testing of subsequent nodes

        .. todo:: to document
        """
        assert(self.input_node is not None)
        if self.test_percentage_selected > 100:
            self._log("Test percentage of %f reduced to 100." %
                      self.test_percentage_selected,
                      level=logging.ERROR)
            self.test_percentage_selected = 100
        self._log("Data for testing is requested.", level=logging.DEBUG)

        if self.test_percentage_selected == 100:
            return super(InstanceSelectionNode, self).request_data_for_testing()

        # If we haven't computed the data for testing yet
        if self.data_for_testing is None:
            # Assert  that this node has already been trained
            assert(not self.is_trainable() or 
                   self.get_remaining_train_phase() == 0)
            
            # Divide available instances according to label
            all_instances = defaultdict(list)
            for instance, label in self.input_node.request_data_for_testing():
                all_instances[label].append(instance)
                
            self._log("Keeping only %s percent of test data" %
                      self.test_percentage_selected,
                      level=logging.DEBUG)
            r = random.Random(self.run_number)
            
            # Retain only *percentage_selected* percent of the data
            retained_instances = []
            for label, instances in all_instances.iteritems():
                # enable random choice of samples
                r.shuffle(instances)
                if not self.reduce_class or \
                        self.test_percentage_selected == 100:
                    end_index = int(round(len(instances) *
                                    self.test_percentage_selected / 100))
                elif not (self.reduce_class == label):
                    end_index = len(instances)
                else:  # self.reduce_class==label--> reduction needed
                    end_index = int(round(len(instances) *
                                    self.test_percentage_selected / 100))

                retained_instances.extend(zip(instances[0:end_index],
                                              [label]*end_index))
            # mix up samples between the different labels
            r.shuffle(retained_instances)
            # Compute a generator the yields the test data and
            # encapsulate it in an object that memoizes its outputs and
            # provides a "fresh" method that returns a new generator that'll
            # yield the same sequence
            self._log("Producing data for testing.", level=logging.DEBUG)
            test_data_generator = ((self.execute(data), label)
                                   for (data, label) in retained_instances)
                    
            self.data_for_testing = MemoizeGenerator(test_data_generator,
                                                     caching=self.caching)
        
        self._log("Data for testing finished", level=logging.DEBUG)
        # Return a fresh copy of the generator
        return self.data_for_testing.fresh()
Пример #52
0
class TransferSplitterNode(BaseNode):
    """ Allow to split data into training and test data sets according to different window definitions
    
    Splits the available data into disjunct training and test sets. The transfer
    of different training and test window definitions is supported. The node 
    was implemented with several use cases in mind:
    
    - The training set contains instances of 'Standard' and 'Target' stimuli 
      but the test set of 'Target' and 'MissedTarget' stimuli.
      
    - The training set contains instances of 'LRP' with different training times
      and 'NoLRPs', but the test set should contain sliding windows. Cross
      validation should be supported to use the node together with parameter
      optimization node.
      
    - The use of merged data sets should be possible.
      
    **Parameters**
    
     :wdefs_train:
         A list with window definition names (specified in the window spec file
         when the raw data was segmented). All windows that belong to one of the
         window definition are considered when the training set(s) is(/are)
         determined.
         
     :wdefs_test:
         A list with window definition names (specified in the window spec file
         when the raw data was segmented). All windows that belong to one of the
         window definition are considered when the testing set(s) is(/are)
         determined.
         
     :split_method:
         One of the following Strings: 'all_data', 'time', 'count', 'set_flag'.
         
         - all_data    :
                             All possible data is used in every split. This 
                             results in splitting only window definitions that
                             occur in both, *wdefs_train* AND *wdefs_test*. 
                             Window definitions that only occur in either 
                             *wdefs_train* or *wdefs_test* are retained in every
                             split.

         - time        :
                             The data is sorted and split according to time.
                             For that (*start_time* of last window - 
                             *start_time* of first window)/*nr_of_splits*) is 
                             determined. Since time in eeg data is relative for 
                             every set, ensure that each input collection 
                             consists only of one data set (is not a merge of 
                             several sets) or that the change_option has been
                             used.

         - count      :
                             The data is split according to 
                             *num_split_instances*. By default only windows 
                             specified in both, *wdefs_train* and *wdefs_test*,
                             are count. With the parameter *wdefs_split* window
                             definition that are count can be specified.
                             If *num_split_instances* is not specified, *splits*
                             determines how many instances of *wdefs_split* are
                             in one split.

        - set_flag    :
                             When the data has been merged with the concatenate
                             operation before, a flag 'new_set' has been inserted
                             to the time series specs. Splits are based on this
                             flag, i.e. the splits behave like a inter-set
                             cross validation. For example you merged 3 sets: 
                             'A', 'B', 'C', then there are 3 splits generated:
                             'A'+'B' vs 'C', 'A'+'C' vs 'B' and 'B'+'C' vs 'A'.

     :random:
         If True, the data is randomized before splitting.
         
         .. note:: It is not guaranteed that overlapping windows will be in the
                   same split for split methods 'time' and 'all_data'!
                  
         (*optional, default: False*)
                  
     :splits:
         The number of splits created internally and the number of train-test
         pairs.
         
         (*optional, default: 10*)
         
     :num_split_instances:
         If *split_method* is 'count', *num_split_instances* specifies how many
         instances will be in one split. After splitting one split is evaluated
         according to *wdefs_test* for the test data set and the remaining 
         splits according to *wdefs_train*. The test split is iterated. If
         the total number of instances that are count is not divisible by 
         *num_split_instances* the last split will contain the remaining
         instances.
         If in addition *splits* is set to 1, only one train-test pair is 
         created with *num_split_instances* in the training set.
         
         (*optional, default: None*)
         
     :wdefs_split:
         A list with window definition names (specified in the window spec file
         when the raw data was segmented). All windows that belong to one of the
         window definition are counted when *split_method* was set to 'count'.
         
         (*optional, default: None*)
         
     :reverse:
         If this option is True, the data is split in reverse ordering.
         
         (*optional, default: False*)
    
    **Exemplary Call**    
    
    .. code-block:: yaml
    
        -
            node : TransferSplitter
            parameters :
                wdefs_train : ['s2', 's1']
                wdefs_test : ['s5', 's2']
                split_method : "all_data"
                splits : 5
    
    :Author: Anett Seeland ([email protected])
    :Created: 2011/04/10
    :LastChange: 2011/11/14 (traintest functionality)
    """
    
    def __init__(self, wdefs_train, wdefs_test, split_method, wdefs_train_test = None,
                 splits=10, random=False, num_split_instances=None, wdefs_split=None,
                 reverse=False, sort=False, *args, **kwargs):
        super(TransferSplitterNode, self).__init__(*args, **kwargs)
        
        if wdefs_train_test == None:
            wdefs_train_test = [wdef for wdef in \
                wdefs_train if wdef in wdefs_test],
            
        self.set_permanent_attributes(wdefs_train = wdefs_train, 
                                       wdefs_test = wdefs_test, 
                                     split_method = split_method,
                                           splits = splits, 
                                           random = random, 
                              num_split_instances = num_split_instances,
                                      wdefs_split = wdefs_split,
                                          reverse = reverse,
                                             sort = sort,
                                    current_split = 0,
                                 wdefs_train_test = wdefs_train_test,
                              split_indices_train = None,
                               split_indices_test = None)

    def is_split_node(self):
        """ Returns whether this is a split node. """
        return True

    def use_next_split(self):
        """ Use the next split of the data into training and test data.
        
        Returns True if more splits are available, otherwise False.
        
        This method is useful for benchmarking
        """
        if self.current_split + 1 < self.splits:
            self.current_split = self.current_split + 1
            self._log("Benchmarking with split %s/%s" % (self.current_split + 1,
                                                         self.splits))
            return True
        else:
            return False

    
    def train_sweep(self, use_test_data):
        """ Performs the actual training of the node.
        
        .. note:: Split nodes cannot be trained
        """
        raise Exception("Split nodes cannot be trained")
    
    def request_data_for_training(self, use_test_data):
        # Create split lazily when required
        if self.split_indices_train == None:
            self._create_split()
            
        # Create training data generator
        self.data_for_training = MemoizeGenerator(
             self.data[i] for i in self.split_indices_train[self.current_split])
        
        return self.data_for_training.fresh()
    
    def request_data_for_testing(self):
        # Create split lazily when required
        if self.split_indices_test == None:
            self._create_split()
        
        # Create test data generator
        self.data_for_testing = MemoizeGenerator(
              self.data[i] for i in self.split_indices_test[self.current_split])
        
        return self.data_for_testing.fresh()

    def _create_split(self):
        """ Create the split of the data into training and test data. """
        self._log("Splitting data into train and test data")
                  
        # Get training and test data
        # note: return the data in a list can double the memory requirements!
        train_data = list(self.input_node.request_data_for_training(
                                                         use_test_data = False))
        test_data = list(self.input_node.request_data_for_testing())
        
        # If there is already a  non-empty training set, 
        # it means that we are not the first split node in the node chain.
        if len(train_data) > 0:
            if len(test_data)==0:
                # If there was an All_Train_Splitter before, filter according
                # to wdef_train and return all training data
                self.split_indices_train = \
                            [[ind for ind, (win, lab) in enumerate(train_data) \
                                 if win.specs['wdef_name'] in self.wdefs_train]]
                self.split_indices_test = [[]]
                self.splits = 1
                self.data = train_data
                self._log("Using all data for training.")
                return
            else:    
                raise Exception("No iterated splitting of data sets allowed\n "
                            "(Calling a splitter on a data set that is already "
                            "splitted)")
        
        # Remember all the data and store it in memory
        # TODO: This might cause problems for large dataset
        self.data = train_data + test_data
        del train_data, test_data
        if self.reverse:
            self.data = self.data[::-1]
        
        # sort the data according to the start time
        if self.sort or self.split_method == 'time':
            self.data.sort(key=lambda swindow: swindow[0].start_time)
        # randomize the data if needed
        if self.random:
            r = random.Random(self.run_number)
            if self.split_method == 'set_flag':
                self.random = False
                # TODO: log this
            elif self.split_method == 'count':
                if self.wdefs_split == None:
                    self.wdefs_split = self.wdefs_train_test
                # divide the data with respect to the time
                data_time = dict()
                marker = -1
                last_window_endtime = 0
                for ind, (win, lab) in enumerate(self.data):
                    if win.start_time < last_window_endtime:
                        # overlapping windows or start of a new set
                        if win.end_time < last_window_endtime:
                            # new set
                            marker += 1
                            data_time[marker]=[(win,lab)]
                        else:
                            # overlapping windows
                            data_time[marker].append((win,lab))
                    else:
                        marker += 1
                        data_time[marker]=[(win,lab)]
                    last_window_endtime = win.end_time
                # randomize order of events by simultaneously keep the order of
                # sliding windows in each event
                data_random = data_time.values()
                r.shuffle(data_random)
                self.data = []
                for l in data_random: self.data.extend(l)
                del data_random, data_time, l
            else:
                r.shuffle(self.data)
            
        if self.split_method == 'all_data':
            # divide the data with respect to *wdef_train*, *wdef_test* and
            # *wdef_train_test*
            wdef_data = {'wdef_train_test':[],'wdef_train':[],'wdef_test':[]}
            class_labels = []
            for (index, (window, label)) in enumerate(self.data):
                if window.specs['wdef_name'] in self.wdefs_train_test:
                    wdef_data['wdef_train_test'].append(index)
                    if label not in class_labels:
                        class_labels.append(label)
                elif window.specs['wdef_name'] in self.wdefs_train:
                    wdef_data['wdef_train'].append(index)
                elif window.specs['wdef_name'] in self.wdefs_test:
                    wdef_data['wdef_test'].append(index)
                else:
                    import warnings
                    warnings.warn("Found window definition %s, which is " \
                                  "neither in *wdefs_train* nor in " \
                                  "*wdefs_test*. Window %s will be ignored!" \
                                  % (window.specs['wdef_name'],window.tag))
            # check if splitting makes sense
            if wdef_data['wdef_train_test']==[] and self.splits>1:
                raise Exception('No instances to split, i.e train-test window'\
                                ' definitions are disjunct!')
            split_indices_train = [[] for i in range(self.splits)]
            split_indices_test = [[] for i in range(self.splits)]
            # calculate splits
            if wdef_data['wdef_train_test']!=[]:
                data_size = len(wdef_data['wdef_train_test'])

                # ensure stratified splits if there are several classes 
                if len(class_labels)>1:
                    # divide the data with respect to the class_label 
                    data_labeled = dict()
                    for index in wdef_data['wdef_train_test']:
                        if not data_labeled.has_key(self.data[index][1]):
                            data_labeled[self.data[index][1]] = [index]
                        else:
                            data_labeled[self.data[index][1]].append(index)   
                    
                    # have not more splits than instances of every class!
                    min_nr_per_class = min([len(data) for data in \
                                                         data_labeled.values()])
                    if self.splits > min_nr_per_class:
                        self.splits = min_nr_per_class
                        self._log("Reducing number of splits to %s since no " \
                                  "more instances of one of the classes are " \
                                  "available." % self.splits)

                    # determine the splits of the data    
                    for label, indices in data_labeled.iteritems():
                        data_size = len(indices)
                        for j in range(self.splits):
                            split_start = \
                                      int(round(float(j)*data_size/self.splits))
                            split_end = \
                                    int(round(float(j+1)*data_size/self.splits))
                            split_indices_test[j].extend([i for i in indices[split_start: split_end]\
                                                if self.data[i][0].specs['wdef_name'] in self.wdefs_test])
                            split_indices_train[j].extend([i for i in indices \
                                             if i not in split_indices_test[j]])
                else: # len(class_labels) == 1
                    # have not more splits than instances!
                    if self.splits > data_size:
                        self.splits = data_size
                        self._log("Reducing number of splits to %s since no " \
                                  "more instances of one of the classes are " \
                                  "available." % self.splits)

                    # determine the splits of the data    
                    for j in range(self.splits):
                        split_start = \
                                  int(round(float(j)*data_size/self.splits))
                        split_end = \
                                int(round(float(j+1)*data_size/self.splits))
                        # means half-open interval [split_start, split_end)
                        split_indices_test[j].extend(
                            wdef_data['wdef_train_test'][split_start:split_end])
                        split_indices_train[j].extend([i for i in \
                                             wdef_data['wdef_train_test'] if i \
                                                  not in split_indices_test[j]])
            for i in range(self.splits):
                split_indices_train[i].extend(wdef_data['wdef_train'])
                split_indices_test[i].extend(wdef_data['wdef_test'])
                    
        elif self.split_method == 'time': 
            first_window_start = self.data[0][0].start_time
            last_window_start = self.data[-1][0].start_time
            # ensure, that time can never be greater than self.splits*time!
            time = round((last_window_start-first_window_start)/self.splits+0.5)
            # divide the data according to the time
            data_time = {0: []}
            time_fold = 0
            for (index, (window, label)) in enumerate(self.data):
                if window.start_time > time_fold*time+time:
                    time_fold += 1
                    data_time[time_fold]=[index]
                else:
                    data_time[time_fold].append(index)
                    
            split_indices_train = [[] for i in range(self.splits)]
            split_indices_test = [[] for i in range(self.splits)]
            for i in range(self.splits):
                split_indices_test[i].extend([index for index in data_time[i] \
                                    if self.data[index][0].specs['wdef_name'] \
                                                            in self.wdefs_test])
                for j in range(self.splits):
                    split_indices_train[i].extend([index for index in data_time[j] \
                            if j != i and self.data[index][0].specs['wdef_name'] \
                                                           in self.wdefs_train])
        elif self.split_method == 'count':
            if self.wdefs_split == None:
                self.wdefs_split = self.wdefs_train_test
            if self.num_split_instances == None:
                l = len([ind for ind, (win, lab) \
                             in enumerate(self.data) if win.specs['wdef_name'] \
                             in self.wdefs_split])
                self.num_split_instances = round(float(l)/self.splits)
            # divide the data according to *num_split_instances*
            data_count = {0:[]}
            count = -1
            count_fold = 0
            if self.splits==1 and len([i for i in range(len(self.data)) \
                    if self.data[i][0].specs['wdef_name'] in self.wdefs_split])\
                        == self.num_split_instances:
                train_end = len(self.data)
            else:
                for (ind, (win, lab)) in enumerate(self.data):
                    #print ind, win.specs['wdef_name'], lab
                    if win.specs['wdef_name'] in self.wdefs_split:
                        count += 1
                        if self.splits == 1 and \
                                              count == self.num_split_instances:
                            train_end = ind
                            break
                        if count != 0 and count % self.num_split_instances == 0:
                            count_fold += 1
                            data_count[count_fold] = [ind]
                        else:
                            data_count[count_fold].append(ind)
                    else:
                        data_count[count_fold].append(ind)
                 
            if self.splits != 1:
                # self.num_split_instances*self.splits < l, but in the case 
                # when only num_split_instances is specified we can not trust 
                # self.splits
                if len(data_count.keys()) == self.splits+1 or \
                        (len(data_count.keys())-1)*self.num_split_instances > l:
                    data_count[count_fold-1].extend(data_count[count_fold])
                    del data_count[count_fold] 
                
                self.splits = len(data_count.keys())
                
                split_indices_train = [[] for i in range(self.splits)]
                split_indices_test = [[] for i in range(self.splits)]
            
                for i in range(self.splits):
                    split_indices_test[i].extend([ind for ind in data_count[i] \
                                       if self.data[ind][0].specs['wdef_name'] \
                                                            in self.wdefs_test])
                    for j in range(self.splits):
                        split_indices_train[i].extend([ind for ind in data_count[j]\
                            if j != i and self.data[ind][0].specs['wdef_name'] \
                                                           in self.wdefs_train])
            else: # self.splits == 1
                split_indices_train = \
                    [[ind for ind in range(len(self.data[:train_end])) if \
                      self.data[ind][0].specs['wdef_name'] in self.wdefs_train]]
                split_indices_test = \
                    [[ind for ind in range(train_end,len(self.data)) if \
                       self.data[ind][0].specs['wdef_name'] in self.wdefs_test]]   
 
                    
        elif self.split_method == 'set_flag':
            # divide the data according to *new_set* flag in time series specs
            data_set = {0:[]}
            key_fold = 0
            for (ind, (win, lab)) in enumerate(self.data):
                if win.specs['new_set']:
                    key_fold += 1
                    data_set[key_fold]=[ind]
                else:
                    data_set[key_fold].append(ind)
                    
            self.splits = len(data_set.keys())
            
            split_indices_train = [[] for i in range(self.splits)]
            split_indices_test = [[] for i in range(self.splits)]
            for i in range(self.splits):
                split_indices_test[i].extend([ind for ind in data_set[i] \
                                    if self.data[ind][0].specs['wdef_name'] \
                                                            in self.wdefs_test])
                for j in range(self.splits):
                    split_indices_train[i].extend([ind for ind in data_set[j] \
                            if j != i and self.data[ind][0].specs['wdef_name'] \
                                          in self.wdefs_train])
            
        self.split_indices_train = split_indices_train
        self.split_indices_test = split_indices_test
        
        self._log("Benchmarking with split %s/%s" % (self.current_split + 1,
                                                     self.splits))
Пример #53
0
class PrintDataNode(BaseNode):
    """Print out formatted data.

    This prints out the data to support debugging.

    **Parameters**

        :print_delimiters:
            Separate prints with delimiters for readibility

            (*optional, default: True*)

        :print_markers:
            Print the markers.

            (*optional, default: True*)

        :print_shape:
            Print the the datas shape.

            (*optional, default: False*)

        :print_samples:
            Print the data.

            (*optional, default: True*)

        :print_hex:
            Print the data in flattened hex format.

            (*optional, default: False*)

        :print_normal:
            Print the data "normally".

            (*optional, default: True*)

        :numpy_printoptions:
            Specify numpy printoptions. Use none, if it does not apply.

            (*optional, default: None*)

    **Exemplary Call**

    .. code-block:: yaml

        -
            node : PrintData
            parameters :
                numpy_printoptions :
                    precision : 12
                    threshold : 100


    :Authors: Hendrik Woehrle ([email protected])
    :Created: 2012/04/20
    """
    def __init__(self,
                 print_delimiters=True,
                 print_markers=True,
                 print_hex=False,
                 print_normal=True,
                 numpy_printoptions=None,
                 print_samples=True,
                 print_shape=False,
                 **kwargs):

        super(PrintDataNode, self).__init__(*kwargs)

        self.set_permanent_attributes(item=0,
                                      print_delimiters=print_delimiters,
                                      print_markers=print_markers,
                                      print_hex=print_hex,
                                      print_normal=print_normal,
                                      numpy_printoptions=numpy_printoptions,
                                      print_samples=print_samples,
                                      print_shape=print_shape)

    def process(self):
        """ Processes all data that is provided by the input node

        Returns a generator that yields the data after being processed by this
        node.
        """
        assert (self.input_node != None), "No input node specified!"
        # Assert  that this node has already been trained
        assert (not self.is_trainable()
                or self.get_remaining_train_phase() == 0), "Node not trained!"
        self._log("Processing data.", level=logging.DEBUG)
        data_generator = \
                itertools.imap(lambda (data, label):
                               self.print_data(data, label),
                               self.input_node.process())
        return data_generator

    def request_data_for_training(self, use_test_data):
        """ Returns data for training of subsequent nodes of the node chain

        A call to this method might involve training of the node chain up this
        node. If use_test_data is true, all available data is used for
        training, otherwise only the data that is explicitly for training.
        """
        assert (self.input_node != None)

        self._log("Data for training is requested.", level=logging.DEBUG)

        # If we haven't computed the data for training yet
        if self.data_for_training == None:
            self._log("Producing data for training.", level=logging.DEBUG)
            # Train this node
            self.train_sweep(use_test_data)
            # Compute a generator the yields the train data and
            # encapsulate it in an object that memoizes its outputs and
            # provides a "fresh" method that returns a new generator that'll
            # yield the same sequence
            # This line crashes without the NodeMetaclass bug fix
            train_data_generator = \
                 itertools.imap(lambda (data, label) :
                                self.print_data(data, label),
                                self.input_node.request_data_for_training(
                                                                use_test_data))
            self.data_for_training = MemoizeGenerator(train_data_generator,
                                                      caching=self.caching)

        self._log("Data for training finished", level=logging.DEBUG)
        # Return a fresh copy of the generator
        return self.data_for_training.fresh()

    def request_data_for_testing(self):
        """ Returns data for testing of subsequent nodes of the node chain

        A call to this node might involve evaluating the whole node chain
        up to this node.
        """
        assert (self.input_node != None)

        self._log("Data for testing is requested.", level=logging.DEBUG)

        # If we haven't computed the data for testing yet
        if self.data_for_testing == None:
            # Assert  that this node has already been trained
            assert (not self.is_trainable()
                    or self.get_remaining_train_phase() == 0)
            # Compute a generator the yields the test data and
            # encapsulate it in an object that memoizes its outputs and
            # provides a "fresh" method that returns a new generator that'll
            # yield the same sequence
            self._log("Producing data for testing.", level=logging.DEBUG)
            test_data_generator = \
                itertools.imap(lambda (data, label):
                               self.print_data(data, label),
                               self.input_node.request_data_for_testing())
            self.data_for_testing = MemoizeGenerator(test_data_generator,
                                                     caching=self.caching)
        self._log("Data for testing finished", level=logging.DEBUG)
        # Return a fresh copy of the generator
        return self.data_for_testing.fresh()

    def print_data(self, data, label):
        """
        Print the data according to the specified constraints.
        """
        if self.print_delimiters == True:
            print 50 * "*"

        if hasattr(data, "marker_name"
                   ) and data.marker_name != None and self.print_markers:
            print "%s: markers: %s" % (str(type(data)), str(data.marker_name))
        else:
            print "%s" % (str(type(data)))

        if issubclass(FeatureVector, type(data)):
            print "%04d: %s %s" % (self.item, data.tag, label)
        elif issubclass(TimeSeries, type(data)):
            print "%04d: %s %s %s" % (self.item, data.name, data.marker_name,
                                      label)

        # backup printoptions
        if self.numpy_printoptions:
            default_printoptions = numpy.get_printoptions()
            numpy.set_printoptions(**self.numpy_printoptions)

        if self.print_shape:
            print "shape:", data.shape

        if self.print_normal:
            if self.print_delimiters == True:
                print 25 * "-"
            print data

        if self.print_hex:
            if self.print_delimiters == True:
                print 25 * "-"
            print map(hex, data.flatten())

        if self.print_delimiters == True:
            print 50 * "*"

        #set back default printoptions
        if self.numpy_printoptions:
            numpy.set_printoptions(default_printoptions)

        self.item += 1

        return (data, label)
Пример #54
0
class FeatureVectorSourceNode(BaseNode):
    """ Source for samples of type :class:`~pySPACE.resources.data_types.feature_vector.FeatureVector`

    This node reads :class:`~pySPACE.resources.data_types.feature_vector.FeatureVector`
    elements
    accumulated in a :mod:`~pySPACE.resources.dataset_defs.feature_vector` and
    passes them into the :mod:`~pySPACE.environments.chains.node_chain`.
    As described in :mod:`~pySPACE.resources.dataset_defs.feature_vector` it is important,
    that the storage format is correct specified in the metadata.yaml.
    If the dataset has been constructed by pySPACE, this is done automatically.

    **Parameters**

    **Exemplary Call**
    
    .. code-block:: yaml

        - 
            node : Feature_Vector_Source

    :Author: Jan Hendrik Metzen ([email protected])
    :Created: 2008/11/25
    """
    input_types = ["FeatureVector"]

    def __init__(self, **kwargs):
        super(FeatureVectorSourceNode, self).__init__(**kwargs)

    def set_input_dataset(self, dataset):
        """ Sets the dataset from which this node reads the data """
        self.set_permanent_attributes(dataset=dataset)

    def register_input_node(self, node):
        """ Register the given node as input """
        raise Exception(
            "No nodes can be registered as inputs for source nodes")

    def use_next_split(self):
        """
        Use the next split of the data into training and test data.
        Returns True if more splits are available, otherwise False.
        
        This method is useful for benchmarking
        """
        return False

    def train_sweep(self, use_test_data):
        """
        Performs the actual training of the node.
        .. note:: Source nodes cannot be trained
        """
        raise Exception("Source nodes cannot be trained")

    def request_data_for_training(self, use_test_data):
        """
        Returns the time windows that can be used for training of subsequent nodes

        .. todo:: to document
        """
        if not use_test_data:
            # If the input dataset consists only of one single run,
            # we use this as input for all runs to be conducted (i.e. we
            # rely on later randomization of the order). Otherwise
            # we use the data for this run number
            if self.dataset.meta_data["runs"] > 1:
                key = (self.run_number, self.current_split, "train")
            else:
                key = (0, self.current_split, "train")
            # Check if there is training data for the current split and run
            if key in self.dataset.data.keys():
                self._log(
                    "Accessing input dataset's training feature vector windows."
                )
                self.data_for_training = MemoizeGenerator(
                    self.dataset.get_data(*key).__iter__(),
                    caching=self.caching)
            else:
                # Returns an iterator that iterates over an empty sequence
                # (i.e. an iterator that is immediately exhausted), since
                # this node does not provide any data that is explicitly
                # dedicated for training
                self._log("No training data available.")
                self.data_for_training = MemoizeGenerator(
                    (x for x in [].__iter__()), caching=self.caching)
        else:
            # Return the test data as there is no additional data that
            # was dedicated for training
            return self.request_data_for_testing()

        # Return a fresh copy of the generator
        return self.data_for_training.fresh()

    def request_data_for_testing(self):
        """
        Returns the data that can be used for testing of subsequent nodes

        .. todo:: to document
        """
        # If we haven't read the data for testing yet
        if self.data_for_testing == None:
            self._log("Accessing input dataset's test feature vector windows.")
            # If the input dataset consists only of one single run,
            # we use this as input for all runs to be conducted (i.e. we
            # rely on later randomization of the order). Otherwise
            # we use the data for this run number
            if self.dataset.meta_data["runs"] > 1:
                key = (self.run_number, self.current_split, "test")
            else:
                key = (0, self.current_split, "test")

            test_data_generator = self.dataset.get_data(*key).__iter__()

            self.data_for_testing = MemoizeGenerator(test_data_generator,
                                                     caching=self.caching)

        # Return a fresh copy of the generator
        return self.data_for_testing.fresh()

    def get_metadata(self, key):
        """ Return the value corresponding to the given key from the dataset meta data of this source node. """
        return self.dataset.meta_data.get(key)
Пример #55
0
class TrainTestSplitterNode(BaseNode):
    """ Split data into one training and one test data set with a fixed ratio
    
    The relative
    size of the two sets is controlled via the parameter train_ratio.

    .. warning:: the class ratio is not retained

    .. todo::
        introduce stratified parameter as in CV_Splitter
    
    **Parameters**
    
     :train_ratio:
         The ratio of the overall available data that is assigned to the 
         training set. The remaining data (1-train_ratio) is used for testing.
         
         (*optional, default: 0.5*)
         
     :num_train_instances:
         Instead of specifying a train_ratio, this option allows to specify the
         absolute number of training instances of class *class_label* that 
         should be in the training set. All instances that occur until 
         *num_train_instances* are found are used for training. The remaining
         data are used for testing.
         
         (*optional, default: None*)
    
     :class_label:
         If *num_train_instances*-option is used, this string determines the
         class of which training examples are count.
     
     :random:
         If *False*, the order of the data is retained. I.e. the train_ratio
         instances are used for training and the remaining as test data. If 
         *True*, the two sets are sampled randomly from the data without
         taking into consideration the data's order.
         
         (*optional, default: True*)
    
    **Exemplary Call**
    
    .. code-block:: yaml
    
        -
            node : TrainTestSplitter
            parameters :
                  train_ratio : 0.7
                  random : False
    
    :Author: Jan Hendrik Metzen ([email protected])
    :Created: 2010/03/08 (Documentation, old node)
    :LastChange: 2011/11/14 (Documentation) Anett Seeland
    """
    
    def __init__(self, train_ratio=0.5, random=True,
                 num_train_instances=None, class_label='Target', reverse=False,
                 **kwargs):
        super(TrainTestSplitterNode, self).__init__(**kwargs)
        assert(not(random and reverse)),"Reverse ordering makes no sense when randomization is active!"
        self.set_permanent_attributes(train_ratio=train_ratio,
                                      random=random,
                                      num_train_instances=num_train_instances,
                                      class_label=class_label,
                                      reverse=reverse,
                                      train_data=None,
                                      test_data=None)

    def is_split_node(self):
        """ Returns whether this is a split node. """
        return True

    def use_next_split(self):
        """ Use the next split of the data into training and test data.
        
        Returns True if more splits are available, otherwise False.
        
        This method is useful for benchmarking
        """
        return False
    
    def train_sweep(self, use_test_data):
        """ Performs the actual training of the node.
        
        .. note:: Split nodes cannot be trained
        """
        raise Exception("Split nodes cannot be trained")
    
    def request_data_for_training(self, use_test_data):
        """ Returns the data for training of subsequent nodes

        .. todo:: to document
        """
        # Create split lazily when required
        if self.train_data == None:
            self._create_split()

        # Create training data generator
        self.data_for_training = \
                MemoizeGenerator(instance for instance in self.train_data)
        
        return self.data_for_training.fresh()
    
    def request_data_for_testing(self):
        """ Returns the data for testing of subsequent nodes

        .. todo:: to document
        """
        # Create split lazily when required
        if self.test_data == None:
            self._create_split()
        
        # Create test data generator
        self.data_for_testing = \
                MemoizeGenerator(instance for instance in self.test_data)

        return self.data_for_testing.fresh()

    def _create_split(self):
        """ Create the split of the data into training and test data. """
        self._log("Splitting data into train and test data")
        train_data = list(self.input_node.request_data_for_training(use_test_data=False))

        # If there is already a  non-empty training set,
        # it means that we are not  the first split node in the node chain.
        if  len(train_data) > 0:
            raise Exception("No iterated splitting of data sets allowed\n "
                            "(Calling a splitter on a  data set that is already "
                            "split)")

        # Create generator instead of loading all data
        if self.num_train_instances and not (self.random):
            self.train_data = []
            input_generator=self.input_node.request_data_for_testing()
            for i in range(self.num_train_instances):
                self.train_data.append(input_generator.next())
            self.test_data = input_generator
            return

        # Gather all test data
        test_data = list(self.input_node.request_data_for_testing())
        
        # Remember all the data and store it in memory
        # TODO: This might cause problems for large dataset
        data = train_data + test_data
        data_size = len(data)

        # Randomize order if randomization is not switched of
        if self.random:
            r = random.Random(self.run_number)
            r.shuffle(data)
        
        if self.num_train_instances!=None:
            if self.reverse:
                data = data[::-1]
            if len([i for i in range(len(data)) \
                  if data[i][1]==self.class_label])==self.num_train_instances:
                train_end = data_size
            else:
                counter = 0
                for (index, (window, label)) in enumerate(data):
                    # print "Label: ", label, "Zeitpunkt: ", window.start_time
                    if label == self.class_label:
                        counter += 1
                    if counter == self.num_train_instances:
                        train_end = index+1
                        break
                assert(self.num_train_instances==counter), \
                            "Too many instances to select."
        else:
            # Split data into train and test data according train_ratio
            train_end = int(round(data_size * self.train_ratio))
            
        self.train_data=data[0:train_end]
        self.test_data=data[train_end:]
Пример #56
0
class PrintDataNode(BaseNode):
    """Print out formatted data.

    This prints out the data to support debugging.

    **Parameters**

        :print_delimiters:
            Separate prints with delimiters for readibility

            (*optional, default: True*)

        :print_markers:
            Print the markers.

            (*optional, default: True*)

        :print_shape:
            Print the the datas shape.

            (*optional, default: False*)

        :print_samples:
            Print the data.

            (*optional, default: True*)

        :print_hex:
            Print the data in flattened hex format.

            (*optional, default: False*)

        :print_normal:
            Print the data "normally".

            (*optional, default: True*)

        :numpy_printoptions:
            Specify numpy printoptions. Use none, if it does not apply.

            (*optional, default: None*)

    **Exemplary Call**

    .. code-block:: yaml

        -
            node : PrintData
            parameters :
                numpy_printoptions :
                    precision : 12
                    threshold : 100


    :Authors: Hendrik Woehrle ([email protected])
    :Created: 2012/04/20
    """
    def __init__(self,
                 print_delimiters = True,
                 print_markers = True,
                 print_hex = False,
                 print_normal = True,
                 numpy_printoptions = None,
                 print_samples = True,
                 print_shape = False,
                 **kwargs):

        super(PrintDataNode, self).__init__(*kwargs)

        self.set_permanent_attributes(item = 0,
                                      print_delimiters = print_delimiters,
                                      print_markers = print_markers,
                                      print_hex = print_hex,
                                      print_normal = print_normal,
                                      numpy_printoptions = numpy_printoptions,
                                      print_samples = print_samples,
                                      print_shape = print_shape
                                      )
    def process(self):
        """ Processes all data that is provided by the input node

        Returns a generator that yields the data after being processed by this
        node.
        """
        assert(self.input_node != None), "No input node specified!"
        # Assert  that this node has already been trained
        assert(not self.is_trainable() or
               self.get_remaining_train_phase() == 0), "Node not trained!"
        self._log("Processing data.", level=logging.DEBUG)
        data_generator = \
                itertools.imap(lambda (data, label):
                               self.print_data(data, label),
                               self.input_node.process())
        return data_generator

    def request_data_for_training(self, use_test_data):
        """ Returns data for training of subsequent nodes of the node chain

        A call to this method might involve training of the node chain up this
        node. If use_test_data is true, all available data is used for
        training, otherwise only the data that is explicitly for training.
        """
        assert(self.input_node != None)

        self._log("Data for training is requested.", level = logging.DEBUG)

        # If we haven't computed the data for training yet
        if self.data_for_training == None:
            self._log("Producing data for training.", level = logging.DEBUG)
            # Train this node
            self.train_sweep(use_test_data)
            # Compute a generator the yields the train data and
            # encapsulate it in an object that memoizes its outputs and
            # provides a "fresh" method that returns a new generator that'll
            # yield the same sequence
            # This line crashes without the NodeMetaclass bug fix
            train_data_generator = \
                 itertools.imap(lambda (data, label) :
                                self.print_data(data, label),
                                self.input_node.request_data_for_training(
                                                                use_test_data))
            self.data_for_training = MemoizeGenerator(train_data_generator,
                                                      caching=self.caching)

        self._log("Data for training finished", level = logging.DEBUG)
        # Return a fresh copy of the generator
        return self.data_for_training.fresh()

    def request_data_for_testing(self):
        """ Returns data for testing of subsequent nodes of the node chain

        A call to this node might involve evaluating the whole node chain
        up to this node.
        """
        assert(self.input_node != None)

        self._log("Data for testing is requested.", level = logging.DEBUG)

        # If we haven't computed the data for testing yet
        if self.data_for_testing == None:
            # Assert  that this node has already been trained
            assert(not self.is_trainable() or
                   self.get_remaining_train_phase() == 0)
            # Compute a generator the yields the test data and
            # encapsulate it in an object that memoizes its outputs and
            # provides a "fresh" method that returns a new generator that'll
            # yield the same sequence
            self._log("Producing data for testing.", level = logging.DEBUG)
            test_data_generator = \
                itertools.imap(lambda (data, label):
                               self.print_data(data, label),
                               self.input_node.request_data_for_testing())
            self.data_for_testing = MemoizeGenerator(test_data_generator,
                                                     caching=self.caching)
        self._log("Data for testing finished", level = logging.DEBUG)
        # Return a fresh copy of the generator
        return self.data_for_testing.fresh()
    
    def print_data(self, data, label):
        """
        Print the data according to the specified constraints.
        """
        if self.print_delimiters == True:
            print 50 *"*"

        if hasattr(data,"marker_name") and data.marker_name != None and self.print_markers:
            print "%s: markers: %s" % (str(type(data)), str(data.marker_name))
        else :
            print "%s" % (str(type(data)))

        if issubclass(FeatureVector, type(data)):
            print "%04d: %s %s" % (self.item, data.tag, label)
        elif issubclass(TimeSeries, type(data)):
            print "%04d: %s %s %s" % (self.item, data.name, data.marker_name, label)

        # backup printoptions
        if self.numpy_printoptions:
            default_printoptions = numpy.get_printoptions()
            numpy.set_printoptions(**self.numpy_printoptions)

        if self.print_shape:
            print "shape:", data.shape

        if self.print_normal:
            if self.print_delimiters == True:
                print 25 *"-"
            print data

        if self.print_hex:
            if self.print_delimiters == True:
                print 25 *"-"
            print map(hex,data.flatten())

        if self.print_delimiters == True:
            print 50 *"*"

        #set back default printoptions
        if self.numpy_printoptions:
            numpy.set_printoptions(default_printoptions)

        self.item += 1

        return (data, label)