Exemplo n.º 1
0
def test_hoeffding_tree_coverage():
    # Cover memory management
    stream = SEAGenerator(random_state=1, noise_percentage=0.05)
    stream.prepare_for_use()
    X, y = stream.next_sample(5000)

    learner = HoeffdingTree(max_byte_size=30,
                            memory_estimate_period=100,
                            grace_period=10,
                            leaf_prediction='mc')

    learner.partial_fit(X, y, classes=stream.target_values)

    learner.reset()

    # Cover nominal attribute observer
    stream = RandomTreeGenerator(tree_random_state=1,
                                 sample_random_state=1,
                                 n_num_features=0,
                                 n_categories_per_cat_feature=2)
    stream.prepare_for_use()
    X, y = stream.next_sample(1000)
    learner = HoeffdingTree(leaf_prediction='mc',
                            nominal_attributes=[i for i in range(10)])
    learner.partial_fit(X, y, classes=stream.target_values)
Exemplo n.º 2
0
    def _process_chunk(self):
        """ A subroutine that runs at the end of each chunk, allowing
        the components to be trained and ensemble weights to be adjusted.
        Until the first _process_chunk call, the ensemble is not yet ready.
        At first call, the first component is learned.
        At the rest of the calls, new components are formed, and the older ones
        are trained by the given chunk.
        If the ensemble size is reached, then the lowest weighted component is
        removed from the ensemble.
        """
        new_clf = HoeffdingTree()  # with default parameters for now
        new_clf.reset()

        # Save records of previous chunk
        if (self._record and self._num_of_current_classifiers > 0):
            self._record_truths_this_chunk()
            self._record_comp_preds_this_chunk()
            self._record_weights_this_chunk()

        # Case 1: No classifier in the ensemble yet, first chunk:
        if (self._num_of_current_classifiers == 0):
            self._classifiers[0] = new_clf
            self._weights[0] = 1.0  # weight is 1 for the first clf
            self._num_of_current_classifiers += 1
        else:
            # First, adjust the weights of the old component classifiers
            # according to what happened in this chunk.
            self._adjust_weights()
            # Case 2: There are classifiers in the ensemble but
            # the ensemble size is still not capped.
            if (self._num_of_current_classifiers <
                    self._num_of_max_classifiers):
                # Put the new classifier to ensemble with the weight of 1
                self._classifiers[self._num_of_current_classifiers] = new_clf
                self._weights[self._num_of_current_classifiers] = float(1.0)
                self._num_of_current_classifiers += 1

            # Case 3: Ensemble size is capped. Need to replace the component
            # with lowest weight.
            else:
                assert (self._num_of_current_classifiers ==
                        self._num_of_max_classifiers), "Ensemble not full."
                index_of_lowest_weight = np.argmin(self._weights)
                self._classifiers[index_of_lowest_weight] = new_clf
                self._weights[index_of_lowest_weight] = 1.0

            # Normalizing weigths to simplify numbers
            self._normalize_weights_softmax()  # maybe useful. we'll see.
            if (self._Logging):
                print("After normalization weights: ")
                print(self._weights)
        # Ensemble maintenance is done. Now train all classifiers
        # in the ensemble from the current chunk.
        # Can be parallelized.
        data_features = self._chunk_data.get_attributes_matrix()
        data_truths = self._chunk_data.get_targets_matrix()
        data_truths = data_truths.astype(int).flatten()

        if (self._Logging):
            print("Starting training the components with the current chunk...")
            for k in range(self._num_of_current_classifiers):
                print("Training classifier {}".format(k))
                self._classifiers[k].partial_fit(data_features,
                                                 data_truths,
                                                 classes=self._target_values)
            print(
                "Training the components with the current chunk completed...")
        else:
            for k in range(self._num_of_current_classifiers):
                self._classifiers[k].partial_fit(data_features,
                                                 data_truths,
                                                 classes=self._target_values)
        return