예제 #1
0
def test_naive_bayes(test_path):
    stream = SEAGenerator(random_state=1)
    stream.prepare_for_use()

    learner = NaiveBayes()

    cnt = 0
    max_samples = 5000
    y_pred = array('i')
    X_batch = []
    y_batch = []
    y_proba = []
    wait_samples = 100

    while cnt < max_samples:
        X, y = stream.next_sample()
        X_batch.append(X[0])
        y_batch.append(y[0])
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            y_pred.append(learner.predict(X)[0])
            y_proba.append(learner.predict_proba(X)[0])
        learner.partial_fit(X, y, classes=stream.target_values)
        cnt += 1

    expected_predictions = array('i', [
        1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1,
        1
    ])

    assert np.alltrue(y_pred == expected_predictions)

    test_file = os.path.join(test_path, 'data_naive_bayes_proba.npy')
    y_proba_expected = np.load(test_file)
    assert np.allclose(y_proba, y_proba_expected)

    expected_info = 'NaiveBayes: nominal attributes: [] - '
    assert learner.get_info() == expected_info

    learner.reset()
    learner.fit(X=np.array(X_batch[:4500]), y=np.array(y_batch[:4500]))

    expected_score = 0.9378757515030061
    assert np.isclose(
        expected_score,
        learner.score(X=np.array(X_batch[4501:]), y=np.array(y_batch[4501:])))

    assert 'estimator' == learner.get_class_type()

    assert type(learner.predict(X)) == np.ndarray
    assert type(learner.predict_proba(X)) == np.ndarray
예제 #2
0
def test_clone():
    stream = SEAGenerator(random_state=1)

    learner = NaiveBayes()

    cnt = 0
    max_samples = 5000
    y_pred = array('i')
    X_batch = []
    y_batch = []
    y_proba = []
    wait_samples = 100

    while cnt < max_samples:
        X, y = stream.next_sample()
        X_batch.append(X[0])
        y_batch.append(y[0])
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            y_pred.append(learner.predict(X)[0])
            y_proba.append(learner.predict_proba(X)[0])
        learner.partial_fit(X, y, classes=[0, 1])
        cnt += 1

    cloned = clone(learner)

    assert learner._observed_class_distribution != {} and cloned._observed_class_distribution == {}
예제 #3
0
    def partial_fit(self, X, y=None, classes=None, weight=None):
        """
        Fit the ensemble to a data chunk
        Implement the basic Algorithm 1 as described in the paper

        :param X: the training data (a data chunk S)
        :param y: the training labels
        :param classes: array-like, contains all possible labels,
                        if not provided, it will be derived from y
        :param weight:  array-like, instance weight
                        if not provided, uniform weights are assumed
        :return: self
        """

        # if the classes are not provided, we derive it from y
        N, D = X.shape
        class_count = None # avoid calling unique multiple times
        if classes is None:
            classes, class_count = np.unique(y, return_counts=True)

        # (1) train classifier C' from X
        # allows a wider variety of classifiers
        # not a lot but still...
        if self.base_learner == "bayes": # Naive Bayes
            C_new = NaiveBayes()
        else: # by default, set to Hoeffding Tree
            C_new = HoeffdingTree()

        C_new.partial_fit(X, y, classes=classes)

        # (2) compute error rate/benefit of C_new via cross-validation on S

        # MSE_r: compute the baseline error rate given by a random classifier
        # a. class distribution learnt from the data
        # use this improve the performance
        if class_count is None:
            _, class_count = np.unique(classes, return_counts=True)
        class_dist = [class_count[i] / N for i, c in enumerate(classes)]
        MSE_r = np.sum([class_dist[i] * ((1 - class_dist[i]) ** 2) for i, c in enumerate(classes)])

        # b. assumption: uniform distribution
        # p_c = 1/L
        # MSE_r = L * (p_c * ((1 - p_c) ** 2))

        # MSE_i: compute the error rate of C_new via cross-validation on X
        # f_ic = the probability given by C_new that x is an instance of class c
        MSE_i = self.compute_MSE(y, C_new.predict_proba(X), classes)

        # (3) derive weight w_new for C_new using (8) or (9)
        w_new = MSE_r - MSE_i

        # create a new classifier with its associated weight,
        # the unique labels of the data chunk it is trained on
        clf_new = self.WeightedClassifier(clf=C_new, weight=w_new, chunk_labels=classes)

        # (4) update the weights of each classifier in the ensemble
        for i, clf in enumerate(self.models):
            MSE_i = self.compute_MSE(y, clf.clf.predict_proba(X), clf.chunk_labels) # apply Ci on S to derive MSE_i
            clf.weights = MSE_r - MSE_i # update wi based on (8) or (9)

        # (5) C <- top K weighted classifiers in C U { C' }
        # selecting top K models by dropping the worst model i.e. clf with smallest weight in C U { C' }
        if len(self.models) < self.K:
            # just push the new model in if there is still slots
            hq.heappush(self.models, clf_new)
        else:
            # if the new model has a weight > that of the bottom classifier (worst one)
            if clf_new.weight > self.models[0].weight:
                hq.heappushpop(self.models, clf_new) # push the new classifier and remove the bottom one
            # do nothing if the new model has a weight even lower than that of the worst classifier

        return self