def test_clone(): stream = SEAGenerator(random_state=1) learner = NaiveBayes() cnt = 0 max_samples = 5000 y_pred = array('i') X_batch = [] y_batch = [] y_proba = [] wait_samples = 100 while cnt < max_samples: X, y = stream.next_sample() X_batch.append(X[0]) y_batch.append(y[0]) # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): y_pred.append(learner.predict(X)[0]) y_proba.append(learner.predict_proba(X)[0]) learner.partial_fit(X, y, classes=[0, 1]) cnt += 1 cloned = clone(learner) assert learner._observed_class_distribution != {} and cloned._observed_class_distribution == {}
class Bayes(IncrementalClassifier): def __init__(self): super().__init__() self.clf = NaiveBayes() def partial_fit(self, one_row): self.clf.partial_fit([one_row[0]], [one_row[1]]) def predict(self, x): return self.clf.predict(x)
def test_naive_bayes(test_path): stream = SEAGenerator(random_state=1) stream.prepare_for_use() learner = NaiveBayes() cnt = 0 max_samples = 5000 y_pred = array('i') X_batch = [] y_batch = [] y_proba = [] wait_samples = 100 while cnt < max_samples: X, y = stream.next_sample() X_batch.append(X[0]) y_batch.append(y[0]) # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): y_pred.append(learner.predict(X)[0]) y_proba.append(learner.predict_proba(X)[0]) learner.partial_fit(X, y, classes=stream.target_values) cnt += 1 expected_predictions = array('i', [ 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1 ]) assert np.alltrue(y_pred == expected_predictions) test_file = os.path.join(test_path, 'data_naive_bayes_proba.npy') y_proba_expected = np.load(test_file) assert np.allclose(y_proba, y_proba_expected) expected_info = 'NaiveBayes: nominal attributes: [] - ' assert learner.get_info() == expected_info learner.reset() learner.fit(X=np.array(X_batch[:4500]), y=np.array(y_batch[:4500])) expected_score = 0.9378757515030061 assert np.isclose( expected_score, learner.score(X=np.array(X_batch[4501:]), y=np.array(y_batch[4501:]))) assert 'estimator' == learner.get_class_type() assert type(learner.predict(X)) == np.ndarray assert type(learner.predict_proba(X)) == np.ndarray
def main(): overall_kswin_tp = overall_kswin_tn = overall_kswin_fp = overall_kswin_fn = 0 overall_adwin_tp = overall_adwin_tn = overall_adwin_fp = overall_adwin_fn = 0 # mebwin_drifts = [] overall_k_swmebwin_tp = overall_k_swmebwin_tn = overall_k_swmebwin_fp = overall_k_swmebwin_fn = 0 overall_swmebwin_tp = overall_swmebwin_tn = overall_swmebwin_fp = overall_swmebwin_fn = 0 overall_eddm_tp = overall_eddm_tn = overall_eddm_fp = overall_eddm_fn = 0 overall_ddm_tp = overall_ddm_tn = overall_ddm_fp = overall_ddm_fn = 0 for stream in streams: print(stream.name) f = open('drifts.txt', 'a+') f.write(f'**{stream.name}**\n\n') f.close() stream.prepare_for_use() stream.next_sample() # mebwin = MEBWIN(epsilon=0.1, sensitivity=0.98, w_size=100, stat_size=30) adwin = [] kswin = [] ddm = DDM(min_num_instances=30) eddm = EDDM() data = [] labels = [] predictions = [] kswin_drifts = [] adwin_drifts = [] # mebwin_drifts = [] k_swmebwin_drifts = [] swmebwin_drifts = [] eddm_drifts = [] ddm_drifts = [] swmebwin = SWMEBWIN(classes=stream.target_values, w_size=80, epsilon=0.05) # k_swmebwin = Kernel_SWMEBWIN(classes=stream.target_values, w_size=80, epsilon=0.05, gamma=10**10) k_swmebwin = Kernel_SWMEBWIN(classes=stream.target_values, w_size=80, epsilon=0.05) # gamma maybe 1.0 / stream.current_sample_x.shape[1] RANGE = 1000000 DIM = 50 # - 2 because first drift is at 2000 not 1000 and last drift is not detectable # COUNT_DRIFTS = RANGE / 1000 - 2 n_rand_dims = DIM - stream.current_sample_x.size multiply = n_rand_dims // stream.current_sample_x.size # partial fit -> pretrain for _m in range(multiply): current_sample_x = np.array([[]]) current_sample_x = np.concatenate( (current_sample_x, stream.current_sample_x), axis=1) bayes = NaiveBayes() bayes.partial_fit(np.array(current_sample_x), list(stream.current_sample_y.ravel())) for j in range(DIM): adwin.append(ADWIN(delta=0.002)) kswin.append(KSWIN(w_size=300, stat_size=30, alpha=0.0001)) """Add dims""" for i in range(RANGE): current_sample_x = np.array([[]]) for _m in range(multiply): current_sample_x = np.concatenate( (current_sample_x, stream.current_sample_x), axis=1) data.append(current_sample_x.ravel()) labels.append(stream.current_sample_y.ravel()[0]) predictions.append(0 if bayes.predict(current_sample_x) == labels[i] else 1) bayes.partial_fit(current_sample_x, list(stream.current_sample_y.ravel())) stream.next_sample() # MEBWIN # start = time.time() # for i in range(RANGE): # mebwin.add_element(data[i]) # # if mebwin.change_detected is True: # mebwin_drifts.append(i) # # f = open('drifts.txt', 'a+') # f.write(f'MEBWIN detected {len(mebwin_drifts)} drifts in {time.time() - start} {mebwin_drifts}\n\n') # f.close() # print(f'MEBWIN took {time.time() - start} sec and detected {len(mebwin_drifts)} drifts') # Kernel SWMEBWIN start = time.time() for i in range(RANGE): k_swmebwin.add_element(value=data[i], label=labels[i]) if k_swmebwin.change_detected is True: k_swmebwin_drifts.append(i) end = time.time() - start f1, tp, fp, tn, fn = confusion_matrix_stats(k_swmebwin_drifts, RANGE) overall_k_swmebwin_tp += tp overall_k_swmebwin_tn += tn overall_k_swmebwin_fp += fp overall_k_swmebwin_fn += fn print(f'F1-Score: {f1}') print(f'{tp} true positives, {fp} false positives') print(f'{tn} true negatives, {fn} false negatives') f = open('drifts.txt', 'a+') f.write(f'K-SWMEB detected {len(k_swmebwin_drifts)} drifts in {time.time() - start} {k_swmebwin_drifts}\n\n') f.close() print(f'K-SW-MEBWIN took {end} sec and detected {len(k_swmebwin_drifts)} drifts\n') # SWMEBWIN start = time.time() for i in range(RANGE): swmebwin.add_element(value=data[i], label=labels[i]) if swmebwin.change_detected is True: swmebwin_drifts.append(i) end = time.time() - start f1, tp, fp, tn, fn = confusion_matrix_stats(swmebwin_drifts, RANGE) overall_swmebwin_tp += tp overall_swmebwin_tn += tn overall_swmebwin_fp += fp overall_swmebwin_fn += fn print(f'F1-Score: {f1}') print(f'{tp} true positives, {fp} false positives') print(f'{tn} true negatives, {fn} false negatives') f = open('drifts.txt', 'a+') f.write(f'SWMEB detected {len(swmebwin_drifts)} drifts in {time.time() - start} {swmebwin_drifts}\n\n') f.close() print(f'SW-MEBWIN took {end} sec and detected {len(swmebwin_drifts)} drifts\n') # ADWIN start = time.time() for i in range(RANGE): adwin_detected = False for j in range(data[i].size): adwin[j].add_element(data[i][j]) if adwin[j].detected_change(): adwin_detected = True if adwin_detected is True: adwin_drifts.append(i) end = time.time() - start f1, tp, fp, tn, fn = confusion_matrix_stats(adwin_drifts, RANGE) overall_adwin_tp += tp overall_adwin_tn += tn overall_adwin_fp += fp overall_adwin_fn += fn print(f'F1-Score: {f1}') print(f'{tp} true positives, {fp} false positives') print(f'{tn} true negatives, {fn} false negatives') f = open('drifts.txt', 'a+') f.write(f'ADWIN detected {len(adwin_drifts)} drifts in {time.time() - start} at {adwin_drifts}\n\n') f.close() print(f'ADWIN took {end} sec and detected {len(adwin_drifts)} drifts\n') # KSWIN start = time.time() for i in range(RANGE): kswin_detected = False for j in range(data[i].size): kswin[j].add_element(data[i][j]) if kswin[j].detected_change(): kswin_detected = True if kswin_detected is True: kswin_drifts.append(i) end = time.time() - start f1, tp, fp, tn, fn = confusion_matrix_stats(kswin_drifts, RANGE) overall_kswin_tp += tp overall_kswin_tn += tn overall_kswin_fp += fp overall_kswin_fn += fn print(f'F1-Score: {f1}') print(f'{tp} true positives, {fp} false positives') print(f'{tn} true negatives, {fn} false negatives') f = open('drifts.txt', 'a+') f.write(f'KSWIN detected {len(kswin_drifts)} drifts in {time.time() - start} at {kswin_drifts}\n\n') f.close() print(f'KSWIN took {end} sec and detected {len(kswin_drifts)} drifts\n') # EDDM start = time.time() for i in range(RANGE): eddm_detected = False eddm.add_element(predictions[i]) if eddm.detected_change(): eddm_detected = True if eddm_detected is True: eddm_drifts.append(i) end = time.time() - start f1, tp, fp, tn, fn = confusion_matrix_stats(eddm_drifts, RANGE) overall_eddm_tp += tp overall_eddm_tn += tn overall_eddm_fp += fp overall_eddm_fn += fn print(f'F1-Score: {f1}') print(f'{tp} true positives, {fp} false positives') print(f'{tn} true negatives, {fn} false negatives') f = open('drifts.txt', 'a+') f.write(f'EDDM detected {len(eddm_drifts)} drifts in {time.time() - start} at {eddm_drifts}\n\n') f.close() print(f'EDDM took {end} sec and detected {len(eddm_drifts)} drifts\n') # DDM start = time.time() for i in range(RANGE): ddm_detected = False ddm.add_element(predictions[i]) if ddm.detected_change(): ddm_detected = True if ddm_detected is True: ddm_drifts.append(i) end = time.time() - start f1, tp, fp, tn, fn = confusion_matrix_stats(ddm_drifts, RANGE) overall_ddm_tp += tp overall_ddm_tn += tn overall_ddm_fp += fp overall_ddm_fn += tn print(f'F1-Score: {f1}') print(f'{tp} true positives, {fp} false positives') print(f'{tn} true negatives, {fn} false negatives') f = open('drifts.txt', 'a+') f.write(f'DDM detected {len(ddm_drifts)} drifts in {time.time() - start} at {ddm_drifts}\n\n') f.close() print(f'DDM took {end} sec and detected {len(ddm_drifts)} drifts\n') # OVERALL STATISTICS print(50 * '-') print('K-SWMEBWIN\n') print(f'Overall F1: {calc_f1(overall_k_swmebwin_tp, overall_k_swmebwin_fp, overall_k_swmebwin_tn, overall_k_swmebwin_fn)}') print(f'{overall_k_swmebwin_tp} true positives, {overall_k_swmebwin_fp} false positives') print(f'{overall_k_swmebwin_tn} true negatives, {overall_k_swmebwin_fn} false negatives') print(50* '-') print(50 * '-') print('SWMEBWIN\n') print(f'Overall F1: {calc_f1(overall_swmebwin_tp, overall_swmebwin_fp, overall_swmebwin_tn, overall_swmebwin_fn)}') print(f'{overall_swmebwin_tp} true positives, {overall_swmebwin_fp} false positives') print(f'{overall_swmebwin_tn} true negatives, {overall_swmebwin_fn} false negatives') print(50* '-') print(50 * '-') print('KSWIN\n') print(f'Overall F1: {calc_f1(overall_kswin_tp, overall_kswin_fp, overall_kswin_tn, overall_kswin_fn)}') print(f'{overall_kswin_tp} true positives, {overall_kswin_fp} false positives') print(f'{overall_kswin_tn} true negatives, {overall_kswin_fn} false negatives') print(50* '-') print(50 * '-') print('ADWIN\n') print(f'Overall F1: {calc_f1(overall_adwin_tp, overall_adwin_fp, overall_adwin_tn, overall_adwin_fn)}') print(f'{overall_adwin_tp} true positives, {overall_adwin_fp} false positives') print(f'{overall_adwin_tn} true negatives, {overall_adwin_fn} false negatives') print(50* '-') print(50 * '-') print('DDM\n') print(f'Overall F1: {calc_f1(overall_ddm_tp, overall_ddm_fp, overall_ddm_tn, overall_ddm_fn)}') print(f'{overall_ddm_tp} true positives, {overall_ddm_fp} false positives') print(f'{overall_ddm_tn} true negatives, {overall_ddm_fn} false negatives') print(50* '-') print(50 * '-') print('EDDM\n') print(f'Overall F1: {calc_f1(overall_eddm_tp, overall_eddm_fp, overall_eddm_tn, overall_eddm_fn)}') print(f'{overall_eddm_tp} true positives, {overall_eddm_fp} false positives') print(f'{overall_eddm_tn} true negatives, {overall_eddm_fn} false negatives') print(50* '-')
def partial_fit(self, X, y=None, classes=None, weight=None): """ Fit the ensemble to a data chunk Implement the basic Algorithm 1 as described in the paper :param X: the training data (a data chunk S) :param y: the training labels :param classes: array-like, contains all possible labels, if not provided, it will be derived from y :param weight: array-like, instance weight if not provided, uniform weights are assumed :return: self """ # if the classes are not provided, we derive it from y N, D = X.shape class_count = None # avoid calling unique multiple times if classes is None: classes, class_count = np.unique(y, return_counts=True) # (1) train classifier C' from X # allows a wider variety of classifiers # not a lot but still... if self.base_learner == "bayes": # Naive Bayes C_new = NaiveBayes() else: # by default, set to Hoeffding Tree C_new = HoeffdingTree() C_new.partial_fit(X, y, classes=classes) # (2) compute error rate/benefit of C_new via cross-validation on S # MSE_r: compute the baseline error rate given by a random classifier # a. class distribution learnt from the data # use this improve the performance if class_count is None: _, class_count = np.unique(classes, return_counts=True) class_dist = [class_count[i] / N for i, c in enumerate(classes)] MSE_r = np.sum([class_dist[i] * ((1 - class_dist[i]) ** 2) for i, c in enumerate(classes)]) # b. assumption: uniform distribution # p_c = 1/L # MSE_r = L * (p_c * ((1 - p_c) ** 2)) # MSE_i: compute the error rate of C_new via cross-validation on X # f_ic = the probability given by C_new that x is an instance of class c MSE_i = self.compute_MSE(y, C_new.predict_proba(X), classes) # (3) derive weight w_new for C_new using (8) or (9) w_new = MSE_r - MSE_i # create a new classifier with its associated weight, # the unique labels of the data chunk it is trained on clf_new = self.WeightedClassifier(clf=C_new, weight=w_new, chunk_labels=classes) # (4) update the weights of each classifier in the ensemble for i, clf in enumerate(self.models): MSE_i = self.compute_MSE(y, clf.clf.predict_proba(X), clf.chunk_labels) # apply Ci on S to derive MSE_i clf.weights = MSE_r - MSE_i # update wi based on (8) or (9) # (5) C <- top K weighted classifiers in C U { C' } # selecting top K models by dropping the worst model i.e. clf with smallest weight in C U { C' } if len(self.models) < self.K: # just push the new model in if there is still slots hq.heappush(self.models, clf_new) else: # if the new model has a weight > that of the bottom classifier (worst one) if clf_new.weight > self.models[0].weight: hq.heappushpop(self.models, clf_new) # push the new classifier and remove the bottom one # do nothing if the new model has a weight even lower than that of the worst classifier return self