def evaluation1(): classifiers = [ OzaBagging(base_estimator=HoeffdingTree()), OzaBaggingAdwin(base_estimator=HoeffdingTree()) ] # Array mit Klassifikationsalgorithmen die getestet werden sollen cv = CrossValidation(clfs=classifiers, max_samples=1000000, test_size=1) cv.streams = cv.init_standard_streams() + cv.init_real_world( ) + cv.init_reoccuring_streams( ) # initialisiert Stream Generatoren des Scikit-Multiflow Package cv.test() cv.save_summary()
def unsupervised_analysis(df, nu, size, percent): stream = DataStream(df) stream.prepare_for_use() stream_clf = HoeffdingTree() stream_acc = [] stream_record = [] stream_true= 0 buffer = dataBuffer(size, stream.n_features, percent) clf = svm.OneClassSVM(nu=nu, kernel="rbf", gamma='auto') # start = time.time() X,y = stream.next_sample(size) stream_clf.partial_fit(X,y, classes=stream.target_values) clf.fit(X) i=0 while(stream.has_more_samples()): #stream.has_more_samples() X,y = stream.next_sample() if buffer.isEmpty(): buffer.addInstance(X,y,clf.predict(X)) y_hat = stream_clf.predict(X) stream_true = stream_true + check_true(y, y_hat) stream_clf.partial_fit(X,y) stream_acc.append(stream_true / (i+1)) stream_record.append(check_true(y,y_hat)) else: if buffer.driftCheck(): #detected #print("concept drift detected at {}".format(i)) #retrain the model stream_clf.reset() #stream_clf = HoeffdingTree() stream_clf.partial_fit(buffer.getCurrentData(), buffer.getCurrentLabels(), classes=stream.target_values) #update one-class SVM clf.fit(buffer.getCurrentData()) #evaluate and update the model y_hat = stream_clf.predict(X) stream_true = stream_true + check_true(y, y_hat) stream_clf.partial_fit(X,y) stream_acc.append(stream_true / (i+1)) stream_record.append(check_true(y,y_hat)) #add new sample to the window buffer.addInstance(X,y,clf.predict(X)) else: #evaluate and update the model y_hat = stream_clf.predict(X) stream_true = stream_true + check_true(y, y_hat) stream_clf.partial_fit(X,y) stream_acc.append(stream_true / (i+1)) stream_record.append(check_true(y,y_hat)) #add new sample to the window buffer.addInstance(X,y,clf.predict(X)) i = i + 1 #print(buffer.drift_count) elapsed = format(time.time() - start, '.4f') acc = format(stream_acc[-1] * 100, '.4f') final_accuracy = "Parameters: {}, {}, {}, Final accuracy: {}, Elapsed time: {}".format(nu,size,percent,acc,elapsed) return final_accuracy, stream_record
def evaluation(): classifiers = [ GLVQ(prototypes_per_class=4), HoeffdingTree(), HAT(), KNN(), SAMKNN(), LeverageBagging(), KNNAdwin(max_window_size=1000) ] # Array mit Klassifikationsalgorithmen die getestet werden sollen cv = CrossValidation(clfs=classifiers, max_samples=1000000, test_size=1) cv.streams = cv.init_standard_streams() + cv.init_real_world( ) + cv.init_reoccuring_streams( ) # initialisiert Stream Generatoren des Scikit-Multiflow Package cv.test() cv.save_summary()
def parameter_q_and_t(self): accuracy_of_combinations = [] combination = [] quantile_percent = [0.50, 0.75, 1.0] threshold = [0.5, 0.6, 0.7] test_X, test_y = get_data_batches(self.X_array, self.y_array) ensemble_clf = DecisionTreeClassifier() clf = HoeffdingTree() bootstrap_count = 100 for q in quantile_percent: for t in threshold: Train_X = test_X[0] Train_y = test_y[0].flatten() clf = clf.fit(Train_X, Train_y) MPD3_detector = MPD3(bootstrap_count, q, t) ensemble = MPD3_detector.ensemble_bootstrap(Train_X, Train_y) batch_accuracy = [] result = [] for i in range(len(test_X) - 1): index = i + 1 prediction = clf.predict(test_X[index]) batch_accuracy.append( accuracy_score(test_y[index], prediction)) mpd_value = MPD3_detector.MPD_score( test_X[index], ensemble) if MPD3_detector.drift_check(mpd_value): Train_X = test_X[index] Train_y = test_y[index].flatten() clf = clf.partial_fit(Train_X, Train_y) ensemble = MPD3_detector.ensemble_bootstrap( Train_X, Train_y) mean_accuracy = np.average(batch_accuracy) accuracy_of_combinations.append(mean_accuracy) combination.append([q, t]) index_of_max_acc = np.argmax(accuracy_of_combinations) final_q, final_t = combination[index_of_max_acc] return final_q, final_t
def test_multi_output_learner(): stream = MultilabelGenerator(n_samples=5150, n_features=15, n_targets=3, n_labels=4, random_state=112) stream.prepare_for_use() classifier = MultiOutputLearner(base_estimator=HoeffdingTree()) X, y = stream.next_sample(150) classifier.partial_fit(X, y) cnt = 0 max_samples = 5000 predictions = [] true_labels = [] wait_samples = 100 correct_predictions = 0 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): predictions.append(classifier.predict(X)[0]) true_labels.append(y[0]) if np.array_equal(y[0], predictions[-1]): correct_predictions += 1 classifier.partial_fit(X, y) cnt += 1 perf = hamming_score(true_labels, predictions) expected_predictions = [[1., 1., 1.], [1., 1., 1.], [1., 1., 1.], [1., 1., 1.], [1., 1., 1.], [0., 1., 1.], [1., 0., 1.], [1., 0., 1.], [1., 1., 1.], [0., 0., 1.], [0., 1., 1.], [0., 1., 1.], [1., 1., 1.], [0., 1., 1.], [1., 1., 0.], [1., 1., 1.], [0., 1., 1.], [1., 0., 0.], [1., 0., 1.], [1., 1., 1.], [1., 0., 1.], [1., 1., 1.], [1., 1., 1.], [1., 1., 1.], [1., 1., 1.], [1., 1., 1.], [1., 1., 1.], [1., 0., 0.], [0., 1., 1.], [1., 1., 0.], [1., 1., 1.], [0., 1., 1.], [1., 1., 1.], [0., 1., 1.], [1., 0., 1.], [1., 0., 1.], [0., 0., 1.], [0., 1., 1.], [1., 1., 0.], [0., 1., 1.], [1., 1., 1.], [1., 1., 1.], [1., 0., 1.], [1., 1., 1.], [1., 1., 1.], [1., 0., 1.], [1., 1., 1.], [1., 1., 1.], [0., 1., 1.]] expected_correct_predictions = 32 expected_performance = 0.8503401360544217 assert np.alltrue(np.array_equal(predictions, expected_predictions)) assert np.isclose(expected_performance, perf) assert correct_predictions == expected_correct_predictions assert type(classifier.predict(X)) == np.ndarray assert type(classifier.predict_proba(X)) == np.ndarray
df.iloc[:, 0:df.shape[1] - 1] = scaler.fit_transform( df.iloc[:, 0:df.shape[1] - 1]) return df def check_true(y, y_hat): if (y == y_hat): return 1 else: return 0 df = select_data(sys.argv[1]) stream = DataStream(df) stream.prepare_for_use() stream_clf = HoeffdingTree() w = int(sys.argv[2]) rho = float(sys.argv[3]) auc = float(sys.argv[4]) # In[ ]: D3_win = D3(w, rho, stream.n_features, auc) stream_acc = [] stream_record = [] stream_true = 0 i = 0 start = time.time() X, y = stream.next_sample(int(w * rho)) stream_clf.partial_fit(X, y, classes=stream.target_values)
from skmultiflow.data import WaveformGenerator from skmultiflow.trees.hoeffding_tree import HoeffdingTree from skmultiflow.evaluation.evaluate_prequential import EvaluatePrequential # 1. Create a stream stream = WaveformGenerator() stream.prepare_for_use() # 2. Instantiate the HoeffdingTree classifier ht = HoeffdingTree() # 3. Setup the evaluator evaluator = EvaluatePrequential(show_plot=False, pretrain_size=200, max_samples=20000) # 4. Run evaluation evaluator.evaluate(stream=stream, model=ht)
labels.columns = ['class'] n_samples = XT.shape[0] - preparatory_size ######################## CURIE ################### lst_dim = [n_bins] * n_feats curie = CA_VonNeumann_Classifier(bins=[], bins_margin=bins_margin, dimensions=lst_dim, cells=empties(lst_dim)) limits_automata = list(np.zeros(1)) #ca_names=['CURIE'] mutants_time = empty_mutant(curie.dimensions) ######################## LEARNERS ################### learners_ref = [HoeffdingTree(), KNN(), NaiveBayes()] ######################## DETECTORS ################### detectores_ref = [DDM(), EDDM(), ADWIN(), PageHinkley(), curie] n_pasos = len(datasets) * len(tipos) * len(learners_ref) * len( detectores_ref) SCORES_LER = [] TIMES_LER = [] RAMS_LER = [] DETECTIONS_LER = [] for ler in range(len(learners_ref)): learner = deepcopy(learners_ref[ler])
# In[18]: ###Hoeffding Tree Online Classifier### # In[19]: ### HT Online for RBF### # In[233]: HT = HoeffdingTree() positive = 0 cnt=1 temp_accuracy = [] itr = [] HT_RBF_prediction = [] for i in range(len(RBF_X)): tempx = np.array([RBF_X[i]]) tempy = np.array([RBF_Y[i]]) prediction = HT.predict(tempx) if tempy == prediction: positive += 1 temp_accuracy.append(positive/cnt) HT_RBF_prediction.append(np.int(HT.predict(tempx)))
def __init__(self, alpha=0.001, drift_detector="KSWIN"): self.classifier = HoeffdingTree() self.init_drift_detection = True self.drift_detector = drift_detector.upper() self.confidence = alpha self.n_detections = 0
def hyperparametertuning_classifiers(classifiers, scoring, cv, X_init, y_init, max_iter, knn_max_window_size): for cl in range(len(classifiers)): cl_name = classifiers[cl].__class__.__name__ if cl_name == 'PassiveAggressiveClassifier': # print (cl_name,' tuning ...') PAC_grid = { 'C': [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0], 'max_iter': [max_iter, 100, 200, 500] } grid_cv_PAC = RandomizedSearchCV(classifiers[cl], PAC_grid, cv=cv, scoring=scoring) grid_cv_PAC.fit(X_init, y_init) classifiers[cl] = grid_cv_PAC.best_estimator_ elif cl_name == 'SGDClassifier': # print (cl_name,' tuning ...') SGDC_grid = { 'alpha': 10.0**-np.arange(1, 7), 'loss': [ 'perceptron', 'hinge', 'log', 'modified_huber', 'squared_hinge' ], 'learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'], 'eta0': [0.1, 0.5, 1.0], 'penalty': [None, 'l2', 'l1', 'elasticnet'], 'max_iter': [max_iter, 100, 200, 500] } grid_cv_SGDC = RandomizedSearchCV(classifiers[cl], SGDC_grid, cv=cv, scoring=scoring) grid_cv_SGDC.fit(X_init, y_init) classifiers[cl] = grid_cv_SGDC.best_estimator_ elif cl_name == 'MLPClassifier': # print (cl_name,' tuning ...') MLPC_grid = { 'hidden_layer_sizes': [(50, ), (100, ), (50, 50), (100, 100)], 'activation': ['identity', 'logistic', 'tanh', 'relu'], 'solver': ['sgd', 'adam'], 'learning_rate': ['constant', 'invscaling', 'adaptive'], 'learning_rate_init': [0.0005, 0.001, 0.005], 'alpha': 10.0**-np.arange(1, 10), 'batch_size': [1, 'auto'], 'max_iter': [1, 100, 200, 500] } grid_cv_MLPC = RandomizedSearchCV(classifiers[cl], MLPC_grid, cv=cv, scoring=scoring) grid_cv_MLPC.fit(X_init, y_init) classifiers[cl] = grid_cv_MLPC.best_estimator_ elif cl_name == 'MondrianTreeClassifier': # print (cl_name,' tuning ...') MTC_grid = { 'max_depth': [None, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110], 'min_samples_split': [2, 5, 10] } grid_cv_MTC = RandomizedSearchCV(classifiers[cl], MTC_grid, cv=cv, scoring=scoring) grid_cv_MTC.fit(X_init, y_init) classifiers[cl] = grid_cv_MTC.best_estimator_ elif cl_name == 'MondrianForestClassifier': # print (cl_name,' tuning ...') MFR_grid = { 'n_estimators': [5, 10, 25, 50, 100], 'max_depth': [None, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110], 'min_samples_split': [2, 5, 10] } grid_cv_MFC = RandomizedSearchCV(classifiers[cl], MFR_grid, cv=cv, scoring=scoring) grid_cv_MFC.fit(X_init, y_init) classifiers[cl] = grid_cv_MFC.best_estimator_ elif cl_name == 'KNN': # print (cl_name, ' No tuning yet! ') KNN_grid = { 'n_neighbors': [5, 10, 15, 20], # 'max_window_size': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100], 'leaf_size': [5, 10, 20, 30], 'algorithm': ['auto'], 'weights': ['uniform', 'distance'] } grid_cv_KNN = RandomizedSearchCV(KNeighborsClassifier(), KNN_grid, cv=cv, scoring=scoring) grid_cv_KNN.fit(X_init, y_init) # print('grid_cv_KNN.best_params_: ',grid_cv_KNN.best_params_) n_neighbors = grid_cv_KNN.best_params_['n_neighbors'] leaf_size = grid_cv_KNN.best_params_['leaf_size'] classifiers[cl] = lazy.KNN(n_neighbors=n_neighbors, max_window_size=knn_max_window_size, leaf_size=leaf_size) # elif cl_name=='VFDR': # print (cl_name, ' No tuning yet! ') # VFDR_grid = {'ordered_rules': [True,False], # 'rule_prediction': ['first_hit','weighted_max','weighted_sum'], # 'max_rules': [5,10,20,30,50], # 'drift_detector': [None], # 'expand_criterion': ['info_gain','hellinger','foil_gain'] # } # classifiers[cl]=VFDR() elif cl_name == 'HoeffdingTree': print(cl_name, ' No tuning yet! ') classifiers[cl] = HoeffdingTree() elif cl_name == 'GaussianNB': classifiers[cl] = GaussianNB() elif cl_name == 'GaussianNB': classifiers[cl] = GaussianNB() return classifiers
SCORES_sgdc = [] SCORES_htc = [] SCORES_mtc = [] SCORES_pac = [] SCORES_gnbc = [] SCORES_knn = [] SCORES_mlpc = [] for ru in range(runs): print('-RUN=' + str(ru)) #Defining streamers SGDC = SGDClassifier() HTC = HoeffdingTree() MTC = MondrianTreeClassifier() PAC = PassiveAggressiveClassifier() GNBC = GaussianNB() KNN = lazy.KNN() MLPC = MLPClassifier() classifiers = [SGDC, HTC, PAC, KNN] #Data features = pd.DataFrame(XT) labels = pd.DataFrame(YT) features.columns = columns labels.columns = ['class'] #Data slicing