def unsupervised_analysis(df, nu, size, percent): stream = DataStream(df) stream.prepare_for_use() stream_clf = HoeffdingTree() stream_acc = [] stream_record = [] stream_true= 0 buffer = dataBuffer(size, stream.n_features, percent) clf = svm.OneClassSVM(nu=nu, kernel="rbf", gamma='auto') # start = time.time() X,y = stream.next_sample(size) stream_clf.partial_fit(X,y, classes=stream.target_values) clf.fit(X) i=0 while(stream.has_more_samples()): #stream.has_more_samples() X,y = stream.next_sample() if buffer.isEmpty(): buffer.addInstance(X,y,clf.predict(X)) y_hat = stream_clf.predict(X) stream_true = stream_true + check_true(y, y_hat) stream_clf.partial_fit(X,y) stream_acc.append(stream_true / (i+1)) stream_record.append(check_true(y,y_hat)) else: if buffer.driftCheck(): #detected #print("concept drift detected at {}".format(i)) #retrain the model stream_clf.reset() #stream_clf = HoeffdingTree() stream_clf.partial_fit(buffer.getCurrentData(), buffer.getCurrentLabels(), classes=stream.target_values) #update one-class SVM clf.fit(buffer.getCurrentData()) #evaluate and update the model y_hat = stream_clf.predict(X) stream_true = stream_true + check_true(y, y_hat) stream_clf.partial_fit(X,y) stream_acc.append(stream_true / (i+1)) stream_record.append(check_true(y,y_hat)) #add new sample to the window buffer.addInstance(X,y,clf.predict(X)) else: #evaluate and update the model y_hat = stream_clf.predict(X) stream_true = stream_true + check_true(y, y_hat) stream_clf.partial_fit(X,y) stream_acc.append(stream_true / (i+1)) stream_record.append(check_true(y,y_hat)) #add new sample to the window buffer.addInstance(X,y,clf.predict(X)) i = i + 1 #print(buffer.drift_count) elapsed = format(time.time() - start, '.4f') acc = format(stream_acc[-1] * 100, '.4f') final_accuracy = "Parameters: {}, {}, {}, Final accuracy: {}, Elapsed time: {}".format(nu,size,percent,acc,elapsed) return final_accuracy, stream_record
def test_data_stream(test_path, package_path): test_file = os.path.join(package_path, 'src/skmultiflow/data/datasets/sea_stream.csv') raw_data = pd.read_csv(test_file) stream = DataStream(raw_data, name='Test') assert not stream._Y_is_defined stream.prepare_for_use() assert stream.n_remaining_samples() == 40000 expected_names = ['attrib1', 'attrib2', 'attrib3'] assert stream.feature_names == expected_names expected_targets = [0, 1] assert stream.target_values == expected_targets assert stream.target_names == ['class'] assert stream.n_features == 3 assert stream.n_cat_features == 0 assert stream.n_num_features == 3 assert stream.n_targets == 1 assert stream.get_data_info() == 'Test: 1 target(s), 2 classes' assert stream.has_more_samples() is True assert stream.is_restartable() is True # Load test data corresponding to first 10 instances test_file = os.path.join(test_path, 'sea_stream_file.npz') data = np.load(test_file) X_expected = data['X'] y_expected = data['y'] X, y = stream.next_sample() assert np.alltrue(X[0] == X_expected[0]) assert np.alltrue(y[0] == y_expected[0]) X, y = stream.last_sample() assert np.alltrue(X[0] == X_expected[0]) assert np.alltrue(y[0] == y_expected[0]) stream.restart() X, y = stream.next_sample(10) assert np.alltrue(X == X_expected) assert np.alltrue(y == y_expected) assert stream.n_targets == np.array(y).ndim assert stream.n_features == X.shape[1] assert 'stream' == stream._estimator_type expected_info = "DataStream(n_targets=-1, target_idx=1, cat_features=None, name='Test')" assert stream.get_info() == expected_info
def test_data_stream_X_y(test_path, package_path): test_file = os.path.join(package_path, 'src/skmultiflow/datasets/sea_stream.csv') raw_data = pd.read_csv(test_file) y = raw_data.iloc[:, -1:] X = raw_data.iloc[:, :-1] stream = DataStream(X, y) assert stream._Y_is_defined == True stream.prepare_for_use() assert stream.n_remaining_samples() == 40000 expected_names = ['attrib1', 'attrib2', 'attrib3'] assert stream.feature_names == expected_names expected_targets = [0, 1] assert stream.target_values == expected_targets assert stream.target_names == ['class'] assert stream.n_features == 3 assert stream.n_cat_features == 0 assert stream.n_num_features == 3 assert stream.n_targets == 1 assert stream.get_data_info() == '1 target(s), 2 target_values' assert stream.has_more_samples() is True assert stream.is_restartable() is True # Load test data corresponding to first 10 instances test_file = os.path.join(test_path, 'sea_stream.npz') data = np.load(test_file) X_expected = data['X'] y_expected = data['y'] X, y = stream.next_sample() assert np.alltrue(X[0] == X_expected[0]) assert np.alltrue(y[0] == y_expected[0]) X, y = stream.last_sample() assert np.alltrue(X[0] == X_expected[0]) assert np.alltrue(y[0] == y_expected[0]) stream.restart() X, y = stream.next_sample(10) assert np.alltrue(X == X_expected) assert np.alltrue(y == y_expected) assert stream.n_targets == np.array(y).ndim assert stream.n_features == X.shape[1]
def InnerCycle_Train(X, y, inject_drift, perc_train): # get number of training samples ntrain = int(perc_train * X.shape[0]) if inject_drift: # pick a point between 0.7 and 0.9 of the stream dpoints = Driftpoints(X) dpoints["cleanrun"] = dpoints["row"] - ntrain # contaminate X after that point X = Swapcols(df=X, class_vec=y, ids=dpoints["cols"], t_change=dpoints["row"]) else: dpoints = dict({"row": X.shape[0], "cols": 0}) # cast data as DataStream class stream = DataStream(X, y) stream.prepare_for_use() # call incr model (main classifier, teacher model) stream_clf = ARF(n_estimators=25) #, #drift_detection_method=None, #warning_detection_method=None #) # get training data... first ntrain rows Xtrain, ytrain = stream.next_sample(ntrain) # partial fit of the incre model using training data stream_clf.fit(Xtrain, ytrain, classes=stream.target_values) yhat_train = stream_clf.predict(Xtrain) yhat_train_prob = stream_clf.predict_proba( Xtrain) ### needs warnings!!!!!!!!! yhat_tr_max_prob = np.array([np.max(x) for x in yhat_train_prob]) # fit student model student_clf = ARF(n_estimators=25) #, #drift_detection_method=None, #warning_detection_method=None) student_clf.fit(Xtrain, yhat_train, classes=stream.target_values) student_regr = RHT() student_regr.fit(Xtrain, yhat_tr_max_prob) results = dict() results["Teacher"] = stream_clf results["Student"] = student_clf results["StudentRegression"] = student_regr results["Driftpoints"] = dpoints results["n"] = ntrain results["Stream"] = stream results["Xtrain"] = Xtrain return (results)
total_length = int(total_length) for data in response.iter_content(chunk_size=4096): dl += len(data) f.write(data) done = int(50 * dl / total_length) sys.stdout.write("\r[%s%s]" % ('=' * done, ' ' * (50 - done))) sys.stdout.flush() data = np.load(file_name, allow_pickle=True) return data # data = download_data() #If dataset file is already downloaded data = np.load(file_name, allow_pickle=True) sam = SAMKNN() arf = HoeffdingAdaptiveTreeClassifier() stream = DataStream(data[:, 1:], data[:, 0].astype(int)) stream.prepare_for_use() evaluator = EvaluatePrequential(max_samples=10000, max_time=1000, show_plot=True, metrics=['accuracy', 'kappa']) evaluator.evaluate(stream=stream, model=[sam, arf], model_names=['Sam', 'RSLVQ'])
def InnerCycle(X, y, inject_drift, perc_train, window, delta, pval, prob_instance, inst_delay): # get number of training samples ntrain = int(perc_train * X.shape[0]) if inject_drift: # pick a point between 0.7 and 0.9 of the stream dpoints = Driftpoints(X) dpoints["cleanrun"] = dpoints["row"] - ntrain # contaminate X after that point X = Swapcols(df=X, class_vec=y, ids=dpoints["cols"], t_change=dpoints["row"]) else: dpoints = dict({"row": X.shape[0], "cols": 0}) # cast data as DataStream class stream = DataStream(X, y) stream.prepare_for_use() # call incr model (main classifier, teacher model) stream_clf = ARF(n_estimators=25, drift_detection_method=None, warning_detection_method=None) # get training data... first ntrain rows Xtrain, ytrain = stream.next_sample(ntrain) # partial fit of the incre model using training data stream_clf.fit(Xtrain, ytrain, classes=stream.target_values) yhat_train = stream_clf.predict(Xtrain) yhat_train_prob = stream_clf.predict_proba( Xtrain) ### needs warnings!!!!!!!!! yhat_tr_max_prob = np.array([np.max(x) for x in yhat_train_prob]) # fit student model student_clf = ARF(n_estimators=25, drift_detection_method=None, warning_detection_method=None) student_clf.fit(Xtrain, yhat_train, classes=stream.target_values) student_regr = RHT() student_regr.fit(Xtrain, yhat_tr_max_prob) ####### Call drift detectors ## Supervised # Supervised with ADWIN S_ADWIN = ADWIN() #(delta=delta) S_ADWIN_alarms = [] # Supervised with PHT S_PHT = PHT() #(min_instances=window,delta=delta) S_PHT_alarms = [] # Delayed Supervised with ADWIN DS_ADWIN = ADWIN() #(delta=delta) DS_ADWIN_alarms = [] # Delayed Supervised with PHT DS_PHT = PHT() #(min_instances=window,delta=delta) DS_PHT_alarms = [] ## Semi-supervised # Semi-Supervised with ADWIN WS_ADWIN = ADWIN() #(delta=delta) WS_ADWIN_alarms = [] # Supervised with PHT WS_PHT = PHT() #(min_instances=window,delta=delta) WS_PHT_alarms = [] # Delayed Supervised with ADWIN DWS_ADWIN = ADWIN() #(delta=delta) DWS_ADWIN_alarms = [] # Delayed Supervised with PHT DWS_PHT = PHT() #(min_instances=window,delta=delta) DWS_PHT_alarms = [] ##### Unsupervised # Student with ADWIN U_ADWIN = ADWIN() #(delta=delta) U_ADWIN_alarms = [] # Student with PHT U_PHT = PHT() #(min_instances=window,delta=delta) U_PHT_alarms = [] # Student with ADWIN UR_ADWIN = ADWIN() #(delta=delta) UR_ADWIN_alarms = [] # Student with PHT UR_PHT = PHT() #(min_instances=window,delta=delta) UR_PHT_alarms = [] # WRS with output WRS_Output = HypothesisTestDetector(method="wrs", window=window, thr=pval) WRS_Output_alarms = [] # WRS with class prob WRS_Prob = HypothesisTestDetector(method="wrs", window=window, thr=pval) WRS_Prob_alarms = [] # TT with output TT_Output = HypothesisTestDetector(method="tt", window=window, thr=pval) TT_Output_alarms = [] # TT with class prob TT_Prob = HypothesisTestDetector(method="tt", window=window, thr=pval) TT_Prob_alarms = [] # KS with output KS_Output = HypothesisTestDetector(method="ks", window=window, thr=pval) KS_Output_alarms = [] # KS with class prob KS_Prob = HypothesisTestDetector(method="ks", window=window, thr=pval) KS_Prob_alarms = [] Driftmodels = [ S_ADWIN, S_PHT, DS_ADWIN, DS_PHT, WS_ADWIN, WS_PHT, DWS_ADWIN, DWS_PHT, U_ADWIN, U_PHT, UR_ADWIN, UR_PHT, WRS_Output, TT_Output, KS_Output, WRS_Prob, TT_Prob, KS_Prob ] Driftmodels_alarms = [ S_ADWIN_alarms, S_PHT_alarms, DS_ADWIN_alarms, DS_PHT_alarms, WS_ADWIN_alarms, WS_PHT_alarms, DWS_ADWIN_alarms, DWS_PHT_alarms, U_ADWIN_alarms, U_PHT_alarms, UR_ADWIN_alarms, UR_PHT_alarms, WRS_Output_alarms, TT_Output_alarms, KS_Output_alarms, WRS_Prob_alarms, TT_Prob_alarms, KS_Prob_alarms ] S_driftmodels = Driftmodels[0:2] DS_driftmodels = Driftmodels[2:4] WS_driftmodels = Driftmodels[4:6] DWS_driftmodels = Driftmodels[6:8] Ustd_driftmodels = Driftmodels[8:10] Ustdreg_driftmodels = Driftmodels[10:12] Uoutput_driftmodels = Driftmodels[12:15] Uprob_driftmodels = Driftmodels[15:18] # always updated S_clf = copy.deepcopy(stream_clf) # always updated with delay DS_clf = copy.deepcopy(stream_clf) # updated immediately with some prob WS_clf = copy.deepcopy(stream_clf) # updated with delay with some prob DWS_clf = copy.deepcopy(stream_clf) # never updated U_clf = copy.deepcopy(stream_clf) i = ntrain k = 0 DWS_yhat_hist = [] DS_yhat_hist = [] X_hist = [] y_hist = [] while (stream.has_more_samples()): print(i) #i=3000 Xi, yi = stream.next_sample() y_hist.append(yi[0]) X_hist.append(Xi) ext_Xi = np.concatenate([Xtrain[-10:], Xi]) U_prob = U_clf.predict_proba(ext_Xi)[-1] U_yhat = U_clf.predict(ext_Xi)[-1] S_yhat = S_clf.predict(ext_Xi)[-1] WS_yhat = WS_clf.predict(ext_Xi)[-1] DS_yhat = DS_clf.predict(ext_Xi)[-1] DWS_yhat = DWS_clf.predict(ext_Xi)[-1] DWS_yhat_hist.append(DWS_yhat) DS_yhat_hist.append(DS_yhat) if len(U_prob) < 2: U_yhat_prob_i = U_prob[0] elif len(U_prob) == 2: U_yhat_prob_i = U_prob[1] else: U_yhat_prob_i = np.max(U_prob) y_meta_hat_i = student_clf.predict(ext_Xi)[-1] y_meta_prob = student_regr.predict(ext_Xi)[-1] # Updating student model student_clf.partial_fit(Xi, [U_yhat]) # Updating supervised model S_clf.partial_fit(Xi, yi) # Computing loss S_err_i = int(yi[0] != S_yhat) student_err_i = int(y_meta_hat_i != U_yhat) student_prob_err_i = U_yhat_prob_i - y_meta_prob for model in S_driftmodels: model.add_element(S_err_i) for model in Ustd_driftmodels: model.add_element(student_err_i) for model in Ustdreg_driftmodels: model.add_element(student_prob_err_i) for model in Uoutput_driftmodels: model.add_element(U_yhat) for model in Uprob_driftmodels: model.add_element(U_yhat_prob_i) put_i_available = np.random.binomial(1, prob_instance) if k >= inst_delay: DS_err_i = int( y_hist[k - inst_delay] != DS_yhat_hist[k - inst_delay]) DS_clf.partial_fit(X_hist[k - inst_delay], [y_hist[k - inst_delay]]) for model in DS_driftmodels: model.add_element(DS_err_i) if put_i_available > 0: DWS_err_i = int( y_hist[k - inst_delay] != DWS_yhat_hist[k - inst_delay]) DWS_clf.partial_fit(X_hist[k - inst_delay], [y_hist[k - inst_delay]]) for model in DWS_driftmodels: model.add_element(DWS_err_i) if put_i_available > 0: WS_err_i = int(yi[0] != WS_yhat) WS_clf.partial_fit(Xi, yi) for model in WS_driftmodels: model.add_element(WS_err_i) # detect changes for j, model in enumerate(Driftmodels): has_change = model.detected_change() if has_change: Driftmodels_alarms[j].append(i) i += 1 k += 1 return ([Driftmodels_alarms, dpoints])