def test_knn_adwin(): stream = ConceptDriftStreamGenerator(stream=SEAGenerator(random_state=1), drift_stream=SEAGenerator( random_state=2, classification_function=2), random_state=1, position=250, width=10) learner = KNNADWINClassifier(n_neighbors=8, leaf_size=40, max_window_size=200) cnt = 0 max_samples = 1000 predictions = array('i') correct_predictions = 0 wait_samples = 20 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): predictions.append(learner.predict(X)[0]) if y[0] == predictions[-1]: correct_predictions += 1 learner.partial_fit(X, y) cnt += 1 expected_predictions = array('i', [ 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1 ]) assert np.alltrue(predictions == expected_predictions) expected_correct_predictions = 46 assert correct_predictions == expected_correct_predictions learner.reset() assert learner.data_window.size == 0 expected_info = "KNNADWINClassifier(leaf_size=40, max_window_size=200, " \ "metric='euclidean', n_neighbors=8)" info = " ".join([line.strip() for line in learner.get_info().split()]) assert info == expected_info assert type(learner.predict(X)) == np.ndarray assert type(learner.predict_proba(X)) == np.ndarray
def OzaBagging(base_estimator=KNNADWINClassifier(), n_estimators=10, random_state=None): # pragma: no cover warnings.warn("'OzaBagging' has been renamed to 'OzaBaggingClassifier' in v0.5.0.\n" "The old name will be removed in v0.7.0", category=FutureWarning) return OzaBaggingClassifier(base_estimator=base_estimator, n_estimators=n_estimators, random_state=random_state)
def __init__(self, base_estimator=KNNADWINClassifier(), n_estimators=10, sampling_rate=3, algorithm=1, drift_detection=True, random_state=None): super().__init__() self.base_estimator = base_estimator self.n_estimators = n_estimators self.random_state = random_state self.sampling_rate = sampling_rate self.algorithm = algorithm self.drift_detection = drift_detection # default values self.ensemble = None self.actual_n_estimators = None self.classes = None self._random_state = None self.adwin_ensemble = None self.lam_sc = None self.lam_pos = None self.lam_neg = None self.lam_sw = None self.epsilon = None self.__configure()
def __init__(self, base_estimator=KNNADWINClassifier(), n_estimators=10, cost_positive=1, cost_negative=0.1, drift_detection=True, random_state=None): super().__init__() # default values self.ensemble = None self.actual_n_estimators = None self.classes = None self._random_state = None self.base_estimator = base_estimator self.n_estimators = n_estimators self.cost_positive = cost_positive self.cost_negative = cost_negative self.drift_detection = drift_detection self.random_state = random_state self.adwin_ensemble = None self.lam_fn = None self.lam_fp = None self.lam_sum = None self.lam_sw = None self.werr = None self.epsilon = None
def __init__(self, base_estimator=KNNADWINClassifier(), n_estimators=10, random_state=None): super().__init__(base_estimator, n_estimators, random_state) # default values self.adwin_ensemble = None self.__configure()
def OnlineBoosting(base_estimator=KNNADWINClassifier(), n_estimators=10, drift_detection=True, random_state=None): # pragma: no cover warnings.warn("'OnlineBoosting' has been renamed to 'OnlineBoostingClassifier' in v0.5.0.\n" "The old name will be removed in v0.7.0", category=FutureWarning) return OnlineBoostingClassifier(base_estimator=base_estimator, n_estimators=n_estimators, drift_detection=drift_detection, random_state=random_state)
def OnlineAdaC2(base_estimator=KNNADWINClassifier(), n_estimators=10, cost_positive=1, cost_negative=0.1, drift_detection=True, random_state=None): # pragma: no cover warnings.warn("'OnlineAdaC2' has been renamed to 'OnlineAdaC2Classifier' in v0.5.0.\n" "The old name will be removed in v0.7.0", category=FutureWarning) return OnlineAdaC2Classifier(base_estimator=base_estimator, n_estimators=n_estimators, cost_positive=cost_positive, cost_negative=cost_negative, drift_detection=drift_detection, random_state=random_state)
def __init__(self, base_estimator=KNNADWINClassifier(), n_estimators=10, random_state=None): super().__init__() # default values self.ensemble = None self.actual_n_estimators = None self.classes = None self._random_state = None # This is the actual random_state object used internally self.base_estimator = base_estimator self.n_estimators = n_estimators self.random_state = random_state self.__configure()
def __init__(self, base_estimator=KNNADWINClassifier(), n_estimators=10, sampling_rate=1, drift_detection=True, random_state=None): super().__init__() self.base_estimator = base_estimator self.n_estimators = n_estimators self._init_random_state = random_state self.sampling_rate = sampling_rate self.drift_detection = drift_detection # default values self.ensemble = None self.actual_n_estimators = None self.classes = None self._random_state = None self.adwin_ensemble = None
def demo(): """ _test_oza_bagging_adwin This demo tests the OzaBaggingADWINClassifier using KNNADWINClassifier as base estimator on samples given by a SEAGenerator. The test computes the performance of the OzaBaggingADWINClassifier as well as the time to create the structure and classify max_samples (20000 by default) instances. """ logging.basicConfig(format='%(message)s', level=logging.INFO) warnings.filterwarnings("ignore", ".*Passing 1d.*") stream = SEAGenerator(1, noise_percentage=0.067, random_state=1) clf = OzaBaggingADWINClassifier(base_estimator=KNNADWINClassifier( n_neighbors=8, max_window_size=2000, leaf_size=30), n_estimators=2, random_state=1) sample_count = 0 correctly_classified = 0 max_samples = 20000 train_size = 10 first = True if train_size > 0: X, y = stream.next_sample(train_size) clf.partial_fit(X, y, classes=stream.target_values) first = False while sample_count < max_samples: if sample_count % (max_samples / 20) == 0: logging.info('%s%%', str((sample_count // (max_samples / 20) * 5))) X, y = stream.next_sample() my_pred = clf.predict(X) if first: clf.partial_fit(X, y, classes=stream.target_values) first = False else: clf.partial_fit(X, y) if my_pred is not None: if y[0] == my_pred[0]: correctly_classified += 1 sample_count += 1 print(str(sample_count) + ' samples analyzed.') print('My performance: ' + str(correctly_classified / sample_count))
def test_pipeline(test_path): n_categories = 5 # Load test data generated using: # RandomTreeGenerator(tree_random_state=1, sample_random_state=1, # n_cat_features=n_categories, n_num_features=0) test_file = os.path.join(test_path, 'data-one-hot.npz') data = np.load(test_file) X = data['X'] y = data['y'] stream = DataStream(data=X, y=y.astype(np.int)) # Setup transformer cat_att_idx = [[i + j for i in range(n_categories)] for j in range(0, n_categories * n_categories, n_categories) ] transformer = OneHotToCategorical(categorical_list=cat_att_idx) # Set up the classifier classifier = KNNADWINClassifier(n_neighbors=2, max_window_size=50, leaf_size=40) # Setup the pipeline pipe = Pipeline([('one-hot', transformer), ('KNNADWINClassifier', classifier)]) # Setup the evaluator evaluator = EvaluatePrequential(show_plot=False, pretrain_size=10, max_samples=100) # Evaluate evaluator.evaluate(stream=stream, model=pipe) metrics = evaluator.get_mean_measurements() expected_accuracy = 0.5555555555555556 assert np.isclose(expected_accuracy, metrics[0].accuracy_score()) expected_kappa = 0.11111111111111116 assert np.isclose(expected_kappa, metrics[0].kappa_score()) print(pipe.get_info()) expected_info = "Pipeline: [OneHotToCategorical(categorical_list=[[0, 1, 2, 3, 4], " \ "[5, 6, 7, 8, 9], [10, 11, 12, 13, 14], [15, 16, 17, 18, 19], " \ "[20, 21, 22, 23, 24]]) KNNADWINClassifier(leaf_size=40, " \ "max_window_size=50, metric='euclidean', n_neighbors=2)]" info = " ".join([line.strip() for line in pipe.get_info().split()]) assert info == expected_info
def test_pipeline(test_path): n_categories = 5 test_file = os.path.join(test_path, 'data-one-hot.npz') data = np.load(test_file) data_as_dict = [] for i in range(0, len(data['X'])): data_as_dict.append({ 'X': data['X'][i].reshape(1, 25), 'y': np.array(data['y'][i]).reshape(1, 1) }) # Setup transformer cat_att_idx = [[i + j for i in range(n_categories)] for j in range(0, n_categories * n_categories, n_categories) ] transformer = OneHotToCategorical(categorical_list=cat_att_idx) # Set up the classifier classifier = KNNADWINClassifier(n_neighbors=2, max_window_size=50, leaf_size=40) # Setup the pipeline pipe = Pipeline([('one-hot', transformer), ('KNNADWINClassifier', classifier)]) train_eval_trigger = PrequentialTrigger(10) reporter = BufferedMetricsReporter(retrieve_metrics) results_observer = MetricsResultObserver(ClassificationMeasurements(), reporter) evaluation_event_observer = EvaluationEventObserver( pipe, train_eval_trigger, [results_observer], [0, 1]) data_source = ArrayDataSource(record_to_dictionary, [evaluation_event_observer], data_as_dict) data_source.listen_for_events() time.sleep(3) expected_accuracy = 0.5555555555555556 expected_kappa = 0.11111111111111116 assert np.isclose(expected_accuracy, reporter.get_buffer()['accuracy']) assert np.isclose(expected_kappa, reporter.get_buffer()['kappa'])
dstream = SEAGenerator(classification_function=2, balance_classes=True, noise_percentage=0.3, random_state=333) #Retrieve five samples dstream.next_sample(5) # Output: #(array([[3.68721825, 0.48303666, 1.04530188], # [2.45403315, 8.73489354, 0.51611639], # [2.38740114, 2.03699194, 1.74533621], # [9.41738118, 4.66915281, 9.59978205], # [1.05404748, 0.42265956, 2.44130999]]), array([1, 0, 0, 1, 1])) # Instatntiate the KNN ADWIN classifier method adwin_knn_class = KNNADWINClassifier(n_neighbors=10, max_window_size=1000) # Prequential Evaluation evaluate1 = EvaluatePrequential(show_plot=False, pretrain_size=1000, max_samples=10000, metrics=['accuracy']) # Run the evaluation evaluate1.evaluate(stream=dstream, model=adwin_knn_class) ################################################### # Applying SAM-KNN Classifier on the synthetic data stream from skmultiflow.lazy import SAMKNNClassifier from skmultiflow.evaluation import EvaluatePrequential from skmultiflow.data.sea_generator import SEAGenerator
def demo(instances=2000): """ _test_comparison_prequential This demo will test a prequential evaluation when more than one learner is passed, which makes it a comparison task. Parameters ---------- instances: int The evaluation's maximum number of instances. """ # Stream setup stream = FileStream( "https://raw.githubusercontent.com/scikit-multiflow/streaming-datasets/" "master/covtype.csv") # stream = SEAGenerator(classification_function=2, sample_seed=53432, balance_classes=False) # Setup the classifier clf = SGDClassifier() # classifier = KNNADWINClassifier(n_neighbors=8, max_window_size=2000,leaf_size=40, nominal_attributes=None) # classifier = OzaBaggingADWINClassifier(base_estimator=KNNClassifier(n_neighbors=8, max_window_size=2000, # leaf_size=30)) clf_one = KNNADWINClassifier(n_neighbors=8, max_window_size=1000, leaf_size=30) # clf_two = KNNClassifier(n_neighbors=8, max_window_size=1000, leaf_size=30) # clf_two = LeveragingBaggingClassifier(base_estimator=KNNClassifier(), n_estimators=2) t_one = OneHotToCategorical([[10, 11, 12, 13], [ 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53 ]]) # t_two = OneHotToCategorical([[10, 11, 12, 13], # [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, # 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, # 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53]]) pipe_one = Pipeline([('one_hot_to_categorical', t_one), ('KNNClassifier', clf_one)]) # pipe_two = Pipeline([('one_hot_to_categorical', t_two), ('KNNClassifier', clf_two)]) classifier = [clf, pipe_one] # classifier = SGDRegressor() # classifier = PerceptronMask() # Setup the pipeline # pipe = Pipeline([('Classifier', classifier)]) # Setup the evaluator evaluator = EvaluatePrequential( pretrain_size=2000, output_file='test_comparison_prequential.csv', max_samples=instances, batch_size=1, n_wait=200, max_time=1000, show_plot=True) # Evaluate evaluator.evaluate(stream=stream, model=classifier)
parser.add_argument('-s', '--label_size', required=False, default=0.25, help="Name of Detector {KD3/Adwin/PageHinkley}") args = parser.parse_args() test_dataset = args.dataset print("dataset:" + "datasets/" + test_dataset + '.csv') stream = FileStream("datasets/" + test_dataset + '.csv') #print(stream.get_target_values()) onlineBoosting = OnlineBoostingClassifier() knn_adwin = KNNADWINClassifier(n_neighbors=8, leaf_size=40, max_window_size=1000) SAMKNN = SAMKNNClassifier(n_neighbors=10, weighting='distance', max_window_size=500, stm_size_option='maxACCApprox', use_ltm=False) learn_pp_nse = LearnPPNSEClassifier() SGD = SGDClassifier() rslvq = RobustSoftLearningVectorQuantization() #CMMM2 = CMGMMClassifier(classes=stream.get_target_values(), prune_component=True, drift_detector=None) #CMMM.train(train_dataset, 'label', 'mfcc') # eval = WeakEvaluatePrequential( show_plot=False,
def test_knn_adwin(): stream = ConceptDriftStream(stream=SEAGenerator(random_state=1), drift_stream=SEAGenerator( random_state=2, classification_function=2), random_state=1, position=250, width=10) stream.prepare_for_use() learner = KNNADWINClassifier(n_neighbors=8, leaf_size=40, max_window_size=200) cnt = 0 max_samples = 1000 predictions = array('i') correct_predictions = 0 wait_samples = 20 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): predictions.append(learner.predict(X)[0]) if y[0] == predictions[-1]: correct_predictions += 1 learner.partial_fit(X, y) cnt += 1 expected_predictions = array('i', [ 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1 ]) assert np.alltrue(predictions == expected_predictions) expected_correct_predictions = 46 assert correct_predictions == expected_correct_predictions learner.reset() assert learner.window.n_samples == 0 expected_info = 'KNNADWINClassifier(leaf_size=40, max_window_size=200, n_neighbors=8, nominal_attributes=None)' info = " ".join([line.strip() for line in learner.get_info().split()]) assert info == expected_info stream.restart() X, y = stream.next_sample(max_samples) learner.fit(X[:950], y[:950]) predictions = learner.predict(X[951:]) correct_predictions = sum(np.array(predictions) == y[951:]) expected_correct_predictions = 47 assert correct_predictions == expected_correct_predictions assert type(learner.predict(X)) == np.ndarray assert type(learner.predict_proba(X)) == np.ndarray
def demo(): """ _test_knn_adwin This demo tests the KNNADWINClassifier on a file stream, which gives instances coming from a SEA generator. The test computes the performance of the KNNADWINClassifier as well as the time to create the structure and classify max_samples (10000 by default) instances. """ start = timer() logging.basicConfig(format='%(message)s', level=logging.INFO) # warnings.filterwarnings("ignore", ".*Passing 1d.*") stream = FileStream( "https://raw.githubusercontent.com/scikit-multiflow/streaming-datasets/" "master/sea_big.csv", -1, 1) # stream = RandomRBFGeneratorDrift(change_speed=41.00, n_centroids=50, model_random_state=32523423, # sample_seed=5435, n_classes=2, num_att=10, num_drift_centroids=50) t = OneHotToCategorical([[10, 11, 12, 13], [ 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53 ]]) t2 = OneHotToCategorical([[10, 11, 12, 13], [ 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53 ]]) knn = KNNADWINClassifier(n_neighbors=8, leaf_size=40, max_window_size=2000) # pipe = Pipeline([('one_hot_to_categorical', t), ('KNNClassifier', knn)]) compare = KNeighborsClassifier(n_neighbors=8, algorithm='kd_tree', leaf_size=40, metric='euclidean') # pipe2 = Pipeline([('one_hot_to_categorical', t2), ('KNNClassifier', compare)]) first = True train = 200 if train > 0: X, y = stream.next_sample(train) # pipe.partial_fit(X, y, classes=stream.target_values) # pipe.partial_fit(X, y, classes=stream.target_values) # pipe2.fit(X, y) knn.partial_fit(X, y, classes=stream.target_values) compare.fit(X, y) first = False n_samples = 0 max_samples = 10000 my_corrects = 0 compare_corrects = 0 while n_samples < max_samples: if n_samples % (max_samples / 20) == 0: logging.info('%s%%', str((n_samples // (max_samples / 20) * 5))) X, y = stream.next_sample() # my_pred = pipe.predict(X) my_pred = knn.predict(X) # my_pred = [1] if first: # pipe.partial_fit(X, y, classes=stream.target_values) # pipe.partial_fit(X, y, classes=stream.target_values) knn.partial_fit(X, y, classes=stream.target_values) first = False else: # pipe.partial_fit(X, y) knn.partial_fit(X, y) # compare_pred = pipe2.predict(X) compare_pred = compare.predict(X) if y[0] == my_pred[0]: my_corrects += 1 if y[0] == compare_pred[0]: compare_corrects += 1 n_samples += 1 end = timer() print('Evaluation time: ' + str(end - start)) print(str(n_samples) + ' samples analyzed.') print('My performance: ' + str(my_corrects / n_samples)) print('Compare performance: ' + str(compare_corrects / n_samples))