def test_half_space_trees(test_path): stream = SEAGenerator(classification_function=0, noise_percentage=0.1, random_state=1) learner = HalfSpaceTrees(n_estimators=13, size_limit=75, anomaly_threshold=0.90, depth=10, random_state=5) cnt = 0 max_samples = 5000 y_pred = array('i') y_proba = [] wait_samples = 500 while cnt < max_samples: X, y = stream.next_sample() # Scale inputs between 0 and 1 X = X / 10 if (cnt % wait_samples == 0) and (cnt != 0): y_pred.append(learner.predict(X)[0]) y_proba.append(learner.predict_proba(X)[0]) learner.partial_fit(X) cnt += 1 expected_predictions = array('i', [1, 0, 0, 0, 1, 0, 0, 1, 0]) assert np.alltrue(y_pred == expected_predictions) test_file = os.path.join(test_path, 'test_half_space_trees.npy') expected_proba = np.load(test_file) assert np.allclose(y_proba, expected_proba)
def test_evaluate_stream_gen_speed(): stream = SEAGenerator(random_state=1) stream.prepare_for_use() stream_name = stream.name evaluator = EvaluateStreamGenerationSpeed(n_samples=100000, max_time=float("inf"), output_file=None, batch_size=5) stream = evaluator.evaluate(stream) assert stream_name == stream.name expected_info = 'EvaluateStreamGenerationSpeed: ' \ 'n_samples: 100000 - max_time: inf - output_file: None - batch_size: 5' assert evaluator.get_info() == expected_info evaluator.set_params({ 'n_samples': 500000, 'max_time': 0.05, 'output_file': None, 'batch_size': 1 }) expected_info = 'EvaluateStreamGenerationSpeed: ' \ 'n_samples: 500000 - max_time: 0.05 - output_file: None - batch_size: 1' assert evaluator.get_info() == expected_info # Stop evaluation by reaching max_time stream = evaluator.evaluate(stream) assert stream_name == stream.name assert evaluator.get_class_type() == 'evaluator'
def run(self): producer = KafkaProducer(bootstrap_servers='localhost:9092') stream = SEAGenerator() # Send Signal to model server to extract persist model from pickle in local file producer.send( topic='testTopicAdmin', value=b'../modelPersist/online_hoeffding_tree_persist.pkl', key=b'extract') time.sleep(10) while not self.stop_event.is_set(): dummy_data, dummy_label = stream.next_sample() print("Dummy Event Generated:", str(dummy_data)) dummy_data_and_label = np.concatenate((dummy_data, dummy_label), axis=None) producer.send(topic='testTopic', value=dummy_data_and_label.tobytes(), key=b'labeled') time.sleep(1) # Send signal to model server to persist model producer.send( topic='testTopicAdmin', value=b'../modelPersist/online_hoeffding_tree_persist.pkl', key=b'flush') producer.close()
def test_hoeffding_adaptive_tree_mc(test_path): stream = ConceptDriftStream(stream=SEAGenerator(random_state=1, noise_percentage=0.05), drift_stream=SEAGenerator( random_state=2, classification_function=2, noise_percentage=0.05), random_state=1, position=250, width=10) learner = HoeffdingAdaptiveTreeClassifier(leaf_prediction='mc') cnt = 0 max_samples = 1000 y_pred = array('i') y_proba = [] wait_samples = 20 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): y_pred.append(learner.predict(X)[0]) y_proba.append(learner.predict_proba(X)[0]) learner.partial_fit(X, y) cnt += 1 expected_predictions = array('i', [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ]) assert np.alltrue(y_pred == expected_predictions) test_file = os.path.join(test_path, 'test_hoeffding_adaptive_tree_mc.npy') data = np.load(test_file) assert np.allclose(y_proba, data) expected_info = "HoeffdingAdaptiveTreeClassifier(binary_split=False, bootstrap_sampling=True, grace_period=200, " \ "leaf_prediction='mc', max_byte_size=33554432, memory_estimate_period=1000000, nb_threshold=0, " \ "no_preprune=False, nominal_attributes=None, remove_poor_atts=False, split_confidence=1e-07, " \ "split_criterion='info_gain', stop_mem_management=False, tie_threshold=0.05)" info = " ".join([line.strip() for line in learner.get_info().split()]) assert info == expected_info expected_model_1 = 'Leaf = Class 1.0 | {0.0: 398.0, 1.0: 1000.0}\n' assert (learner.get_model_description() == expected_model_1) assert type(learner.predict(X)) == np.ndarray assert type(learner.predict_proba(X)) == np.ndarray stream.restart() X, y = stream.next_sample(5000) learner = HoeffdingAdaptiveTreeClassifier(max_byte_size=30, leaf_prediction='mc', grace_period=10) learner.partial_fit(X, y)
def test_oracle_better(): gen = SEAGenerator(random_state=42) gen.prepare_for_use() evaluator_dyn2sel = dyn2selHoldout(n_wait=100, max_samples=1000, test_size=100) dynse_rank = DYNSEMethod(NaiveBayes(), 100, Rank()) f = StringIO() with redirect_stdout(f): evaluator_dyn2sel.evaluate(gen, dynse_rank) out = f.getvalue() f.close() acc_rank = out[out.find("Accuracy"):] acc_rank = acc_rank[acc_rank.find(":") + 2:] acc_rank = acc_rank[:acc_rank.find("\n")] acc_rank = float(acc_rank) evaluator_dyn2sel = dyn2selHoldout(n_wait=100, max_samples=1000, test_size=100) dynse_oracle = DYNSEMethod(NaiveBayes(), 100, Oracle()) f = StringIO() with redirect_stdout(f): evaluator_dyn2sel.evaluate(gen, dynse_oracle) out = f.getvalue() f.close() acc_oracle = out[out.find("Accuracy"):] acc_oracle = acc_oracle[acc_oracle.find(":") + 2:] acc_oracle = acc_oracle[:acc_oracle.find("\n")] acc_oracle = float(acc_oracle) assert acc_oracle > acc_rank
def test_hoeffding_tree_model_information(): stream = SEAGenerator(random_state=1, noise_percentage=0.05) X, y = stream.next_sample(5000) nominal_attr_idx = [x for x in range(5, stream.n_features)] learner = HoeffdingTreeClassifier(nominal_attributes=nominal_attr_idx) learner.partial_fit(X, y, classes=stream.target_values) expected_info = { 'Tree size (nodes)': 5, 'Tree size (leaves)': 3, 'Active learning nodes': 3, 'Tree depth': 2, 'Active leaf byte size estimate': 0.0, 'Inactive leaf byte size estimate': 0.0, 'Byte size estimate overhead': 1.0 } observed_info = learner.get_model_measurements for k in expected_info: assert k in observed_info assert expected_info[k] == observed_info[k] expected_description = "if Attribute 0 <= 4.549969620513424:\n" \ " if Attribute 1 <= 5.440182925299016:\n" \ " Leaf = Class 0 | {0: 345.54817975126275, 1: 44.43855503614928}\n" \ " if Attribute 1 > 5.440182925299016:\n" \ " Leaf = Class 1 | {0: 54.451820248737235, 1: 268.5614449638507}\n" \ "if Attribute 0 > 4.549969620513424:\n" \ " Leaf = Class 1 | {0: 390.5845685762964, 1: 2372.3747376855454}\n" \ assert expected_description == learner.get_model_description()
def test_evaluate_delayed_coverage(tmpdir): from skmultiflow.data import SEAGenerator from skmultiflow.bayes import NaiveBayes max_samples = 1000 # Stream data = SEAGenerator(random_state=1) # Get X and y X, y = data.next_sample(max_samples) time = generate_random_dates(seed=1, samples=max_samples) # Setup temporal stream stream = TemporalDataStream(X, y, time, ordered=False) # Learner nb = NaiveBayes() output_file = os.path.join(str(tmpdir), "prequential_delayed_summary.csv") metrics = ['running_time', 'model_size'] evaluator = EvaluatePrequentialDelayed(max_samples=max_samples, metrics=metrics, data_points_for_classification=True, output_file=output_file) evaluator.evaluate(stream=stream, model=nb, model_names=['NB'])
def test_clone(): stream = SEAGenerator(random_state=1) learner = NaiveBayes() cnt = 0 max_samples = 5000 y_pred = array('i') X_batch = [] y_batch = [] y_proba = [] wait_samples = 100 while cnt < max_samples: X, y = stream.next_sample() X_batch.append(X[0]) y_batch.append(y[0]) # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): y_pred.append(learner.predict(X)[0]) y_proba.append(learner.predict_proba(X)[0]) learner.partial_fit(X, y, classes=stream.target_values) cnt += 1 cloned = clone(learner) assert learner._observed_class_distribution != {} and cloned._observed_class_distribution == {}
def test_knn(): stream = SEAGenerator(random_state=1) stream.prepare_for_use() learner = KNN(n_neighbors=8, max_window_size=2000, leaf_size=40) cnt = 0 max_samples = 5000 predictions = array('i') correct_predictions = 0 wait_samples = 100 X_batch = [] y_batch = [] while cnt < max_samples: X, y = stream.next_sample() X_batch.append(X[0]) y_batch.append(y[0]) # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): predictions.append(learner.predict(X)[0]) if y[0] == predictions[-1]: correct_predictions += 1 learner.partial_fit(X, y) cnt += 1 expected_predictions = array('i', [ 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1 ]) assert np.alltrue(predictions == expected_predictions) expected_correct_predictions = 49 assert correct_predictions == expected_correct_predictions expected_info = 'KNN(leaf_size=40, max_window_size=2000, n_neighbors=8, nominal_attributes=None)' assert learner.get_info() == expected_info learner.reset() assert learner.get_info() == expected_info X_batch = np.array(X_batch) y_batch = np.array(y_batch) learner.fit(X_batch[:4500], y_batch[:4500], classes=[0, 1]) predictions = learner.predict(X_batch[4501:4550]) expected_predictions = array('i', [ 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0 ]) assert np.alltrue(predictions == expected_predictions) correct_predictions = sum(predictions == y_batch[4501:4550]) expected_correct_predictions = 49 assert correct_predictions == expected_correct_predictions assert type(learner.predict(X)) == np.ndarray assert type(learner.predict_proba(X)) == np.ndarray
def test_knn_adwin(): stream = ConceptDriftStream(stream=SEAGenerator(random_state=1), drift_stream=SEAGenerator( random_state=2, classification_function=2), random_state=1, position=250, width=10) learner = KNNADWINClassifier(n_neighbors=8, leaf_size=40, max_window_size=200) cnt = 0 max_samples = 1000 predictions = array('i') correct_predictions = 0 wait_samples = 20 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): predictions.append(learner.predict(X)[0]) if y[0] == predictions[-1]: correct_predictions += 1 learner.partial_fit(X, y) cnt += 1 expected_predictions = array('i', [ 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1 ]) assert np.alltrue(predictions == expected_predictions) expected_correct_predictions = 46 assert correct_predictions == expected_correct_predictions learner.reset() assert learner.data_window.size == 0 expected_info = "KNNADWINClassifier(leaf_size=40, max_window_size=200, " \ "metric='euclidean', n_neighbors=8)" info = " ".join([line.strip() for line in learner.get_info().split()]) assert info == expected_info stream.restart() X, y = stream.next_sample(max_samples) learner.fit(X[:950], y[:950]) predictions = learner.predict(X[951:]) correct_predictions = sum(np.array(predictions) == y[951:]) expected_correct_predictions = 47 assert correct_predictions == expected_correct_predictions assert type(learner.predict(X)) == np.ndarray assert type(learner.predict_proba(X)) == np.ndarray
def test_perceptron(test_path): stream = SEAGenerator(random_state=1) learner = PerceptronMask(random_state=1) cnt = 0 max_samples = 5000 y_pred = array('i') X_batch = [] y_batch = [] y_proba = [] wait_samples = 100 while cnt < max_samples: X, y = stream.next_sample() X_batch.append(X[0]) y_batch.append(y[0]) # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): y_pred.append(learner.predict(X)[0]) y_proba.append(learner.predict_proba(X)[0]) learner.partial_fit(X, y, classes=stream.target_values) cnt += 1 expected_predictions = array('i', [ 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1 ]) assert np.alltrue(y_pred == expected_predictions) test_file = os.path.join(test_path, 'data_perceptron_proba.npy') y_proba_expected = np.load(test_file) assert np.allclose(y_proba, y_proba_expected) expected_info = "PerceptronMask(alpha=0.0001, class_weight=None, early_stopping=False, " \ "eta0=1.0, fit_intercept=True, max_iter=1000, n_iter_no_change=5, " \ "n_jobs=None, penalty=None, random_state=1, shuffle=True, tol=0.001, " \ "validation_fraction=0.1, verbose=0, warm_start=False)" info = " ".join([line.strip() for line in learner.get_info().split()]) assert info == expected_info # Coverage tests learner.reset() if not sklearn_version.startswith("0.21"): learner.fit(X=np.asarray(X_batch[:4500]), y=np.asarray(y_batch[:4500], dtype=int)) else: # Root cause of failure (TypeError: an integer is required) is in the fit() method # in sklearn 0.21.0. This is a workaround until a fix is made available in sklearn learner.partial_fit(X=np.asarray(X_batch[:4500]), y=np.asarray(y_batch[:4500]), classes=stream.target_values) learner.predict(X=X_batch[4501:]) # Run for coverage assert type(learner.predict(X)) == np.ndarray assert type(learner.predict_proba(X)) == np.ndarray
def test_standardize(): stream = SEAGenerator(random_state=1) learner = WeightedKNNClassifier(n_neighbors=8, max_window_size=2000, leaf_size=40, standardize=True) # # Test for Implementing moving average example_features = np.array( [np.array([[1, 2]]), np.array([[2, 4]]), np.array([[3, 9]])]) example_targets = np.array([[1], [1], [1]]) for i in range(len(example_features)): learner.partial_fit(example_features[i], example_targets[i]) moving_average = learner.get_mean assert type(moving_average) is np.ndarray assert np.alltrue(moving_average == np.array([[2, 5]])) moving_sd = learner.get_sd assert type(moving_sd) is np.ndarray assert np.alltrue(moving_sd.astype(int) == np.array([[0, 2]])) stream = SEAGenerator(random_state=1) learner = WeightedKNNClassifier(n_neighbors=8, max_window_size=2000, leaf_size=40, standardize=True) cnt = 0 max_samples = 5000 predictions = array('i') correct_predictions = 0 wait_samples = 100 X_batch = [] y_batch = [] while cnt < max_samples: X, y = stream.next_sample() X_batch.append(X[0]) y_batch.append(y[0]) # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): predictions.append(learner.predict(X)[0]) if y[0] == predictions[-1]: correct_predictions += 1 learner.partial_fit(X, y) cnt += 1 expected_predictions = array('i', [ 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1 ]) assert np.alltrue(predictions == expected_predictions)
def test_hoeffding_tree_coverage(): # Cover memory management stream = SEAGenerator(random_state=1, noise_percentage=0.05) stream.prepare_for_use() X, y = stream.next_sample(5000) learner = HoeffdingTree(max_byte_size=30, memory_estimate_period=100, grace_period=10, leaf_prediction='mc') learner.partial_fit(X, y, classes=stream.target_values) learner.reset() # Cover nominal attribute observer stream = RandomTreeGenerator(tree_random_state=1, sample_random_state=1, n_num_features=0, n_categories_per_cat_feature=2) stream.prepare_for_use() X, y = stream.next_sample(1000) learner = HoeffdingTree(leaf_prediction='mc', nominal_attributes=[i for i in range(10)]) learner.partial_fit(X, y, classes=stream.target_values)
def test_extremely_fast_decision_tree_coverage(): # Cover memory management stream = SEAGenerator(random_state=1, noise_percentage=0.05) stream.prepare_for_use() X, y = stream.next_sample(5000) learner = ExtremelyFastDecisionTreeClassifier(max_byte_size=30, memory_estimate_period=100, grace_period=10, leaf_prediction='nba') learner.partial_fit(X, y, classes=stream.target_values) learner.reset() # Cover nominal attribute observer stream = RandomTreeGenerator(tree_random_state=23, sample_random_state=12, n_classes=2, n_cat_features=2, n_categories_per_cat_feature=4, n_num_features=1, max_tree_depth=30, min_leaf_depth=10, fraction_leaves_per_level=0.45) stream.prepare_for_use() X, y = stream.next_sample(5000) learner = ExtremelyFastDecisionTreeClassifier( leaf_prediction='nba', nominal_attributes=[i for i in range(1, 9)]) learner.partial_fit(X, y, classes=stream.target_values)
def test_knn_adwin(): stream = ConceptDriftStream(stream=SEAGenerator(random_state=1), drift_stream=SEAGenerator( random_state=2, classification_function=2), random_state=1, position=250, width=10) stream.prepare_for_use() learner = KNNAdwin(n_neighbors=8, leaf_size=40, max_window_size=200) cnt = 0 max_samples = 1000 predictions = array('i') correct_predictions = 0 wait_samples = 20 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): predictions.append(learner.predict(X)[0]) if y[0] == predictions[-1]: correct_predictions += 1 learner.partial_fit(X, y) cnt += 1 expected_predictions = array('i', [ 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1 ]) assert np.alltrue(predictions == expected_predictions) expected_correct_predictions = 46 assert correct_predictions == expected_correct_predictions learner.reset() assert learner.window.n_samples == 0 expected_info = 'KNNAdwin(leaf_size=40, max_window_size=200, n_neighbors=8,\n' \ ' nominal_attributes=None)' assert learner.get_info() == expected_info stream.restart() X, y = stream.next_sample(max_samples) learner.fit(X[:950], y[:950]) predictions = learner.predict(X[951:]) correct_predictions = sum(np.array(predictions) == y[951:]) expected_correct_predictions = 47 assert correct_predictions == expected_correct_predictions assert type(learner.predict(X)) == np.ndarray assert type(learner.predict_proba(X)) == np.ndarray
def test_ensemble_size(): # since each member of the ensemble is initialized when the number of instances reach the chunk size, the size of # the ensemble should n_samples // chunk_size chunk_size = 100 n_samples = 1050 gen = SEAGenerator(balance_classes=True) # gen.prepare_for_use() mde = MDEMethod(NaiveBayes(), chunk_size, KNORAE(), alpha=0.0) X, y = gen.next_sample(n_samples) mde.partial_fit(X, y) assert len(mde.ensemble) == n_samples // chunk_size
def test_ensemble_size(): # since each member of the ensemble is initialized when the number of instances reach the chunk size, the size of # the ensemble should n_samples // chunk_size chunk_size = 100 n_samples = 1050 gen = SEAGenerator() # gen.prepare_for_use() dynse = DYNSEMethod(NaiveBayes(), chunk_size, ModifiedRank()) X, y = gen.next_sample(n_samples) dynse.partial_fit(X, y) assert len(dynse.ensemble) == n_samples // chunk_size
def test_accuracy(): # an ensemble of Adaptive Random Forests should perform at the very least 80% with 200 instances of SEAGenerator n_samples_train = 200 n_samples_test = 200 gen = SEAGenerator(noise_percentage=0.0) # gen.prepare_for_use() arf = AdaptiveRandomForest() desdd = DESDDMethod(arf) X_train, y_train = gen.next_sample(n_samples_train) X_test, y_test = gen.next_sample(n_samples_test) desdd.partial_fit(X_train, y_train) assert desdd.score(X_test, y_test) > 0.80
def test_perceptron(test_path): stream = SEAGenerator(random_state=1) stream.prepare_for_use() learner = PerceptronMask(random_state=1) cnt = 0 max_samples = 5000 y_pred = array('i') X_batch = [] y_batch = [] y_proba = [] wait_samples = 100 while cnt < max_samples: X, y = stream.next_sample() X_batch.append(X[0]) y_batch.append(y[0]) # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): y_pred.append(learner.predict(X)[0]) y_proba.append(learner.predict_proba(X)[0]) learner.partial_fit(X, y, classes=stream.target_values) cnt += 1 expected_predictions = array('i', [ 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1 ]) assert np.alltrue(y_pred == expected_predictions) test_file = os.path.join(test_path, 'data_perceptron_proba.npy') y_proba_expected = np.load(test_file) assert np.allclose(y_proba, y_proba_expected) expected_info = 'PerceptronMask: - penalty: None - alpha: 0.0001 - fit_intercept: True - max_iter: 1000 ' \ '- tol: 0.001 - shuffle: True - eta0: 1.0 - warm_start: False - class_weight: None - n_jobs: 1' assert learner.get_info() == expected_info # Coverage tests learner.reset() learner.fit(X=X_batch[:4500], y=y_batch[:4500]) y_pred = learner.predict(X=X_batch[4501:]) accuracy = accuracy_score(y_true=y_batch[4501:], y_pred=y_pred) expected_accuracy = 0.8897795591182365 # assert np.isclose(expected_accuracy, accuracy) # Removed due to npn-replicable error in Travis build assert 'estimator' == learner.get_class_type() assert type(learner.predict(X)) == np.ndarray assert type(learner.predict_proba(X)) == np.ndarray
def test_equality_multiflow(): gen = SEAGenerator(random_state=42) gen.prepare_for_use() evaluator_mtflow = mtflowPrequential( max_samples=1000, pretrain_size=0, restart_stream=True ) evaluator_dyn2sel = dyn2selPrequential(max_samples=1000, pretrain_size=0) nb_mtflow = evaluator_mtflow.evaluate(gen, NaiveBayes())[0].__dict__ nb_dyn2sel = evaluator_dyn2sel.evaluate(gen, NaiveBayes())[0].__dict__ del nb_mtflow["_attribute_observers"] del nb_dyn2sel["_attribute_observers"] assert nb_mtflow == nb_dyn2sel
def test_naive_bayes(test_path): stream = SEAGenerator(random_state=1) stream.prepare_for_use() learner = NaiveBayes() cnt = 0 max_samples = 5000 y_pred = array('i') X_batch = [] y_batch = [] y_proba = [] wait_samples = 100 while cnt < max_samples: X, y = stream.next_sample() X_batch.append(X[0]) y_batch.append(y[0]) # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): y_pred.append(learner.predict(X)[0]) y_proba.append(learner.predict_proba(X)[0]) learner.partial_fit(X, y, classes=stream.target_values) cnt += 1 expected_predictions = array('i', [ 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1 ]) assert np.alltrue(y_pred == expected_predictions) test_file = os.path.join(test_path, 'data_naive_bayes_proba.npy') y_proba_expected = np.load(test_file) assert np.allclose(y_proba, y_proba_expected) expected_info = 'NaiveBayes: nominal attributes: [] - ' assert learner.get_info() == expected_info learner.reset() learner.fit(X=np.array(X_batch[:4500]), y=np.array(y_batch[:4500])) expected_score = 0.9378757515030061 assert np.isclose( expected_score, learner.score(X=np.array(X_batch[4501:]), y=np.array(y_batch[4501:]))) assert 'estimator' == learner.get_class_type() assert type(learner.predict(X)) == np.ndarray assert type(learner.predict_proba(X)) == np.ndarray
def test_accuracy(): # an ensemble of Naive Bayes should perform at the very least 85% with 200 instances of SEAGenerator chunk_size = 100 n_samples_train = 1050 n_samples_test = 200 gen = SEAGenerator(noise_percentage=0.0) # gen.prepare_for_use() nb = NaiveBayes() mde = MDEMethod(nb, chunk_size, KNORAU()) X_train, y_train = gen.next_sample(n_samples_train) X_test, y_test = gen.next_sample(n_samples_test) mde.partial_fit(X_train, y_train) assert mde.score(X_test, y_test) > 0.85
def test_online_csb2(): stream = SEAGenerator(1, noise_percentage=0.067, random_state=112) stream.prepare_for_use() nb = NaiveBayes() learner = OnlineCSB2Classifier(base_estimator=nb, n_estimators=3, cost_positive=1, cost_negative=0.9, random_state=112) first = True cnt = 0 max_samples = 5000 predictions = [] wait_samples = 100 correct_predictions = 0 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): predictions.append(learner.predict(X)[0]) if y[0] == predictions[-1]: correct_predictions += 1 if first: learner.partial_fit(X, y, classes=stream.target_values) first = False else: learner.partial_fit(X, y) cnt += 1 performance = correct_predictions / len(predictions) expected_predictions = [ 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1 ] expected_correct_predictions = 43 expected_performance = 0.8775510204081632 assert np.alltrue(predictions == expected_predictions) assert np.isclose(expected_performance, performance) assert correct_predictions == expected_correct_predictions assert type(learner.predict(X)) == np.ndarray assert type(learner.predict_proba(X)) == np.ndarray expected_info = "OnlineCSB2Classifier(base_estimator=NaiveBayes(nominal_attributes=None), cost_negative=0.9, " \ "cost_positive=1, drift_detection=True, n_estimators=3, random_state=112)" info = " ".join([line.strip() for line in learner.get_info().split()]) assert info == expected_info
def test_online_rus_1(): stream = SEAGenerator(1, noise_percentage=0.067, random_state=112) stream.prepare_for_use() nb = NaiveBayes() learner = OnlineRUSBoost(base_estimator=nb, n_estimators=3, sampling_rate=5, algorithm=1, random_state=112) first = True cnt = 0 max_samples = 5000 predictions = [] wait_samples = 100 correct_predictions = 0 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): predictions.append(learner.predict(X)[0]) if y[0] == predictions[-1]: correct_predictions += 1 if first: learner.partial_fit(X, y, classes=stream.target_values) first = False else: learner.partial_fit(X, y) cnt += 1 performance = correct_predictions / len(predictions) expected_predictions = [ 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1 ] expected_correct_predictions = 33 expected_performance = 0.673469387755102 assert np.alltrue(predictions == expected_predictions) assert np.isclose(expected_performance, performance) assert correct_predictions == expected_correct_predictions assert type(learner.predict(X)) == np.ndarray assert type(learner.predict_proba(X)) == np.ndarray expected_info = "OnlineRUSBoost(algorithm=1, base_estimator=NaiveBayes(nominal_attributes=None),\n" \ " drift_detection=True, n_estimators=3, random_state=112,\n" \ " sampling_rate=5)" assert learner.get_info() == expected_info
def test_leverage_bagging(): stream = SEAGenerator(1, noise_percentage=0.067, random_state=112) stream.prepare_for_use() knn = KNNClassifier(n_neighbors=8, leaf_size=40, max_window_size=2000) learner = LeverageBaggingClassifier(base_estimator=knn, n_estimators=3, random_state=112) first = True cnt = 0 max_samples = 5000 predictions = [] wait_samples = 100 correct_predictions = 0 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): predictions.append(learner.predict(X)[0]) if y[0] == predictions[-1]: correct_predictions += 1 if first: learner.partial_fit(X, y, classes=stream.target_values) first = False else: learner.partial_fit(X, y) cnt += 1 performance = correct_predictions / len(predictions) expected_predictions = [ 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1 ] assert np.alltrue(predictions == expected_predictions) expected_performance = 0.8571428571428571 assert np.isclose(expected_performance, performance) expected_correct_predictions = 42 assert correct_predictions == expected_correct_predictions assert type(learner.predict(X)) == np.ndarray assert type(learner.predict_proba(X)) == np.ndarray expected_info = "LeverageBaggingClassifier(base_estimator=KNNClassifier(leaf_size=40, max_window_size=2000, " \ "n_neighbors=8, nominal_attributes=None), delta=0.002, enable_code_matrix=False, " \ "leverage_algorithm='leveraging_bag', n_estimators=3, random_state=112, w=6)" info = " ".join([line.strip() for line in learner.get_info().split()]) assert info == expected_info
def demo(): """ _test_leverage_bagging This demo tests the LeverageBaggingClassifier on a file stream, which gives instances coming from a SEA generator. The test computes the performance of the LeverageBaggingClassifier as well as the time to create the structure and classify max_samples (2000 by default) instances. """ logging.basicConfig(format='%(message)s', level=logging.INFO) warnings.filterwarnings("ignore", ".*Passing 1d.*") stream = SEAGenerator(1, noise_percentage=0.067, random_state=1) clf = LeverageBaggingClassifier(base_estimator=KNNClassifier( n_neighbors=8, max_window_size=2000, leaf_size=30), n_estimators=1, random_state=1) sample_count = 0 correctly_classified = 0 max_samples = 2000 train_size = 200 first = True if train_size > 0: X, y = stream.next_sample(train_size) clf.partial_fit(X, y, classes=stream.target_values) first = False logging.info('%s%%', 0.0) while sample_count < max_samples: if (sample_count + 1) % (max_samples / 20) == 0: logging.info('%s%%', str(((sample_count // (max_samples / 20) + 1) * 5))) X, y = stream.next_sample(2) my_pred = clf.predict(X) if first: clf.partial_fit(X, y, classes=stream.target_values) first = False else: clf.partial_fit(X, y) if my_pred is not None: if y[0] == my_pred[0]: correctly_classified += 1 sample_count += 1 print(str(sample_count) + ' samples analyzed.') print('My performance: ' + str(correctly_classified / sample_count)) print(clf.get_info())
def test_pretrain_size(): gen = SEAGenerator(random_state=42) gen.prepare_for_use() evaluator_dyn2sel = dyn2selPrequential( n_wait=100, max_samples=1000, pretrain_size=150 ) dynse_rank = DYNSEMethod(NaiveBayes(), 100, Rank()) evaluator_dyn2sel.evaluate(gen, dynse_rank) evaluator_dyn2sel = dyn2selPrequential( n_wait=100, max_samples=1000, pretrain_size=150 ) dynse_oracle = DYNSEMethod(NaiveBayes(), 100, Oracle()) evaluator_dyn2sel.evaluate(gen, dynse_oracle)
def test_hat_nb(test_path): stream = ConceptDriftStream(stream=SEAGenerator(random_state=1, noise_percentage=0.05), drift_stream=SEAGenerator( random_state=2, classification_function=2, noise_percentage=0.05), random_state=1, position=250, width=10) stream.prepare_for_use() learner = HAT(leaf_prediction='nb') cnt = 0 max_samples = 1000 y_pred = array('i') y_proba = [] wait_samples = 20 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): y_pred.append(learner.predict(X)[0]) y_proba.append(learner.predict_proba(X)[0]) learner.partial_fit(X, y) cnt += 1 expected_predictions = array('i', [ 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1 ]) assert np.alltrue(y_pred == expected_predictions) test_file = os.path.join(test_path, 'test_hoeffding_adaptive_tree_nb.npy') data = np.load(test_file) assert np.allclose(y_proba, data) expected_info = "HAT(binary_split=False, grace_period=200, leaf_prediction='nb',\n" \ " max_byte_size=33554432, memory_estimate_period=1000000, nb_threshold=0,\n" \ " no_preprune=False, nominal_attributes=None, remove_poor_atts=False,\n" \ " split_confidence=1e-07, split_criterion='info_gain',\n" \ " stop_mem_management=False, tie_threshold=0.05)" assert learner.get_info() == expected_info assert type(learner.predict(X)) == np.ndarray assert type(learner.predict_proba(X)) == np.ndarray
def test_oza_bagging(): stream = SEAGenerator(1, noise_percentage=0.067, random_state=112) knn = KNNClassifier(n_neighbors=8, leaf_size=40, max_window_size=2000) learner = OzaBaggingClassifier(base_estimator=knn, n_estimators=3, random_state=112) first = True cnt = 0 max_samples = 5000 predictions = [] wait_samples = 100 correct_predictions = 0 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): predictions.append(learner.predict(X)[0]) if y[0] == predictions[-1]: correct_predictions += 1 if first: learner.partial_fit(X, y, classes=stream.target_values) first = False else: learner.partial_fit(X, y) cnt += 1 performance = correct_predictions / len(predictions) expected_predictions = [ 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1 ] assert np.alltrue(predictions == expected_predictions) expected_performance = 0.8979591836734694 assert np.isclose(expected_performance, performance) expected_correct_predictions = 44 assert correct_predictions == expected_correct_predictions assert type(learner.predict(X)) == np.ndarray assert type(learner.predict_proba(X)) == np.ndarray expected_info = "OzaBaggingClassifier(base_estimator=KNNClassifier(leaf_size=40, " \ "max_window_size=2000, metric='euclidean', n_neighbors=8), " \ "n_estimators=3, random_state=112)" info = " ".join([line.strip() for line in learner.get_info().split()]) assert info == expected_info
def test_additive_expert_ensemble_weakest(): stream = SEAGenerator(1, noise_percentage=0.067, random_state=112) stream.prepare_for_use() learner = AdditiveExpertEnsemble(3, NaiveBayes(), beta=0.5, gamma=0.1, pruning='weakest') cnt = 0 max_samples = 5000 predictions = [] wait_samples = 100 correct_predictions = 0 first = True while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): predictions.append(learner.predict(X)[0]) if y[0] == predictions[-1]: correct_predictions += 1 if first: learner.partial_fit(X, y, classes=stream.target_values) first = False else: learner.partial_fit(X, y) cnt += 1 performance = correct_predictions / len(predictions) expected_predictions = [ 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1 ] expected_correct_predictions = 45 expected_performance = 0.9183673469387755 assert np.alltrue(predictions == expected_predictions) assert np.isclose(expected_performance, performance) assert correct_predictions == expected_correct_predictions assert type(learner.predict(X)) == np.ndarray expected_info = "AdditiveExpertEnsemble(base_estimator=NaiveBayes(nominal_attributes=None),\n" \ " beta=0.5, gamma=0.1, n_estimators=3, pruning='weakest')" assert learner.get_info() == expected_info