def test_evaluate_prequential_classifier(tmpdir, test_path): # Setup file stream stream = RandomTreeGenerator(tree_random_state=23, sample_random_state=12, n_classes=4, n_cat_features=2, n_num_features=5, n_categories_per_cat_feature=5, max_tree_depth=6, min_leaf_depth=3, fraction_leaves_per_level=0.15) stream.prepare_for_use() # Setup learner nominal_attr_idx = [x for x in range(15, len(stream.feature_names))] learner = HoeffdingTree(nominal_attributes=nominal_attr_idx) # Setup evaluator max_samples = 1000 metrics = ['kappa', 'kappa_t', 'performance'] output_file = os.path.join(str(tmpdir), "prequential_summary.csv") evaluator = EvaluatePrequential(max_samples=max_samples, metrics=metrics, output_file=output_file) # Evaluate result = evaluator.evaluate(stream=stream, model=learner) result_learner = result[0] assert isinstance(result_learner, HoeffdingTree) assert learner.get_model_measurements == result_learner.get_model_measurements expected_file = os.path.join(test_path, 'prequential_summary.csv') compare_files(output_file, expected_file)
def test_evaluate_classification_coverage(tmpdir): # A simple coverage test. Tests for metrics are placed in the corresponding test module. stream = RandomTreeGenerator(tree_random_state=23, sample_random_state=12, n_classes=2, n_cat_features=2, n_num_features=5, n_categories_per_cat_feature=5, max_tree_depth=6, min_leaf_depth=3, fraction_leaves_per_level=0.15) stream.prepare_for_use() # Learner nominal_attr_idx = [x for x in range(15, len(stream.feature_names))] learner = HoeffdingTree(nominal_attributes=nominal_attr_idx) max_samples = 1000 output_file = os.path.join(str(tmpdir), "prequential_summary.csv") metrics = [ 'accuracy', 'kappa', 'kappa_t', 'kappa_m', 'f1', 'precision', 'recall', 'gmean', 'true_vs_predicted' ] evaluator = EvaluatePrequential(max_samples=max_samples, metrics=metrics, output_file=output_file) # Evaluate evaluator.evaluate(stream=stream, model=learner) mean_performance, current_performance = evaluator.get_measurements( model_idx=0) expected_current_accuracy = 0.685 assert np.isclose(current_performance.accuracy_score(), expected_current_accuracy)
def test_kdd_tree_mixed(): stream = RandomTreeGenerator(tree_random_state=1, sample_random_state=1, n_num_features=0) stream.prepare_for_use() X, _ = stream.next_sample(1000) X_test, _ = stream.next_sample(10) # Build tree cat_features = [i for i in range(25)] kdtree = KDTree(X, metric='mixed', return_distance=True, categorical_list=cat_features) # Query tree dist, idx = kdtree.query(X_test, 4) expected_idx = [[123, 234, 707, 654], [688, 429, 216, 627], [463, 970, 566, 399], [18, 895, 640, 996], [396, 612, 897, 232], [328, 54, 138, 569], [253, 501, 82, 273], [38, 146, 752, 923], [946, 808, 271, 363], [951, 111, 708, 5]] expected_dist = [[2, 2, 2, 2], [2, 2, 2, 2], [2, 2, 2, 2], [2, 2, 2, 0], [2, 2, 2, 0], [2, 2, 2, 0], [2, 2, 2, 2], [2, 2, 0, 0], [2, 2, 2, 0], [2, 2, 2, 2]] assert np.alltrue(idx == expected_idx) assert np.allclose(dist, expected_dist) expected_info = 'KDTree: - leaf_size: 40 - metric: mixed - return_distance: True' assert kdtree.get_info() == expected_info assert kdtree.get_class_type() == 'data_structure'
def test_extremely_fast_decision_tree_nba(test_path): stream = RandomTreeGenerator(tree_random_state=23, sample_random_state=12, n_classes=2, n_cat_features=2, n_categories_per_cat_feature=4, n_num_features=1, max_tree_depth=30, min_leaf_depth=10, fraction_leaves_per_level=0.45) stream.prepare_for_use() learner = ExtremelyFastDecisionTreeClassifier( nominal_attributes=[i for i in range(1, 9)]) cnt = 0 max_samples = 5000 predictions = array('i') proba_predictions = [] wait_samples = 100 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): predictions.append(learner.predict(X)[0]) proba_predictions.append(learner.predict_proba(X)[0]) learner.partial_fit(X, y) cnt += 1 expected_predictions = array('i', [ 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0 ]) assert np.alltrue(predictions == expected_predictions) test_file = os.path.join(test_path, 'test_hoeffding_anytime_tree.npy') expected_proba = np.load(test_file)[:49, :] assert np.allclose(proba_predictions, expected_proba) expected_info = "ExtremelyFastDecisionTreeClassifier(binary_split=False, grace_period=200, " \ "leaf_prediction='nba', max_byte_size=33554432, memory_estimate_period=1000000, " \ "min_samples_reevaluate=20, nb_threshold=0, nominal_attributes=[1, 2, 3, 4, 5, 6, 7, 8], " \ "split_confidence=1e-07, split_criterion='info_gain', stop_mem_management=False, " \ "tie_threshold=0.05)" info = " ".join([line.strip() for line in learner.get_info().split()]) assert info == expected_info expected_model = 'ifAttribute1=0.0:ifAttribute3=0.0:Leaf=Class1|{0:260.0,1:287.0}' \ 'ifAttribute3=1.0:Leaf=Class0|{0:163.0,1:117.0}ifAttribute1=1.0:Leaf=Class0|{0:718.0,1:495.0}' assert (learner.get_model_description().replace("\n", " ").replace( " ", "") == expected_model.replace(" ", "")) assert type(learner.predict(X)) == np.ndarray assert type(learner.predict_proba(X)) == np.ndarray
def test_HAT(test_path): stream = RandomTreeGenerator(tree_random_state=23, sample_random_state=12, n_classes=4, n_cat_features=2, n_num_features=5, n_categories_per_cat_feature=5, max_tree_depth=6, min_leaf_depth=3, fraction_leaves_per_level=0.15) stream.prepare_for_use() nominal_attr_idx = [x for x in range(5, stream.n_features)] learner = HAT(nominal_attributes=nominal_attr_idx) cnt = 0 max_samples = 5000 predictions = array('d') proba_predictions = [] wait_samples = 100 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): predictions.append(learner.predict(X)[0]) proba_predictions.append(learner.predict_proba(X)[0]) learner.partial_fit(X, y) cnt += 1 expected_predictions = array('d', [2.0, 1.0, 1.0, 1.0, 0.0, 3.0, 0.0, 1.0, 1.0, 2.0, 0.0, 2.0, 1.0, 1.0, 2.0, 1.0, 3.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 3.0, 1.0, 2.0, 1.0, 1.0, 3.0, 2.0, 1.0, 2.0, 2.0, 2.0, 1.0, 1.0, 1.0, 0.0, 1.0, 2.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 1.0, 3.0, 2.0]) test_file = os.path.join(test_path, 'test_hoeffding_adaptive_tree.npy') data = np.load(test_file) assert np.alltrue(predictions == expected_predictions) assert np.allclose(proba_predictions, data) expected_info = 'HAT: max_byte_size: 33554432 - memory_estimate_period: 1000000 - grace_period: 200' \ ' - split_criterion: info_gain - split_confidence: 1e-07 - tie_threshold: 0.05' \ ' - binary_split: False - stop_mem_management: False - remove_poor_atts: False' \ ' - no_pre_prune: False - leaf_prediction: nba - nb_threshold: 0' \ ' - nominal_attributes: [5, 6, 7, 8, 9, 10, 11, 12, 13, 14] - ' assert learner.get_info() == expected_info expected_model_1 = 'Leaf = Class 1.0 | {0.0: 1367.3628584299263, 1.0: 1702.2738590243584,' \ ' 2.0: 952.1668539501372, 3.0: 822.1964285955778}\n' expected_model_2 = 'Leaf = Class 1.0 | {1.0: 1702.2738590243584, 2.0: 952.1668539501372,' \ ' 0.0: 1367.3628584299263, 3.0: 822.1964285955778}\n' expected_model_3 = 'Leaf = Class 1.0 | {1.0: 1702.2738590243584, 2.0: 952.16685395013724, ' \ '0.0: 1367.3628584299263, 3.0: 822.1964285955778}\n' # Python 3.6 expected_model_4 = 'Leaf = Class 1.0 | {0.0: 1367.3628584299263, 1.0: 1702.2738590243584,' \ ' 2.0: 952.16685395013724, 3.0: 822.1964285955778}\n' # Python 3.4 assert (learner.get_model_description() == expected_model_1) \ or (learner.get_model_description() == expected_model_2) \ or (learner.get_model_description() == expected_model_3) \ or (learner.get_model_description() == expected_model_4)
def test_evaluate_prequential_classifier(tmpdir, test_path): # Setup file stream stream = RandomTreeGenerator(tree_random_state=23, sample_random_state=12, n_classes=4, n_cat_features=2, n_num_features=5, n_categories_per_cat_feature=5, max_tree_depth=6, min_leaf_depth=3, fraction_leaves_per_level=0.15) stream.prepare_for_use() # Setup learner nominal_attr_idx = [x for x in range(15, len(stream.feature_names))] learner = HoeffdingTree(nominal_attributes=nominal_attr_idx) # Setup evaluator max_samples = 1000 metrics = ['accuracy', 'kappa', 'kappa_t'] output_file = os.path.join(str(tmpdir), "prequential_summary.csv") evaluator = EvaluatePrequential(max_samples=max_samples, metrics=metrics, output_file=output_file) # Evaluate result = evaluator.evaluate(stream=stream, model=learner) result_learner = result[0] assert isinstance(result_learner, HoeffdingTree) assert learner.get_model_measurements == result_learner.get_model_measurements expected_file = os.path.join(test_path, 'prequential_summary.csv') compare_files(output_file, expected_file) mean_performance, current_performance = evaluator.get_measurements(model_idx=0) expected_mean_accuracy = 0.436250 assert np.isclose(mean_performance.get_accuracy(), expected_mean_accuracy) expected_mean_kappa = 0.231791 assert np.isclose(mean_performance.get_kappa(), expected_mean_kappa) expected_mean_kappa_t = 0.236887 assert np.isclose(mean_performance.get_kappa_t(), expected_mean_kappa_t) expected_current_accuracy = 0.430000 assert np.isclose(current_performance.get_accuracy(), expected_current_accuracy) expected_current_kappa = 0.223909 assert np.isclose(current_performance.get_kappa(), expected_current_kappa) expected_current_kappa_t = 0.240000 assert np.isclose(current_performance.get_kappa_t(), expected_current_kappa_t) expected_info = "EvaluatePrequential(batch_size=1, data_points_for_classification=False,\n" \ " max_samples=1000, max_time=inf,\n" \ " metrics=['accuracy', 'kappa', 'kappa_t'], n_wait=200,\n" \ " output_file='prequential_summary.csv',\n" \ " pretrain_size=200, restart_stream=True, show_plot=False)" assert evaluator.get_info() == expected_info
def test_hoeffding_tree(test_path): stream = RandomTreeGenerator(tree_random_state=23, sample_random_state=12, n_classes=4, n_cat_features=2, n_num_features=5, n_categories_per_cat_feature=5, max_tree_depth=6, min_leaf_depth=3, fraction_leaves_per_level=0.15) stream.prepare_for_use() nominal_attr_idx = [x for x in range(5, stream.n_features)] learner = HoeffdingTree(nominal_attributes=nominal_attr_idx) cnt = 0 max_samples = 5000 predictions = array('i') proba_predictions = [] wait_samples = 100 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): predictions.append(learner.predict(X)[0]) proba_predictions.append(learner.predict_proba(X)[0]) learner.partial_fit(X, y) cnt += 1 expected_predictions = array('i', [ 0, 1, 3, 0, 0, 3, 0, 1, 1, 2, 0, 2, 1, 1, 2, 1, 3, 0, 1, 1, 1, 1, 0, 3, 1, 2, 1, 1, 3, 2, 1, 2, 2, 2, 1, 1, 1, 0, 1, 2, 0, 2, 0, 0, 0, 0, 1, 3, 2 ]) test_file = os.path.join(test_path, 'test_hoeffding_tree.npy') data = np.load(test_file) assert np.alltrue(predictions == expected_predictions) assert np.allclose(proba_predictions, data) expected_info = 'HoeffdingTree: max_byte_size: 33554432 - memory_estimate_period: 1000000 - grace_period: 200 ' \ '- split_criterion: info_gain - split_confidence: 1e-07 - tie_threshold: 0.05 ' \ '- binary_split: False - stop_mem_management: False - remove_poor_atts: False ' \ '- no_pre_prune: False - leaf_prediction: nba - nb_threshold: 0 - nominal_attributes: [5, 6, 7,' \ ' 8, 9, 10, 11, 12, 13, 14] - ' assert learner.get_info() == expected_info expected_model_1 = 'Leaf = Class 1.0 | {0.0: 1423.0, 1.0: 1745.0, 2.0: 978.0, 3.0: 854.0}\n' expected_model_2 = 'Leaf = Class 1.0 | {1.0: 1745.0, 2.0: 978.0, 0.0: 1423.0, 3.0: 854.0}\n' assert (learner.get_model_description() == expected_model_1) \ or (learner.get_model_description() == expected_model_2) assert type(learner.predict(X)) == np.ndarray assert type(learner.predict_proba(X)) == np.ndarray
def test_evaluate_holdout_classifier(tmpdir, test_path): # Setup file stream stream = RandomTreeGenerator(tree_random_state=23, sample_random_state=12, n_classes=4, n_cat_features=2, n_num_features=5, n_categories_per_cat_feature=5, max_tree_depth=6, min_leaf_depth=3, fraction_leaves_per_level=0.15) stream.prepare_for_use() # Setup learner nominal_attr_idx = [x for x in range(15, len(stream.feature_names))] learner = HoeffdingTree(nominal_attributes=nominal_attr_idx) # Setup evaluator n_wait = 200 max_samples = 1000 metrics = ['accuracy', 'kappa', 'kappa_t'] output_file = os.path.join(str(tmpdir), "holdout_summary.csv") evaluator = EvaluateHoldout(n_wait=n_wait, max_samples=max_samples, test_size=50, metrics=metrics, output_file=output_file) # Evaluate result = evaluator.evaluate(stream=stream, model=learner) result_learner = result[0] assert isinstance(result_learner, HoeffdingTree) assert learner.get_model_measurements == result_learner.get_model_measurements expected_file = os.path.join(test_path, 'holdout_summary.csv') compare_files(output_file, expected_file) mean_performance, current_performance = evaluator.get_measurements( model_idx=0) expected_mean_accuracy = 0.344000 expected_mean_kappa = 0.135021 expected_mean_kappa_t = 0.180000 expected_current_accuracy = 0.360000 expected_current_kappa = 0.152542 expected_current_kappa_t = 0.200000 assert np.isclose(mean_performance.get_accuracy(), expected_mean_accuracy) assert np.isclose(mean_performance.get_kappa(), expected_mean_kappa) assert np.isclose(mean_performance.get_kappa_t(), expected_mean_kappa_t) assert np.isclose(current_performance.get_accuracy(), expected_current_accuracy) assert np.isclose(current_performance.get_kappa(), expected_current_kappa) assert np.isclose(current_performance.get_kappa_t(), expected_current_kappa_t)
def test_adaptive_random_forests_nb(): stream = RandomTreeGenerator(tree_random_state=112, sample_random_state=112, n_classes=2) stream.prepare_for_use() learner = AdaptiveRandomForest(n_estimators=3, random_state=112, leaf_prediction='nb') X, y = stream.next_sample(150) learner.partial_fit(X, y) cnt = 0 max_samples = 5000 predictions = [] true_labels = [] wait_samples = 100 correct_predictions = 0 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): predictions.append(int(learner.predict(X)[0])) true_labels.append(y[0]) if np.array_equal(y[0], predictions[-1]): correct_predictions += 1 learner.partial_fit(X, y) cnt += 1 last_version_predictions = [ 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1 ] # Performance below does not need to be guaranteed. This check is set up so that anything that changes # to predictions are caught in the unit test. This helps prevent accidental changes. assert type(learner.predict(X)) == np.ndarray assert np.alltrue(predictions == last_version_predictions) expected_info = "AdaptiveRandomForest(binary_split=False, disable_weighted_vote=False,\n" \ " drift_detection_method=ADWIN(delta=0.001), grace_period=50,\n" \ " lambda_value=6, leaf_prediction='nb',\n" \ " max_byte_size=33554432, max_features=5,\n" \ " memory_estimate_period=2000000, n_estimators=3,\n" \ " nb_threshold=0, no_preprune=False, nominal_attributes=None,\n" \ " performance_metric='acc', random_state=112,\n" \ " remove_poor_atts=False, split_confidence=0.01,\n" \ " split_criterion='info_gain', stop_mem_management=False,\n" \ " tie_threshold=0.05,\n" \ " warning_detection_method=ADWIN(delta=0.01))" assert learner.get_info() == expected_info
def test_hoeffding_anytime_tree(test_path): stream = RandomTreeGenerator(tree_random_state=23, sample_random_state=12, n_classes=2, n_cat_features=2, n_categories_per_cat_feature=4, n_num_features=1, max_tree_depth=30, min_leaf_depth=10, fraction_leaves_per_level=0.45) stream.prepare_for_use() learner = HATT(nominal_attributes=[i for i in range(1, 9)]) cnt = 0 max_samples = 5000 predictions = array('i') proba_predictions = [] wait_samples = 100 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): predictions.append(learner.predict(X)[0]) proba_predictions.append(learner.predict_proba(X)[0]) learner.partial_fit(X, y) cnt += 1 expected_predictions = array('i', [ 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0 ]) assert np.alltrue(predictions == expected_predictions) test_file = os.path.join(test_path, 'test_hoeffding_anytime_tree.npy') expected_proba = np.load(test_file)[:49, :] assert np.allclose(proba_predictions, expected_proba) expected_info = 'HATT: max_byte_size: 33554432 - memory_estimate_period: 1000000 - grace_period: 200 - ' \ 'min_samples_reevaluate: 20 - split_criterion: info_gain - split_confidence: 1e-07 - ' \ 'tie_threshold: 0.05 - binary_split: False - stop_mem_management: False - leaf_prediction: ' \ 'nba - nb_threshold: 0 - nominal_attributes: [1, 2, 3, 4, 5, 6, 7, 8] - ' assert learner.get_info() == expected_info expected_model = 'ifAttribute1=0:ifAttribute3=0:Leaf=Class1|{0:260.0,1:287.0}' \ 'ifAttribute3=1:Leaf=Class0|{0:163.0,1:117.0}ifAttribute1=1:Leaf=Class0|{0:718.0,1:495.0}' assert (learner.get_model_description().replace("\n", " ").replace( " ", "") == expected_model.replace(" ", "")) assert type(learner.predict(X)) == np.ndarray assert type(learner.predict_proba(X)) == np.ndarray
def __init__(self, concept_id=0, seed=None, noise=0, desc=None): self.cf = concept_id self.seed = seed self.difficulty = 0 if desc == None else desc.difficulty stream = RandomTreeGenerator(tree_random_state=seed, sample_random_state=seed, max_tree_depth=self.difficulty + 2, min_leaf_depth=self.difficulty, n_classes=2) stream.prepare_for_use() super().__init__(stream)
def test_batch_incremental(): stream = RandomTreeGenerator(tree_random_state=112, sample_random_state=112) stream.prepare_for_use() estimator = DecisionTreeClassifier(random_state=112) learner = BatchIncremental(base_estimator=estimator, n_estimators=10) X, y = stream.next_sample(150) learner.partial_fit(X, y) cnt = 0 max_samples = 5000 predictions = [] true_labels = [] wait_samples = 100 correct_predictions = 0 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): predictions.append(learner.predict(X)[0]) true_labels.append(y[0]) if np.array_equal(y[0], predictions[-1]): correct_predictions += 1 learner.partial_fit(X, y) cnt += 1 performance = correct_predictions / len(predictions) expected_predictions = [1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0] expected_correct_predictions = 31 expected_performance = 0.6326530612244898 assert np.alltrue(predictions == expected_predictions) assert np.isclose(expected_performance, performance) assert correct_predictions == expected_correct_predictions assert type(learner.predict(X)) == np.ndarray expected_info = "BatchIncremental(base_estimator=DecisionTreeClassifier(class_weight=None, " \ "criterion='gini', max_depth=None, max_features=None, max_leaf_nodes=None, " \ "min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, " \ "min_samples_split=2, min_weight_fraction_leaf=0.0, presort=False, random_state=112, " \ "splitter='best'), n_estimators=10, window_size=100)" info = " ".join([line.strip() for line in learner.get_info().split()]) assert info == expected_info
def test_learn_pp(): stream = RandomTreeGenerator(tree_random_state=2212, sample_random_state=2212) stream.prepare_for_use() estimator = DecisionTreeClassifier(random_state=2212) classifier = LearnPPClassifier(base_estimator=estimator, n_estimators=5, n_ensembles=5, random_state=2212) m = 200 # Keeping track of sample count and correct prediction count sample_count = 0 corrects = 0 # Pre training the classifier with 200 samples X, y = stream.next_sample(m) classifier.partial_fit(X, y, classes=stream.target_values) predictions = [] for i in range(10): X, y = stream.next_sample(200) pred = classifier.predict(X) classifier.partial_fit(X, y) if pred is not None: corrects += np.sum(y == pred) predictions.append(pred[0]) sample_count += m acc = corrects / sample_count expected_correct_predictions = 1138 expected_acc = 0.569 expected_predictions = [0, 1, 0, 0, 1, 1, 0, 0, 0, 0] assert np.alltrue(predictions == expected_predictions) assert np.isclose(expected_acc, acc) assert corrects == expected_correct_predictions assert type(classifier.predict(X)) == np.ndarray expected_info = "LearnPPClassifier(base_estimator=DecisionTreeClassifier(class_weight=None, " \ "criterion='gini', max_depth=None, max_features=None, max_leaf_nodes=None, " \ "min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, " \ "min_samples_split=2, min_weight_fraction_leaf=0.0, presort=False, " \ "random_state=2212, splitter='best'), error_threshold=0.5, n_ensembles=5, " \ "n_estimators=5, random_state=2212, window_size=100)" info = " ".join([line.strip() for line in classifier.get_info().split()]) assert info == expected_info
def test_hoeffding_tree_nb(test_path): stream = RandomTreeGenerator(tree_random_state=23, sample_random_state=12, n_classes=4, n_cat_features=2, n_num_features=5, n_categories_per_cat_feature=5, max_tree_depth=6, min_leaf_depth=3, fraction_leaves_per_level=0.15) stream.prepare_for_use() nominal_attr_idx = [x for x in range(5, stream.n_features)] learner = HoeffdingTree(nominal_attributes=nominal_attr_idx, leaf_prediction='nb') cnt = 0 max_samples = 5000 predictions = array('i') proba_predictions = [] wait_samples = 100 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): predictions.append(learner.predict(X)[0]) proba_predictions.append(learner.predict_proba(X)[0]) learner.partial_fit(X, y) cnt += 1 expected_predictions = array('i', [ 0, 1, 3, 0, 0, 3, 0, 1, 1, 2, 0, 2, 1, 1, 2, 1, 3, 0, 1, 1, 1, 1, 0, 3, 1, 2, 1, 1, 3, 2, 1, 2, 2, 2, 1, 1, 1, 0, 1, 2, 0, 2, 0, 0, 0, 0, 1, 3, 2 ]) assert np.alltrue(predictions == expected_predictions) expected_info = "HoeffdingTree(binary_split=False, grace_period=200, leaf_prediction='nb',\n" \ " max_byte_size=33554432, memory_estimate_period=1000000,\n" \ " nb_threshold=0, no_preprune=False,\n" \ " nominal_attributes=[5, 6, 7, 8, 9, 10, 11, 12, 13, 14],\n" \ " remove_poor_atts=False, split_confidence=1e-07,\n" \ " split_criterion='info_gain', stop_mem_management=False,\n" \ " tie_threshold=0.05)" assert learner.get_info() == expected_info
def test_extremely_fast_decision_tree_nb_gini(test_path): stream = RandomTreeGenerator(tree_random_state=23, sample_random_state=12, n_classes=2, n_cat_features=2, n_categories_per_cat_feature=4, n_num_features=1, max_tree_depth=30, min_leaf_depth=10, fraction_leaves_per_level=0.45) stream.prepare_for_use() learner = ExtremelyFastDecisionTreeClassifier( nominal_attributes=[i for i in range(1, 9)], leaf_prediction='nb', split_criterion='gini') cnt = 0 max_samples = 5000 predictions = array('i') proba_predictions = [] wait_samples = 100 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): predictions.append(learner.predict(X)[0]) proba_predictions.append(learner.predict_proba(X)[0]) learner.partial_fit(X, y) cnt += 1 expected_predictions = array('i', [ 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0 ]) assert np.alltrue(predictions == expected_predictions) expected_info = "ExtremelyFastDecisionTreeClassifier(binary_split=False, grace_period=200, " \ "leaf_prediction='nb', max_byte_size=33554432, memory_estimate_period=1000000, " \ "min_samples_reevaluate=20, nb_threshold=0, nominal_attributes=[1, 2, 3, 4, 5, 6, 7, 8], " \ "split_confidence=1e-07, split_criterion='gini', stop_mem_management=False, tie_threshold=0.05)" info = " ".join([line.strip() for line in learner.get_info().split()]) assert info == expected_info
def test_batch_incremental(): stream = RandomTreeGenerator(tree_random_state=112, sample_random_state=112) stream.prepare_for_use() estimator = DecisionTreeClassifier(random_state=112) classifier = BatchIncremental(base_estimator=estimator, n_estimators=10) learner = Pipeline([('classifier', classifier)]) X, y = stream.next_sample(150) learner.partial_fit(X, y) cnt = 0 max_samples = 5000 predictions = [] true_labels = [] wait_samples = 100 correct_predictions = 0 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): predictions.append(learner.predict(X)[0]) true_labels.append(y[0]) if np.array_equal(y[0], predictions[-1]): correct_predictions += 1 learner.partial_fit(X, y) cnt += 1 performance = correct_predictions / len(predictions) expected_predictions = [1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0] expected_correct_predictions = 31 expected_performance = 0.6326530612244898 assert np.alltrue(predictions == expected_predictions) assert np.isclose(expected_performance, performance) assert correct_predictions == expected_correct_predictions assert type(learner.predict(X)) == np.ndarray
def test_adaptive_random_forests_labels_given(): stream = RandomTreeGenerator(tree_random_state=112, sample_random_state=112, n_classes=2) stream.prepare_for_use() learner = AdaptiveRandomForest(n_estimators=3, random_state=112) X, y = stream.next_sample(150) learner.partial_fit(X, y, classes=[0, 1]) cnt = 0 max_samples = 5000 predictions = [] true_labels = [] wait_samples = 100 correct_predictions = 0 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): predictions.append(learner.predict_proba(X)[0]) true_labels.append(y[0]) if np.array_equal(y[0], predictions[-1].argmax()): correct_predictions += 1 learner.partial_fit(X, y) cnt += 1 assert np.alltrue([ np.isclose(y_proba.sum(), 1) for y_proba in predictions ]), "Probabilities should sum to 1." class_probabilities = np.asarray(predictions).squeeze() assert class_probabilities.shape == (49, 2) predictions = class_probabilities.argmax(axis=1) last_version_predictions = [ 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0 ] assert np.alltrue(predictions == last_version_predictions)
def test_evaluate_classification_metrics(): stream = RandomTreeGenerator(tree_random_state=23, sample_random_state=12, n_classes=2, n_cat_features=2, n_num_features=5, n_categories_per_cat_feature=5, max_tree_depth=6, min_leaf_depth=3, fraction_leaves_per_level=0.15) stream.prepare_for_use() # Setup learner nominal_attr_idx = [x for x in range(15, len(stream.feature_names))] learner = HoeffdingTree(nominal_attributes=nominal_attr_idx) max_samples = 1000 metrics = ['f1', 'precision', 'recall', 'gmean'] evaluator = EvaluatePrequential(max_samples=max_samples, metrics=metrics) # Evaluate evaluator.evaluate(stream=stream, model=learner) mean_performance, current_performance = evaluator.get_measurements(model_idx=0) expected_current_f1_score = 0.7096774193548387 expected_current_precision = 0.6814159292035398 expected_current_recall = 0.7403846153846154 expected_current_g_mean = 0.6802502367624613 expected_mean_f1_score = 0.7009803921568628 expected_mean_precision = 0.7185929648241206 expected_mean_recall = 0.6842105263157895 expected_mean_g_mean = 0.6954166367760247 print(mean_performance.get_g_mean()) print(mean_performance.get_recall()) print(mean_performance.get_precision()) print(mean_performance.get_f1_score()) print(current_performance.get_g_mean()) print(current_performance.get_recall()) print(current_performance.get_precision()) print(current_performance.get_f1_score()) assert np.isclose(current_performance.get_f1_score(), expected_current_f1_score) assert np.isclose(current_performance.get_precision(), expected_current_precision) assert np.isclose(current_performance.get_recall(), expected_current_recall) assert np.isclose(current_performance.get_g_mean(), expected_current_g_mean) assert np.isclose(mean_performance.get_f1_score(), expected_mean_f1_score) assert np.isclose(mean_performance.get_precision(), expected_mean_precision) assert np.isclose(mean_performance.get_recall(), expected_mean_recall) assert np.isclose(mean_performance.get_g_mean(), expected_mean_g_mean)
def test_adaptive_random_forests_batch_predict_proba(): stream = RandomTreeGenerator(tree_random_state=112, sample_random_state=112, n_classes=2) stream.prepare_for_use() learner = AdaptiveRandomForest(n_estimators=3, random_state=112) X, y = stream.next_sample(150) learner.partial_fit(X, y, classes=[0, 1]) cnt = 0 max_samples = 500 predictions = [] true_labels = [] wait_samples = 100 while cnt < max_samples: X, y = stream.next_sample(5) # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): p = learner.predict_proba(X) assert p.shape == (5, 2) predictions.append(p) true_labels.append(y) learner.partial_fit(X, y) cnt += 1 all_predictions = np.concatenate(predictions) # all_true_labels = np.asarray(true_labels).flatten() # correct_predictions = sum(np.equal(all_true_labels, all_predictions.argmax(axis=1))) assert np.alltrue([ np.isclose(y_proba.sum(), 1) for y_proba in all_predictions ]), "Probabilities should sum to 1." assert all_predictions.shape == (4 * 5, 2) last_version_predictions = [ 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1 ] assert type(learner.predict_proba(X)) == np.ndarray assert np.alltrue( all_predictions.argmax(axis=1) == last_version_predictions)
def test_adaptive_random_forests(): stream = RandomTreeGenerator(tree_random_state=112, sample_random_state=112) stream.prepare_for_use() learner = AdaptiveRandomForest(n_estimators=3, random_state=112) X, y = stream.next_sample(150) learner.partial_fit(X, y) cnt = 0 max_samples = 5000 predictions = [] true_labels = [] wait_samples = 100 correct_predictions = 0 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): predictions.append(int(learner.predict(X)[0])) true_labels.append(y[0]) if np.array_equal(y[0], predictions[-1]): correct_predictions += 1 learner.partial_fit(X, y) cnt += 1 last_version_predictions = [1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0] # Performance below does not need to be guaranteed. This check is set up so that anything that changes # to predictions are caught in the unit test. This helps prevent accidental changes. # If these tests fail, make sure that what is worked on *should* change the predictions of ARF. if sys.version_info.major == 3 and sys.version_info.minor >= 6: # Temporary disable as pre-3.6 give different predictions than 3.6+ assert np.alltrue(predictions == last_version_predictions) assert type(learner.predict(X)) == np.ndarray
def test_learn_pp(): stream = RandomTreeGenerator(tree_random_state=2212, sample_random_state=2212) stream.prepare_for_use() estimator = DecisionTreeClassifier(random_state=2212) classifier = LearnPP(base_estimator=estimator, n_estimators=5, n_ensembles=5, random_state=2212) m = 200 # Keeping track of sample count and correct prediction count sample_count = 0 corrects = 0 # Pre training the classifier with 200 samples X, y = stream.next_sample(m) classifier.partial_fit(X, y, classes=stream.target_values) predictions = [] for i in range(10): X, y = stream.next_sample(200) pred = classifier.predict(X) classifier.partial_fit(X, y) if pred is not None: corrects += np.sum(y == pred) predictions.append(pred[0]) sample_count += m acc = corrects / sample_count expected_correct_predictions = 1138 expected_acc = 0.569 expected_predictions = [0, 1, 0, 0, 1, 1, 0, 0, 0, 0] assert np.alltrue(predictions == expected_predictions) assert np.isclose(expected_acc, acc) assert corrects == expected_correct_predictions assert type(classifier.predict(X)) == np.ndarray
def test_evaluate_holdout_classifier(tmpdir, test_path): # Setup file stream stream = RandomTreeGenerator(tree_random_state=23, sample_random_state=12, n_classes=4, n_cat_features=2, n_num_features=5, n_categories_per_cat_feature=5, max_tree_depth=6, min_leaf_depth=3, fraction_leaves_per_level=0.15) stream.prepare_for_use() # Setup learner nominal_attr_idx = [x for x in range(15, len(stream.feature_names))] learner = HoeffdingTree(nominal_attributes=nominal_attr_idx) # Setup evaluator n_wait = 200 max_samples = 1000 metrics = ['accuracy', 'kappa', 'kappa_t'] output_file = os.path.join(str(tmpdir), "holdout_summary.csv") evaluator = EvaluateHoldout(n_wait=n_wait, max_samples=max_samples, test_size=50, metrics=metrics, output_file=output_file) # Evaluate result = evaluator.evaluate(stream=stream, model=learner) result_learner = result[0] assert isinstance(result_learner, HoeffdingTree) assert learner.get_model_measurements == result_learner.get_model_measurements expected_file = os.path.join(test_path, 'holdout_summary.csv') compare_files(output_file, expected_file) mean_performance, current_performance = evaluator.get_measurements( model_idx=0) expected_mean_accuracy = 0.344000 assert np.isclose(mean_performance.get_accuracy(), expected_mean_accuracy) expected_mean_kappa = 0.135021 assert np.isclose(mean_performance.get_kappa(), expected_mean_kappa) expected_mean_kappa_t = 0.180000 assert np.isclose(mean_performance.get_kappa_t(), expected_mean_kappa_t) expected_current_accuracy = 0.360000 assert np.isclose(current_performance.get_accuracy(), expected_current_accuracy) expected_current_kappa = 0.152542 assert np.isclose(current_performance.get_kappa(), expected_current_kappa) expected_current_kappa_t = 0.200000 assert np.isclose(current_performance.get_kappa_t(), expected_current_kappa_t) expected_info = "EvaluateHoldout(batch_size=1, dynamic_test_set=False, max_samples=1000,\n" \ " max_time=inf, metrics=['accuracy', 'kappa', 'kappa_t'],\n" \ " n_wait=200,\n" \ " output_file='holdout_summary.csv',\n" \ " restart_stream=True, show_plot=False, test_size=50)" assert evaluator.get_info() == expected_info stream = RandomTreeGenerator(tree_random_state=23, sample_random_state=12, n_classes=2, n_cat_features=2, n_num_features=5, n_categories_per_cat_feature=5, max_tree_depth=6, min_leaf_depth=3, fraction_leaves_per_level=0.15) stream.prepare_for_use() # Setup learner nominal_attr_idx = [x for x in range(15, len(stream.feature_names))] learner = HoeffdingTree(nominal_attributes=nominal_attr_idx) # Setup evaluator n_wait = 200 max_samples = 1000 metrics = ['f1', 'precision', 'recall', 'gmean'] evaluator = EvaluateHoldout(n_wait=n_wait, max_samples=max_samples, test_size=50, metrics=metrics) # Evaluate evaluator.evaluate(stream=stream, model=learner) mean_performance, current_performance = evaluator.get_measurements( model_idx=0) expected_current_f1_score = 0.6818181818181818 expected_current_precision = 0.625 expected_current_recall = 0.75 expected_current_g_mean = 0.7245688373094719 expected_mean_f1_score = 0.6431718061674009 expected_mean_precision = 0.5748031496062992 expected_mean_recall = 0.73 expected_mean_g_mean = 0.6835202996254025 assert np.isclose(current_performance.get_f1_score(), expected_current_f1_score) assert np.isclose(current_performance.get_precision(), expected_current_precision) assert np.isclose(current_performance.get_recall(), expected_current_recall) assert np.isclose(current_performance.get_g_mean(), expected_current_g_mean) assert np.isclose(mean_performance.get_f1_score(), expected_mean_f1_score) assert np.isclose(mean_performance.get_precision(), expected_mean_precision) assert np.isclose(mean_performance.get_recall(), expected_mean_recall) assert np.isclose(mean_performance.get_g_mean(), expected_mean_g_mean)
def test_hoeffding_tree_nba(test_path): stream = RandomTreeGenerator(tree_random_state=23, sample_random_state=12, n_classes=4, n_cat_features=2, n_num_features=5, n_categories_per_cat_feature=5, max_tree_depth=6, min_leaf_depth=3, fraction_leaves_per_level=0.15) stream.prepare_for_use() nominal_attr_idx = [x for x in range(5, stream.n_features)] learner = HoeffdingTree(nominal_attributes=nominal_attr_idx) cnt = 0 max_samples = 5000 predictions = array('i') proba_predictions = [] wait_samples = 100 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): predictions.append(learner.predict(X)[0]) proba_predictions.append(learner.predict_proba(X)[0]) learner.partial_fit(X, y) cnt += 1 expected_predictions = array('i', [ 0, 1, 3, 0, 0, 3, 0, 1, 1, 2, 0, 2, 1, 1, 2, 1, 3, 0, 1, 1, 1, 1, 0, 3, 1, 2, 1, 1, 3, 2, 1, 2, 2, 2, 1, 1, 1, 0, 1, 2, 0, 2, 0, 0, 0, 0, 1, 3, 2 ]) test_file = os.path.join(test_path, 'test_hoeffding_tree.npy') data = np.load(test_file) assert np.alltrue(predictions == expected_predictions) assert np.allclose(proba_predictions, data) expected_info = "HoeffdingTree(binary_split=False, grace_period=200, leaf_prediction='nba',\n" \ " max_byte_size=33554432, memory_estimate_period=1000000,\n" \ " nb_threshold=0, no_preprune=False,\n" \ " nominal_attributes=[5, 6, 7, 8, 9, 10, 11, 12, 13, 14],\n" \ " remove_poor_atts=False, split_confidence=1e-07,\n" \ " split_criterion='info_gain', stop_mem_management=False,\n" \ " tie_threshold=0.05)" assert learner.get_info() == expected_info expected_model_1 = 'Leaf = Class 1.0 | {0.0: 1423.0, 1.0: 1745.0, 2.0: 978.0, 3.0: 854.0}\n' assert (learner.get_model_description() == expected_model_1) assert type(learner.predict(X)) == np.ndarray assert type(learner.predict_proba(X)) == np.ndarray X, y = stream.next_sample(20000) learner.split_criterion = 'hellinger' learner.partial_fit(X, y) expected_rules = 'Att (5) == 0.000 and Att (12) == 0.000 | class: 1\n' + \ 'Att (5) == 0.000 and Att (12) == 1.000 | class: 1\n' + \ 'Att (5) == 1.000 and Att (13) == 0.000 and Att (1) <= 0.550 and Att (3) <= 0.730 | class: 0\n' +\ 'Att (5) == 1.000 and Att (13) == 0.000 and Att (1) <= 0.550 and Att (3) > 0.730 | class: 2\n' + \ 'Att (5) == 1.000 and Att (13) == 0.000 and Att (1) > 0.550 and Att (1) <= 0.800 | class: 0\n' + \ 'Att (5) == 1.000 and Att (13) == 0.000 and Att (1) > 0.550 and Att (1) > 0.800 and Att (14) == 0.000 | class: 0\n' + \ 'Att (5) == 1.000 and Att (13) == 0.000 and Att (1) > 0.550 and Att (1) > 0.800 and Att (14) == 1.000 | class: 1\n' + \ 'Att (5) == 1.000 and Att (13) == 1.000 and Att (3) <= 0.730 | class: 1\n' + \ 'Att (5) == 1.000 and Att (13) == 1.000 and Att (3) > 0.730 | class: 0\n' assert expected_rules == learner.get_rules_description()
def test_kdd_tree_euclidean(): stream = RandomTreeGenerator(tree_random_state=1, sample_random_state=1) stream.prepare_for_use() X, _ = stream.next_sample(1000) X_test, _ = stream.next_sample(10) # Build tree kdtree = KDTree(X, metric='euclidean', return_distance=True) # Query tree dist, idx = kdtree.query(X_test, 4) expected_idx = [[855, 466, 348, 996], [829, 654, 92, 333], [227, 364, 183, 325], [439, 482, 817, 501], [886, 173, 279, 470], [98, 30, 34, 580], [959, 773, 374, 819], [819, 685, 59, 992], [624, 665, 209, 239], [524, 807, 506, 191]] expected_dist = [[ 1.6366216258724973, 1.631437068636607, 1.5408182139320563, 1.4836054196064452 ], [ 1.7839579422032452, 1.7694587302438618, 1.5339920309706585, 1.5228981881653287 ], [ 1.6512443805072872, 1.637456923425164, 1.61736766513639, 1.5776532815820448 ], [ 1.5843121606184263, 1.571918014408251, 1.5038147281265382, 0.7058569455034059 ], [ 2.052148026638031, 2.0157953468214007, 1.8012794130725434, 1.6572756455115591 ], [ 1.5844032729792423, 1.5688736638121885, 1.55893121879858, 1.4609657517960262 ], [ 1.6819916227667229, 1.6186557774269037, 1.5815309744477162, 1.5720184136312232 ], [ 1.7302164693989817, 1.5964713159009083, 1.4897849225874815, 1.1629448414734906 ], [ 1.6511813695220574, 1.6454651930288255, 1.5926685577827064, 1.4973008307362947 ], [ 1.5982346741983797, 1.5875900895982191, 1.4702209684850878, 1.4676217546305874 ]] assert np.alltrue(idx == expected_idx) assert np.allclose(dist, expected_dist) expected_info = 'KDTree: - leaf_size: 40 - metric: euclidean - return_distance: True' assert kdtree.get_info() == expected_info assert kdtree.get_class_type() == 'data_structure'