def test_half_space_trees(test_path):
    stream = SEAGenerator(classification_function=0,
                          noise_percentage=0.1,
                          random_state=1)

    learner = HalfSpaceTrees(n_estimators=13,
                             size_limit=75,
                             anomaly_threshold=0.90,
                             depth=10,
                             random_state=5)

    cnt = 0
    max_samples = 5000
    y_pred = array('i')
    y_proba = []
    wait_samples = 500

    while cnt < max_samples:
        X, y = stream.next_sample()
        # Scale inputs between 0 and 1
        X = X / 10
        if (cnt % wait_samples == 0) and (cnt != 0):
            y_pred.append(learner.predict(X)[0])
            y_proba.append(learner.predict_proba(X)[0])
        learner.partial_fit(X)
        cnt += 1

    expected_predictions = array('i', [1, 0, 0, 0, 1, 0, 0, 1, 0])
    assert np.alltrue(y_pred == expected_predictions)
    test_file = os.path.join(test_path, 'test_half_space_trees.npy')
    expected_proba = np.load(test_file)
    assert np.allclose(y_proba, expected_proba)
def test_evaluate_stream_gen_speed():
    stream = SEAGenerator(random_state=1)
    stream.prepare_for_use()
    stream_name = stream.name

    evaluator = EvaluateStreamGenerationSpeed(n_samples=100000,
                                              max_time=float("inf"),
                                              output_file=None,
                                              batch_size=5)
    stream = evaluator.evaluate(stream)
    assert stream_name == stream.name

    expected_info = 'EvaluateStreamGenerationSpeed: ' \
                    'n_samples: 100000 - max_time: inf - output_file: None - batch_size: 5'
    assert evaluator.get_info() == expected_info

    evaluator.set_params({
        'n_samples': 500000,
        'max_time': 0.05,
        'output_file': None,
        'batch_size': 1
    })
    expected_info = 'EvaluateStreamGenerationSpeed: ' \
                    'n_samples: 500000 - max_time: 0.05 - output_file: None - batch_size: 1'
    assert evaluator.get_info() == expected_info

    # Stop evaluation by reaching max_time
    stream = evaluator.evaluate(stream)
    assert stream_name == stream.name

    assert evaluator.get_class_type() == 'evaluator'
    def run(self):
        producer = KafkaProducer(bootstrap_servers='localhost:9092')
        stream = SEAGenerator()

        # Send Signal to model server to extract persist model from pickle in local file
        producer.send(
            topic='testTopicAdmin',
            value=b'../modelPersist/online_hoeffding_tree_persist.pkl',
            key=b'extract')
        time.sleep(10)

        while not self.stop_event.is_set():
            dummy_data, dummy_label = stream.next_sample()
            print("Dummy Event Generated:", str(dummy_data))
            dummy_data_and_label = np.concatenate((dummy_data, dummy_label),
                                                  axis=None)

            producer.send(topic='testTopic',
                          value=dummy_data_and_label.tobytes(),
                          key=b'labeled')

            time.sleep(1)

        # Send signal to model server to persist model
        producer.send(
            topic='testTopicAdmin',
            value=b'../modelPersist/online_hoeffding_tree_persist.pkl',
            key=b'flush')

        producer.close()
示例#4
0
def test_hoeffding_adaptive_tree_mc(test_path):
    stream = ConceptDriftStream(stream=SEAGenerator(random_state=1,
                                                    noise_percentage=0.05),
                                drift_stream=SEAGenerator(
                                    random_state=2,
                                    classification_function=2,
                                    noise_percentage=0.05),
                                random_state=1,
                                position=250,
                                width=10)

    learner = HoeffdingAdaptiveTreeClassifier(leaf_prediction='mc')

    cnt = 0
    max_samples = 1000
    y_pred = array('i')
    y_proba = []
    wait_samples = 20

    while cnt < max_samples:
        X, y = stream.next_sample()
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            y_pred.append(learner.predict(X)[0])
            y_proba.append(learner.predict_proba(X)[0])
        learner.partial_fit(X, y)
        cnt += 1

    expected_predictions = array('i', [
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1
    ])
    assert np.alltrue(y_pred == expected_predictions)

    test_file = os.path.join(test_path, 'test_hoeffding_adaptive_tree_mc.npy')
    data = np.load(test_file)
    assert np.allclose(y_proba, data)

    expected_info = "HoeffdingAdaptiveTreeClassifier(binary_split=False, bootstrap_sampling=True, grace_period=200, " \
                    "leaf_prediction='mc', max_byte_size=33554432, memory_estimate_period=1000000, nb_threshold=0, " \
                    "no_preprune=False, nominal_attributes=None, remove_poor_atts=False, split_confidence=1e-07, " \
                    "split_criterion='info_gain', stop_mem_management=False, tie_threshold=0.05)"
    info = " ".join([line.strip() for line in learner.get_info().split()])
    assert info == expected_info

    expected_model_1 = 'Leaf = Class 1.0 | {0.0: 398.0, 1.0: 1000.0}\n'

    assert (learner.get_model_description() == expected_model_1)

    assert type(learner.predict(X)) == np.ndarray
    assert type(learner.predict_proba(X)) == np.ndarray

    stream.restart()
    X, y = stream.next_sample(5000)

    learner = HoeffdingAdaptiveTreeClassifier(max_byte_size=30,
                                              leaf_prediction='mc',
                                              grace_period=10)
    learner.partial_fit(X, y)
示例#5
0
def test_oracle_better():
    gen = SEAGenerator(random_state=42)
    gen.prepare_for_use()
    evaluator_dyn2sel = dyn2selHoldout(n_wait=100,
                                       max_samples=1000,
                                       test_size=100)
    dynse_rank = DYNSEMethod(NaiveBayes(), 100, Rank())
    f = StringIO()
    with redirect_stdout(f):
        evaluator_dyn2sel.evaluate(gen, dynse_rank)
    out = f.getvalue()
    f.close()
    acc_rank = out[out.find("Accuracy"):]
    acc_rank = acc_rank[acc_rank.find(":") + 2:]
    acc_rank = acc_rank[:acc_rank.find("\n")]
    acc_rank = float(acc_rank)

    evaluator_dyn2sel = dyn2selHoldout(n_wait=100,
                                       max_samples=1000,
                                       test_size=100)
    dynse_oracle = DYNSEMethod(NaiveBayes(), 100, Oracle())
    f = StringIO()
    with redirect_stdout(f):
        evaluator_dyn2sel.evaluate(gen, dynse_oracle)
    out = f.getvalue()
    f.close()
    acc_oracle = out[out.find("Accuracy"):]
    acc_oracle = acc_oracle[acc_oracle.find(":") + 2:]
    acc_oracle = acc_oracle[:acc_oracle.find("\n")]
    acc_oracle = float(acc_oracle)

    assert acc_oracle > acc_rank
def test_hoeffding_tree_model_information():
    stream = SEAGenerator(random_state=1, noise_percentage=0.05)
    X, y = stream.next_sample(5000)

    nominal_attr_idx = [x for x in range(5, stream.n_features)]
    learner = HoeffdingTreeClassifier(nominal_attributes=nominal_attr_idx)

    learner.partial_fit(X, y, classes=stream.target_values)

    expected_info = {
        'Tree size (nodes)': 5,
        'Tree size (leaves)': 3,
        'Active learning nodes': 3,
        'Tree depth': 2,
        'Active leaf byte size estimate': 0.0,
        'Inactive leaf byte size estimate': 0.0,
        'Byte size estimate overhead': 1.0
    }

    observed_info = learner.get_model_measurements
    for k in expected_info:
        assert k in observed_info
        assert expected_info[k] == observed_info[k]

    expected_description = "if Attribute 0 <= 4.549969620513424:\n" \
                            "  if Attribute 1 <= 5.440182925299016:\n" \
                            "    Leaf = Class 0 | {0: 345.54817975126275, 1: 44.43855503614928}\n" \
                            "  if Attribute 1 > 5.440182925299016:\n" \
                            "    Leaf = Class 1 | {0: 54.451820248737235, 1: 268.5614449638507}\n" \
                            "if Attribute 0 > 4.549969620513424:\n" \
                            "  Leaf = Class 1 | {0: 390.5845685762964, 1: 2372.3747376855454}\n" \

    assert expected_description == learner.get_model_description()
def test_evaluate_delayed_coverage(tmpdir):
    from skmultiflow.data import SEAGenerator
    from skmultiflow.bayes import NaiveBayes

    max_samples = 1000

    # Stream
    data = SEAGenerator(random_state=1)
    # Get X and y
    X, y = data.next_sample(max_samples)
    time = generate_random_dates(seed=1, samples=max_samples)

    # Setup temporal stream
    stream = TemporalDataStream(X, y, time, ordered=False)

    # Learner
    nb = NaiveBayes()

    output_file = os.path.join(str(tmpdir), "prequential_delayed_summary.csv")
    metrics = ['running_time', 'model_size']
    evaluator = EvaluatePrequentialDelayed(max_samples=max_samples,
                                           metrics=metrics,
                                           data_points_for_classification=True,
                                           output_file=output_file)

    evaluator.evaluate(stream=stream, model=nb, model_names=['NB'])
示例#8
0
def test_clone():
    stream = SEAGenerator(random_state=1)

    learner = NaiveBayes()

    cnt = 0
    max_samples = 5000
    y_pred = array('i')
    X_batch = []
    y_batch = []
    y_proba = []
    wait_samples = 100

    while cnt < max_samples:
        X, y = stream.next_sample()
        X_batch.append(X[0])
        y_batch.append(y[0])
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            y_pred.append(learner.predict(X)[0])
            y_proba.append(learner.predict_proba(X)[0])
        learner.partial_fit(X, y, classes=stream.target_values)
        cnt += 1

    cloned = clone(learner)

    assert learner._observed_class_distribution != {} and cloned._observed_class_distribution == {}
示例#9
0
def test_knn():
    stream = SEAGenerator(random_state=1)
    stream.prepare_for_use()

    learner = KNN(n_neighbors=8, max_window_size=2000, leaf_size=40)
    cnt = 0
    max_samples = 5000
    predictions = array('i')
    correct_predictions = 0
    wait_samples = 100
    X_batch = []
    y_batch = []

    while cnt < max_samples:
        X, y = stream.next_sample()
        X_batch.append(X[0])
        y_batch.append(y[0])
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            predictions.append(learner.predict(X)[0])
            if y[0] == predictions[-1]:
                correct_predictions += 1
        learner.partial_fit(X, y)
        cnt += 1

    expected_predictions = array('i', [
        1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0,
        1
    ])
    assert np.alltrue(predictions == expected_predictions)

    expected_correct_predictions = 49
    assert correct_predictions == expected_correct_predictions

    expected_info = 'KNN(leaf_size=40, max_window_size=2000, n_neighbors=8, nominal_attributes=None)'
    assert learner.get_info() == expected_info

    learner.reset()
    assert learner.get_info() == expected_info

    X_batch = np.array(X_batch)
    y_batch = np.array(y_batch)
    learner.fit(X_batch[:4500], y_batch[:4500], classes=[0, 1])
    predictions = learner.predict(X_batch[4501:4550])

    expected_predictions = array('i', [
        1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1,
        1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1,
        0
    ])
    assert np.alltrue(predictions == expected_predictions)

    correct_predictions = sum(predictions == y_batch[4501:4550])
    expected_correct_predictions = 49
    assert correct_predictions == expected_correct_predictions

    assert type(learner.predict(X)) == np.ndarray
    assert type(learner.predict_proba(X)) == np.ndarray
示例#10
0
def test_knn_adwin():
    stream = ConceptDriftStream(stream=SEAGenerator(random_state=1),
                                drift_stream=SEAGenerator(
                                    random_state=2, classification_function=2),
                                random_state=1,
                                position=250,
                                width=10)

    learner = KNNADWINClassifier(n_neighbors=8,
                                 leaf_size=40,
                                 max_window_size=200)

    cnt = 0
    max_samples = 1000
    predictions = array('i')
    correct_predictions = 0
    wait_samples = 20

    while cnt < max_samples:
        X, y = stream.next_sample()
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            predictions.append(learner.predict(X)[0])
            if y[0] == predictions[-1]:
                correct_predictions += 1
        learner.partial_fit(X, y)
        cnt += 1

    expected_predictions = array('i', [
        1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0,
        1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1,
        1
    ])
    assert np.alltrue(predictions == expected_predictions)

    expected_correct_predictions = 46
    assert correct_predictions == expected_correct_predictions

    learner.reset()
    assert learner.data_window.size == 0

    expected_info = "KNNADWINClassifier(leaf_size=40, max_window_size=200, " \
                    "metric='euclidean', n_neighbors=8)"
    info = " ".join([line.strip() for line in learner.get_info().split()])
    assert info == expected_info

    stream.restart()

    X, y = stream.next_sample(max_samples)
    learner.fit(X[:950], y[:950])
    predictions = learner.predict(X[951:])

    correct_predictions = sum(np.array(predictions) == y[951:])
    expected_correct_predictions = 47
    assert correct_predictions == expected_correct_predictions

    assert type(learner.predict(X)) == np.ndarray
    assert type(learner.predict_proba(X)) == np.ndarray
示例#11
0
def test_perceptron(test_path):
    stream = SEAGenerator(random_state=1)

    learner = PerceptronMask(random_state=1)

    cnt = 0
    max_samples = 5000
    y_pred = array('i')
    X_batch = []
    y_batch = []
    y_proba = []
    wait_samples = 100

    while cnt < max_samples:
        X, y = stream.next_sample()
        X_batch.append(X[0])
        y_batch.append(y[0])
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            y_pred.append(learner.predict(X)[0])
            y_proba.append(learner.predict_proba(X)[0])
        learner.partial_fit(X, y, classes=stream.target_values)
        cnt += 1

    expected_predictions = array('i', [
        1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0,
        1
    ])
    assert np.alltrue(y_pred == expected_predictions)

    test_file = os.path.join(test_path, 'data_perceptron_proba.npy')
    y_proba_expected = np.load(test_file)
    assert np.allclose(y_proba, y_proba_expected)

    expected_info = "PerceptronMask(alpha=0.0001, class_weight=None, early_stopping=False, " \
                    "eta0=1.0, fit_intercept=True, max_iter=1000, n_iter_no_change=5, " \
                    "n_jobs=None, penalty=None, random_state=1, shuffle=True, tol=0.001, " \
                    "validation_fraction=0.1, verbose=0, warm_start=False)"
    info = " ".join([line.strip() for line in learner.get_info().split()])
    assert info == expected_info

    # Coverage tests
    learner.reset()
    if not sklearn_version.startswith("0.21"):
        learner.fit(X=np.asarray(X_batch[:4500]),
                    y=np.asarray(y_batch[:4500], dtype=int))
    else:
        # Root cause of failure (TypeError: an integer is required) is in the fit() method
        # in sklearn 0.21.0. This is a workaround until a fix is made available in sklearn
        learner.partial_fit(X=np.asarray(X_batch[:4500]),
                            y=np.asarray(y_batch[:4500]),
                            classes=stream.target_values)
    learner.predict(X=X_batch[4501:])  # Run for coverage

    assert type(learner.predict(X)) == np.ndarray
    assert type(learner.predict_proba(X)) == np.ndarray
def test_standardize():
    stream = SEAGenerator(random_state=1)

    learner = WeightedKNNClassifier(n_neighbors=8,
                                    max_window_size=2000,
                                    leaf_size=40,
                                    standardize=True)

    #    # Test for Implementing moving average
    example_features = np.array(
        [np.array([[1, 2]]),
         np.array([[2, 4]]),
         np.array([[3, 9]])])

    example_targets = np.array([[1], [1], [1]])
    for i in range(len(example_features)):
        learner.partial_fit(example_features[i], example_targets[i])

    moving_average = learner.get_mean
    assert type(moving_average) is np.ndarray
    assert np.alltrue(moving_average == np.array([[2, 5]]))
    moving_sd = learner.get_sd
    assert type(moving_sd) is np.ndarray
    assert np.alltrue(moving_sd.astype(int) == np.array([[0, 2]]))
    stream = SEAGenerator(random_state=1)
    learner = WeightedKNNClassifier(n_neighbors=8,
                                    max_window_size=2000,
                                    leaf_size=40,
                                    standardize=True)

    cnt = 0
    max_samples = 5000
    predictions = array('i')
    correct_predictions = 0
    wait_samples = 100
    X_batch = []
    y_batch = []

    while cnt < max_samples:
        X, y = stream.next_sample()
        X_batch.append(X[0])
        y_batch.append(y[0])
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            predictions.append(learner.predict(X)[0])
            if y[0] == predictions[-1]:
                correct_predictions += 1
        learner.partial_fit(X, y)
        cnt += 1

    expected_predictions = array('i', [
        1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0,
        1
    ])
    assert np.alltrue(predictions == expected_predictions)
示例#13
0
def test_hoeffding_tree_coverage():
    # Cover memory management
    stream = SEAGenerator(random_state=1, noise_percentage=0.05)
    stream.prepare_for_use()
    X, y = stream.next_sample(5000)

    learner = HoeffdingTree(max_byte_size=30,
                            memory_estimate_period=100,
                            grace_period=10,
                            leaf_prediction='mc')

    learner.partial_fit(X, y, classes=stream.target_values)

    learner.reset()

    # Cover nominal attribute observer
    stream = RandomTreeGenerator(tree_random_state=1,
                                 sample_random_state=1,
                                 n_num_features=0,
                                 n_categories_per_cat_feature=2)
    stream.prepare_for_use()
    X, y = stream.next_sample(1000)
    learner = HoeffdingTree(leaf_prediction='mc',
                            nominal_attributes=[i for i in range(10)])
    learner.partial_fit(X, y, classes=stream.target_values)
def test_extremely_fast_decision_tree_coverage():
    # Cover memory management
    stream = SEAGenerator(random_state=1, noise_percentage=0.05)
    stream.prepare_for_use()
    X, y = stream.next_sample(5000)

    learner = ExtremelyFastDecisionTreeClassifier(max_byte_size=30,
                                                  memory_estimate_period=100,
                                                  grace_period=10,
                                                  leaf_prediction='nba')

    learner.partial_fit(X, y, classes=stream.target_values)

    learner.reset()

    # Cover nominal attribute observer
    stream = RandomTreeGenerator(tree_random_state=23,
                                 sample_random_state=12,
                                 n_classes=2,
                                 n_cat_features=2,
                                 n_categories_per_cat_feature=4,
                                 n_num_features=1,
                                 max_tree_depth=30,
                                 min_leaf_depth=10,
                                 fraction_leaves_per_level=0.45)
    stream.prepare_for_use()
    X, y = stream.next_sample(5000)
    learner = ExtremelyFastDecisionTreeClassifier(
        leaf_prediction='nba', nominal_attributes=[i for i in range(1, 9)])
    learner.partial_fit(X, y, classes=stream.target_values)
def test_knn_adwin():
    stream = ConceptDriftStream(stream=SEAGenerator(random_state=1),
                                drift_stream=SEAGenerator(
                                    random_state=2, classification_function=2),
                                random_state=1,
                                position=250,
                                width=10)
    stream.prepare_for_use()
    learner = KNNAdwin(n_neighbors=8, leaf_size=40, max_window_size=200)

    cnt = 0
    max_samples = 1000
    predictions = array('i')
    correct_predictions = 0
    wait_samples = 20

    while cnt < max_samples:
        X, y = stream.next_sample()
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            predictions.append(learner.predict(X)[0])
            if y[0] == predictions[-1]:
                correct_predictions += 1
        learner.partial_fit(X, y)
        cnt += 1

    expected_predictions = array('i', [
        1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0,
        1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1,
        1
    ])
    assert np.alltrue(predictions == expected_predictions)

    expected_correct_predictions = 46
    assert correct_predictions == expected_correct_predictions

    learner.reset()
    assert learner.window.n_samples == 0

    expected_info = 'KNNAdwin(leaf_size=40, max_window_size=200, n_neighbors=8,\n' \
                    '         nominal_attributes=None)'
    assert learner.get_info() == expected_info

    stream.restart()

    X, y = stream.next_sample(max_samples)
    learner.fit(X[:950], y[:950])
    predictions = learner.predict(X[951:])

    correct_predictions = sum(np.array(predictions) == y[951:])
    expected_correct_predictions = 47
    assert correct_predictions == expected_correct_predictions

    assert type(learner.predict(X)) == np.ndarray
    assert type(learner.predict_proba(X)) == np.ndarray
示例#16
0
def test_ensemble_size():
    # since each member of the ensemble is initialized when the number of instances reach the chunk size, the size of
    # the ensemble should n_samples // chunk_size
    chunk_size = 100
    n_samples = 1050
    gen = SEAGenerator(balance_classes=True)
    # gen.prepare_for_use()
    mde = MDEMethod(NaiveBayes(), chunk_size, KNORAE(), alpha=0.0)
    X, y = gen.next_sample(n_samples)
    mde.partial_fit(X, y)
    assert len(mde.ensemble) == n_samples // chunk_size
示例#17
0
def test_ensemble_size():
    # since each member of the ensemble is initialized when the number of instances reach the chunk size, the size of
    # the ensemble should n_samples // chunk_size
    chunk_size = 100
    n_samples = 1050
    gen = SEAGenerator()
    # gen.prepare_for_use()
    dynse = DYNSEMethod(NaiveBayes(), chunk_size, ModifiedRank())
    X, y = gen.next_sample(n_samples)
    dynse.partial_fit(X, y)
    assert len(dynse.ensemble) == n_samples // chunk_size
示例#18
0
def test_accuracy():
    # an ensemble of Adaptive Random Forests should perform at the very least 80% with 200 instances of SEAGenerator
    n_samples_train = 200
    n_samples_test = 200
    gen = SEAGenerator(noise_percentage=0.0)
    # gen.prepare_for_use()
    arf = AdaptiveRandomForest()
    desdd = DESDDMethod(arf)
    X_train, y_train = gen.next_sample(n_samples_train)
    X_test, y_test = gen.next_sample(n_samples_test)
    desdd.partial_fit(X_train, y_train)
    assert desdd.score(X_test, y_test) > 0.80
示例#19
0
def test_perceptron(test_path):
    stream = SEAGenerator(random_state=1)
    stream.prepare_for_use()

    learner = PerceptronMask(random_state=1)

    cnt = 0
    max_samples = 5000
    y_pred = array('i')
    X_batch = []
    y_batch = []
    y_proba = []
    wait_samples = 100

    while cnt < max_samples:
        X, y = stream.next_sample()
        X_batch.append(X[0])
        y_batch.append(y[0])
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            y_pred.append(learner.predict(X)[0])
            y_proba.append(learner.predict_proba(X)[0])
        learner.partial_fit(X, y, classes=stream.target_values)
        cnt += 1

    expected_predictions = array('i', [
        1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0,
        1
    ])
    assert np.alltrue(y_pred == expected_predictions)

    test_file = os.path.join(test_path, 'data_perceptron_proba.npy')
    y_proba_expected = np.load(test_file)
    assert np.allclose(y_proba, y_proba_expected)

    expected_info = 'PerceptronMask: - penalty: None - alpha: 0.0001 - fit_intercept: True - max_iter: 1000 ' \
                    '- tol: 0.001 - shuffle: True - eta0: 1.0 - warm_start: False - class_weight: None - n_jobs: 1'

    assert learner.get_info() == expected_info

    # Coverage tests
    learner.reset()
    learner.fit(X=X_batch[:4500], y=y_batch[:4500])
    y_pred = learner.predict(X=X_batch[4501:])
    accuracy = accuracy_score(y_true=y_batch[4501:], y_pred=y_pred)
    expected_accuracy = 0.8897795591182365
    # assert np.isclose(expected_accuracy, accuracy)  # Removed due to npn-replicable error in Travis build

    assert 'estimator' == learner.get_class_type()

    assert type(learner.predict(X)) == np.ndarray
    assert type(learner.predict_proba(X)) == np.ndarray
示例#20
0
def test_equality_multiflow():
    gen = SEAGenerator(random_state=42)
    gen.prepare_for_use()
    evaluator_mtflow = mtflowPrequential(
        max_samples=1000, pretrain_size=0, restart_stream=True
    )
    evaluator_dyn2sel = dyn2selPrequential(max_samples=1000, pretrain_size=0)
    nb_mtflow = evaluator_mtflow.evaluate(gen, NaiveBayes())[0].__dict__
    nb_dyn2sel = evaluator_dyn2sel.evaluate(gen, NaiveBayes())[0].__dict__
    del nb_mtflow["_attribute_observers"]
    del nb_dyn2sel["_attribute_observers"]
    assert nb_mtflow == nb_dyn2sel
示例#21
0
def test_naive_bayes(test_path):
    stream = SEAGenerator(random_state=1)
    stream.prepare_for_use()

    learner = NaiveBayes()

    cnt = 0
    max_samples = 5000
    y_pred = array('i')
    X_batch = []
    y_batch = []
    y_proba = []
    wait_samples = 100

    while cnt < max_samples:
        X, y = stream.next_sample()
        X_batch.append(X[0])
        y_batch.append(y[0])
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            y_pred.append(learner.predict(X)[0])
            y_proba.append(learner.predict_proba(X)[0])
        learner.partial_fit(X, y, classes=stream.target_values)
        cnt += 1

    expected_predictions = array('i', [
        1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1,
        1
    ])

    assert np.alltrue(y_pred == expected_predictions)

    test_file = os.path.join(test_path, 'data_naive_bayes_proba.npy')
    y_proba_expected = np.load(test_file)
    assert np.allclose(y_proba, y_proba_expected)

    expected_info = 'NaiveBayes: nominal attributes: [] - '
    assert learner.get_info() == expected_info

    learner.reset()
    learner.fit(X=np.array(X_batch[:4500]), y=np.array(y_batch[:4500]))

    expected_score = 0.9378757515030061
    assert np.isclose(
        expected_score,
        learner.score(X=np.array(X_batch[4501:]), y=np.array(y_batch[4501:])))

    assert 'estimator' == learner.get_class_type()

    assert type(learner.predict(X)) == np.ndarray
    assert type(learner.predict_proba(X)) == np.ndarray
示例#22
0
def test_accuracy():
    # an ensemble of Naive Bayes should perform at the very least 85% with 200 instances of SEAGenerator
    chunk_size = 100
    n_samples_train = 1050
    n_samples_test = 200
    gen = SEAGenerator(noise_percentage=0.0)
    # gen.prepare_for_use()
    nb = NaiveBayes()
    mde = MDEMethod(nb, chunk_size, KNORAU())
    X_train, y_train = gen.next_sample(n_samples_train)
    X_test, y_test = gen.next_sample(n_samples_test)
    mde.partial_fit(X_train, y_train)
    assert mde.score(X_test, y_test) > 0.85
示例#23
0
def test_online_csb2():
    stream = SEAGenerator(1, noise_percentage=0.067, random_state=112)
    stream.prepare_for_use()
    nb = NaiveBayes()
    learner = OnlineCSB2Classifier(base_estimator=nb,
                                   n_estimators=3,
                                   cost_positive=1,
                                   cost_negative=0.9,
                                   random_state=112)
    first = True

    cnt = 0
    max_samples = 5000
    predictions = []
    wait_samples = 100
    correct_predictions = 0

    while cnt < max_samples:
        X, y = stream.next_sample()
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            predictions.append(learner.predict(X)[0])
            if y[0] == predictions[-1]:
                correct_predictions += 1
        if first:
            learner.partial_fit(X, y, classes=stream.target_values)
            first = False
        else:
            learner.partial_fit(X, y)
        cnt += 1
    performance = correct_predictions / len(predictions)
    expected_predictions = [
        1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0,
        1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
        1
    ]

    expected_correct_predictions = 43
    expected_performance = 0.8775510204081632

    assert np.alltrue(predictions == expected_predictions)
    assert np.isclose(expected_performance, performance)
    assert correct_predictions == expected_correct_predictions

    assert type(learner.predict(X)) == np.ndarray
    assert type(learner.predict_proba(X)) == np.ndarray

    expected_info = "OnlineCSB2Classifier(base_estimator=NaiveBayes(nominal_attributes=None), cost_negative=0.9, " \
                    "cost_positive=1, drift_detection=True, n_estimators=3, random_state=112)"
    info = " ".join([line.strip() for line in learner.get_info().split()])
    assert info == expected_info
def test_online_rus_1():
    stream = SEAGenerator(1, noise_percentage=0.067, random_state=112)
    stream.prepare_for_use()
    nb = NaiveBayes()
    learner = OnlineRUSBoost(base_estimator=nb,
                             n_estimators=3,
                             sampling_rate=5,
                             algorithm=1,
                             random_state=112)
    first = True

    cnt = 0
    max_samples = 5000
    predictions = []
    wait_samples = 100
    correct_predictions = 0

    while cnt < max_samples:
        X, y = stream.next_sample()
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            predictions.append(learner.predict(X)[0])
            if y[0] == predictions[-1]:
                correct_predictions += 1
        if first:
            learner.partial_fit(X, y, classes=stream.target_values)
            first = False
        else:
            learner.partial_fit(X, y)
        cnt += 1
    performance = correct_predictions / len(predictions)
    expected_predictions = [
        1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1,
        1
    ]

    expected_correct_predictions = 33
    expected_performance = 0.673469387755102

    assert np.alltrue(predictions == expected_predictions)
    assert np.isclose(expected_performance, performance)
    assert correct_predictions == expected_correct_predictions

    assert type(learner.predict(X)) == np.ndarray
    assert type(learner.predict_proba(X)) == np.ndarray

    expected_info = "OnlineRUSBoost(algorithm=1, base_estimator=NaiveBayes(nominal_attributes=None),\n" \
                    "               drift_detection=True, n_estimators=3, random_state=112,\n" \
                    "               sampling_rate=5)"
    assert learner.get_info() == expected_info
示例#25
0
def test_leverage_bagging():
    stream = SEAGenerator(1, noise_percentage=0.067, random_state=112)
    stream.prepare_for_use()
    knn = KNNClassifier(n_neighbors=8, leaf_size=40, max_window_size=2000)
    learner = LeverageBaggingClassifier(base_estimator=knn,
                                        n_estimators=3,
                                        random_state=112)
    first = True

    cnt = 0
    max_samples = 5000
    predictions = []
    wait_samples = 100
    correct_predictions = 0

    while cnt < max_samples:
        X, y = stream.next_sample()
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            predictions.append(learner.predict(X)[0])
            if y[0] == predictions[-1]:
                correct_predictions += 1
        if first:
            learner.partial_fit(X, y, classes=stream.target_values)
            first = False
        else:
            learner.partial_fit(X, y)
        cnt += 1

    performance = correct_predictions / len(predictions)
    expected_predictions = [
        1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0,
        1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
        1
    ]
    assert np.alltrue(predictions == expected_predictions)

    expected_performance = 0.8571428571428571
    assert np.isclose(expected_performance, performance)

    expected_correct_predictions = 42
    assert correct_predictions == expected_correct_predictions

    assert type(learner.predict(X)) == np.ndarray
    assert type(learner.predict_proba(X)) == np.ndarray

    expected_info = "LeverageBaggingClassifier(base_estimator=KNNClassifier(leaf_size=40, max_window_size=2000, " \
                    "n_neighbors=8, nominal_attributes=None), delta=0.002, enable_code_matrix=False, " \
                    "leverage_algorithm='leveraging_bag', n_estimators=3, random_state=112, w=6)"
    info = " ".join([line.strip() for line in learner.get_info().split()])
    assert info == expected_info
示例#26
0
def demo():
    """ _test_leverage_bagging

    This demo tests the LeverageBaggingClassifier on a file stream, which gives
    instances coming from a SEA generator. 

    The test computes the performance of the LeverageBaggingClassifier as well
    as the time to create the structure and classify max_samples (2000 by default) 
    instances.

    """
    logging.basicConfig(format='%(message)s', level=logging.INFO)
    warnings.filterwarnings("ignore", ".*Passing 1d.*")
    stream = SEAGenerator(1, noise_percentage=0.067, random_state=1)

    clf = LeverageBaggingClassifier(base_estimator=KNNClassifier(
        n_neighbors=8, max_window_size=2000, leaf_size=30),
                                    n_estimators=1,
                                    random_state=1)
    sample_count = 0
    correctly_classified = 0
    max_samples = 2000
    train_size = 200
    first = True
    if train_size > 0:
        X, y = stream.next_sample(train_size)
        clf.partial_fit(X, y, classes=stream.target_values)
        first = False

    logging.info('%s%%', 0.0)
    while sample_count < max_samples:
        if (sample_count + 1) % (max_samples / 20) == 0:
            logging.info('%s%%',
                         str(((sample_count // (max_samples / 20) + 1) * 5)))
        X, y = stream.next_sample(2)
        my_pred = clf.predict(X)
        if first:
            clf.partial_fit(X, y, classes=stream.target_values)
            first = False
        else:
            clf.partial_fit(X, y)

        if my_pred is not None:
            if y[0] == my_pred[0]:
                correctly_classified += 1

        sample_count += 1

    print(str(sample_count) + ' samples analyzed.')
    print('My performance: ' + str(correctly_classified / sample_count))
    print(clf.get_info())
示例#27
0
def test_pretrain_size():
    gen = SEAGenerator(random_state=42)
    gen.prepare_for_use()
    evaluator_dyn2sel = dyn2selPrequential(
        n_wait=100, max_samples=1000, pretrain_size=150
    )
    dynse_rank = DYNSEMethod(NaiveBayes(), 100, Rank())
    evaluator_dyn2sel.evaluate(gen, dynse_rank)

    evaluator_dyn2sel = dyn2selPrequential(
        n_wait=100, max_samples=1000, pretrain_size=150
    )
    dynse_oracle = DYNSEMethod(NaiveBayes(), 100, Oracle())
    evaluator_dyn2sel.evaluate(gen, dynse_oracle)
def test_hat_nb(test_path):
    stream = ConceptDriftStream(stream=SEAGenerator(random_state=1,
                                                    noise_percentage=0.05),
                                drift_stream=SEAGenerator(
                                    random_state=2,
                                    classification_function=2,
                                    noise_percentage=0.05),
                                random_state=1,
                                position=250,
                                width=10)
    stream.prepare_for_use()

    learner = HAT(leaf_prediction='nb')

    cnt = 0
    max_samples = 1000
    y_pred = array('i')
    y_proba = []
    wait_samples = 20

    while cnt < max_samples:
        X, y = stream.next_sample()
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            y_pred.append(learner.predict(X)[0])
            y_proba.append(learner.predict_proba(X)[0])
        learner.partial_fit(X, y)
        cnt += 1

    expected_predictions = array('i', [
        1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0,
        1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1,
        1
    ])
    assert np.alltrue(y_pred == expected_predictions)

    test_file = os.path.join(test_path, 'test_hoeffding_adaptive_tree_nb.npy')
    data = np.load(test_file)
    assert np.allclose(y_proba, data)

    expected_info = "HAT(binary_split=False, grace_period=200, leaf_prediction='nb',\n" \
                    "    max_byte_size=33554432, memory_estimate_period=1000000, nb_threshold=0,\n" \
                    "    no_preprune=False, nominal_attributes=None, remove_poor_atts=False,\n" \
                    "    split_confidence=1e-07, split_criterion='info_gain',\n" \
                    "    stop_mem_management=False, tie_threshold=0.05)"

    assert learner.get_info() == expected_info
    assert type(learner.predict(X)) == np.ndarray
    assert type(learner.predict_proba(X)) == np.ndarray
def test_oza_bagging():
    stream = SEAGenerator(1, noise_percentage=0.067, random_state=112)
    knn = KNNClassifier(n_neighbors=8, leaf_size=40, max_window_size=2000)
    learner = OzaBaggingClassifier(base_estimator=knn,
                                   n_estimators=3,
                                   random_state=112)
    first = True

    cnt = 0
    max_samples = 5000
    predictions = []
    wait_samples = 100
    correct_predictions = 0

    while cnt < max_samples:
        X, y = stream.next_sample()
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            predictions.append(learner.predict(X)[0])
            if y[0] == predictions[-1]:
                correct_predictions += 1
        if first:
            learner.partial_fit(X, y, classes=stream.target_values)
            first = False
        else:
            learner.partial_fit(X, y)
        cnt += 1
    performance = correct_predictions / len(predictions)
    expected_predictions = [
        1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0,
        1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
        1
    ]
    assert np.alltrue(predictions == expected_predictions)

    expected_performance = 0.8979591836734694
    assert np.isclose(expected_performance, performance)

    expected_correct_predictions = 44
    assert correct_predictions == expected_correct_predictions

    assert type(learner.predict(X)) == np.ndarray
    assert type(learner.predict_proba(X)) == np.ndarray

    expected_info = "OzaBaggingClassifier(base_estimator=KNNClassifier(leaf_size=40, " \
                    "max_window_size=2000, metric='euclidean', n_neighbors=8), " \
                    "n_estimators=3, random_state=112)"
    info = " ".join([line.strip() for line in learner.get_info().split()])
    assert info == expected_info
def test_additive_expert_ensemble_weakest():
    stream = SEAGenerator(1, noise_percentage=0.067, random_state=112)
    stream.prepare_for_use()

    learner = AdditiveExpertEnsemble(3,
                                     NaiveBayes(),
                                     beta=0.5,
                                     gamma=0.1,
                                     pruning='weakest')

    cnt = 0
    max_samples = 5000
    predictions = []
    wait_samples = 100
    correct_predictions = 0
    first = True

    while cnt < max_samples:
        X, y = stream.next_sample()
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            predictions.append(learner.predict(X)[0])
            if y[0] == predictions[-1]:
                correct_predictions += 1
        if first:
            learner.partial_fit(X, y, classes=stream.target_values)
            first = False
        else:
            learner.partial_fit(X, y)
        cnt += 1
    performance = correct_predictions / len(predictions)

    expected_predictions = [
        1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0,
        1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
        1
    ]
    expected_correct_predictions = 45
    expected_performance = 0.9183673469387755

    assert np.alltrue(predictions == expected_predictions)
    assert np.isclose(expected_performance, performance)
    assert correct_predictions == expected_correct_predictions

    assert type(learner.predict(X)) == np.ndarray

    expected_info = "AdditiveExpertEnsemble(base_estimator=NaiveBayes(nominal_attributes=None),\n" \
                    "                       beta=0.5, gamma=0.1, n_estimators=3, pruning='weakest')"
    assert learner.get_info() == expected_info