예제 #1
0
def test_hoeffding_tree_coverage():
    # Cover memory management
    stream = SEAGenerator(random_state=1, noise_percentage=0.05)
    stream.prepare_for_use()
    X, y = stream.next_sample(5000)

    learner = HoeffdingTree(max_byte_size=30,
                            memory_estimate_period=100,
                            grace_period=10,
                            leaf_prediction='mc')

    learner.partial_fit(X, y, classes=stream.target_values)

    learner.reset()

    # Cover nominal attribute observer
    stream = RandomTreeGenerator(tree_random_state=1,
                                 sample_random_state=1,
                                 n_num_features=0,
                                 n_categories_per_cat_feature=2)
    stream.prepare_for_use()
    X, y = stream.next_sample(1000)
    learner = HoeffdingTree(leaf_prediction='mc',
                            nominal_attributes=[i for i in range(10)])
    learner.partial_fit(X, y, classes=stream.target_values)
예제 #2
0
def demo():
    """ _test_streams
    
    This demo tests if the streams are correctly generating samples.
    
    :return: 
    """
    stream = FileStream("https://raw.githubusercontent.com/scikit-multiflow/streaming-datasets/"
                        "master/covtype.csv")

    rbf_drift = RandomRBFGeneratorDrift(change_speed=41.00, n_centroids=50, model_seed=32523423, instance_seed=5435,
                                        n_classes=2, n_features=10, num_drift_centroids=50)

    sea = SEAGenerator()

    print('1 instance:\n')

    X,y = stream.next_sample()
    print(X)
    print(y)

    X, y = sea.next_sample()
    print(X)
    print(y)

    print('\n\n10 instances:\n')
    X,y = stream.next_sample(10)
    print(X)
    print(y)

    X, y = sea.next_sample(10)
    print(X)
    print(y)
def test_extremely_fast_decision_tree_coverage():
    # Cover memory management
    stream = SEAGenerator(random_state=1, noise_percentage=0.05)
    stream.prepare_for_use()
    X, y = stream.next_sample(5000)

    learner = ExtremelyFastDecisionTreeClassifier(max_byte_size=30,
                                                  memory_estimate_period=100,
                                                  grace_period=10,
                                                  leaf_prediction='nba')

    learner.partial_fit(X, y, classes=stream.target_values)

    learner.reset()

    # Cover nominal attribute observer
    stream = RandomTreeGenerator(tree_random_state=23,
                                 sample_random_state=12,
                                 n_classes=2,
                                 n_cat_features=2,
                                 n_categories_per_cat_feature=4,
                                 n_num_features=1,
                                 max_tree_depth=30,
                                 min_leaf_depth=10,
                                 fraction_leaves_per_level=0.45)
    stream.prepare_for_use()
    X, y = stream.next_sample(5000)
    learner = ExtremelyFastDecisionTreeClassifier(
        leaf_prediction='nba', nominal_attributes=[i for i in range(1, 9)])
    learner.partial_fit(X, y, classes=stream.target_values)
예제 #4
0
def test_accuracy():
    # an ensemble of Adaptive Random Forests should perform at the very least 80% with 200 instances of SEAGenerator
    n_samples_train = 200
    n_samples_test = 200
    gen = SEAGenerator(noise_percentage=0.0)
    # gen.prepare_for_use()
    arf = AdaptiveRandomForest()
    desdd = DESDDMethod(arf)
    X_train, y_train = gen.next_sample(n_samples_train)
    X_test, y_test = gen.next_sample(n_samples_test)
    desdd.partial_fit(X_train, y_train)
    assert desdd.score(X_test, y_test) > 0.80
def demo():
    """ _test_leverage_bagging

    This demo tests the LeverageBagging classifier on a file stream, which gives 
    instances coming from a SEA generator. 

    The test computes the performance of the LeverageBagging classifier as well 
    as the time to create the structure and classify max_samples (2000 by default) 
    instances.

    """
    logging.basicConfig(format='%(message)s', level=logging.INFO)
    warnings.filterwarnings("ignore", ".*Passing 1d.*")
    stream = SEAGenerator(1, noise_percentage=0.067, random_state=1)
    stream.prepare_for_use()
    clf = LeverageBagging(base_estimator=KNN(n_neighbors=8,
                                             max_window_size=2000,
                                             leaf_size=30),
                          n_estimators=1,
                          random_state=1)
    sample_count = 0
    correctly_classified = 0
    max_samples = 2000
    train_size = 200
    first = True
    if train_size > 0:
        X, y = stream.next_sample(train_size)
        clf.partial_fit(X, y, classes=stream.target_values)
        first = False

    logging.info('%s%%', 0.0)
    while sample_count < max_samples:
        if (sample_count + 1) % (max_samples / 20) == 0:
            logging.info('%s%%',
                         str(((sample_count // (max_samples / 20) + 1) * 5)))
        X, y = stream.next_sample(2)
        my_pred = clf.predict(X)
        if first:
            clf.partial_fit(X, y, classes=stream.target_values)
            first = False
        else:
            clf.partial_fit(X, y)

        if my_pred is not None:
            if y[0] == my_pred[0]:
                correctly_classified += 1

        sample_count += 1

    print(str(sample_count) + ' samples analyzed.')
    print('My performance: ' + str(correctly_classified / sample_count))
    print(clf.get_info())
예제 #6
0
def test_accuracy():
    # an ensemble of Naive Bayes should perform at the very least 85% with 200 instances of SEAGenerator
    chunk_size = 100
    n_samples_train = 1050
    n_samples_test = 200
    gen = SEAGenerator(noise_percentage=0.0)
    # gen.prepare_for_use()
    nb = NaiveBayes()
    mde = MDEMethod(nb, chunk_size, KNORAU())
    X_train, y_train = gen.next_sample(n_samples_train)
    X_test, y_test = gen.next_sample(n_samples_test)
    mde.partial_fit(X_train, y_train)
    assert mde.score(X_test, y_test) > 0.85
예제 #7
0
def demo():
    """ _test_oza_bagging_adwin

    This demo tests the OzaBaggingADWINClassifier using KNNADWINClassifier as base estimator
    on samples given by a SEAGenerator. 

    The test computes the performance of the OzaBaggingADWINClassifier as well
    as the time to create the structure and classify max_samples (20000 by 
    default) instances.

    """
    logging.basicConfig(format='%(message)s', level=logging.INFO)
    warnings.filterwarnings("ignore", ".*Passing 1d.*")
    stream = SEAGenerator(1, noise_percentage=0.067, random_state=1)

    clf = OzaBaggingADWINClassifier(base_estimator=KNNADWINClassifier(
        n_neighbors=8, max_window_size=2000, leaf_size=30),
                                    n_estimators=2,
                                    random_state=1)
    sample_count = 0
    correctly_classified = 0
    max_samples = 20000
    train_size = 10
    first = True
    if train_size > 0:
        X, y = stream.next_sample(train_size)
        clf.partial_fit(X, y, classes=stream.target_values)
        first = False

    while sample_count < max_samples:
        if sample_count % (max_samples / 20) == 0:
            logging.info('%s%%', str((sample_count // (max_samples / 20) * 5)))
        X, y = stream.next_sample()
        my_pred = clf.predict(X)

        if first:
            clf.partial_fit(X, y, classes=stream.target_values)
            first = False
        else:
            clf.partial_fit(X, y)

        if my_pred is not None:
            if y[0] == my_pred[0]:
                correctly_classified += 1

        sample_count += 1

    print(str(sample_count) + ' samples analyzed.')
    print('My performance: ' + str(correctly_classified / sample_count))
예제 #8
0
def test_clone():
    stream = SEAGenerator(random_state=1)
    stream.prepare_for_use()

    learner = NaiveBayes()

    cnt = 0
    max_samples = 5000
    y_pred = array('i')
    X_batch = []
    y_batch = []
    y_proba = []
    wait_samples = 100

    while cnt < max_samples:
        X, y = stream.next_sample()
        X_batch.append(X[0])
        y_batch.append(y[0])
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            y_pred.append(learner.predict(X)[0])
            y_proba.append(learner.predict_proba(X)[0])
        learner.partial_fit(X, y, classes=stream.target_values)
        cnt += 1

    cloned = clone(learner)

    assert learner._observed_class_distribution != {} and cloned._observed_class_distribution == {}
    def run(self):
        producer = KafkaProducer(bootstrap_servers='localhost:9092')
        stream = SEAGenerator()

        # Send Signal to model server to extract persist model from pickle in local file
        producer.send(
            topic='testTopicAdmin',
            value=b'../modelPersist/online_hoeffding_tree_persist.pkl',
            key=b'extract')
        time.sleep(10)

        while not self.stop_event.is_set():
            dummy_data, dummy_label = stream.next_sample()
            print("Dummy Event Generated:", str(dummy_data))
            dummy_data_and_label = np.concatenate((dummy_data, dummy_label),
                                                  axis=None)

            producer.send(topic='testTopic',
                          value=dummy_data_and_label.tobytes(),
                          key=b'labeled')

            time.sleep(1)

        # Send signal to model server to persist model
        producer.send(
            topic='testTopicAdmin',
            value=b'../modelPersist/online_hoeffding_tree_persist.pkl',
            key=b'flush')

        producer.close()
예제 #10
0
def create_sea_drift_dataset(n_samples_per_concept=200, concepts=[0, 1, 2, 3]):
    X_stream = []
    Y_stream = []
    concept_drifts = []

    t = 0
    gen = SEAGenerator()
    gen.prepare_for_use()
    for _ in concepts:
        if t != 0:
            concept_drifts.append(t)

        X, y = gen.next_sample(batch_size=n_samples_per_concept)
        X_stream.append(X)
        Y_stream.append(y)

        gen.generate_drift()

        t += n_samples_per_concept

    return {
        "data": (np.concatenate(X_stream, axis=0),
                 np.concatenate(Y_stream, axis=0).reshape(-1, 1)),
        "drifts":
        np.array(concept_drifts)
    }
예제 #11
0
def test_evaluate_delayed_coverage(tmpdir):
    from skmultiflow.data import SEAGenerator
    from skmultiflow.bayes import NaiveBayes

    max_samples = 1000

    # Stream
    data = SEAGenerator(random_state=1)
    # Get X and y
    X, y = data.next_sample(max_samples)
    time = generate_random_dates(seed=1, samples=max_samples)

    # Setup temporal stream
    stream = TemporalDataStream(X, y, time, ordered=False)

    # Learner
    nb = NaiveBayes()

    output_file = os.path.join(str(tmpdir), "prequential_delayed_summary.csv")
    metrics = ['running_time', 'model_size']
    evaluator = EvaluatePrequentialDelayed(max_samples=max_samples,
                                           metrics=metrics,
                                           data_points_for_classification=True,
                                           output_file=output_file)

    evaluator.evaluate(stream=stream, model=nb, model_names=['NB'])
예제 #12
0
def test_half_space_trees(test_path):
    stream = SEAGenerator(classification_function=0,
                          noise_percentage=0.1,
                          random_state=1)

    learner = HalfSpaceTrees(n_estimators=13,
                             size_limit=75,
                             anomaly_threshold=0.90,
                             depth=10,
                             random_state=5)

    cnt = 0
    max_samples = 5000
    y_pred = array('i')
    y_proba = []
    wait_samples = 500

    while cnt < max_samples:
        X, y = stream.next_sample()
        # Scale inputs between 0 and 1
        X = X / 10
        if (cnt % wait_samples == 0) and (cnt != 0):
            y_pred.append(learner.predict(X)[0])
            y_proba.append(learner.predict_proba(X)[0])
        learner.partial_fit(X, y)
        cnt += 1

    expected_predictions = array('i', [1, 0, 0, 0, 1, 0, 0, 1, 0])
    assert np.alltrue(y_pred == expected_predictions)
    test_file = os.path.join(test_path, 'test_half_space_trees.npy')
    expected_proba = np.load(test_file)
    assert np.allclose(y_proba, expected_proba)
예제 #13
0
def test_hoeffding_tree_model_information():
    stream = SEAGenerator(random_state=1, noise_percentage=0.05)
    stream.prepare_for_use()
    X, y = stream.next_sample(5000)

    nominal_attr_idx = [x for x in range(5, stream.n_features)]
    learner = HoeffdingTree(nominal_attributes=nominal_attr_idx)

    learner.partial_fit(X, y, classes=stream.target_values)

    expected_info = {
        'Tree size (nodes)': 5,
        'Tree size (leaves)': 3,
        'Active learning nodes': 3,
        'Tree depth': 2,
        'Active leaf byte size estimate': 0.0,
        'Inactive leaf byte size estimate': 0.0,
        'Byte size estimate overhead': 1.0
    }

    observed_info = learner.get_model_measurements
    for k in expected_info:
        assert k in observed_info
        assert expected_info[k] == observed_info[k]

    expected_description = "if Attribute 0 <= 4.549969620513424:\n" \
                            "  if Attribute 1 <= 5.440182925299016:\n" \
                            "    Leaf = Class 0 | {0: 345.54817975126275, 1: 44.43855503614928}\n" \
                            "  if Attribute 1 > 5.440182925299016:\n" \
                            "    Leaf = Class 1 | {0: 54.451820248737235, 1: 268.5614449638507}\n" \
                            "if Attribute 0 > 4.549969620513424:\n" \
                            "  Leaf = Class 1 | {0: 390.5845685762964, 1: 2372.3747376855454}\n" \

    assert expected_description == learner.get_model_description()
예제 #14
0
def test_knn():
    stream = SEAGenerator(random_state=1)
    stream.prepare_for_use()

    learner = KNN(n_neighbors=8, max_window_size=2000, leaf_size=40)
    cnt = 0
    max_samples = 5000
    predictions = array('i')
    correct_predictions = 0
    wait_samples = 100
    X_batch = []
    y_batch = []

    while cnt < max_samples:
        X, y = stream.next_sample()
        X_batch.append(X[0])
        y_batch.append(y[0])
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            predictions.append(learner.predict(X)[0])
            if y[0] == predictions[-1]:
                correct_predictions += 1
        learner.partial_fit(X, y)
        cnt += 1

    expected_predictions = array('i', [
        1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0,
        1
    ])
    assert np.alltrue(predictions == expected_predictions)

    expected_correct_predictions = 49
    assert correct_predictions == expected_correct_predictions

    expected_info = 'KNN(leaf_size=40, max_window_size=2000, n_neighbors=8, nominal_attributes=None)'
    assert learner.get_info() == expected_info

    learner.reset()
    assert learner.get_info() == expected_info

    X_batch = np.array(X_batch)
    y_batch = np.array(y_batch)
    learner.fit(X_batch[:4500], y_batch[:4500], classes=[0, 1])
    predictions = learner.predict(X_batch[4501:4550])

    expected_predictions = array('i', [
        1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1,
        1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1,
        0
    ])
    assert np.alltrue(predictions == expected_predictions)

    correct_predictions = sum(predictions == y_batch[4501:4550])
    expected_correct_predictions = 49
    assert correct_predictions == expected_correct_predictions

    assert type(learner.predict(X)) == np.ndarray
    assert type(learner.predict_proba(X)) == np.ndarray
예제 #15
0
def test_perceptron(test_path):
    stream = SEAGenerator(random_state=1)

    learner = PerceptronMask(random_state=1)

    cnt = 0
    max_samples = 5000
    y_pred = array('i')
    X_batch = []
    y_batch = []
    y_proba = []
    wait_samples = 100

    while cnt < max_samples:
        X, y = stream.next_sample()
        X_batch.append(X[0])
        y_batch.append(y[0])
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            y_pred.append(learner.predict(X)[0])
            y_proba.append(learner.predict_proba(X)[0])
        learner.partial_fit(X, y, classes=stream.target_values)
        cnt += 1

    expected_predictions = array('i', [
        1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0,
        1
    ])
    assert np.alltrue(y_pred == expected_predictions)

    test_file = os.path.join(test_path, 'data_perceptron_proba.npy')
    y_proba_expected = np.load(test_file)
    assert np.allclose(y_proba, y_proba_expected)

    expected_info = "PerceptronMask(alpha=0.0001, class_weight=None, early_stopping=False, " \
                    "eta0=1.0, fit_intercept=True, max_iter=1000, n_iter_no_change=5, " \
                    "n_jobs=None, penalty=None, random_state=1, shuffle=True, tol=0.001, " \
                    "validation_fraction=0.1, verbose=0, warm_start=False)"
    info = " ".join([line.strip() for line in learner.get_info().split()])
    assert info == expected_info

    # Coverage tests
    learner.reset()
    if not sklearn_version.startswith("0.21"):
        learner.fit(X=np.asarray(X_batch[:4500]),
                    y=np.asarray(y_batch[:4500], dtype=int))
    else:
        # Root cause of failure (TypeError: an integer is required) is in the fit() method
        # in sklearn 0.21.0. This is a workaround until a fix is made available in sklearn
        learner.partial_fit(X=np.asarray(X_batch[:4500]),
                            y=np.asarray(y_batch[:4500]),
                            classes=stream.target_values)
    learner.predict(X=X_batch[4501:])  # Run for coverage

    assert type(learner.predict(X)) == np.ndarray
    assert type(learner.predict_proba(X)) == np.ndarray
예제 #16
0
def test_standardize():
    stream = SEAGenerator(random_state=1)

    learner = WeightedKNNClassifier(n_neighbors=8,
                                    max_window_size=2000,
                                    leaf_size=40,
                                    standardize=True)

    #    # Test for Implementing moving average
    example_features = np.array(
        [np.array([[1, 2]]),
         np.array([[2, 4]]),
         np.array([[3, 9]])])

    example_targets = np.array([[1], [1], [1]])
    for i in range(len(example_features)):
        learner.partial_fit(example_features[i], example_targets[i])

    moving_average = learner.get_mean
    assert type(moving_average) is np.ndarray
    assert np.alltrue(moving_average == np.array([[2, 5]]))
    moving_sd = learner.get_sd
    assert type(moving_sd) is np.ndarray
    assert np.alltrue(moving_sd.astype(int) == np.array([[0, 2]]))
    stream = SEAGenerator(random_state=1)
    learner = WeightedKNNClassifier(n_neighbors=8,
                                    max_window_size=2000,
                                    leaf_size=40,
                                    standardize=True)

    cnt = 0
    max_samples = 5000
    predictions = array('i')
    correct_predictions = 0
    wait_samples = 100
    X_batch = []
    y_batch = []

    while cnt < max_samples:
        X, y = stream.next_sample()
        X_batch.append(X[0])
        y_batch.append(y[0])
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            predictions.append(learner.predict(X)[0])
            if y[0] == predictions[-1]:
                correct_predictions += 1
        learner.partial_fit(X, y)
        cnt += 1

    expected_predictions = array('i', [
        1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0,
        1
    ])
    assert np.alltrue(predictions == expected_predictions)
예제 #17
0
def test_ensemble_size():
    # since each member of the ensemble is initialized when the number of instances reach the chunk size, the size of
    # the ensemble should n_samples // chunk_size
    chunk_size = 100
    n_samples = 1050
    gen = SEAGenerator()
    # gen.prepare_for_use()
    dynse = DYNSEMethod(NaiveBayes(), chunk_size, ModifiedRank())
    X, y = gen.next_sample(n_samples)
    dynse.partial_fit(X, y)
    assert len(dynse.ensemble) == n_samples // chunk_size
예제 #18
0
def test_ensemble_size():
    # since each member of the ensemble is initialized when the number of instances reach the chunk size, the size of
    # the ensemble should n_samples // chunk_size
    chunk_size = 100
    n_samples = 1050
    gen = SEAGenerator(balance_classes=True)
    # gen.prepare_for_use()
    mde = MDEMethod(NaiveBayes(), chunk_size, KNORAE(), alpha=0.0)
    X, y = gen.next_sample(n_samples)
    mde.partial_fit(X, y)
    assert len(mde.ensemble) == n_samples // chunk_size
예제 #19
0
def test_perceptron(test_path):
    stream = SEAGenerator(random_state=1)
    stream.prepare_for_use()

    learner = PerceptronMask(random_state=1)

    cnt = 0
    max_samples = 5000
    y_pred = array('i')
    X_batch = []
    y_batch = []
    y_proba = []
    wait_samples = 100

    while cnt < max_samples:
        X, y = stream.next_sample()
        X_batch.append(X[0])
        y_batch.append(y[0])
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            y_pred.append(learner.predict(X)[0])
            y_proba.append(learner.predict_proba(X)[0])
        learner.partial_fit(X, y, classes=stream.target_values)
        cnt += 1

    expected_predictions = array('i', [
        1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0,
        1
    ])
    assert np.alltrue(y_pred == expected_predictions)

    test_file = os.path.join(test_path, 'data_perceptron_proba.npy')
    y_proba_expected = np.load(test_file)
    assert np.allclose(y_proba, y_proba_expected)

    expected_info = 'PerceptronMask: - penalty: None - alpha: 0.0001 - fit_intercept: True - max_iter: 1000 ' \
                    '- tol: 0.001 - shuffle: True - eta0: 1.0 - warm_start: False - class_weight: None - n_jobs: 1'

    assert learner.get_info() == expected_info

    # Coverage tests
    learner.reset()
    learner.fit(X=X_batch[:4500], y=y_batch[:4500])
    y_pred = learner.predict(X=X_batch[4501:])
    accuracy = accuracy_score(y_true=y_batch[4501:], y_pred=y_pred)
    expected_accuracy = 0.8897795591182365
    # assert np.isclose(expected_accuracy, accuracy)  # Removed due to npn-replicable error in Travis build

    assert 'estimator' == learner.get_class_type()

    assert type(learner.predict(X)) == np.ndarray
    assert type(learner.predict_proba(X)) == np.ndarray
예제 #20
0
def test_naive_bayes(test_path):
    stream = SEAGenerator(random_state=1)
    stream.prepare_for_use()

    learner = NaiveBayes()

    cnt = 0
    max_samples = 5000
    y_pred = array('i')
    X_batch = []
    y_batch = []
    y_proba = []
    wait_samples = 100

    while cnt < max_samples:
        X, y = stream.next_sample()
        X_batch.append(X[0])
        y_batch.append(y[0])
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            y_pred.append(learner.predict(X)[0])
            y_proba.append(learner.predict_proba(X)[0])
        learner.partial_fit(X, y, classes=stream.target_values)
        cnt += 1

    expected_predictions = array('i', [
        1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1,
        1
    ])

    assert np.alltrue(y_pred == expected_predictions)

    test_file = os.path.join(test_path, 'data_naive_bayes_proba.npy')
    y_proba_expected = np.load(test_file)
    assert np.allclose(y_proba, y_proba_expected)

    expected_info = 'NaiveBayes: nominal attributes: [] - '
    assert learner.get_info() == expected_info

    learner.reset()
    learner.fit(X=np.array(X_batch[:4500]), y=np.array(y_batch[:4500]))

    expected_score = 0.9378757515030061
    assert np.isclose(
        expected_score,
        learner.score(X=np.array(X_batch[4501:]), y=np.array(y_batch[4501:])))

    assert 'estimator' == learner.get_class_type()

    assert type(learner.predict(X)) == np.ndarray
    assert type(learner.predict_proba(X)) == np.ndarray
예제 #21
0
def test_online_csb2():
    stream = SEAGenerator(1, noise_percentage=0.067, random_state=112)
    stream.prepare_for_use()
    nb = NaiveBayes()
    learner = OnlineCSB2Classifier(base_estimator=nb,
                                   n_estimators=3,
                                   cost_positive=1,
                                   cost_negative=0.9,
                                   random_state=112)
    first = True

    cnt = 0
    max_samples = 5000
    predictions = []
    wait_samples = 100
    correct_predictions = 0

    while cnt < max_samples:
        X, y = stream.next_sample()
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            predictions.append(learner.predict(X)[0])
            if y[0] == predictions[-1]:
                correct_predictions += 1
        if first:
            learner.partial_fit(X, y, classes=stream.target_values)
            first = False
        else:
            learner.partial_fit(X, y)
        cnt += 1
    performance = correct_predictions / len(predictions)
    expected_predictions = [
        1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0,
        1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
        1
    ]

    expected_correct_predictions = 43
    expected_performance = 0.8775510204081632

    assert np.alltrue(predictions == expected_predictions)
    assert np.isclose(expected_performance, performance)
    assert correct_predictions == expected_correct_predictions

    assert type(learner.predict(X)) == np.ndarray
    assert type(learner.predict_proba(X)) == np.ndarray

    expected_info = "OnlineCSB2Classifier(base_estimator=NaiveBayes(nominal_attributes=None), cost_negative=0.9, " \
                    "cost_positive=1, drift_detection=True, n_estimators=3, random_state=112)"
    info = " ".join([line.strip() for line in learner.get_info().split()])
    assert info == expected_info
def test_online_rus_1():
    stream = SEAGenerator(1, noise_percentage=0.067, random_state=112)
    stream.prepare_for_use()
    nb = NaiveBayes()
    learner = OnlineRUSBoost(base_estimator=nb,
                             n_estimators=3,
                             sampling_rate=5,
                             algorithm=1,
                             random_state=112)
    first = True

    cnt = 0
    max_samples = 5000
    predictions = []
    wait_samples = 100
    correct_predictions = 0

    while cnt < max_samples:
        X, y = stream.next_sample()
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            predictions.append(learner.predict(X)[0])
            if y[0] == predictions[-1]:
                correct_predictions += 1
        if first:
            learner.partial_fit(X, y, classes=stream.target_values)
            first = False
        else:
            learner.partial_fit(X, y)
        cnt += 1
    performance = correct_predictions / len(predictions)
    expected_predictions = [
        1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1,
        1
    ]

    expected_correct_predictions = 33
    expected_performance = 0.673469387755102

    assert np.alltrue(predictions == expected_predictions)
    assert np.isclose(expected_performance, performance)
    assert correct_predictions == expected_correct_predictions

    assert type(learner.predict(X)) == np.ndarray
    assert type(learner.predict_proba(X)) == np.ndarray

    expected_info = "OnlineRUSBoost(algorithm=1, base_estimator=NaiveBayes(nominal_attributes=None),\n" \
                    "               drift_detection=True, n_estimators=3, random_state=112,\n" \
                    "               sampling_rate=5)"
    assert learner.get_info() == expected_info
예제 #23
0
def test_leverage_bagging():
    stream = SEAGenerator(1, noise_percentage=0.067, random_state=112)
    stream.prepare_for_use()
    knn = KNNClassifier(n_neighbors=8, leaf_size=40, max_window_size=2000)
    learner = LeverageBaggingClassifier(base_estimator=knn,
                                        n_estimators=3,
                                        random_state=112)
    first = True

    cnt = 0
    max_samples = 5000
    predictions = []
    wait_samples = 100
    correct_predictions = 0

    while cnt < max_samples:
        X, y = stream.next_sample()
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            predictions.append(learner.predict(X)[0])
            if y[0] == predictions[-1]:
                correct_predictions += 1
        if first:
            learner.partial_fit(X, y, classes=stream.target_values)
            first = False
        else:
            learner.partial_fit(X, y)
        cnt += 1

    performance = correct_predictions / len(predictions)
    expected_predictions = [
        1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0,
        1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
        1
    ]
    assert np.alltrue(predictions == expected_predictions)

    expected_performance = 0.8571428571428571
    assert np.isclose(expected_performance, performance)

    expected_correct_predictions = 42
    assert correct_predictions == expected_correct_predictions

    assert type(learner.predict(X)) == np.ndarray
    assert type(learner.predict_proba(X)) == np.ndarray

    expected_info = "LeverageBaggingClassifier(base_estimator=KNNClassifier(leaf_size=40, max_window_size=2000, " \
                    "n_neighbors=8, nominal_attributes=None), delta=0.002, enable_code_matrix=False, " \
                    "leverage_algorithm='leveraging_bag', n_estimators=3, random_state=112, w=6)"
    info = " ".join([line.strip() for line in learner.get_info().split()])
    assert info == expected_info
예제 #24
0
def demo():
    """ _test_streams
    
    This demo tests if the streams are correctly generating samples.
    
    :return: 
    """
    stream = FileStream('../data/datasets/covtype.csv', -1, 1)
    stream.prepare_for_use()
    rbf_drift = RandomRBFGeneratorDrift(change_speed=41.00,
                                        n_centroids=50,
                                        model_seed=32523423,
                                        instance_seed=5435,
                                        n_classes=2,
                                        n_features=10,
                                        num_drift_centroids=50)
    rbf_drift.prepare_for_use()

    sea = SEAGenerator()

    print('1 instance:\n')

    X, y = stream.next_sample()
    print(X)
    print(y)

    X, y = sea.next_sample()
    print(X)
    print(y)

    print('\n\n10 instances:\n')
    X, y = stream.next_sample(10)
    print(X)
    print(y)

    X, y = sea.next_sample(10)
    print(X)
    print(y)
def test_additive_expert_ensemble_weakest():
    stream = SEAGenerator(1, noise_percentage=0.067, random_state=112)
    stream.prepare_for_use()

    learner = AdditiveExpertEnsemble(3,
                                     NaiveBayes(),
                                     beta=0.5,
                                     gamma=0.1,
                                     pruning='weakest')

    cnt = 0
    max_samples = 5000
    predictions = []
    wait_samples = 100
    correct_predictions = 0
    first = True

    while cnt < max_samples:
        X, y = stream.next_sample()
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            predictions.append(learner.predict(X)[0])
            if y[0] == predictions[-1]:
                correct_predictions += 1
        if first:
            learner.partial_fit(X, y, classes=stream.target_values)
            first = False
        else:
            learner.partial_fit(X, y)
        cnt += 1
    performance = correct_predictions / len(predictions)

    expected_predictions = [
        1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0,
        1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
        1
    ]
    expected_correct_predictions = 45
    expected_performance = 0.9183673469387755

    assert np.alltrue(predictions == expected_predictions)
    assert np.isclose(expected_performance, performance)
    assert correct_predictions == expected_correct_predictions

    assert type(learner.predict(X)) == np.ndarray

    expected_info = "AdditiveExpertEnsemble(base_estimator=NaiveBayes(nominal_attributes=None),\n" \
                    "                       beta=0.5, gamma=0.1, n_estimators=3, pruning='weakest')"
    assert learner.get_info() == expected_info
예제 #26
0
def test_oza_bagging_adwin():
    stream = SEAGenerator(1, noise_percentage=0.067, random_state=112)
    stream.prepare_for_use()
    knn = KNN(n_neighbors=8, leaf_size=40, max_window_size=2000)
    learner = OzaBaggingAdwin(base_estimator=knn,
                              n_estimators=3,
                              random_state=112)
    first = True

    cnt = 0
    max_samples = 5000
    predictions = []
    wait_samples = 100
    correct_predictions = 0

    while cnt < max_samples:
        X, y = stream.next_sample()
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            predictions.append(learner.predict(X)[0])
            if y[0] == predictions[-1]:
                correct_predictions += 1
        if first:
            learner.partial_fit(X, y, classes=stream.target_values)
            first = False
        else:
            learner.partial_fit(X, y)
        cnt += 1
    performance = correct_predictions / len(predictions)
    expected_predictions = [
        1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0,
        1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
        1
    ]
    assert np.alltrue(predictions == expected_predictions)

    expected_performance = 0.8979591836734694
    assert np.isclose(expected_performance, performance)

    expected_correct_predictions = 44
    assert correct_predictions == expected_correct_predictions

    assert type(learner.predict(X)) == np.ndarray
    assert type(learner.predict_proba(X)) == np.ndarray

    expected_info = "OzaBaggingAdwin(base_estimator=KNN(leaf_size=40, max_window_size=2000,\n" \
                    "                                   n_neighbors=8, nominal_attributes=None),\n" \
                    "                n_estimators=3, random_state=112)"
    assert learner.get_info() == expected_info
예제 #27
0
def test_oza_bagging():
    stream = SEAGenerator(1, noise_percentage=0.067, random_state=112)
    knn = KNNClassifier(n_neighbors=8, leaf_size=40, max_window_size=2000)
    learner = OzaBaggingClassifier(base_estimator=knn,
                                   n_estimators=3,
                                   random_state=112)
    first = True

    cnt = 0
    max_samples = 5000
    predictions = []
    wait_samples = 100
    correct_predictions = 0

    while cnt < max_samples:
        X, y = stream.next_sample()
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            predictions.append(learner.predict(X)[0])
            if y[0] == predictions[-1]:
                correct_predictions += 1
        if first:
            learner.partial_fit(X, y, classes=stream.target_values)
            first = False
        else:
            learner.partial_fit(X, y)
        cnt += 1
    performance = correct_predictions / len(predictions)
    expected_predictions = [
        1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0,
        1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
        1
    ]
    assert np.alltrue(predictions == expected_predictions)

    expected_performance = 0.8979591836734694
    assert np.isclose(expected_performance, performance)

    expected_correct_predictions = 44
    assert correct_predictions == expected_correct_predictions

    assert type(learner.predict(X)) == np.ndarray
    assert type(learner.predict_proba(X)) == np.ndarray

    expected_info = "OzaBaggingClassifier(base_estimator=KNNClassifier(leaf_size=40, " \
                    "max_window_size=2000, metric='euclidean', n_neighbors=8), " \
                    "n_estimators=3, random_state=112)"
    info = " ".join([line.strip() for line in learner.get_info().split()])
    assert info == expected_info
예제 #28
0
def test_extremely_fast_decision_tree_coverage():
    # Cover memory management
    max_size_kb = 20
    stream = SEAGenerator(random_state=1, noise_percentage=0.05)
    X, y = stream.next_sample(5000)

    # Unconstrained model has over 50 kB
    learner = ExtremelyFastDecisionTreeClassifier(
        leaf_prediction='mc', memory_estimate_period=200, max_byte_size=max_size_kb*2**10,
        min_samples_reevaluate=2500
    )

    learner.partial_fit(X, y, classes=stream.target_values)
    assert calculate_object_size(learner, 'kB') <= max_size_kb

    learner.reset()

    # Cover nominal attribute observer
    stream = RandomTreeGenerator(tree_random_state=23, sample_random_state=12, n_classes=2, n_cat_features=2,
                                 n_categories_per_cat_feature=4, n_num_features=1, max_tree_depth=30, min_leaf_depth=10,
                                 fraction_leaves_per_level=0.45)
    X, y = stream.next_sample(5000)
    learner = ExtremelyFastDecisionTreeClassifier(leaf_prediction='nba', nominal_attributes=[i for i in range(1, 9)])
    learner.partial_fit(X, y, classes=stream.target_values)
def test_dynamic_weighted_majority():
    stream = SEAGenerator(1, noise_percentage=0.067, random_state=112)

    learner = DynamicWeightedMajorityClassifier(3,
                                                NaiveBayes(),
                                                beta=0.5,
                                                theta=0.01)

    cnt = 0
    max_samples = 5000
    predictions = []
    wait_samples = 100
    correct_predictions = 0
    first = True

    while cnt < max_samples:
        X, y = stream.next_sample()
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            predictions.append(learner.predict(X)[0])
            if y[0] == predictions[-1]:
                correct_predictions += 1
        if first:
            learner.partial_fit(X, y, classes=stream.target_values)
            first = False
        else:
            learner.partial_fit(X, y)
        cnt += 1
    performance = correct_predictions / len(predictions)
    expected_predictions = [
        1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0,
        1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
        1
    ]
    expected_correct_predictions = 44
    expected_performance = 0.8979591836734694

    assert np.alltrue(predictions == expected_predictions)
    assert np.isclose(expected_performance, performance)
    assert correct_predictions == expected_correct_predictions

    assert type(learner.predict(X)) == np.ndarray

    expected_info = 'DynamicWeightedMajorityClassifier(base_estimator=NaiveBayes(nominal_attributes=None),\n' \
                    '                                  beta=0.5, n_estimators=3, period=50,\n' \
                    '                                  theta=0.01)'
    assert learner.get_info() == expected_info
예제 #30
0
def test_online_adac2():
    stream = SEAGenerator(1, noise_percentage=0.067, random_state=112)
    stream.prepare_for_use()
    nb = NaiveBayes()
    learner = OnlineAdaC2(base_estimator=nb,
                          n_estimators=3,
                          random_state=112,
                          cost_positive=1,
                          cost_negative=1)
    first = True

    cnt = 0
    max_samples = 5000
    predictions = []
    wait_samples = 100
    correct_predictions = 0

    while cnt < max_samples:
        X, y = stream.next_sample()
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            predictions.append(learner.predict(X)[0])
            if y[0] == predictions[-1]:
                correct_predictions += 1
        if first:
            learner.partial_fit(X, y, classes=stream.target_values)
            first = False
        else:
            learner.partial_fit(X, y)
        cnt += 1
    performance = correct_predictions / len(predictions)
    expected_predictions = [
        1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0,
        1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
        1
    ]
    expected_correct_predictions = 44
    expected_performance = 0.8979591836734694

    assert np.alltrue(predictions == expected_predictions)
    assert np.isclose(expected_performance, performance)
    assert correct_predictions == expected_correct_predictions

    assert type(learner.predict(X)) == np.ndarray
    assert type(learner.predict_proba(X)) == np.ndarray