示例#1
0
def test_knn_adwin():
    stream = ConceptDriftStreamGenerator(stream=SEAGenerator(random_state=1),
                                         drift_stream=SEAGenerator(
                                             random_state=2,
                                             classification_function=2),
                                         random_state=1,
                                         position=250,
                                         width=10)

    learner = KNNADWINClassifier(n_neighbors=8,
                                 leaf_size=40,
                                 max_window_size=200)

    cnt = 0
    max_samples = 1000
    predictions = array('i')
    correct_predictions = 0
    wait_samples = 20

    while cnt < max_samples:
        X, y = stream.next_sample()
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            predictions.append(learner.predict(X)[0])
            if y[0] == predictions[-1]:
                correct_predictions += 1
        learner.partial_fit(X, y)
        cnt += 1

    expected_predictions = array('i', [
        1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0,
        1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1,
        1
    ])
    assert np.alltrue(predictions == expected_predictions)

    expected_correct_predictions = 46
    assert correct_predictions == expected_correct_predictions

    learner.reset()
    assert learner.data_window.size == 0

    expected_info = "KNNADWINClassifier(leaf_size=40, max_window_size=200, " \
                    "metric='euclidean', n_neighbors=8)"
    info = " ".join([line.strip() for line in learner.get_info().split()])
    assert info == expected_info

    assert type(learner.predict(X)) == np.ndarray
    assert type(learner.predict_proba(X)) == np.ndarray
示例#2
0
def OzaBagging(base_estimator=KNNADWINClassifier(), n_estimators=10,
               random_state=None):  # pragma: no cover
    warnings.warn("'OzaBagging' has been renamed to 'OzaBaggingClassifier' in v0.5.0.\n"
                  "The old name will be removed in v0.7.0", category=FutureWarning)
    return OzaBaggingClassifier(base_estimator=base_estimator,
                                n_estimators=n_estimators,
                                random_state=random_state)
    def __init__(self,
                 base_estimator=KNNADWINClassifier(),
                 n_estimators=10,
                 sampling_rate=3,
                 algorithm=1,
                 drift_detection=True,
                 random_state=None):

        super().__init__()
        self.base_estimator = base_estimator
        self.n_estimators = n_estimators
        self.random_state = random_state
        self.sampling_rate = sampling_rate
        self.algorithm = algorithm
        self.drift_detection = drift_detection
        # default values
        self.ensemble = None
        self.actual_n_estimators = None
        self.classes = None
        self._random_state = None
        self.adwin_ensemble = None
        self.lam_sc = None
        self.lam_pos = None
        self.lam_neg = None
        self.lam_sw = None
        self.epsilon = None
        self.__configure()
示例#4
0
 def __init__(self,
              base_estimator=KNNADWINClassifier(),
              n_estimators=10,
              cost_positive=1,
              cost_negative=0.1,
              drift_detection=True,
              random_state=None):
     super().__init__()
     # default values
     self.ensemble = None
     self.actual_n_estimators = None
     self.classes = None
     self._random_state = None
     self.base_estimator = base_estimator
     self.n_estimators = n_estimators
     self.cost_positive = cost_positive
     self.cost_negative = cost_negative
     self.drift_detection = drift_detection
     self.random_state = random_state
     self.adwin_ensemble = None
     self.lam_fn = None
     self.lam_fp = None
     self.lam_sum = None
     self.lam_sw = None
     self.werr = None
     self.epsilon = None
 def __init__(self,
              base_estimator=KNNADWINClassifier(),
              n_estimators=10,
              random_state=None):
     super().__init__(base_estimator, n_estimators, random_state)
     # default values
     self.adwin_ensemble = None
     self.__configure()
def OnlineBoosting(base_estimator=KNNADWINClassifier(), n_estimators=10, drift_detection=True,
                   random_state=None):     # pragma: no cover
    warnings.warn("'OnlineBoosting' has been renamed to 'OnlineBoostingClassifier' in v0.5.0.\n"
                  "The old name will be removed in v0.7.0", category=FutureWarning)
    return OnlineBoostingClassifier(base_estimator=base_estimator,
                                    n_estimators=n_estimators,
                                    drift_detection=drift_detection,
                                    random_state=random_state)
def OnlineAdaC2(base_estimator=KNNADWINClassifier(), n_estimators=10, cost_positive=1, cost_negative=0.1,
                drift_detection=True, random_state=None):     # pragma: no cover
    warnings.warn("'OnlineAdaC2' has been renamed to 'OnlineAdaC2Classifier' in v0.5.0.\n"
                  "The old name will be removed in v0.7.0", category=FutureWarning)
    return OnlineAdaC2Classifier(base_estimator=base_estimator,
                                 n_estimators=n_estimators,
                                 cost_positive=cost_positive,
                                 cost_negative=cost_negative,
                                 drift_detection=drift_detection,
                                 random_state=random_state)
示例#8
0
 def __init__(self, base_estimator=KNNADWINClassifier(), n_estimators=10, random_state=None):
     super().__init__()
     # default values
     self.ensemble = None
     self.actual_n_estimators = None
     self.classes = None
     self._random_state = None  # This is the actual random_state object used internally
     self.base_estimator = base_estimator
     self.n_estimators = n_estimators
     self.random_state = random_state
     self.__configure()
 def __init__(self, base_estimator=KNNADWINClassifier(), n_estimators=10, sampling_rate=1, drift_detection=True,
              random_state=None):
     super().__init__()
     self.base_estimator = base_estimator
     self.n_estimators = n_estimators
     self._init_random_state = random_state
     self.sampling_rate = sampling_rate
     self.drift_detection = drift_detection
     # default values
     self.ensemble = None
     self.actual_n_estimators = None
     self.classes = None
     self._random_state = None
     self.adwin_ensemble = None
示例#10
0
def demo():
    """ _test_oza_bagging_adwin

    This demo tests the OzaBaggingADWINClassifier using KNNADWINClassifier as base estimator
    on samples given by a SEAGenerator. 

    The test computes the performance of the OzaBaggingADWINClassifier as well
    as the time to create the structure and classify max_samples (20000 by 
    default) instances.

    """
    logging.basicConfig(format='%(message)s', level=logging.INFO)
    warnings.filterwarnings("ignore", ".*Passing 1d.*")
    stream = SEAGenerator(1, noise_percentage=0.067, random_state=1)

    clf = OzaBaggingADWINClassifier(base_estimator=KNNADWINClassifier(
        n_neighbors=8, max_window_size=2000, leaf_size=30),
                                    n_estimators=2,
                                    random_state=1)
    sample_count = 0
    correctly_classified = 0
    max_samples = 20000
    train_size = 10
    first = True
    if train_size > 0:
        X, y = stream.next_sample(train_size)
        clf.partial_fit(X, y, classes=stream.target_values)
        first = False

    while sample_count < max_samples:
        if sample_count % (max_samples / 20) == 0:
            logging.info('%s%%', str((sample_count // (max_samples / 20) * 5)))
        X, y = stream.next_sample()
        my_pred = clf.predict(X)

        if first:
            clf.partial_fit(X, y, classes=stream.target_values)
            first = False
        else:
            clf.partial_fit(X, y)

        if my_pred is not None:
            if y[0] == my_pred[0]:
                correctly_classified += 1

        sample_count += 1

    print(str(sample_count) + ' samples analyzed.')
    print('My performance: ' + str(correctly_classified / sample_count))
示例#11
0
def test_pipeline(test_path):
    n_categories = 5

    # Load test data generated using:
    # RandomTreeGenerator(tree_random_state=1, sample_random_state=1,
    #                     n_cat_features=n_categories, n_num_features=0)
    test_file = os.path.join(test_path, 'data-one-hot.npz')
    data = np.load(test_file)
    X = data['X']
    y = data['y']
    stream = DataStream(data=X, y=y.astype(np.int))

    # Setup transformer
    cat_att_idx = [[i + j for i in range(n_categories)]
                   for j in range(0, n_categories * n_categories, n_categories)
                   ]
    transformer = OneHotToCategorical(categorical_list=cat_att_idx)

    # Set up the classifier
    classifier = KNNADWINClassifier(n_neighbors=2,
                                    max_window_size=50,
                                    leaf_size=40)
    # Setup the pipeline
    pipe = Pipeline([('one-hot', transformer),
                     ('KNNADWINClassifier', classifier)])
    # Setup the evaluator
    evaluator = EvaluatePrequential(show_plot=False,
                                    pretrain_size=10,
                                    max_samples=100)
    # Evaluate
    evaluator.evaluate(stream=stream, model=pipe)

    metrics = evaluator.get_mean_measurements()

    expected_accuracy = 0.5555555555555556
    assert np.isclose(expected_accuracy, metrics[0].accuracy_score())

    expected_kappa = 0.11111111111111116
    assert np.isclose(expected_kappa, metrics[0].kappa_score())
    print(pipe.get_info())
    expected_info = "Pipeline: [OneHotToCategorical(categorical_list=[[0, 1, 2, 3, 4], " \
                    "[5, 6, 7, 8, 9], [10, 11, 12, 13, 14], [15, 16, 17, 18, 19], " \
                    "[20, 21, 22, 23, 24]]) KNNADWINClassifier(leaf_size=40, " \
                    "max_window_size=50, metric='euclidean', n_neighbors=2)]"
    info = " ".join([line.strip() for line in pipe.get_info().split()])
    assert info == expected_info
def test_pipeline(test_path):
    n_categories = 5

    test_file = os.path.join(test_path, 'data-one-hot.npz')
    data = np.load(test_file)

    data_as_dict = []
    for i in range(0, len(data['X'])):
        data_as_dict.append({
            'X': data['X'][i].reshape(1, 25),
            'y': np.array(data['y'][i]).reshape(1, 1)
        })

    # Setup transformer
    cat_att_idx = [[i + j for i in range(n_categories)]
                   for j in range(0, n_categories * n_categories, n_categories)
                   ]
    transformer = OneHotToCategorical(categorical_list=cat_att_idx)

    # Set up the classifier
    classifier = KNNADWINClassifier(n_neighbors=2,
                                    max_window_size=50,
                                    leaf_size=40)
    # Setup the pipeline
    pipe = Pipeline([('one-hot', transformer),
                     ('KNNADWINClassifier', classifier)])

    train_eval_trigger = PrequentialTrigger(10)
    reporter = BufferedMetricsReporter(retrieve_metrics)
    results_observer = MetricsResultObserver(ClassificationMeasurements(),
                                             reporter)
    evaluation_event_observer = EvaluationEventObserver(
        pipe, train_eval_trigger, [results_observer], [0, 1])

    data_source = ArrayDataSource(record_to_dictionary,
                                  [evaluation_event_observer], data_as_dict)

    data_source.listen_for_events()
    time.sleep(3)

    expected_accuracy = 0.5555555555555556
    expected_kappa = 0.11111111111111116

    assert np.isclose(expected_accuracy, reporter.get_buffer()['accuracy'])
    assert np.isclose(expected_kappa, reporter.get_buffer()['kappa'])
dstream = SEAGenerator(classification_function=2,
                       balance_classes=True,
                       noise_percentage=0.3,
                       random_state=333)

#Retrieve five samples
dstream.next_sample(5)
# Output:
#(array([[3.68721825, 0.48303666, 1.04530188],
#        [2.45403315, 8.73489354, 0.51611639],
#        [2.38740114, 2.03699194, 1.74533621],
#        [9.41738118, 4.66915281, 9.59978205],
#        [1.05404748, 0.42265956, 2.44130999]]), array([1, 0, 0, 1, 1]))

# Instatntiate the KNN ADWIN classifier method
adwin_knn_class = KNNADWINClassifier(n_neighbors=10, max_window_size=1000)

# Prequential Evaluation
evaluate1 = EvaluatePrequential(show_plot=False,
                                pretrain_size=1000,
                                max_samples=10000,
                                metrics=['accuracy'])
# Run the evaluation
evaluate1.evaluate(stream=dstream, model=adwin_knn_class)

###################################################

# Applying SAM-KNN Classifier on the synthetic data stream
from skmultiflow.lazy import SAMKNNClassifier
from skmultiflow.evaluation import EvaluatePrequential
from skmultiflow.data.sea_generator import SEAGenerator
def demo(instances=2000):
    """ _test_comparison_prequential
    
    This demo will test a prequential evaluation when more than one learner is 
    passed, which makes it a comparison task.
    
    Parameters
    ----------
    instances: int
        The evaluation's maximum number of instances.
     
    """
    # Stream setup
    stream = FileStream(
        "https://raw.githubusercontent.com/scikit-multiflow/streaming-datasets/"
        "master/covtype.csv")
    # stream = SEAGenerator(classification_function=2, sample_seed=53432, balance_classes=False)
    # Setup the classifier
    clf = SGDClassifier()
    # classifier = KNNADWINClassifier(n_neighbors=8, max_window_size=2000,leaf_size=40, nominal_attributes=None)
    # classifier = OzaBaggingADWINClassifier(base_estimator=KNNClassifier(n_neighbors=8, max_window_size=2000,
    #                                                                     leaf_size=30))
    clf_one = KNNADWINClassifier(n_neighbors=8,
                                 max_window_size=1000,
                                 leaf_size=30)
    # clf_two = KNNClassifier(n_neighbors=8, max_window_size=1000, leaf_size=30)
    # clf_two = LeveragingBaggingClassifier(base_estimator=KNNClassifier(), n_estimators=2)

    t_one = OneHotToCategorical([[10, 11, 12, 13],
                                 [
                                     14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
                                     24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
                                     34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
                                     44, 45, 46, 47, 48, 49, 50, 51, 52, 53
                                 ]])
    # t_two = OneHotToCategorical([[10, 11, 12, 13],
    #                             [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
    #                              27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
    #                              40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53]])

    pipe_one = Pipeline([('one_hot_to_categorical', t_one),
                         ('KNNClassifier', clf_one)])
    # pipe_two = Pipeline([('one_hot_to_categorical', t_two), ('KNNClassifier', clf_two)])

    classifier = [clf, pipe_one]
    # classifier = SGDRegressor()
    # classifier = PerceptronMask()

    # Setup the pipeline
    # pipe = Pipeline([('Classifier', classifier)])

    # Setup the evaluator
    evaluator = EvaluatePrequential(
        pretrain_size=2000,
        output_file='test_comparison_prequential.csv',
        max_samples=instances,
        batch_size=1,
        n_wait=200,
        max_time=1000,
        show_plot=True)

    # Evaluate
    evaluator.evaluate(stream=stream, model=classifier)
示例#15
0
parser.add_argument('-s',
                    '--label_size',
                    required=False,
                    default=0.25,
                    help="Name of Detector {KD3/Adwin/PageHinkley}")

args = parser.parse_args()

test_dataset = args.dataset
print("dataset:" + "datasets/" + test_dataset + '.csv')
stream = FileStream("datasets/" + test_dataset + '.csv')
#print(stream.get_target_values())

onlineBoosting = OnlineBoostingClassifier()
knn_adwin = KNNADWINClassifier(n_neighbors=8,
                               leaf_size=40,
                               max_window_size=1000)
SAMKNN = SAMKNNClassifier(n_neighbors=10,
                          weighting='distance',
                          max_window_size=500,
                          stm_size_option='maxACCApprox',
                          use_ltm=False)
learn_pp_nse = LearnPPNSEClassifier()
SGD = SGDClassifier()
rslvq = RobustSoftLearningVectorQuantization()
#CMMM2 = CMGMMClassifier(classes=stream.get_target_values(), prune_component=True, drift_detector=None)
#CMMM.train(train_dataset, 'label', 'mfcc')
#

eval = WeakEvaluatePrequential(
    show_plot=False,
示例#16
0
def test_knn_adwin():
    stream = ConceptDriftStream(stream=SEAGenerator(random_state=1),
                                drift_stream=SEAGenerator(
                                    random_state=2, classification_function=2),
                                random_state=1,
                                position=250,
                                width=10)
    stream.prepare_for_use()
    learner = KNNADWINClassifier(n_neighbors=8,
                                 leaf_size=40,
                                 max_window_size=200)

    cnt = 0
    max_samples = 1000
    predictions = array('i')
    correct_predictions = 0
    wait_samples = 20

    while cnt < max_samples:
        X, y = stream.next_sample()
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            predictions.append(learner.predict(X)[0])
            if y[0] == predictions[-1]:
                correct_predictions += 1
        learner.partial_fit(X, y)
        cnt += 1

    expected_predictions = array('i', [
        1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0,
        1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1,
        1
    ])
    assert np.alltrue(predictions == expected_predictions)

    expected_correct_predictions = 46
    assert correct_predictions == expected_correct_predictions

    learner.reset()
    assert learner.window.n_samples == 0

    expected_info = 'KNNADWINClassifier(leaf_size=40, max_window_size=200, n_neighbors=8, nominal_attributes=None)'
    info = " ".join([line.strip() for line in learner.get_info().split()])
    assert info == expected_info

    stream.restart()

    X, y = stream.next_sample(max_samples)
    learner.fit(X[:950], y[:950])
    predictions = learner.predict(X[951:])

    correct_predictions = sum(np.array(predictions) == y[951:])
    expected_correct_predictions = 47
    assert correct_predictions == expected_correct_predictions

    assert type(learner.predict(X)) == np.ndarray
    assert type(learner.predict_proba(X)) == np.ndarray
示例#17
0
def demo():
    """ _test_knn_adwin

    This demo tests the KNNADWINClassifier on a file stream, which gives
    instances coming from a SEA generator. 
    
    The test computes the performance of the KNNADWINClassifier as well as
    the time to create the structure and classify max_samples (10000 by 
    default) instances.
    
    """
    start = timer()
    logging.basicConfig(format='%(message)s', level=logging.INFO)
    # warnings.filterwarnings("ignore", ".*Passing 1d.*")
    stream = FileStream(
        "https://raw.githubusercontent.com/scikit-multiflow/streaming-datasets/"
        "master/sea_big.csv", -1, 1)
    # stream = RandomRBFGeneratorDrift(change_speed=41.00, n_centroids=50, model_random_state=32523423,
    #                                  sample_seed=5435, n_classes=2, num_att=10, num_drift_centroids=50)

    t = OneHotToCategorical([[10, 11, 12, 13],
                             [
                                 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
                                 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
                                 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46,
                                 47, 48, 49, 50, 51, 52, 53
                             ]])
    t2 = OneHotToCategorical([[10, 11, 12, 13],
                              [
                                  14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
                                  25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
                                  36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46,
                                  47, 48, 49, 50, 51, 52, 53
                              ]])

    knn = KNNADWINClassifier(n_neighbors=8, leaf_size=40, max_window_size=2000)
    # pipe = Pipeline([('one_hot_to_categorical', t), ('KNNClassifier', knn)])

    compare = KNeighborsClassifier(n_neighbors=8,
                                   algorithm='kd_tree',
                                   leaf_size=40,
                                   metric='euclidean')
    # pipe2 = Pipeline([('one_hot_to_categorical', t2), ('KNNClassifier', compare)])
    first = True
    train = 200
    if train > 0:
        X, y = stream.next_sample(train)
        # pipe.partial_fit(X, y, classes=stream.target_values)
        # pipe.partial_fit(X, y, classes=stream.target_values)
        # pipe2.fit(X, y)

        knn.partial_fit(X, y, classes=stream.target_values)
        compare.fit(X, y)
        first = False
    n_samples = 0
    max_samples = 10000
    my_corrects = 0
    compare_corrects = 0

    while n_samples < max_samples:
        if n_samples % (max_samples / 20) == 0:
            logging.info('%s%%', str((n_samples // (max_samples / 20) * 5)))
        X, y = stream.next_sample()
        # my_pred = pipe.predict(X)
        my_pred = knn.predict(X)
        # my_pred = [1]
        if first:
            # pipe.partial_fit(X, y, classes=stream.target_values)
            # pipe.partial_fit(X, y, classes=stream.target_values)
            knn.partial_fit(X, y, classes=stream.target_values)
            first = False
        else:
            # pipe.partial_fit(X, y)
            knn.partial_fit(X, y)
        # compare_pred = pipe2.predict(X)
        compare_pred = compare.predict(X)
        if y[0] == my_pred[0]:
            my_corrects += 1
        if y[0] == compare_pred[0]:
            compare_corrects += 1
        n_samples += 1

    end = timer()

    print('Evaluation time: ' + str(end - start))
    print(str(n_samples) + ' samples analyzed.')
    print('My performance: ' + str(my_corrects / n_samples))
    print('Compare performance: ' + str(compare_corrects / n_samples))