예제 #1
0
def demo():
    """ _test_filters
    
    This demo test the MissingValuesCleaner filter. The transform is set 
    to clean any value equal to -47, replacing it with the median value 
    of the last 10 samples, or less if there aren't 10 samples available. 
    
    The output will be the 10 instances used in the transform. The first 
    9 are kept untouched, as they don't have any feature value of -47. The 
    last samples has its first feature value equal to -47, so it's replaced 
    by the median of the 9 first samples.
    
    """
    stream = FileStream("https://raw.githubusercontent.com/scikit-multiflow/streaming-datasets/"
                        "master/covtype.csv")

    filter = MissingValuesCleaner(-47, 'median', 10)

    X, y = stream.next_sample(10)

    X[9, 0] = -47

    for i in range(10):
        temp = filter.partial_fit_transform([X[i].tolist()])
        print(temp)
예제 #2
0
    def get_file_stream2(self, path):
        from skmultiflow.data.file_stream import FileStream
        stream = FileStream(path, n_targets=1, target_idx=-1)
        stream2 = FileStream(path, n_targets=1, target_idx=-1)
        stream3 = FileStream(path, n_targets=1, target_idx=-1)

        return stream, stream2, stream3
예제 #3
0
def demo(output_file=None, instances=50000):
    """ _test_sam_knn_prequential

    This demo shows how to produce a prequential evaluation.

    The first thing needed is a stream. For this case we use a file stream 
    which gets its samples from the movingSquares.csv file, inside the datasets 
    folder.

    Then we need to setup a classifier, which in this case is an instance 
    of scikit-multiflow's SAMKNN. Then, optionally we create a 
    pipeline structure, initialized on that classifier.

    The evaluation is then run.

    Parameters
    ----------
    output_file: string
        The name of the csv output file

    instances: int
        The evaluation's max number of instances

    """
    # Setup the File Stream
    # opt = FileOption("FILE", "OPT_NAME", "../datasets/covtype.csv", "CSV", False)
    opt = FileOption("FILE", "OPT_NAME", "../datasets/movingSquares.csv",
                     "CSV", False)
    stream = FileStream(opt, -1, 1)
    # stream = WaveformGenerator()
    stream.prepare_for_use()

    # Setup the classifier
    # classifier = SGDClassifier()
    # classifier = KNNAdwin(k=8, max_window_size=2000,leaf_size=40, categorical_list=None)
    # classifier = OzaBaggingAdwin(h=KNN(k=8, max_window_size=2000, leaf_size=30, categorical_list=None))
    classifier = SAMKNN(n_neighbors=5,
                        knnWeights='distance',
                        maxSize=1000,
                        STMSizeAdaption='maxACCApprox',
                        useLTM=False)
    # classifier = SGDRegressor()
    # classifier = PerceptronMask()

    # Setup the pipeline
    #pipe = Pipeline([('Classifier', classifier)])

    # Setup the evaluator
    eval = EvaluatePrequential(pretrain_size=0,
                               max_instances=instances,
                               batch_size=1,
                               n_wait=100,
                               max_time=1000,
                               output_file=output_file,
                               task_type='classification',
                               show_plot=True,
                               plot_options=['performance'])

    # Evaluate
    eval.eval(stream=stream, classifier=classifier)
예제 #4
0
def demo():
    """ _test_filters
    
    This demo test the MissingValuesCleaner filter. The transform is set 
    to clean any value equal to -47, replacing it with the median value 
    of the last 10 samples, or less if there aren't 10 samples available. 
    
    The output will be the 10 instances used in the transform. The first 
    9 are kept untouched, as they don't have any feature value of -47. The 
    last samples has its first feature value equal to -47, so it's replaced 
    by the median of the 9 first samples.
    
    """
    opt = FileOption('FILE', 'OPT_NAME', '../datasets/covtype.csv', 'csv', False)
    stream = FileStream(opt, -1, 1)
    stream.prepare_for_use()

    filter = MissingValuesCleaner(-47, 'median', 10)

    X, y = stream.next_instance(10)

    X[9, 0] = -47

    for i in range(10):
        temp = filter.partial_fit_transform([X[i].tolist()])
        print(temp)
예제 #5
0
def test_sam_knn(package_path):

    test_file = os.path.join(package_path, 'src/skmultiflow/data/datasets/sea_big.csv')

    stream = FileStream(test_file)
    stream.prepare_for_use()

    hyperParams = {'maxSize': 1000, 'nNeighbours': 5, 'knnWeights': 'distance', 'STMSizeAdaption': 'maxACCApprox',
                   'useLTM': False}

    learner = SAMKNN(n_neighbors=hyperParams['nNeighbours'], max_window_size=hyperParams['maxSize'],
                     weighting=hyperParams['knnWeights'],
                     stm_size_option=hyperParams['STMSizeAdaption'], use_ltm=hyperParams['useLTM'])

    cnt = 0
    max_samples = 5000
    predictions = array('d')

    wait_samples = 100

    while cnt < max_samples:
        X, y = stream.next_sample()
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            predictions.append(learner.predict(X)[0])
        learner.partial_fit(X, y)
        cnt += 1

    expected_predictions = array('d', [1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0,
                                       0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0,
                                       1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0,
                                       0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0,
                                       0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0])

    assert np.alltrue(predictions == expected_predictions)
def test_KNN_adwin(test_path, package_path):
    test_file = os.path.join(package_path,
                             'src/skmultiflow/data/datasets/sea_big.csv')
    stream = FileStream(test_file, -1, 1)
    stream.prepare_for_use()
    learner = KNNAdwin(n_neighbors=8, leaf_size=40, max_window_size=2000)

    cnt = 0
    max_samples = 5000
    predictions = []
    correct_predictions = 0

    wait_samples = 100

    while cnt < max_samples:
        X, y = stream.next_sample()
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            predictions.append(learner.predict(X)[0])
            if y[0] == predictions[-1]:
                correct_predictions += 1
        learner.partial_fit(X, y)
        cnt += 1
    performance = correct_predictions / len(predictions)
    expected_predictions = [
        1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1,
        1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1,
        1
    ]
    expected_correct_predictions = 40
    expected_performance = 0.8163265306122449

    assert np.alltrue(predictions == expected_predictions)
    assert np.isclose(expected_performance, performance)
    assert correct_predictions == expected_correct_predictions
예제 #7
0
def train_tree(csv_path, tree):

    print("Training the tree")

    stream = FileStream(csv_path)

    accuracy = 0
    n_samples = 0
    correct_cnt = 0

    t0 = time.time()

    while stream.has_more_samples():
        X, y = stream.next_sample()
        y_pred = tree.predict(X)
        if y[0] == y_pred[0]:
            correct_cnt += 1
        tree = tree.partial_fit(X, y)
        n_samples += 1

    t1 = time.time()
    total = t1 - t0

    accuracy = 100.0 * correct_cnt / n_samples

    print("Training data instances: ", n_samples)
    print("Tree trained on ", n_samples, " instances & has ", accuracy,
          "% accuracy.")
    print("Training tree completed in ", total, " (s)")
def demo(instances=2000):
    """ _test_comparison_prequential
    
    This demo will test a prequential evaluation when more than one learner is 
    passed, which makes it a comparison task.
    
    Parameters
    ----------
    instances: int
        The evaluation's maximum number of instances.
     
    """
    # Stream setup
    stream = FileStream("../datasets/covtype.csv", -1, 1)
    # stream = SEAGenerator(classification_function=2, sample_seed=53432, balance_classes=False)
    stream.prepare_for_use()
    # Setup the classifier
    clf = SGDClassifier()
    # classifier = KNNAdwin(k=8, max_window_size=2000,leaf_size=40, categorical_list=None)
    # classifier = OzaBaggingAdwin(h=KNN(k=8, max_window_size=2000, leaf_size=30, categorical_list=None))
    clf_one = KNNAdwin(k=8, max_window_size=1000, leaf_size=30)
    # clf_two = KNN(k=8, max_window_size=1000, leaf_size=30)
    # clf_two = LeverageBagging(h=KNN(), ensemble_length=2)

    t_one = OneHotToCategorical([[10, 11, 12, 13],
                                 [
                                     14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
                                     24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
                                     34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
                                     44, 45, 46, 47, 48, 49, 50, 51, 52, 53
                                 ]])
    # t_two = OneHotToCategorical([[10, 11, 12, 13],
    #                        [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
    #                        36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53]])

    pipe_one = Pipeline([('one_hot_to_categorical', t_one), ('KNN', clf_one)])
    # pipe_two = Pipeline([('one_hot_to_categorical', t_two), ('KNN', clf_two)])

    classifier = [clf, pipe_one]
    # classifier = SGDRegressor()
    # classifier = PerceptronMask()

    # Setup the pipeline
    # pipe = Pipeline([('Classifier', classifier)])

    # Setup the evaluator
    evaluator = EvaluatePrequential(pretrain_size=2000,
                                    output_file='teste.csv',
                                    max_samples=instances,
                                    batch_size=1,
                                    n_wait=200,
                                    max_time=1000,
                                    show_plot=True,
                                    metrics=['performance', 'kappa_t'])

    # Evaluate
    evaluator.evaluate(stream=stream, model=classifier)
def demo_parameterized(h, dset="sea_stream.csv", show_plot=True): 
    # Setup Stream
    opt = FileOption("FILE", "OPT_NAME", "../datasets/"+dset, "CSV", False)
    stream = FileStream(opt, -1, 1)
    stream.prepare_for_use()

    # For each classifier, e...
    T_init = 100
    eval = EvaluatePrequential(pretrain_size=T_init, output_file='output.csv', max_instances=10000, batch_size=1, n_wait=1000, task_type='classification', show_plot=show_plot, plot_options=['performance'])
    eval.eval(stream=stream, classifier=h)
예제 #10
0
def demo(output_file=None, instances=40000):
    """ _test_prequential
    
    This demo shows how to produce a prequential evaluation.
    
    The first thing needed is a stream. For this case we use a file stream 
    which gets its samples from the sea_big.csv file, inside the datasets 
    folder.
    
    Then we need to setup a classifier, which in this case is an instance 
    of sklearn's PassiveAggressiveClassifier. Then, optionally we create a 
    pipeline structure, initialized on that classifier.
    
    The evaluation is then run.
    
    Parameters
    ----------
    output_file: string
        The name of the csv output file
    
    instances: int
        The evaluation's max number of instances
    
    """
    # Setup the File Stream
    stream = FileStream("../data/datasets/sea_big.csv", -1, 1)
    # stream = WaveformGenerator()
    stream.prepare_for_use()

    # Setup the classifier
    # classifier = SGDClassifier()
    # classifier = KNNAdwin(n_neighbors=8, max_window_size=2000,leaf_size=40, nominal_attributes=None)
    # classifier = OzaBaggingAdwin(base_estimator=KNN(n_neighbors=8, max_window_size=2000, leaf_size=30, categorical_list=None))
    classifier = PassiveAggressiveClassifier()
    # classifier = SGDRegressor()
    # classifier = PerceptronMask()

    # Setup the pipeline
    pipe = Pipeline([('Classifier', classifier)])

    # Setup the evaluator
    evaluator = EvaluatePrequential(
        pretrain_size=200,
        max_samples=instances,
        batch_size=1,
        n_wait=100,
        max_time=1000,
        output_file=output_file,
        show_plot=True,
        metrics=['kappa', 'kappa_t', 'performance'])

    # Evaluate
    evaluator.evaluate(stream=stream, model=pipe)
def demo_parameterized(h, filename="covtype.csv", show_plot=True):
    # Setup Stream
    stream = FileStream("../datasets/" + filename, -1, 1)
    stream.prepare_for_use()

    # For each classifier, e...
    pretrain = 100
    evaluator = EvaluatePrequential(pretrain_size=pretrain,
                                    output_file='output.csv',
                                    max_samples=10000,
                                    batch_size=1,
                                    n_wait=1000,
                                    show_plot=show_plot,
                                    metrics=['performance'])
    evaluator.evaluate(stream=stream, model=h)
예제 #12
0
def demo():

    # The classifier we will use (other options: SAMKNN, LeverageBagging, SGD)
    h = HoeffdingTree()

    # Setup Stream
    stream = FileStream("../datasets/sea_stream.csv", -1, 1)
    stream.prepare_for_use()

    pretrain = 100
    evaluator = EvaluatePrequential(pretrain_size=pretrain,
                                    output_file='output.csv',
                                    max_samples=10000,
                                    batch_size=1,
                                    n_wait=1000,
                                    show_plot=True,
                                    metrics=['performance'])
    evaluator.evaluate(stream=stream, model=h)
예제 #13
0
def test_sam_knn_coverage(package_path):

    test_file = os.path.join(package_path,
                             'src/skmultiflow/data/datasets/sea_big.csv')

    stream = FileStream(test_file)
    stream.prepare_for_use()

    hyperParams = {
        'maxSize': 50,
        'n_neighbors': 3,
        'weighting': 'uniform',
        'stm_size_option': 'maxACC',
        'min_stm_size': 10,
        'useLTM': True
    }

    learner = SAMKNN(n_neighbors=hyperParams['n_neighbors'],
                     max_window_size=hyperParams['maxSize'],
                     weighting=hyperParams['weighting'],
                     stm_size_option=hyperParams['stm_size_option'],
                     min_stm_size=hyperParams['min_stm_size'],
                     use_ltm=hyperParams['useLTM'])

    cnt = 0
    max_samples = 1000
    predictions = array('i')

    wait_samples = 20

    while cnt < max_samples:
        X, y = stream.next_sample()
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            predictions.append(learner.predict(X)[0])
        learner.partial_fit(X, y)
        cnt += 1

    expected_predictions = array('i', [
        1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0,
        0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1,
        1
    ])
    assert np.alltrue(predictions == expected_predictions)
예제 #14
0
def demo():

    # The classifier we will use (other options: SAMKNN, LeverageBagging, SGD)
    h = HoeffdingTree()

    # Setup Stream
    opt = FileOption("FILE", "OPT_NAME", "../datasets/sea_stream.csv", "CSV",
                     False)
    stream = FileStream(opt, -1, 1)
    stream.prepare_for_use()

    T_init = 100
    eval = EvaluatePrequential(pretrain_size=T_init,
                               output_file='output.csv',
                               max_instances=10000,
                               batch_size=1,
                               n_wait=1000,
                               task_type='classification',
                               show_plot=True,
                               plot_options=['performance'])
    eval.eval(stream=stream, classifier=h)
예제 #15
0
def test_tree(csv_path, tree):

    print("Testing the tree")
    
    stream = FileStream(csv_path)
    
    
    n_samples = 0
    correct_cnt = 0
    
    t2 = time.time()
    
    y_true_all = list()
    y_pred_all = list()
    while stream.has_more_samples():
        X, y = stream.next_sample()
        y_pred = tree.predict(X)
        if y[0] == y_pred[0]:
            correct_cnt += 1
        tree = tree.partial_fit(X, y)
        n_samples += 1
        
        y_true_all.append(y[0])
        y_pred_all.append(y_pred[0])
    
    
    t3 = time.time()
    total = t3-t2
    
    accuracy = 100.0 * correct_cnt / n_samples
    fscore = f1_score(y_true_all, y_pred_all, average='binary')
    gm = geometric_mean_score(y_true_all, y_pred_all, average='binary')
    
    print("Test data instances: ", n_samples)
    print("Tree tested on ", n_samples, " instances & has ", accuracy, "% accuracy.")
    print("Tree has F-score: %.3f" % fscore)
    print("Tree has GM: %.3f" % gm)
    print("Testing tree completed in ", total, " (s)")
    
    return round(fscore,3), round(gm,3)
    def validate(self):
        # self.load_merged()
        y_true = self.validation['R1'].tolist()

        models_train = [
            self.model_1_train, self.model_2_train, self.model_3_train
        ]
        models_with_annotations = [
            self.model_1_with_annotations, self.model_2_with_annotations,
            self.model_3_with_annotations
        ]

        self.logger.log(
            'Predicting with models trained only on the training set...')

        self.stream = FileStream('data/validation.csv')
        num_samples = self.stream.n_remaining_samples()

        for model in models_train:
            self.stream.restart()
            y_pred_train = self.predict(model, self.stream)
            cr = classification_report(y_true, y_pred_train, labels=labels)
            cm = confusion_matrix(y_true, y_pred_train, labels=labels)
            print(cr)
            pprint.pprint(cm)

        self.logger.log(
            'Predicting with models trained with the annotations added...')

        for model in models_with_annotations:
            self.stream.restart()
            y_pred_with_annotations = self.predict(model, self.stream)
            cr = classification_report(y_true,
                                       y_pred_with_annotations,
                                       labels=labels)
            cm = confusion_matrix(y_true,
                                  y_pred_with_annotations,
                                  labels=labels)
            print(cr)
            pprint.pprint(cm)
예제 #17
0
def demo():
    """ _test_stream_speed
    
    This demo tests the sample generation speed of the file stream.
    
    """
    # Setup the stream
    opt = FileOption("FILE", "OPT_NAME", "../datasets/covtype.csv", "CSV", False)
    stream = FileStream(opt, -1, 1)
    stream = RandomRBFGeneratorDrift()
    stream.prepare_for_use()

    # Test with RandomTreeGenerator
    #opt_list = [['-c', '2'], ['-o', '0'], ['-u', '5'], ['-v', '4']]
    #stream = RandomTreeGenerator(opt_list)
    #stream.prepare_for_use()

    # Setup the evaluator
    eval = EvaluateStreamGenerationSpeed(100000, float("inf"), None, 5)

    # Evaluate
    eval.eval(stream)
예제 #18
0
def demo():
    """ _test_mol

    This demo tests the MOL learner on a file stream, which reads from 
    the music.csv file.

    The test computes the performance of the MOL learner as well as 
    the time to create the structure and classify all the samples in 
    the file.

    """
    # Setup logging
    logging.basicConfig(format='%(message)s', level=logging.INFO)

    # Setup the file stream
    opt = FileOption("FILE", "OPT_NAME", "../datasets/music.csv", "CSV", False)
    stream = FileStream(opt, 0, 6)
    stream.prepare_for_use()

    # Setup the classifier, by default it uses Logistic Regression
    #classifier = MultiOutputLearner()
    #classifier = MultiOutputLearner(h=SGDClassifier(n_iter=100))
    classifier = MultiOutputLearner(h=Perceptron())

    # Setup the pipeline
    pipe = Pipeline([('classifier', classifier)])

    pretrain_size = 150
    logging.info('Pre training on %s samples', str(pretrain_size))
    X, y = stream.next_instance(pretrain_size)
    #classifier.fit(X, y)
    pipe.partial_fit(X, y, classes=stream.get_classes())
    count = 0
    true_labels = []
    predicts = []
    init_time = timer()
    logging.info('Evaluating...')
    while stream.has_more_instances():
        X, y = stream.next_instance()
        #p = classifier.predict(X)
        p = pipe.predict(X)
        predicts.extend(p)
        true_labels.extend(y)
        count += 1
    perf = hamming_score(true_labels, predicts)
    logging.info('Evaluation time: %s s', str(timer() - init_time))
    logging.info('Total samples analyzed: %s', str(count))
    logging.info('The classifier\'s static Hamming score    : %0.3f' % perf)
예제 #19
0
def demo():
    """ _test_knn
    
    This demo tests the KNN classifier on a file stream, which gives 
    instances coming from a SEA generator. 
    
    The test computes the performance of the KNN classifier as well as 
    the time to create the structure and classify max_samples (5000 by 
    default) instances.
    
    """
    opt = FileOption('FILE', 'OPT_NAME', '../datasets/sea_big.csv', 'csv',
                     False)
    stream = FileStream(opt, -1, 1)
    stream.prepare_for_use()
    train = 200
    X, y = stream.next_instance(train)
    #t = OneHotToCategorical([[10, 11, 12, 13],
    #                         [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
    #                          36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53]])
    #t2 = OneHotToCategorical([[10, 11, 12, 13],
    #                         [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
    #                          36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53]])
    start = timer()
    knn = KNN(k=8, max_window_size=2000, leaf_size=40)
    #pipe = Pipeline([('one_hot_to_categorical', t), ('KNN', knn)])

    #compare = KNeighborsClassifier(n_neighbors=8, algorithm='kd_tree', leaf_size=40, metric='euclidean')

    #pipe2 = Pipeline([('one_hot_to_categorical', t2), ('KNN', compare)])

    #pipe.fit(X, y)
    #pipe2.fit(X, y)
    knn.partial_fit(X, y)
    #compare.fit(X, y)

    n_samples = 0
    max_samples = 5000
    my_corrects = 0
    compare_corrects = 0

    while n_samples < max_samples:
        X, y = stream.next_instance()
        #my_pred = pipe.predict(X)
        my_pred = knn.predict(X)
        #compare_pred = pipe2.predict(X)
        #compare_pred = compare.predict(X)
        if y[0] == my_pred[0]:
            my_corrects += 1
        #if y[0] == compare_pred[0]:
        #    compare_corrects += 1
        n_samples += 1

    end = timer()

    print('Evaluation time: ' + str(end - start))
    print(str(n_samples) + ' samples analyzed.')
    print('My performance: ' + str(my_corrects / n_samples))
예제 #20
0
def test_KNN(test_path, package_path):
    test_file = os.path.join(package_path, 'src/skmultiflow/data/datasets/sea_big.csv')
    stream = FileStream(test_file, -1, 1)
    stream.prepare_for_use()

    learner = KNN(n_neighbors=8, max_window_size=2000, leaf_size=40)
    cnt = 0
    max_samples = 5000
    predictions = []
    wait_samples = 100

    while cnt < max_samples:
        X, y = stream.next_sample()
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            predictions.append(learner.predict(X)[0])
        learner.partial_fit(X, y)
        cnt += 1

    expected_predictions = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0,
                            0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0,
                            1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0]

    assert np.alltrue(predictions == expected_predictions)
예제 #21
0
def demo():
    """ _test_streams
    
    This demo tests if the streams are correctly generating samples.
    
    :return: 
    """
    opt = FileOption('FILE', 'OPT_NAME', '../datasets/covtype.csv', 'csv',
                     False)
    stream = FileStream(opt, -1, 1)
    stream.prepare_for_use()
    rbf_drift = RandomRBFGeneratorDrift(change_speed=41.00,
                                        num_centroids=50,
                                        model_seed=32523423,
                                        instance_seed=5435,
                                        num_classes=2,
                                        num_att=10,
                                        num_drift_centroids=50)
    rbf_drift.prepare_for_use()

    sea = SEAGenerator()

    print('1 instance:\n')

    X, y = stream.next_instance()
    print(X)
    print(y)

    X, y = sea.next_instance()
    print(X)
    print(y)

    print('\n\n10 instances:\n')
    X, y = stream.next_instance(10)
    print(X)
    print(y)

    X, y = sea.next_instance(10)
    print(X)
    print(y)
from skmultiflow.classification.lazy.knn_adwin import KNN
from skmultiflow.classification.trees.hoeffding_tree import HoeffdingTree
from skmultiflow.data.file_stream import FileStream
from skmultiflow.evaluation.evaluate_prequential import EvaluatePrequential
from skmultiflow.options.file_option import FileOption

from my_classifier import BatchClassifier

dataset = "elec"

# 1. Create a stream
opt = FileOption("FILE", "OPT_NAME", "./data/" + dataset + ".csv", "CSV",
                 False)
stream = FileStream(opt, -1, 1)
# 2. Prepare for use
stream.prepare_for_use()
# 2. Instantiate the HoeffdingTree classifier
h = [
    KNN(k=10, max_window_size=100, leaf_size=30),
    HoeffdingTree(),
    BatchClassifier(window_size=100, max_models=10),
]
# 3. Setup the evaluator
eval = EvaluatePrequential(pretrain_size=1000,
                           output_file='result_' + dataset + '.csv',
                           max_instances=10000,
                           batch_size=1,
                           n_wait=500,
                           max_time=1000000000,
                           task_type='classification',
                           show_plot=True,
예제 #23
0
from skmultiflow.data.file_stream import FileStream
from skmultiflow.trees.hoeffding_tree import HoeffdingTreeClassifier
from skmultiflow.evaluation.evaluate_prequential import EvaluatePrequential

# Create a stream
stream = FileStream("elec.csv")
stream.prepare_for_use()  # Not required for v0.5.0+

# Instantiate the HoeffdingTreeClassifier
ht = HoeffdingTreeClassifier()

# Setup the evaluator
evaluator = EvaluatePrequential(pretrain_size=1000,
                                max_samples=10000,
                                output_file='results.csv')

# Run evaluation
evaluator.evaluate(stream=stream, model=ht)
def test_file_stream(test_path, package_path):
    test_file = os.path.join(package_path,
                             'src/skmultiflow/data/datasets/sea_stream.csv')
    stream = FileStream(test_file)
    stream.prepare_for_use()

    assert stream.n_remaining_samples() == 40000

    expected_names = ['attrib1', 'attrib2', 'attrib3']
    assert stream.feature_names == expected_names

    expected_targets = [0, 1]
    assert stream.target_values == expected_targets

    assert stream.target_names == ['class']

    assert stream.n_features == 3

    assert stream.n_cat_features == 0

    assert stream.n_num_features == 3

    assert stream.n_targets == 1

    assert stream.get_data_info() == 'sea_stream.csv - 1 target(s), 2 classes'

    assert stream.has_more_samples() is True

    assert stream.is_restartable() is True

    # Load test data corresponding to first 10 instances
    test_file = os.path.join(test_path, 'sea_stream_file.npz')
    data = np.load(test_file)
    X_expected = data['X']
    y_expected = data['y']

    X, y = stream.next_sample()
    assert np.alltrue(X[0] == X_expected[0])
    assert np.alltrue(y[0] == y_expected[0])

    X, y = stream.last_sample()
    assert np.alltrue(X[0] == X_expected[0])
    assert np.alltrue(y[0] == y_expected[0])

    stream.restart()
    X, y = stream.next_sample(10)
    assert np.alltrue(X == X_expected)
    assert np.alltrue(y == y_expected)

    assert stream.n_targets == np.array(y).ndim

    assert stream.n_features == X.shape[1]
예제 #25
0
def test_random_rbf_generator(test_path, package_path):
    test_file = os.path.join(package_path,
                             'src/skmultiflow/datasets/sea_stream.csv')
    file_option = FileOption('FILE', 'sea', test_file, 'csv', False)
    stream = FileStream(file_option)
    stream.prepare_for_use()

    assert stream.estimated_remaining_instances() == 40000

    expected_header = ['attrib1', 'attrib2', 'attrib3']
    assert stream.get_attributes_header() == expected_header

    expected_classes = [0, 1]
    assert stream.get_classes() == expected_classes

    assert stream.get_classes_header() == ['class']

    assert stream.get_num_attributes() == 3

    assert stream.get_num_nominal_attributes() == 0

    assert stream.get_num_numerical_attributes() == 3

    assert stream.get_num_targets() == 1

    assert stream.get_num_values_per_nominal_attribute() == 0

    assert stream.get_plot_name() == 'sea_stream.csv - 2 class labels'

    assert stream.has_more_instances() is True

    assert stream.is_restartable() is True

    # Load test data corresponding to first 10 instances
    test_file = os.path.join(test_path, 'sea_stream.npz')
    data = np.load(test_file)
    X_expected = data['X']
    y_expected = data['y']

    X, y = stream.next_instance()
    assert np.alltrue(X[0] == X_expected[0])
    assert np.alltrue(y[0] == y_expected[0])

    X, y = stream.get_last_instance()
    assert np.alltrue(X[0] == X_expected[0])
    assert np.alltrue(y[0] == y_expected[0])

    stream.restart()
    X, y = stream.next_instance(10)
    assert np.alltrue(X == X_expected)
    assert np.alltrue(y == y_expected)
예제 #26
0
from skmultiflow.classification.lazy.knn import KNN
from skmultiflow.data.file_stream import FileStream
import time
import pudb;pu.db
f="/opt/techgig/scikit-multiflow/src/skmultiflow/datasets/sea_big.csv"
stream = FileStream(f, -1, 1)
stream.prepare_for_use()
X, y = stream.next_sample(200)
knn = KNN(k=8, max_window_size=2000, leaf_size=40)
knn.partial_fit(X, y)
n_samples = 0
corrects = 0
while n_samples < 5000:
    X, y = stream.next_sample()
    my_pred = knn.predict(X)
    if y[0] == my_pred[0]:
        corrects += 1
    knn = knn.partial_fit(X, y)
    n_samples += 1
    print("KNN's performance: " + str(corrects/n_samples))
    time.sleep(1)
print('KNN usage example')
print(str(n_samples) + ' samples analyzed.')
print("KNN's performance: " + str(corrects/n_samples))
예제 #27
0
def demo():
    """ _test_kdtree_compare
    
    This demo compares creation and query speed for different kd tree 
    implementations. They are fed with instances from the covtype dataset. 
    
    Three kd tree implementations are compared: SciPy's KDTree, NumPy's 
    KDTree and scikit-multiflow's KDTree. For each of them the demo will 
    time the construction of the tree on 1000 instances, and then measure 
    the time to query 100 instances. The results are displayed in the 
    terminal.
    
    """
    warnings.filterwarnings("ignore", ".*Passing 1d.*")

    opt = FileOption('FILE', 'OPT_NAME', '../datasets/covtype.csv', 'csv', False)
    stream = FileStream(opt, -1, 1)
    stream.prepare_for_use()
    filter = OneHotToCategorical([[10, 11, 12, 13],
                             [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
                              36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53]])

    X, y = stream.next_instance(1000)
    X = filter.transform(X)
    #print(X)

    X_find, y = stream.next_instance(100)
    X_find = filter.transform(X_find)
    print(X_find[4])
    # Normal kdtree
    start = timer()
    scipy = spatial.KDTree(X, leafsize=40)
    end = timer()
    print("\nScipy KDTree construction time: " + str(end-start))

    start = timer()
    for i in range(10):
        ind = scipy.query(X_find[i], 8)
        #print(ind)
    end = timer()
    print("Scipy KDTree query time: " + str(end - start))

    del scipy

    # Fast kdtree
    start = timer()
    opt = KDTree(X, metric='euclidean', return_distance=True)
    end = timer()
    print("\nOptimal KDTree construction time: " + str(end-start))

    start = timer()
    for i in range(100):
        ind, dist = opt.query(X_find[i], 8)
        #print(ind)
        #print(dist)
    end = timer()
    print("Optimal KDTree query time: " + str(end - start))

    del opt

    # Sklearn kdtree
    start = timer()
    sk = ng.KDTree(X, metric='euclidean')
    end = timer()
    print("\nSklearn KDTree construction time: " + str(end-start))

    start = timer()
    for i in range(100):
        ind, dist = sk.query(np.asarray(X_find[i]).reshape(1, -1), 8, return_distance=True)
        #print(ind)
        #print(dist)
    end = timer()
    print("Sklearn KDTree query time: " + str(end - start) + "\n")

    del sk
# output- (10000, 11)

# Store it in csv
data.to_csv('data_stream.csv', index=False)

# Applying Hoeffding Tree on the synthetic data stream

# Import the relevant libraries
from skmultiflow.trees import HoeffdingTreeClassifier
from skmultiflow.evaluation import EvaluatePrequential
from skmultiflow.data.file_stream import FileStream
import pandas as pd
import numpy as np

# Load the synthetic data stream
dstream = FileStream('data_stream.csv')
dstream.prepare_for_use()

# Create the model instance
ht_class = HoeffdingTreeClassifier()

# perform prequential evaluation
evaluate1 = EvaluatePrequential(show_plot=False,
                                pretrain_size=400,
                                max_samples=10000,
                                metrics=['accuracy'])
evaluate1.evaluate(stream=dstream, model=ht_class)

###################################################

# Hoeffding Adaptive tree
예제 #29
0
from skmultiflow.trees import HoeffdingTree
from hoeffdingOptionTree import HOT
from skmultiflow.evaluation.evaluate_prequential import EvaluatePrequential
from skmultiflow.data.file_stream import FileStream
import matplotlib as plt

plt.interactive(True)

dataset = "elec"

# 1. Create a stream

stream = FileStream(dataset+".csv", n_targets=1, target_idx=-1)
# 2. Prepare for use
stream.prepare_for_use()
# 2. Instantiate the HoeffdingTree classifier
h = [
    HoeffdingTree(),
    HOT()
]
# 3. Setup the evaluator

evaluator = EvaluatePrequential(pretrain_size=1000, max_samples=20000, show_plot=True,
                                metrics=['accuracy', 'kappa'], output_file='result_'+dataset+'.csv',
                                batch_size=1)
# 4. Run
evaluator.evaluate(stream=stream, model=h)
class ARASActiveLearningTools(object):
    def __init__(self):
        self.id = 'ARAS_active_learning_tools'

        self.logger = Log(self.id)

        self.load_data()
        self.load_models()

    def load_data(self):
        self.logger.log(
            'Loading data files: train, validation, and annotations...')

        self.train = pd.read_csv('data/train.csv', dtype=int)
        print(self.train.shape)
        print(self.train.head())

        self.header = list(self.train.columns.values)

        self.validation = pd.read_csv('data/validation.csv', dtype=int)
        print(self.validation.shape)
        print(self.validation.head())

        self.annotations = pd.read_csv('data/annotations.csv', dtype=int)
        self.annotations.columns = self.header
        print(self.annotations.shape)
        print(self.annotations.head())

    def load_merged(self):
        self.merged = pd.read_csv('data/merged.csv', dtype=int)
        print(self.merged.shape)
        print(self.merged.head())

    def load_models(self):
        self.model_1_train = pickle.load(open('models/train/Model1.p', 'rb'))
        self.model_2_train = pickle.load(open('models/train/Model2.p', 'rb'))
        self.model_3_train = pickle.load(open('models/train/Model3.p', 'rb'))

        self.model_1_with_annotations = pickle.load(
            open('models/with_annotations/Model1.p', 'rb'))
        self.model_2_with_annotations = pickle.load(
            open('models/with_annotations/Model2.p', 'rb'))
        self.model_3_with_annotations = pickle.load(
            open('models/with_annotations/Model3.p', 'rb'))

    def merge(self):
        self.logger.log(
            'Merging the annotated data with the original training set...')

        self.merged = pd.concat([self.train, self.annotations])
        print(self.merged.shape)
        print(self.merged.head())

        self.logger.log('Saving dataframe...')

        self.merged.to_csv('data/merged.csv', index=False)

        self.logger.log_great('Done.')

    # see https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics for available metrics
    def validate(self):
        # self.load_merged()
        y_true = self.validation['R1'].tolist()

        models_train = [
            self.model_1_train, self.model_2_train, self.model_3_train
        ]
        models_with_annotations = [
            self.model_1_with_annotations, self.model_2_with_annotations,
            self.model_3_with_annotations
        ]

        self.logger.log(
            'Predicting with models trained only on the training set...')

        self.stream = FileStream('data/validation.csv')
        num_samples = self.stream.n_remaining_samples()

        for model in models_train:
            self.stream.restart()
            y_pred_train = self.predict(model, self.stream)
            cr = classification_report(y_true, y_pred_train, labels=labels)
            cm = confusion_matrix(y_true, y_pred_train, labels=labels)
            print(cr)
            pprint.pprint(cm)

        self.logger.log(
            'Predicting with models trained with the annotations added...')

        for model in models_with_annotations:
            self.stream.restart()
            y_pred_with_annotations = self.predict(model, self.stream)
            cr = classification_report(y_true,
                                       y_pred_with_annotations,
                                       labels=labels)
            cm = confusion_matrix(y_true,
                                  y_pred_with_annotations,
                                  labels=labels)
            print(cr)
            pprint.pprint(cm)

    def predict(self, model, stream):
        y_pred = []
        count = 0
        while stream.has_more_samples():
            X, y = stream.next_sample()
            y_pred.append(model.predict(X))
            if (count % 50000) == 0:
                print('Predictions so far:', count)
            count = count + 1

        return y_pred