def demo(): """ _test_streams This demo tests if the streams are correctly generating samples. :return: """ stream = FileStream("https://raw.githubusercontent.com/scikit-multiflow/streaming-datasets/" "master/covtype.csv") rbf_drift = RandomRBFGeneratorDrift(change_speed=41.00, n_centroids=50, model_seed=32523423, instance_seed=5435, n_classes=2, n_features=10, num_drift_centroids=50) sea = SEAGenerator() print('1 instance:\n') X,y = stream.next_sample() print(X) print(y) X, y = sea.next_sample() print(X) print(y) print('\n\n10 instances:\n') X,y = stream.next_sample(10) print(X) print(y) X, y = sea.next_sample(10) print(X) print(y)
def demo(): """ _test_knn This demo tests the KNNClassifier on a file stream, which gives instances coming from a SEA generator. The test computes the performance of the KNNClassifier as well as the time to create the structure and classify max_samples (5000 by default) instances. """ stream = FileStream("https://raw.githubusercontent.com/scikit-multiflow/streaming-datasets/" "master/sea_big.csv") train = 200 X, y = stream.next_sample(train) # t = OneHotToCategorical([[10, 11, 12, 13], # [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, # 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53]]) # t2 = OneHotToCategorical([[10, 11, 12, 13], # [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, # 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53]]) start = timer() knn = KNNClassifier(n_neighbors=8, max_window_size=2000, leaf_size=40) # pipe = Pipeline([('one_hot_to_categorical', t), ('KNNClassifier', knn)]) # compare = KNeighborsClassifier(n_neighbors=8, algorithm='kd_tree', leaf_size=40, metric='euclidean') # pipe2 = Pipeline([('one_hot_to_categorical', t2), ('KNNClassifier', compare)]) # pipe.fit(X, y) # pipe2.fit(X, y) knn.partial_fit(X, y) # compare.fit(X, y) n_samples = 0 max_samples = 5000 my_corrects = 0 # compare_corrects = 0 while n_samples < max_samples: X, y = stream.next_sample() # my_pred = pipe.predict(X) my_pred = knn.predict(X) # compare_pred = pipe2.predict(X) # compare_pred = compare.predict(X) if y[0] == my_pred[0]: my_corrects += 1 # if y[0] == compare_pred[0]: # compare_corrects += 1 n_samples += 1 end = timer() print('Evaluation time: ' + str(end-start)) print(str(n_samples) + ' samples analyzed.') print('My performance: ' + str(my_corrects/n_samples))
def demo(): """ _test_mol This demo tests the MOL learner on a file stream, which reads from the music.csv file. The test computes the performance of the MOL learner as well as the time to create the structure and classify all the samples in the file. """ # Setup logging logging.basicConfig(format='%(message)s', level=logging.INFO) # Setup the file stream stream = FileStream("../data/datasets/music.csv", 0, 6) stream.prepare_for_use() # Setup the classifier, by default it uses Logistic Regression # classifier = MultiOutputLearner() # classifier = MultiOutputLearner(base_estimator=SGDClassifier(n_iter=100)) classifier = MultiOutputLearner(base_estimator=Perceptron()) # Setup the pipeline pipe = Pipeline([('classifier', classifier)]) pretrain_size = 150 logging.info('Pre training on %s samples', str(pretrain_size)) logging.info('Total %s samples', str(stream.n_samples)) X, y = stream.next_sample(pretrain_size) # classifier.fit(X, y) classes = stream.target_values classes_flat = list(set([item for sublist in classes for item in sublist])) pipe.partial_fit(X, y, classes=classes_flat) count = 0 true_labels = [] predicts = [] init_time = timer() logging.info('Evaluating...') while stream.has_more_samples(): X, y = stream.next_sample() # p = classifier.predict(X) p = pipe.predict(X) predicts.extend(p) true_labels.extend(y) count += 1 perf = hamming_score(true_labels, predicts) logging.info('Evaluation time: %s s', str(timer() - init_time)) logging.info('Total samples analyzed: %s', str(count)) logging.info('The classifier\'s static Hamming score : %0.3f' % perf)
def demo(): """ _test_streams This demo tests if the streams are correctly generating samples. :return: """ stream = FileStream('../data/datasets/covtype.csv', -1, 1) stream.prepare_for_use() rbf_drift = RandomRBFGeneratorDrift(change_speed=41.00, n_centroids=50, model_seed=32523423, instance_seed=5435, n_classes=2, n_features=10, num_drift_centroids=50) rbf_drift.prepare_for_use() sea = SEAGenerator() print('1 instance:\n') X, y = stream.next_sample() print(X) print(y) X, y = sea.next_sample() print(X) print(y) print('\n\n10 instances:\n') X, y = stream.next_sample(10) print(X) print(y) X, y = sea.next_sample(10) print(X) print(y)
def demo(): """ _test_knn_adwin This demo tests the KNNAdwin classifier on a file stream, which gives instances coming from a SEA generator. The test computes the performance of the KNNAdwin classifier as well as the time to create the structure and classify max_samples (10000 by default) instances. """ start = timer() logging.basicConfig(format='%(message)s', level=logging.INFO) # warnings.filterwarnings("ignore", ".*Passing 1d.*") stream = FileStream('../data/datasets/sea_big.csv', -1, 1) # stream = RandomRBFGeneratorDrift(change_speed=41.00, n_centroids=50, model_random_state=32523423, # sample_seed=5435, n_classes=2, num_att=10, num_drift_centroids=50) stream.prepare_for_use() t = OneHotToCategorical([[10, 11, 12, 13], [ 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53 ]]) t2 = OneHotToCategorical([[10, 11, 12, 13], [ 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53 ]]) # knn = KNN(n_neighbors=8, max_window_size=2000, leaf_size=40) knn = KNNAdwin(n_neighbors=8, leaf_size=40, max_window_size=2000) # pipe = Pipeline([('one_hot_to_categorical', t), ('KNN', knn)]) compare = KNeighborsClassifier(n_neighbors=8, algorithm='kd_tree', leaf_size=40, metric='euclidean') # pipe2 = Pipeline([('one_hot_to_categorical', t2), ('KNN', compare)]) first = True train = 200 if train > 0: X, y = stream.next_sample(train) # pipe.partial_fit(X, y, classes=stream.target_values) # pipe.partial_fit(X, y, classes=stream.target_values) # pipe2.fit(X, y) knn.partial_fit(X, y, classes=stream.target_values) compare.fit(X, y) first = False n_samples = 0 max_samples = 10000 my_corrects = 0 compare_corrects = 0 while n_samples < max_samples: if n_samples % (max_samples / 20) == 0: logging.info('%s%%', str((n_samples // (max_samples / 20) * 5))) X, y = stream.next_sample() # my_pred = pipe.predict(X) my_pred = knn.predict(X) # my_pred = [1] if first: # pipe.partial_fit(X, y, classes=stream.target_values) # pipe.partial_fit(X, y, classes=stream.target_values) knn.partial_fit(X, y, classes=stream.target_values) first = False else: # pipe.partial_fit(X, y) knn.partial_fit(X, y) # compare_pred = pipe2.predict(X) compare_pred = compare.predict(X) if y[0] == my_pred[0]: my_corrects += 1 if y[0] == compare_pred[0]: compare_corrects += 1 n_samples += 1 end = timer() print('Evaluation time: ' + str(end - start)) print(str(n_samples) + ' samples analyzed.') print('My performance: ' + str(my_corrects / n_samples)) print('Compare performance: ' + str(compare_corrects / n_samples))
def demo(): """ _test_kdtree_compare This demo compares creation and query speed for different kd tree implementations. They are fed with instances from the covtype dataset. Three kd tree implementations are compared: SciPy's KDTree, NumPy's KDTree and scikit-multiflow's KDTree. For each of them the demo will time the construction of the tree on 1000 instances, and then measure the time to query 100 instances. The results are displayed in the terminal. """ warnings.filterwarnings("ignore", ".*Passing 1d.*") stream = FileStream('../data/datasets/covtype.csv', -1, 1) filter = OneHotToCategorical([[10, 11, 12, 13], [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53]]) X, y = stream.next_sample(1000) X = filter.transform(X) # print(X) X_find, y = stream.next_sample(100) X_find = filter.transform(X_find) print(X_find[4]) # Normal kdtree start = timer() scipy = spatial.KDTree(X, leafsize=40) end = timer() print("\nScipy KDTree construction time: " + str(end-start)) start = timer() for i in range(10): ind = scipy.query(X_find[i], 8) # print(ind) end = timer() print("Scipy KDTree query time: " + str(end - start)) del scipy # Fast kdtree start = timer() opt = KDTree(X, metric='euclidean', return_distance=True) end = timer() print("\nOptimal KDTree construction time: " + str(end-start)) start = timer() for i in range(100): ind, dist = opt.query(X_find[i], 8) # print(ind) # print(dist) end = timer() print("Optimal KDTree query time: " + str(end - start)) del opt # Sklearn kdtree start = timer() sk = ng.KDTree(X, metric='euclidean') end = timer() print("\nSklearn KDTree construction time: " + str(end-start)) start = timer() for i in range(100): ind, dist = sk.query(np.asarray(X_find[i]).reshape(1, -1), 8, return_distance=True) # print(ind) # print(dist) end = timer() print("Sklearn KDTree query time: " + str(end - start) + "\n") del sk
for i in range(r): pred = classifier.predict(np.asarray([X[i]])) predicted_labels.append(pred[0]) true_labels.append(y[i]) classifier = classifier.partial_fit(np.asarray([X[i]]), np.asarray([y[i]]), None) if (i % (r // 20)) == 0: logging.info(str((i // (r / 20))*5) + "%") accuracy = accuracy_score(true_labels, predicted_labels) logging.info('error rate %.2f%%' % (100-100*accuracy)) if __name__ == '__main__': logging.basicConfig(format='%(message)s', level=logging.INFO) hyperParams ={'maxSize': 1000, 'nNeighbours': 5, 'knnWeights': 'distance', 'STMSizeAdaption': 'maxACCApprox', 'useLTM': False} # hyperParams = {'windowSize': 5000, 'nNeighbours': 5, 'knnWeights': 'distance', 'STMSizeAdaption': None, # 'useLTM': False} logging.info('loading dataset') # stream = FileStream("../data/datasets/weather.csv") stream = FileStream("../data/datasets/moving_squares.csv") stream = FileStream("/Users/jing/local/scikit-multiflow/src/skmultiflow/data/datasets/covtype.csv") stream.prepare_for_use() X, y = stream.next_sample(stream.n_samples) logging.info('%d samples' % X.shape[0]) logging.info('%d dimensions' % X.shape[1]) run(X[:], y[:], hyperParams)
nb = NaiveBayes() ht = HoeffdingTreeClassifier() evaluator = EvaluatePrequential( max_samples=10000, max_time=1000, show_plot=True, pretrain_size= 3000, metrics=['accuracy']) #Single Bathc Classification #stream = FileStream('file.csv') stream = FileStream("RBF Dataset.csv") stream10 = FileStream("RBF Dataset 10.csv") stream70 = FileStream("RBF Dataset 70.csv") X1,y1 = stream.next_sample(10000) X2,y2 = stream10.next_sample(10000) X3,y3 = stream70.next_sample(10000) X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size= 0.4,random_state=109) X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size= 0.4,random_state=109) X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, test_size= 0.4,random_state=109) #Classifiers MLP = MLPClassifier(hidden_layer_sizes=(200, 4 ) ,random_state=1, max_iter=300) nb = NaiveBayes() ht = HoeffdingTreeClassifier() # ht2_trained = ht.fit(X3_train,y3_train)