def demo(): """ _test_streams This demo tests if the streams are correctly generating samples. :return: """ stream = FileStream("https://raw.githubusercontent.com/scikit-multiflow/streaming-datasets/" "master/covtype.csv") rbf_drift = RandomRBFGeneratorDrift(change_speed=41.00, n_centroids=50, model_seed=32523423, instance_seed=5435, n_classes=2, n_features=10, num_drift_centroids=50) sea = SEAGenerator() print('1 instance:\n') X,y = stream.next_sample() print(X) print(y) X, y = sea.next_sample() print(X) print(y) print('\n\n10 instances:\n') X,y = stream.next_sample(10) print(X) print(y) X, y = sea.next_sample(10) print(X) print(y)
def demo(instances=2000): """ _test_comparison_prequential This demo will test a prequential evaluation when more than one learner is passed, which makes it a comparison task. Parameters ---------- instances: int The evaluation's maximum number of instances. """ # Stream setup stream = FileStream("../data/datasets/covtype.csv", -1, 1) # stream = SEAGenerator(classification_function=2, sample_seed=53432, balance_classes=False) stream.prepare_for_use() # Setup the classifier clf = SGDClassifier() # classifier = KNNAdwin(n_neighbors=8, max_window_size=2000,leaf_size=40, categorical_list=None) # classifier = OzaBaggingAdwin(base_estimator=KNN(n_neighbors=8, max_window_size=2000, leaf_size=30, categorical_list=None)) clf_one = KNNAdwin(n_neighbors=8, max_window_size=1000, leaf_size=30) # clf_two = KNN(n_neighbors=8, max_window_size=1000, leaf_size=30) # clf_two = LeverageBagging(base_estimator=KNN(), n_estimators=2) t_one = OneHotToCategorical([[10, 11, 12, 13], [ 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53 ]]) # t_two = OneHotToCategorical([[10, 11, 12, 13], # [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, # 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53]]) pipe_one = Pipeline([('one_hot_to_categorical', t_one), ('KNN', clf_one)]) # pipe_two = Pipeline([('one_hot_to_categorical', t_two), ('KNN', clf_two)]) classifier = [clf, pipe_one] # classifier = SGDRegressor() # classifier = PerceptronMask() # Setup the pipeline # pipe = Pipeline([('Classifier', classifier)]) # Setup the evaluator evaluator = EvaluatePrequential( pretrain_size=2000, output_file='test_comparison_prequential.csv', max_samples=instances, batch_size=1, n_wait=200, max_time=1000, show_plot=True, metrics=['performance', 'kappa_t']) # Evaluate evaluator.evaluate(stream=stream, model=classifier)
def demo(): """ _test_knn This demo tests the KNNClassifier on a file stream, which gives instances coming from a SEA generator. The test computes the performance of the KNNClassifier as well as the time to create the structure and classify max_samples (5000 by default) instances. """ stream = FileStream("https://raw.githubusercontent.com/scikit-multiflow/streaming-datasets/" "master/sea_big.csv") train = 200 X, y = stream.next_sample(train) # t = OneHotToCategorical([[10, 11, 12, 13], # [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, # 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53]]) # t2 = OneHotToCategorical([[10, 11, 12, 13], # [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, # 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53]]) start = timer() knn = KNNClassifier(n_neighbors=8, max_window_size=2000, leaf_size=40) # pipe = Pipeline([('one_hot_to_categorical', t), ('KNNClassifier', knn)]) # compare = KNeighborsClassifier(n_neighbors=8, algorithm='kd_tree', leaf_size=40, metric='euclidean') # pipe2 = Pipeline([('one_hot_to_categorical', t2), ('KNNClassifier', compare)]) # pipe.fit(X, y) # pipe2.fit(X, y) knn.partial_fit(X, y) # compare.fit(X, y) n_samples = 0 max_samples = 5000 my_corrects = 0 # compare_corrects = 0 while n_samples < max_samples: X, y = stream.next_sample() # my_pred = pipe.predict(X) my_pred = knn.predict(X) # compare_pred = pipe2.predict(X) # compare_pred = compare.predict(X) if y[0] == my_pred[0]: my_corrects += 1 # if y[0] == compare_pred[0]: # compare_corrects += 1 n_samples += 1 end = timer() print('Evaluation time: ' + str(end-start)) print(str(n_samples) + ' samples analyzed.') print('My performance: ' + str(my_corrects/n_samples))
def demo(output_file=None, instances=50000): """ _test_sam_knn_prequential This demo shows how to produce a prequential evaluation. The first thing needed is a stream. For this case we use a file stream which gets its samples from the movingSquares.csv file, inside the datasets folder. Then we need to setup a classifier, which in this case is an instance of scikit-multiflow's SAMKNN. Then, optionally we create a pipeline structure, initialized on that classifier. The evaluation is then run. Parameters ---------- output_file: string The name of the csv output file instances: int The evaluation's max number of instances """ # Setup the File Stream stream = FileStream("../data/datasets/movingSquares.csv", -1, 1) # stream = WaveformGenerator() stream.prepare_for_use() # Setup the classifier # classifier = SGDClassifier() # classifier = KNNAdwin(n_neighbors=8, max_window_size=2000,leaf_size=40, categorical_list=None) # classifier = OzaBaggingAdwin(base_estimator=KNN(n_neighbors=8, max_window_size=2000, leaf_size=30, categorical_list=None)) classifier = SAMKNN(n_neighbors=5, weighting='distance', max_window_size=1000, stm_size_option='maxACCApprox', use_ltm=False) # classifier = SGDRegressor() # classifier = PerceptronMask() # Setup the pipeline # pipe = Pipeline([('Classifier', classifier)]) # Setup the evaluator evaluator = EvaluatePrequential(pretrain_size=0, max_samples=instances, batch_size=1, n_wait=100, max_time=1000, output_file=output_file, show_plot=True, metrics=['performance']) # Evaluate evaluator.evaluate(stream=stream, model=classifier)
def demo_parameterized(h, filename="covtype.csv", show_plot=True, model_names=None): # Setup Stream stream = FileStream("../data/datasets/" + filename) stream.prepare_for_use() # For each classifier, e... pretrain = 100 evaluator = EvaluatePrequential(pretrain_size=pretrain, output_file='test_parametrized.csv', max_samples=10000, batch_size=1, n_wait=500, show_plot=show_plot) evaluator.evaluate(stream=stream, model=h, model_names=model_names)
def demo(): """ _test_mol This demo tests the MOL learner on a file stream, which reads from the music.csv file. The test computes the performance of the MOL learner as well as the time to create the structure and classify all the samples in the file. """ # Setup logging logging.basicConfig(format='%(message)s', level=logging.INFO) # Setup the file stream stream = FileStream("https://raw.githubusercontent.com/scikit-multiflow/streaming-datasets/" "master/music.csv", 0, 6) # Setup the classifier, by default it uses Logistic Regression # classifier = MultiOutputLearner() # classifier = MultiOutputLearner(base_estimator=SGDClassifier(n_iter=100)) classifier = MultiOutputLearner(base_estimator=Perceptron()) # Setup the pipeline pipe = Pipeline([('classifier', classifier)]) pretrain_size = 150 logging.info('Pre training on %s samples', str(pretrain_size)) logging.info('Total %s samples', str(stream.n_samples)) X, y = stream.next_sample(pretrain_size) # classifier.fit(X, y) classes = stream.target_values classes_flat = list(set([item for sublist in classes for item in sublist])) pipe.partial_fit(X, y, classes=classes_flat) count = 0 true_labels = [] predicts = [] init_time = timer() logging.info('Evaluating...') while stream.has_more_samples(): X, y = stream.next_sample() # p = classifier.predict(X) p = pipe.predict(X) predicts.extend(p) true_labels.extend(y) count += 1 perf = hamming_score(true_labels, predicts) logging.info('Evaluation time: %s s', str(timer() - init_time)) logging.info('Total samples analyzed: %s', str(count)) logging.info('The classifier\'s static Hamming score : %0.3f' % perf)
def demo(output_file=None, instances=50000): """ _test_sam_knn_prequential This demo shows how to produce a prequential evaluation. The first thing needed is a stream. For this case we use a file stream which gets its samples from the moving_squares.csv file, inside the datasets folder. Then we need to setup a classifier, which in this case is an instance of scikit-multiflow's SAMKNNClassifier. Then, optionally we create a pipeline structure, initialized on that classifier. The evaluation is then run. Parameters ---------- output_file: string The name of the csv output file instances: int The evaluation's max number of instances """ # Setup the File Stream stream = FileStream("../data/datasets/moving_squares.csv") # stream = WaveformGenerator() stream.prepare_for_use() # Setup the classifier classifier = SAMKNNClassifier(n_neighbors=5, weighting='distance', max_window_size=1000, stm_size_option='maxACCApprox', use_ltm=False) # Setup the evaluator evaluator = EvaluatePrequential(pretrain_size=0, max_samples=instances, batch_size=1, n_wait=100, max_time=1000, output_file=output_file, show_plot=True) # Evaluate evaluator.evaluate(stream=stream, model=classifier)
def demo(): # The classifier we will use (other options: SAMKNNClassifier, LeverageBaggingClassifier, SGD) h = HoeffdingTreeClassifier() # Setup Stream stream = FileStream("../data/datasets/sea_stream.csv") stream.prepare_for_use() pretrain = 100 evaluator = EvaluatePrequential(pretrain_size=pretrain, output_file='test_filestream.csv', max_samples=10000, batch_size=1, n_wait=1000, show_plot=True) evaluator.evaluate(stream=stream, model=h)
def demo_parameterized(h, filename="covtype.csv", show_plot=True, model_names=None): # Setup Stream stream = FileStream("https://raw.githubusercontent.com/scikit-multiflow/streaming-datasets/" "master/" + filename) # For each classifier, e... pretrain = 100 evaluator = EvaluatePrequential(pretrain_size=pretrain, output_file='test_parametrized.csv', max_samples=10000, batch_size=1, n_wait=500, show_plot=show_plot) evaluator.evaluate(stream=stream, model=h, model_names=model_names)
def demo(): """ _test_streams This demo tests if the streams are correctly generating samples. :return: """ stream = FileStream('../data/datasets/covtype.csv', -1, 1) rbf_drift = RandomRBFGeneratorDrift(change_speed=41.00, n_centroids=50, model_seed=32523423, instance_seed=5435, n_classes=2, n_features=10, num_drift_centroids=50) sea = SEAGenerator() print('1 instance:\n') X, y = stream.next_sample() print(X) print(y) X, y = sea.next_sample() print(X) print(y) print('\n\n10 instances:\n') X, y = stream.next_sample(10) print(X) print(y) X, y = sea.next_sample(10) print(X) print(y)
def demo(): # The classifier we will use (other options: SAMKNNClassifier, LeveragingBaggingClassifier, SGD) h = HoeffdingTreeClassifier() # Setup Stream stream = FileStream( "https://raw.githubusercontent.com/scikit-multiflow/streaming-datasets/" "master/sea_stream.csv") pretrain = 100 evaluator = EvaluatePrequential(pretrain_size=pretrain, output_file='test_filestream.csv', max_samples=10000, batch_size=1, n_wait=1000, show_plot=True) evaluator.evaluate(stream=stream, model=h)
import sys sys.path.append("..") from skmultiflow.data import FileStream from evaluation.evaluate_prequential import EvaluatePrequential from recommendation.random import RandomClassifier from recommendation.popular import PopularClassifier from recommendation.co_events import CoEventsClassifier from recommendation.seq_events import SeqEventsClassifier from recommendation.ht_wrapper import HTWrapper from recommendation.beer import BeerEnsemble from recommendation.sknn import SKNNClassifier # Create stream stream = FileStream("../data/clef_1M100K.csv") stream.prepare_for_use() # Instantiate recommenders random = RandomClassifier() ht = HTWrapper(weight_mc=5, weight_inv=0.01) sknn = SKNNClassifier(k=200, sample_size=500, sample_recent=True, similarity='cosine', sliding_window=True) popular = PopularClassifier(sliding_window=True) ar = CoEventsClassifier(sliding_window=False) sr = SeqEventsClassifier(sliding_window=False) mc = SeqEventsClassifier(steps_back=1, sliding_window=False) beer = BeerEnsemble(cf_components=[ar, sr, mc, popular, sknn]) evaluator = EvaluatePrequential(session_column_index=0, time_column_index=1, rec_size=10, allow_reminders=True,
stream2 = RandomRBFGeneratorDrift(model_random_state=99, sample_random_state = 50, n_classes = 2, n_features = 10, n_centroids = 10000, change_speed= 10) X, y = stream2.next_sample(10000) stream2.restart() df2 = pd.DataFrame(np.hstack((X,np.array([y]).T))) df2.to_csv("RBF Dataset 10.csv") stream3 = RandomRBFGeneratorDrift(model_random_state=99, sample_random_state = 50, n_classes = 2, n_features = 10, n_centroids = 10000, change_speed= 70) X, y = stream3.next_sample(10000) stream3.restart() df3 = pd.DataFrame(np.hstack((X,np.array([y]).T))) df3.to_csv("RBF Dataset 70.csv") #Single Online Classifiers stream10 = FileStream("./"+'RBF Dataset 10'+'.csv') stream70 = FileStream("./"+'RBF Dataset 70'+'.csv') stream = FileStream("./"+'RBF Dataset'+'.csv') MLP = MLPClassifier(hidden_layer_sizes=(200,200,200,200 ) ,random_state=1, max_iter=500) nb = NaiveBayes() ht = HoeffdingTreeClassifier() evaluator = EvaluatePrequential( max_samples=10000, max_time=1000, show_plot=True, pretrain_size= 3000, metrics=['accuracy']) #Ensemble Online stream10 = FileStream("./"+'RBF Dataset 10'+'.csv')
from skmultiflow.data import FileStream from skmultiflow.lazy.knn import KNN from skmultiflow.evaluation import EvaluatePrequential n_neighbors = 8 max_window_size = 2000 leaf_size = 30 n_estimators = 30 show_plot = True pretrain_size = 100 max_samples = 7000 metrics = ['accuracy'] stream = FileStream('data/stream1.csv') stream.prepare_for_use() mdl = KNN(n_neighbors=n_neighbors, max_window_size=max_window_size, leaf_size=leaf_size) evaluator = EvaluatePrequential(show_plot=show_plot, pretrain_size=pretrain_size, max_samples=max_samples, metrics=metrics) evaluator.evaluate(stream=stream, model=mdl)
def demo(instances=2000): """ _test_comparison_prequential This demo will test a prequential evaluation when more than one learner is passed, which makes it a comparison task. Parameters ---------- instances: int The evaluation's maximum number of instances. """ # Stream setup stream = FileStream( "https://raw.githubusercontent.com/scikit-multiflow/streaming-datasets/" "master/covtype.csv") # stream = SEAGenerator(classification_function=2, sample_seed=53432, balance_classes=False) # Setup the classifier clf = SGDClassifier() # classifier = KNNADWINClassifier(n_neighbors=8, max_window_size=2000,leaf_size=40, nominal_attributes=None) # classifier = OzaBaggingADWINClassifier(base_estimator=KNNClassifier(n_neighbors=8, max_window_size=2000, # leaf_size=30)) clf_one = KNNADWINClassifier(n_neighbors=8, max_window_size=1000, leaf_size=30) # clf_two = KNNClassifier(n_neighbors=8, max_window_size=1000, leaf_size=30) # clf_two = LeveragingBaggingClassifier(base_estimator=KNNClassifier(), n_estimators=2) t_one = OneHotToCategorical([[10, 11, 12, 13], [ 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53 ]]) # t_two = OneHotToCategorical([[10, 11, 12, 13], # [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, # 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, # 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53]]) pipe_one = Pipeline([('one_hot_to_categorical', t_one), ('KNNClassifier', clf_one)]) # pipe_two = Pipeline([('one_hot_to_categorical', t_two), ('KNNClassifier', clf_two)]) classifier = [clf, pipe_one] # classifier = SGDRegressor() # classifier = PerceptronMask() # Setup the pipeline # pipe = Pipeline([('Classifier', classifier)]) # Setup the evaluator evaluator = EvaluatePrequential( pretrain_size=2000, output_file='test_comparison_prequential.csv', max_samples=instances, batch_size=1, n_wait=200, max_time=1000, show_plot=True) # Evaluate evaluator.evaluate(stream=stream, model=classifier)
import sys sys.path.append("..") from skmultiflow.data import FileStream from evaluation.evaluate_prequential import EvaluatePrequential from recommendation.random import RandomClassifier from recommendation.popular import PopularClassifier from recommendation.co_events import CoEventsClassifier from recommendation.seq_events import SeqEventsClassifier from recommendation.ht_wrapper import HTWrapper from recommendation.beer import BeerEnsemble from recommendation.sknn import SKNNClassifier # Create stream stream = FileStream("../data/yoochoose_clicks_1M100K.csv") stream.prepare_for_use() # Instantiate recommenders random = RandomClassifier() ht = HTWrapper(weight_mc=3, weight_inv=0.9) sknn = SKNNClassifier(k=300, sample_size=1500, sample_recent=True, similarity='cosine', sliding_window=True) popular = PopularClassifier(sliding_window=False) ar = CoEventsClassifier(sliding_window=False) sr = SeqEventsClassifier(sliding_window=False) mc = SeqEventsClassifier(steps_back=1, sliding_window=False) beer = BeerEnsemble(cf_components=[ar, sr, mc, popular, sknn], boundaries=[0.5]) evaluator = EvaluatePrequential(session_column_index=0, time_column_index=1, rec_size=10, allow_reminders=True,
def transform_vector(self, X): r, c = get_dimensions(X) for i in range(r): row = np.copy([X[i][:]]) for j in range(c): value = X[i][j] mean = self.calculate_mean(j) standard_deviation = self.calculate_stddev(j) standardized = (value - mean) / standard_deviation X[i][j] = standardized self.window.add_element(row) return X #Read the stream stream = FileStream("C:/Users/jeffr/OneDrive/Desktop/Data Stream/Assignment_One/dataset/data_n30000.csv") stream.prepare_for_use() #stream.next_sample(10) #stream.n_remaining_samples() #X, y = stream.next_sample(5000) metrics = ['accuracy', 'kappa', 'kappa_m', 'kappa_t', 'running_time', 'model_size'] evaluator = EvaluatePrequential(max_samples = 30000, n_wait = 100, show_plot = True, metrics = metrics) my_knn = MyKNNClassifier(standardize = True, weighted_vote = False) evaluator.evaluate(stream = stream, model = [my_knn], model_names = ['My_KNN']) cm = evaluator.get_mean_measurements(0).confusion_matrix print("Recall per class") for i in range(cm.n_classes): recall = cm.data[(i,i)]/cm.sum_col[i] \
show_plot=False, metrics=metrics) evaluator.evaluate(stream=stream,model=[knn],model_names=['KNN']) cm = evaluator.get_mean_measurements(0).confusion_matrix print("Recall per class") # Recall = True Positive / (True Positive + False Negative) for i in range(cm.n_classes): recall = cm.data[(i,i)]/cm.sum_col[i] \ if cm.sum_col[i] != 0 else 'Ill-defined' print("Class {}: {}".format(i, recall)) ''' #------------------------------------------------Experiment 3--------------------------------------------------------------- from skmultiflow.meta import AdaptiveRandomForestClassifier from skmultiflow.meta import LeveragingBaggingClassifier # Read in stream stream = FileStream(r"C:\Users\luyj0\OneDrive\Desktop\COMPX523-Data Stream Mining\covtype_numeric.csv") # Set up different classifiers knn = MyKNNClassifier() ht = HoeffdingTreeClassifier() nb = NaiveBayes() wv_knn = MyKNNClassifier(weighted_vote=True) s_knn = MyKNNClassifier(standardize=True) arf = AdaptiveRandomForestClassifier() lb = LeveragingBaggingClassifier() # Set up two ensemble algorithms metrics = ['accuracy', 'kappa', 'kappa_m','kappa_t', 'running_time', 'model_size'] # use a test-then-train evaluation approach evaluator = EvaluatePrequential(max_samples=30000, n_wait=100, show_plot=False, metrics=metrics)
import sys sys.path.append("..") from skmultiflow.data import FileStream from evaluation.evaluate_prequential import EvaluatePrequential from recommendation.random import RandomClassifier from recommendation.popular import PopularClassifier from recommendation.co_events import CoEventsClassifier from recommendation.seq_events import SeqEventsClassifier from recommendation.ht_wrapper import HTWrapper from recommendation.beer import BeerEnsemble from recommendation.sknn import SKNNClassifier # Create stream stream = FileStream("../data/trivago_1M100K.csv") stream.prepare_for_use() # Instantiate recommenders random = RandomClassifier() ht = HTWrapper(weight_mc=5, weight_inv=0.90) sknn = SKNNClassifier(k=100, sample_size=1000, sample_recent=True, similarity='cosine', sliding_window=True) popular = PopularClassifier(sliding_window=True) ar = CoEventsClassifier(sliding_window=False) sr = SeqEventsClassifier(sliding_window=False) mc = SeqEventsClassifier(steps_back=1, sliding_window=False) beer = BeerEnsemble(cf_components=[ar, sr, mc, popular, sknn])
def demo(): """ _test_knn_adwin This demo tests the KNNAdwin classifier on a file stream, which gives instances coming from a SEA generator. The test computes the performance of the KNNAdwin classifier as well as the time to create the structure and classify max_samples (10000 by default) instances. """ start = timer() logging.basicConfig(format='%(message)s', level=logging.INFO) # warnings.filterwarnings("ignore", ".*Passing 1d.*") stream = FileStream('../data/datasets/sea_big.csv', -1, 1) # stream = RandomRBFGeneratorDrift(change_speed=41.00, n_centroids=50, model_random_state=32523423, # sample_seed=5435, n_classes=2, num_att=10, num_drift_centroids=50) stream.prepare_for_use() t = OneHotToCategorical([[10, 11, 12, 13], [ 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53 ]]) t2 = OneHotToCategorical([[10, 11, 12, 13], [ 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53 ]]) # knn = KNN(n_neighbors=8, max_window_size=2000, leaf_size=40) knn = KNNAdwin(n_neighbors=8, leaf_size=40, max_window_size=2000) # pipe = Pipeline([('one_hot_to_categorical', t), ('KNN', knn)]) compare = KNeighborsClassifier(n_neighbors=8, algorithm='kd_tree', leaf_size=40, metric='euclidean') # pipe2 = Pipeline([('one_hot_to_categorical', t2), ('KNN', compare)]) first = True train = 200 if train > 0: X, y = stream.next_sample(train) # pipe.partial_fit(X, y, classes=stream.target_values) # pipe.partial_fit(X, y, classes=stream.target_values) # pipe2.fit(X, y) knn.partial_fit(X, y, classes=stream.target_values) compare.fit(X, y) first = False n_samples = 0 max_samples = 10000 my_corrects = 0 compare_corrects = 0 while n_samples < max_samples: if n_samples % (max_samples / 20) == 0: logging.info('%s%%', str((n_samples // (max_samples / 20) * 5))) X, y = stream.next_sample() # my_pred = pipe.predict(X) my_pred = knn.predict(X) # my_pred = [1] if first: # pipe.partial_fit(X, y, classes=stream.target_values) # pipe.partial_fit(X, y, classes=stream.target_values) knn.partial_fit(X, y, classes=stream.target_values) first = False else: # pipe.partial_fit(X, y) knn.partial_fit(X, y) # compare_pred = pipe2.predict(X) compare_pred = compare.predict(X) if y[0] == my_pred[0]: my_corrects += 1 if y[0] == compare_pred[0]: compare_corrects += 1 n_samples += 1 end = timer() print('Evaluation time: ' + str(end - start)) print(str(n_samples) + ' samples analyzed.') print('My performance: ' + str(my_corrects / n_samples)) print('Compare performance: ' + str(compare_corrects / n_samples))
def make_stream(path, classifier): stream = FileStream(path) evaluator = flow_detection_classifier(classifier, stream) stream = evaluator.stream.y return stream
classifier = classifier.partial_fit(np.asarray([X[i]]), np.asarray([y[i]]), None) if (i % (r // 20)) == 0: logging.info(str((i // (r / 20)) * 5) + "%") accuracy = accuracy_score(true_labels, predicted_labels) logging.info('error rate %.2f%%' % (100 - 100 * accuracy)) if __name__ == '__main__': logging.basicConfig(format='%(message)s', level=logging.INFO) hyperParams = { 'maxSize': 1000, 'nNeighbours': 5, 'knnWeights': 'distance', 'STMSizeAdaption': 'maxACCApprox', 'use_ltm': False } # hyperParams = {'windowSize': 5000, 'nNeighbours': 5, 'knnWeights': 'distance', 'STMSizeAdaption': None, # 'use_ltm': False} logging.info('loading dataset') # stream = FileStream("../data/datasets/weather.csv") stream = FileStream("../data/datasets/moving_squares.csv") stream.prepare_for_use() X, y = stream.next_sample(stream.n_samples) logging.info('%d samples' % X.shape[0]) logging.info('%d dimensions' % X.shape[1]) run(X[:], y[:], hyperParams)
true_labels = [] for i in range(r): pred = classifier.predict(np.asarray([X[i]])) predicted_labels.append(pred[0]) true_labels.append(y[i]) classifier = classifier.partial_fit(np.asarray([X[i]]), np.asarray([y[i]]), None) if (i % (r // 20)) == 0: logging.info(str((i // (r / 20))*5) + "%") accuracy = accuracy_score(true_labels, predicted_labels) logging.info('error rate %.2f%%' % (100-100*accuracy)) if __name__ == '__main__': logging.basicConfig(format='%(message)s', level=logging.INFO) hyperParams ={'maxSize': 1000, 'nNeighbours': 5, 'knnWeights': 'distance', 'STMSizeAdaption': 'maxACCApprox', 'use_ltm': False} # hyperParams = {'windowSize': 5000, 'nNeighbours': 5, 'knnWeights': 'distance', 'STMSizeAdaption': None, # 'use_ltm': False} logging.info('loading dataset') # stream = FileStream("https://raw.githubusercontent.com/scikit-multiflow/streaming-datasets/" # "master/weather.csv") stream = FileStream("https://raw.githubusercontent.com/scikit-multiflow/streaming-datasets/" "master/moving_squares.csv") X, y = stream.next_sample(stream.n_samples) logging.info('%d samples' % X.shape[0]) logging.info('%d dimensions' % X.shape[1]) run(X[:], y[:], hyperParams)
# Example 1: popularity recommender from skmultiflow.data import FileStream from evaluation.evaluate_prequential import EvaluatePrequential from recommendation.random import RandomClassifier from recommendation.popular import PopularClassifier # Create stream stream = FileStream("your-dataset.csv") stream.prepare_for_use() # Instantiate recommender popular = PopularClassifier(sliding_window=True) # Configure evaluator evaluator = EvaluatePrequential(session_column_index=0, rec_size=10, pretrain_size=0, n_wait=200, # evaluation window n_keep=20000, # observation window max_samples=100000, metrics=['recall', 'mrr', 'running_time']) # Run evaluation evaluator.evaluate(stream=stream, model=[popular], model_names=['POP'])
from skmultiflow.evaluation import EvaluatePrequential from streaming_random_patches_regressor import StreamingRandomPatchesRegressor ############################################################################### # Options # ############################################################################### SEED = 123456 n_estimators = 3 aggregation_method = 'median' # 'median', 'mean' drift_detection_criteria= 'prediciton' # 'error', 'prediction' subspace_mode = "randompatches" # "randomsubspaces", "resampling", "randompatches" ############################################################################### stream = FileStream('datasets/cal_housing.csv') SRPR = StreamingRandomPatchesRegressor(n_estimators=n_estimators, aggregation_method=aggregation_method, random_state=SEED) HTR = HoeffdingTreeRegressor(random_state=SEED) # , leaf_prediction='mean') evaluator = EvaluatePrequential(pretrain_size=0, show_plot=True, metrics=['mean_square_error', 'mean_absolute_error', 'true_vs_predicted'] ) evaluator.evaluate(stream=stream, model=[SRPR, HTR], model_names=['SRP-Reg', 'HT-Reg'])
parser.add_argument('-d', '--dataset', required=False, default="sea_gen", help="Name of Detector {KD3/Adwin/PageHinkley}") parser.add_argument('-s', '--label_size', required=False, default=0.25, help="Name of Detector {KD3/Adwin/PageHinkley}") args = parser.parse_args() test_dataset = args.dataset print("dataset:" + "datasets/" + test_dataset + '.csv') stream = FileStream("datasets/" + test_dataset + '.csv') #print(stream.get_target_values()) onlineBoosting = OnlineBoostingClassifier() knn_adwin = KNNADWINClassifier(n_neighbors=8, leaf_size=40, max_window_size=1000) SAMKNN = SAMKNNClassifier(n_neighbors=10, weighting='distance', max_window_size=500, stm_size_option='maxACCApprox', use_ltm=False) learn_pp_nse = LearnPPNSEClassifier() SGD = SGDClassifier() rslvq = RobustSoftLearningVectorQuantization() #CMMM2 = CMGMMClassifier(classes=stream.get_target_values(), prune_component=True, drift_detector=None)
for i in range(r): pred = classifier.predict(np.asarray([X[i]])) predicted_labels.append(pred[0]) true_labels.append(y[i]) classifier = classifier.partial_fit(np.asarray([X[i]]), np.asarray([y[i]]), None) if (i % (r // 20)) == 0: logging.info(str((i // (r / 20))*5) + "%") accuracy = accuracy_score(true_labels, predicted_labels) logging.info('error rate %.2f%%' % (100-100*accuracy)) if __name__ == '__main__': logging.basicConfig(format='%(message)s', level=logging.INFO) hyperParams ={'maxSize': 1000, 'nNeighbours': 5, 'knnWeights': 'distance', 'STMSizeAdaption': 'maxACCApprox', 'useLTM': False} # hyperParams = {'windowSize': 5000, 'nNeighbours': 5, 'knnWeights': 'distance', 'STMSizeAdaption': None, # 'useLTM': False} logging.info('loading dataset') # stream = FileStream("../data/datasets/weather.csv") stream = FileStream("../data/datasets/moving_squares.csv") stream = FileStream("/Users/jing/local/scikit-multiflow/src/skmultiflow/data/datasets/covtype.csv") stream.prepare_for_use() X, y = stream.next_sample(stream.n_samples) logging.info('%d samples' % X.shape[0]) logging.info('%d dimensions' % X.shape[1]) run(X[:], y[:], hyperParams)
def test_evaluate_and_adapt_trees(): expected_accuracies = [ 0.86, 0.876, 0.914, 0.858, 0.77, 0.894, 0.876, 0.91, 0.898, 0.884, 0.804, 0.808 ] expected_trees = [30, 60, 30, 60, 30] # Load the meta-model dictMeta = { 0.0: 60, 0.1: 30, 0.2: 30, 0.3: 30, 0.4: 60, 0.5: 70, 0.6: 60, 0.7: 30, 0.8: 30, 0.9: 30 } # dict = {'pourc redund feat':best nb tree} n_trees = 10 n_samples_max = 6000 n_samples_meas = 500 stream = FileStream('./recurrent-data/real-world/elec.csv') stream.prepare_for_use() # Evaluate model (with adaptation or not) arf = AdaptiveRandomForest(n_estimators=n_trees, lambda_value=6, grace_period=10, split_confidence=0.1, tie_threshold=0.005, warning_detection_method=ADWIN(delta=0.01), drift_detection_method=ADWIN(delta=0.001), random_state=0) modelsList = [arf] modelsNames = ['ARF'] evaluator = EvaluatePrequentialAndAdaptTreesARF( metrics=['accuracy', 'kappa', 'running_time', 'ram_hours'], show_plot=False, n_wait=n_samples_meas, pretrain_size=200, max_samples=n_samples_max, output_file=None, metaKB=dictMeta) # Run evaluation model, acc, n_trees = evaluator.evaluate(stream=stream, model=modelsList, model_names=modelsNames) assert np.alltrue(acc[0] == expected_accuracies) assert np.alltrue(n_trees[0] == expected_trees)
from skmultiflow.data import FileStream from skmultiflow.evaluation import EvaluatePrequential from skmultiflow.bayes import NaiveBayes from skmultiflow.meta import OzaBagging from sklearn.datasets import make_classification with open("dataset_imb.csv", "w") as f: X, y = make_classification( n_features=10, n_informative=10, n_redundant=0, n_samples=10000, weights=[0.5] ) for i in range(X.shape[0]): for att in X[i]: f.write(str(att) + ",") f.write(str(y[i]) + "\n") generator = FileStream("dataset_imb.csv") dpdes = DPDESMethod(NaiveBayes(), 200, 10, KNORAU()) ozabag = OzaBagging(NaiveBayes(), n_estimators=10) evaluator = EvaluatePrequential( max_samples=10000, n_wait=200, batch_size=200, pretrain_size=0, metrics=["precision"], ) evaluator.evaluate(generator, [dpdes, ozabag], ["DPDES", "Ozabag"])
def demo(): """ _test_kdtree_compare This demo compares creation and query speed for different kd tree implementations. They are fed with instances from the covtype dataset. Three kd tree implementations are compared: SciPy's KDTree, NumPy's KDTree and scikit-multiflow's KDTree. For each of them the demo will time the construction of the tree on 1000 instances, and then measure the time to query 100 instances. The results are displayed in the terminal. """ warnings.filterwarnings("ignore", ".*Passing 1d.*") stream = FileStream('../data/datasets/covtype.csv', -1, 1) filter = OneHotToCategorical([[10, 11, 12, 13], [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53]]) X, y = stream.next_sample(1000) X = filter.transform(X) # print(X) X_find, y = stream.next_sample(100) X_find = filter.transform(X_find) print(X_find[4]) # Normal kdtree start = timer() scipy = spatial.KDTree(X, leafsize=40) end = timer() print("\nScipy KDTree construction time: " + str(end-start)) start = timer() for i in range(10): ind = scipy.query(X_find[i], 8) # print(ind) end = timer() print("Scipy KDTree query time: " + str(end - start)) del scipy # Fast kdtree start = timer() opt = KDTree(X, metric='euclidean', return_distance=True) end = timer() print("\nOptimal KDTree construction time: " + str(end-start)) start = timer() for i in range(100): ind, dist = opt.query(X_find[i], 8) # print(ind) # print(dist) end = timer() print("Optimal KDTree query time: " + str(end - start)) del opt # Sklearn kdtree start = timer() sk = ng.KDTree(X, metric='euclidean') end = timer() print("\nSklearn KDTree construction time: " + str(end-start)) start = timer() for i in range(100): ind, dist = sk.query(np.asarray(X_find[i]).reshape(1, -1), 8, return_distance=True) # print(ind) # print(dist) end = timer() print("Sklearn KDTree query time: " + str(end - start) + "\n") del sk