def train_tree(csv_path, tree): print("Training the tree") stream = FileStream(csv_path) accuracy = 0 n_samples = 0 correct_cnt = 0 t0 = time.time() while stream.has_more_samples(): X, y = stream.next_sample() y_pred = tree.predict(X) if y[0] == y_pred[0]: correct_cnt += 1 tree = tree.partial_fit(X, y) n_samples += 1 t1 = time.time() total = t1 - t0 accuracy = 100.0 * correct_cnt / n_samples print("Training data instances: ", n_samples) print("Tree trained on ", n_samples, " instances & has ", accuracy, "% accuracy.") print("Training tree completed in ", total, " (s)")
def test_file_stream(test_path, package_path): test_file = os.path.join(package_path, 'src/skmultiflow/data/datasets/sea_stream.csv') stream = FileStream(test_file) stream.prepare_for_use() assert stream.n_remaining_samples() == 40000 expected_names = ['attrib1', 'attrib2', 'attrib3'] assert stream.feature_names == expected_names expected_targets = [0, 1] assert stream.target_values == expected_targets assert stream.target_names == ['class'] assert stream.n_features == 3 assert stream.n_cat_features == 0 assert stream.n_num_features == 3 assert stream.n_targets == 1 assert stream.get_data_info() == 'sea_stream.csv - 1 target(s), 2 classes' assert stream.has_more_samples() is True assert stream.is_restartable() is True # Load test data corresponding to first 10 instances test_file = os.path.join(test_path, 'sea_stream_file.npz') data = np.load(test_file) X_expected = data['X'] y_expected = data['y'] X, y = stream.next_sample() assert np.alltrue(X[0] == X_expected[0]) assert np.alltrue(y[0] == y_expected[0]) X, y = stream.last_sample() assert np.alltrue(X[0] == X_expected[0]) assert np.alltrue(y[0] == y_expected[0]) stream.restart() X, y = stream.next_sample(10) assert np.alltrue(X == X_expected) assert np.alltrue(y == y_expected) assert stream.n_targets == np.array(y).ndim assert stream.n_features == X.shape[1] assert 'stream' == stream._estimator_type expected_info = "FileStream(filename='sea_stream.csv', target_idx=-1, n_targets=1, cat_features=None)" assert stream.get_info() == expected_info
def demo(): """ _test_mol This demo tests the MOL learner on a file stream, which reads from the music.csv file. The test computes the performance of the MOL learner as well as the time to create the structure and classify all the samples in the file. """ # Setup logging logging.basicConfig(format='%(message)s', level=logging.INFO) # Setup the file stream stream = FileStream("../datasets/music.csv", 0, 6) stream.prepare_for_use() # Setup the classifier, by default it uses Logistic Regression # classifier = MultiOutputLearner() # classifier = MultiOutputLearner(h=SGDClassifier(n_iter=100)) classifier = MultiOutputLearner(h=Perceptron()) # Setup the pipeline pipe = Pipeline([('classifier', classifier)]) pretrain_size = 150 logging.info('Pre training on %s samples', str(pretrain_size)) X, y = stream.next_sample(pretrain_size) # classifier.fit(X, y) pipe.partial_fit(X, y, classes=stream.get_targets()) count = 0 true_labels = [] predicts = [] init_time = timer() logging.info('Evaluating...') while stream.has_more_samples(): X, y = stream.next_sample() # p = classifier.predict(X) p = pipe.predict(X) predicts.extend(p) true_labels.extend(y) count += 1 perf = hamming_score(true_labels, predicts) logging.info('Evaluation time: %s s', str(timer() - init_time)) logging.info('Total samples analyzed: %s', str(count)) logging.info('The classifier\'s static Hamming score : %0.3f' % perf)
def test_tree(csv_path, tree): print("Testing the tree") stream = FileStream(csv_path) n_samples = 0 correct_cnt = 0 t2 = time.time() y_true_all = list() y_pred_all = list() while stream.has_more_samples(): X, y = stream.next_sample() y_pred = tree.predict(X) if y[0] == y_pred[0]: correct_cnt += 1 tree = tree.partial_fit(X, y) n_samples += 1 y_true_all.append(y[0]) y_pred_all.append(y_pred[0]) t3 = time.time() total = t3-t2 accuracy = 100.0 * correct_cnt / n_samples fscore = f1_score(y_true_all, y_pred_all, average='binary') gm = geometric_mean_score(y_true_all, y_pred_all, average='binary') print("Test data instances: ", n_samples) print("Tree tested on ", n_samples, " instances & has ", accuracy, "% accuracy.") print("Tree has F-score: %.3f" % fscore) print("Tree has GM: %.3f" % gm) print("Testing tree completed in ", total, " (s)") return round(fscore,3), round(gm,3)
# Retrieving 5 samples data_stream.next_sample(5) # Output- #(array([[ 36. , 0. , 7. , 3. , 1. , 118. , 13. , # 18. , 50. , 239.554, 97. , 1. , 1. , 1. , # 1. , 0. , 0. , 98. , 178. , 31. ], # [ 3. , 23. , 7. , 4. , 1. , 179. , 51. , # 18. , 38. , 239.554, 97. , 0. , 1. , 0. , # 1. , 0. , 0. , 89. , 170. , 31. ], # [ 7. , 7. , 7. , 5. , 1. , 279. , 5. , # 14. , 39. , 239.554, 97. , 0. , 1. , 2. , # 1. , 1. , 0. , 68. , 168. , 24. ], # [ 11. , 23. , 7. , 5. , 1. , 289. , 36. , # 13. , 33. , 239.554, 97. , 0. , 1. , 2. , # 1. , 0. , 1. , 90. , 172. , 30. ], # [ 3. , 23. , 7. , 6. , 1. , 179. , 51. , # 18. , 38. , 239.554, 97. , 0. , 1. , 0. , # 1. , 0. , 0. , 89. , 170. , 31. ]]), # array([0, 2, 4, 2, 2])) data_stream.has_more_samples() # Output- # True data_stream.n_remaining_samples() # Output- # 734 #####################################################################################
X_init, y_init = stream.next_sample(CHUNK_SIZE) print(X_init) print(y_init) goowe.partial_fit(X_init, y_init) accuracy = 0.0 total = 0.0 true_predictions = 0.0 for i in range(CHUNK_SIZE): total += 1 cur = stream.next_sample() X, y = cur[0], cur[1] preds = goowe.predict(X) true_predictions += np.sum(preds == y) accuracy = true_predictions / total print('\tData instance: {} - Accuracy: {}'.format(total, accuracy)) goowe.partial_fit(X, y) # Now, for the remaining instances, do ITTT (Interleaved Test Then Train). while(stream.has_more_samples()): total += 1 cur = stream.next_sample() X, y = cur[0], cur[1] preds = goowe.predict(X) # Test true_predictions += np.sum(preds == y) accuracy = true_predictions / total print('\tData instance: {} - Accuracy: {}'.format(int(total), round(accuracy*100.0, 3))) goowe.partial_fit(X, y) # Then train