def build_query_strategy(sent_df, col_names): # type: (DataFrame, ColumnNames) -> QueryStrategy """ Builds and returns a QueryStrategy using a feature extractor and a base_df """ init_extractor = SynStateALHeuristic.build_feature_extractor( sent_df, col_names) combined_features = init_extractor.transform(sent_df, col_names) trn_ds = TextDataset(sent_df, col_names, None, features=combined_features) return ActiveLearningByLearning( trn_ds, query_strategies=[ UncertaintySampling(trn_ds, model=SVM(C=100, gamma=3.1, kernel='rbf', decision_function_shape='ovr')), QUIRE(trn_ds), HintSVM(trn_ds, cl=1.0, ch=1.0), ], T=1000, uniform_sampler=True, model=SVM(C=100, gamma=3.1, kernel='rbf', decision_function_shape='ovr'))
def test_quire(self): trn_ds = Dataset( self.X, np.concatenate([self.y[:5], [None] * (len(self.y) - 5)])) qs = QUIRE(trn_ds) qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal( qseq, np.array([117, 175, 256, 64, 103, 118, 180, 159, 129, 235]))
def test_quire_mykernel(self): def my_kernel(X, Y): return np.dot(X, Y.T) np.random.seed(1126) trn_ds = Dataset( self.X, np.concatenate([self.y[:5], [None] * (len(self.y) - 5)])) qs = QUIRE(trn_ds, kernel=my_kernel) qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal( qseq, np.array([9, 227, 176, 110, 52, 117, 228, 205, 103, 175]))
def build_query_strategy(sent_df, col_names): # type: (DataFrame, ColumnNames) -> QueryStrategy """ Builds and returns a QueryStrategy using a feature extractor and a base_df """ init_extractor = SynStateALHeuristic.build_feature_extractor( sent_df, col_names) combined_features = init_extractor.transform(sent_df, col_names) return QUIRE( TextDataset(sent_df, col_names, None, features=combined_features))
def test_ALBLTestCase(self): trn_ds = Dataset( self.X, np.concatenate([self.y[:10], [None] * (len(self.y) - 10)])) qs = ActiveLearningByLearning( trn_ds, T=self.quota, query_strategies=[ UncertaintySampling(trn_ds, model=SVM(kernel="linear", decision_function_shape="ovr")), QUIRE(trn_ds), RandomSampling(trn_ds) ], model=SVM(kernel="linear", decision_function_shape="ovr"), random_state=1126) qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal( qseq, np.array([173, 103, 133, 184, 187, 147, 251, 83, 93, 33]))
def main(): # Specifiy the parameters here: # path to your binary classification dataset ds_name = 'australian' dataset_filepath = os.path.join( os.path.dirname(os.path.realpath(__file__)), '%s.txt' % ds_name) test_size = 0.33 # the percentage of samples in the dataset that will be # randomly selected and assigned to the test set n_labeled = 10 # number of samples that are initially labeled results = [] for T in range(20): # repeat the experiment 20 times print("%dth experiment" % (T + 1)) trn_ds, tst_ds, y_train, fully_labeled_trn_ds = \ split_train_test(dataset_filepath, test_size, n_labeled) trn_ds2 = copy.deepcopy(trn_ds) trn_ds3 = copy.deepcopy(trn_ds) trn_ds4 = copy.deepcopy(trn_ds) trn_ds5 = copy.deepcopy(trn_ds) lbr = IdealLabeler(fully_labeled_trn_ds) quota = len(y_train) - n_labeled # number of samples to query # Comparing UncertaintySampling strategy with RandomSampling. # model is the base learner, e.g. LogisticRegression, SVM ... etc. qs = UncertaintySampling(trn_ds, model=SVM(decision_function_shape='ovr')) model = SVM(kernel='linear', decision_function_shape='ovr') _, E_out_1 = run(trn_ds, tst_ds, lbr, model, qs, quota) results.append(E_out_1.tolist()) qs2 = RandomSampling(trn_ds2) model = SVM(kernel='linear', decision_function_shape='ovr') _, E_out_2 = run(trn_ds2, tst_ds, lbr, model, qs2, quota) results.append(E_out_2.tolist()) qs3 = QUIRE(trn_ds3) model = SVM(kernel='linear', decision_function_shape='ovr') _, E_out_3 = run(trn_ds3, tst_ds, lbr, model, qs3, quota) results.append(E_out_3.tolist()) qs4 = HintSVM(trn_ds4, cl=1.0, ch=1.0) model = SVM(kernel='linear', decision_function_shape='ovr') _, E_out_4 = run(trn_ds4, tst_ds, lbr, model, qs4, quota) results.append(E_out_4.tolist()) qs5 = ActiveLearningByLearning( trn_ds5, query_strategies=[ UncertaintySampling(trn_ds5, model=SVM(kernel='linear', decision_function_shape='ovr')), QUIRE(trn_ds5), HintSVM(trn_ds5, cl=1.0, ch=1.0), ], T=quota, uniform_sampler=True, model=SVM(kernel='linear', decision_function_shape='ovr')) model = SVM(kernel='linear', decision_function_shape='ovr') _, E_out_5 = run(trn_ds5, tst_ds, lbr, model, qs5, quota) results.append(E_out_5.tolist()) result = [] for i in range(5): _temp = [] for j in range(i, len(results), 5): _temp.append(results[j]) result.append(np.mean(_temp, axis=0)) # Plot the learning curve of UncertaintySampling to RandomSampling # The x-axis is the number of queries, and the y-axis is the corresponding # error rate. query_num = np.arange(1, quota + 1) plt.plot(query_num, result[0], 'g', label='uncertainty sampling') plt.plot(query_num, result[1], 'k', label='random') plt.plot(query_num, result[2], 'r', label='QUIRE') plt.plot(query_num, result[3], 'b', label='HintSVM') plt.plot(query_num, result[4], 'c', label='ALBL') plt.xlabel('Number of Queries') plt.ylabel('Error') plt.title('Experiment Result') plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=5) plt.show()
def test_quire(self): trn_ds = init_toyexample(self.X, self.y) qs = QUIRE(trn_ds) model = LogisticRegression() qseq = run_qs(trn_ds, self.lbr, model, qs, self.quota) assert_array_equal(qseq, np.array([6, 7, 9, 8]))
def test_quire(self): trn_ds = Dataset(self.X, np.concatenate([self.y[:10], [None] * 10])) qs = QUIRE(trn_ds) qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal(qseq, np.array([10, 11, 12, 13, 14, 15, 16, 18, 19, 17]))
def test_quire(self): trn_ds = Dataset(self.X, np.concatenate([self.y[:6], [None] * 4])) qs = QUIRE(trn_ds) qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal(qseq, np.array([6, 7, 9, 8]))
def main(): global pos_filepath, dataset_filepath, csv_filepath, vectors_list, ids_list dataset_filepath = "/Users/dndesign/Desktop/active_learning/vecteurs_et_infos/vectors_2015.txt" csv_filepath = "/Users/dndesign/Desktop/active_learning/donnees/corpus_2015_id-time-text.csv" pos_filepath = "/Users/dndesign/Desktop/active_learning/donnees/oriane_pos_id-time-text.csv" vectors_list, ids_list = get_vectors_list(dataset_filepath) timestr = time.strftime("%Y%m%d_%H%M%S") text_file = codecs.open("task_" + str(timestr) + ".txt", "w", "utf-8") print("Loading data...") text_file.write("Loading data...\n") # Open this file t0 = time.time() file = openfile_txt(dataset_filepath) num_lines = sum(1 for line in file) print("Treating " + str(num_lines) + " entries...") text_file.write("Treating : %s entries...\n" % str(num_lines)) # Number of queries to ask human to label quota = 10 E_out1, E_out2, E_out3, E_out4, E_out5, E_out6, E_out7 = [], [], [], [], [], [], [] trn_ds, tst_ds = split_train_test(csv_filepath) # model = SVM(kernel='linear') model = LogisticRegression() ''' UncertaintySampling (Least Confident) UncertaintySampling : it queries the instances about which it is least certain how to label Least Confident : it queries the instance whose posterior probability of being positive is nearest 0.5 ''' qs = UncertaintySampling(trn_ds, method='lc', model=LogisticRegression(C=.01)) model.train(trn_ds) # preds = model.predict(tst_ds.format_sklearn()[0]) # score = accuracy_score(tst_ds.format_sklearn()[1], preds) # E_out1 = np.append(E_out1, score) E_out1 = np.append(E_out1, 1 - model.score(tst_ds)) # E_out1 = np.append(E_out1, model.score(tst_ds)) ''' UncertaintySampling (Smallest Margin) Smallest Margin : it queries the instance whose posterior probability gap between the most and the second probable labels is minimal ''' trn_ds2 = copy.deepcopy(trn_ds) qs2 = UncertaintySampling(trn_ds2, method='sm', model=LogisticRegression(C=.01)) model.train(trn_ds2) # preds = model.predict(tst_ds.format_sklearn()[0]) # score = accuracy_score(tst_ds.format_sklearn()[1], preds) # E_out2 = np.append(E_out2, score) E_out2 = np.append(E_out2, 1 - model.score(tst_ds)) # E_out2 = np.append(E_out2, model.score(tst_ds)) ''' UncertaintySampling (Entropy) Entropy : it reduces to the margin and least confident strategies NB : We notice that all those three strategies are equivalent for binary classification ''' trn_ds3 = copy.deepcopy(trn_ds) qs3 = UncertaintySampling(trn_ds3, method='entropy', model=LogisticRegression(C=.01)) model.train(trn_ds3) # preds = model.predict(tst_ds.format_sklearn()[0]) # score = accuracy_score(tst_ds.format_sklearn()[1], preds) # E_out3 = np.append(E_out3, score) E_out3 = np.append(E_out3, 1 - model.score(tst_ds)) # E_out3 = np.append(E_out3, model.score(tst_ds)) ''' Random Sampling Random : it chooses randomly a query ''' trn_ds4 = copy.deepcopy(trn_ds) qs4 = RandomSampling(trn_ds4, random_state=1126) model.train(trn_ds4) # preds = model.predict(tst_ds.format_sklearn()[0]) # score = accuracy_score(tst_ds.format_sklearn()[1], preds) # E_out4 = np.append(E_out4, score) E_out4 = np.append(E_out4, 1 - model.score(tst_ds)) # E_out4 = np.append(E_out4, model.score(tst_ds)) ''' QUIRE ''' trn_ds5 = copy.deepcopy(trn_ds) # qs5 = QUIRE(trn_ds5, kernel='linear') qs5 = QUIRE(trn_ds5) model.train(trn_ds5) # preds = model.predict(tst_ds.format_sklearn()[0]) # score = accuracy_score(tst_ds.format_sklearn()[1], preds) # E_out5 = np.append(E_out5, score) E_out5 = np.append(E_out5, 1 - model.score(tst_ds)) # E_out5 = np.append(E_out5, model.score(tst_ds)) ''' QueryByCommittee (Vote Entropy) QueryByCommittee : it keeps a committee of classifiers and queries the instance that the committee members disagree, it also examines unlabeled examples and selects only those that are most informative for labeling Vote Entropy : a way of measuring disagreement Disadvantage : it does not consider the committee members’ class distributions. It also misses some informative unlabeled examples to label ''' trn_ds6 = copy.deepcopy(trn_ds) qs6 = QueryByCommittee(trn_ds6, disagreement='vote', models=[ LogisticRegression(C=1.0), LogisticRegression(C=0.01), LogisticRegression(C=100) ], random_state=1126) model.train(trn_ds6) # preds = model.predict(tst_ds.format_sklearn()[0]) # score = accuracy_score(tst_ds.format_sklearn()[1], preds) # E_out6 = np.append(E_out6, score) E_out6 = np.append(E_out6, 1 - model.score(tst_ds)) # E_out6 = np.append(E_out6, model.score(tst_ds)) ''' QueryByCommittee (Kullback-Leibler Divergence) QueryByCommittee : it examines unlabeled examples and selects only those that are most informative for labeling Disadvantage : it misses some examples on which committee members disagree ''' trn_ds7 = copy.deepcopy(trn_ds) qs7 = QueryByCommittee(trn_ds7, disagreement='kl_divergence', models=[ LogisticRegression(C=1.0), LogisticRegression(C=0.01), LogisticRegression(C=100) ], random_state=1126) model.train(trn_ds7) # preds = model.predict(tst_ds.format_sklearn()[0]) # score = accuracy_score(tst_ds.format_sklearn()[1], preds) # E_out7 = np.append(E_out7, score) E_out7 = np.append(E_out7, 1 - model.score(tst_ds)) # E_out7 = np.append(E_out7, model.score(tst_ds)) # HintSVM ''' trn_ds8 = copy.deepcopy(trn_ds) qs8 = HintSVM(trn_ds8, random_state=1126) model.train(trn_ds8) E_out8 = np.append(E_out8, 1 - model.score(tst_ds))''' with sns.axes_style("darkgrid"): fig = plt.figure() ax = fig.add_subplot(1, 1, 1) query_num = np.arange(0, 1) p1, = ax.plot(query_num, E_out1, 'red') p2, = ax.plot(query_num, E_out2, 'blue') p3, = ax.plot(query_num, E_out3, 'green') p4, = ax.plot(query_num, E_out4, 'orange') p5, = ax.plot(query_num, E_out5, 'yellow') p6, = ax.plot(query_num, E_out6, 'black') p7, = ax.plot(query_num, E_out7, 'purple') plt.legend(('Least Confident', 'Smallest Margin', 'Entropy', 'Random Sampling', 'QUIRE', 'Vote Entropy', 'KL Divergence'), loc=1) # plt.legend(('Least Confident', 'Smallest Margin', 'Entropy', 'Random Sampling', 'Vote Entropy', 'KL Divergence'), loc=4) plt.ylabel('Accuracy') plt.xlabel('Number of Queries') plt.title('Active Learning - Query choice strategies') plt.ylim([0, 1]) plt.show(block=False) for i in range(quota): print("\n#################################################") print("Query number " + str(i) + " : ") print("#################################################\n") text_file.write( "\n#################################################\n") text_file.write("Query number %s : " % str(i)) text_file.write( "\n#################################################\n") ask_id = qs.make_query() print("\033[4mUsing Uncertainty Sampling (Least confident) :\033[0m") print("Tweet :" + define_tweet_by_id(ask_id), end='', flush=True) print("Simulating human response : " + str(simulate_human_decision(ask_id)) + " \n") text_file.write("Using Uncertainty Sampling (Least confident) :\n") text_file.write("Tweet : %s \n" % str(define_tweet_by_id(ask_id))) text_file.write("Simulating human response : %s \n\n" % str(simulate_human_decision(ask_id))) trn_ds.update(ask_id, simulate_human_decision(ask_id)) model.train(trn_ds) # preds = model.predict(tst_ds.format_sklearn()[0]) # score = accuracy_score(tst_ds.format_sklearn()[1], preds) # E_out1 = np.append(E_out1, score) E_out1 = np.append(E_out1, 1 - model.score(tst_ds)) # E_out1 = np.append(E_out1, model.score(tst_ds)) ask_id = qs2.make_query() print("\033[4mUsing Uncertainty Sampling (Smallest Margin) :\033[0m") print("Tweet :" + define_tweet_by_id(ask_id), end='', flush=True) print("Simulating human response : " + str(simulate_human_decision(ask_id)) + " \n") text_file.write("Using Uncertainty Sampling (Smallest Margin) :\n") text_file.write("Tweet : %s \n" % str(define_tweet_by_id(ask_id))) text_file.write("Simulating human response : %s \n\n" % str(simulate_human_decision(ask_id))) trn_ds2.update(ask_id, simulate_human_decision(ask_id)) model.train(trn_ds2) # preds = model.predict(tst_ds.format_sklearn()[0]) # score = accuracy_score(tst_ds.format_sklearn()[1], preds) # E_out2 = np.append(E_out2, score) E_out2 = np.append(E_out2, 1 - model.score(tst_ds)) # E_out2 = np.append(E_out2, model.score(tst_ds)) ask_id = qs3.make_query() print("\033[4mUsing Uncertainty Sampling (Entropy) :\033[0m") print("Tweet :" + define_tweet_by_id(ask_id), end='', flush=True) print("Simulating human response : " + str(simulate_human_decision(ask_id)) + " \n") text_file.write("Using Uncertainty Sampling (Entropy) :\n") text_file.write("Tweet : %s \n" % str(define_tweet_by_id(ask_id))) text_file.write("Simulating human response : %s \n\n" % str(simulate_human_decision(ask_id))) trn_ds3.update(ask_id, simulate_human_decision(ask_id)) model.train(trn_ds3) # preds = model.predict(tst_ds.format_sklearn()[0]) # score = accuracy_score(tst_ds.format_sklearn()[1], preds) # E_out3 = np.append(E_out3, score) E_out3 = np.append(E_out3, 1 - model.score(tst_ds)) # E_out3 = np.append(E_out3, model.score(tst_ds)) ask_id = qs4.make_query() print("\033[4mUsing Random Sampling :\033[0m") print("Tweet :" + define_tweet_by_id(ask_id), end='', flush=True) print("Simulating human response : " + str(simulate_human_decision(ask_id)) + " \n") text_file.write("Using Random Sampling :\n") text_file.write("Tweet : %s \n" % str(define_tweet_by_id(ask_id))) text_file.write("Simulating human response : %s \n\n" % str(simulate_human_decision(ask_id))) trn_ds4.update(ask_id, simulate_human_decision(ask_id)) model.train(trn_ds4) # preds = model.predict(tst_ds.format_sklearn()[0]) # score = accuracy_score(tst_ds.format_sklearn()[1], preds) # E_out4 = np.append(E_out4, score) E_out4 = np.append(E_out4, 1 - model.score(tst_ds)) # E_out4 = np.append(E_out4, model.score(tst_ds)) ask_id = qs5.make_query() print("\033[4mUsing QUIRE :\033[0m") print("Tweet :" + define_tweet_by_id(ask_id), end='', flush=True) print("Simulating human response : " + str(simulate_human_decision(ask_id)) + " \n") text_file.write("Using QUIRE :\n") text_file.write("Tweet : %s \n" % str(define_tweet_by_id(ask_id))) text_file.write("Simulating human response : %s \n\n" % str(simulate_human_decision(ask_id))) trn_ds5.update(ask_id, simulate_human_decision(ask_id)) model.train(trn_ds5) # preds = model.predict(tst_ds.format_sklearn()[0]) # score = accuracy_score(tst_ds.format_sklearn()[1], preds) # E_out5 = np.append(E_out5, score) E_out5 = np.append(E_out5, 1 - model.score(tst_ds)) # E_out5 = np.append(E_out5, model.score(tst_ds)) ask_id = qs6.make_query() print("\033[4mUsing QueryByCommittee (Vote Entropy) :\033[0m") print("Tweet :" + define_tweet_by_id(ask_id), end='', flush=True) print("Simulating human response : " + str(simulate_human_decision(ask_id)) + " \n") text_file.write("Using QueryByCommittee (Vote Entropy) :\n") text_file.write("Tweet : %s \n" % str(define_tweet_by_id(ask_id))) text_file.write("Simulating human response : %s \n\n" % str(simulate_human_decision(ask_id))) trn_ds6.update(ask_id, simulate_human_decision(ask_id)) model.train(trn_ds6) # preds = model.predict(tst_ds.format_sklearn()[0]) # score = accuracy_score(tst_ds.format_sklearn()[1], preds) # E_out6 = np.append(E_out6, score) E_out6 = np.append(E_out6, 1 - model.score(tst_ds)) # E_out6 = np.append(E_out6, model.score(tst_ds)) ask_id = qs7.make_query() print("\033[4mUsing QueryByCommittee (KL Divergence) :\033[0m") print("Tweet :" + define_tweet_by_id(ask_id), end='', flush=True) print("Simulating human response : " + str(simulate_human_decision(ask_id)) + " \n") text_file.write("Using QueryByCommittee (KL Divergence) :\n") text_file.write("Tweet : %s \n" % str(define_tweet_by_id(ask_id))) text_file.write("Simulating human response : %s \n\n" % str(simulate_human_decision(ask_id))) trn_ds7.update(ask_id, simulate_human_decision(ask_id)) model.train(trn_ds7) # preds = model.predict(tst_ds.format_sklearn()[0]) # score = accuracy_score(tst_ds.format_sklearn()[1], preds) # E_out7 = np.append(E_out7, score) E_out7 = np.append(E_out7, 1 - model.score(tst_ds)) # E_out7 = np.append(E_out7, model.score(tst_ds)) ax.set_xlim((0, i + 1)) ax.set_ylim((0, max(max(E_out1), max(E_out2), max(E_out3), max(E_out4), max(E_out5), max(E_out6), max(E_out7)) + 0.2)) # ax.set_ylim((0, max(max(E_out1), max(E_out2), max(E_out3), max(E_out4), max(E_out6), max(E_out7)) + 0.2)) query_num = np.arange(0, i + 2) p1.set_xdata(query_num) p1.set_ydata(E_out1) p2.set_xdata(query_num) p2.set_ydata(E_out2) p3.set_xdata(query_num) p3.set_ydata(E_out3) p4.set_xdata(query_num) p4.set_ydata(E_out4) p5.set_xdata(query_num) p5.set_ydata(E_out5) p6.set_xdata(query_num) p6.set_ydata(E_out6) p7.set_xdata(query_num) p7.set_ydata(E_out7) plt.draw() t2 = time.time() time_total = t2 - t0 print("\n\n\n#################################################\n") print("Execution time : %fs \n\n" % time_total) text_file.write( "\n\n\n#################################################\n") text_file.write("Execution time : %fs \n" % time_total) text_file.close() input("Press any key to save the plot...") plt.savefig('task_' + str(timestr) + '.png') print("Done")