def libact_EER(X, y, n_queries): y_train = np.array([None for _ in range(len(y))]) y_train[0], y_train[50], y_train[100] = 0, 1, 2 libact_train_dataset = Dataset(X, y_train) libact_full_dataset = Dataset(X, y) libact_learner = LogisticRegressionLibact( solver='liblinear', n_jobs=1, multi_class='ovr') #SVM(gamma='auto', probability=True) libact_qs = EER(libact_train_dataset, model=libact_learner, loss='01') libact_labeler = IdealLabeler(libact_full_dataset) libact_learner.train(libact_train_dataset) for _ in range(n_queries): query_idx = libact_qs.make_query() query_label = libact_labeler.label(X[query_idx]) libact_train_dataset.update(query_idx, query_label) libact_learner.train(libact_train_dataset)
def main(): # Specifiy the parameters here: # path to your binary classification dataset dataset_filepath = os.path.join( os.path.dirname(os.path.realpath(__file__)), 'diabetes.txt') test_size = 0.33 # the percentage of samples in the dataset that will be # randomly selected and assigned to the test set n_labeled = 10 # number of samples that are initially labeled # Load dataset trn_ds, tst_ds, y_train, fully_labeled_trn_ds = \ split_train_test(dataset_filepath, test_size, n_labeled) trn_ds2 = copy.deepcopy(trn_ds) lbr = IdealLabeler(fully_labeled_trn_ds) quota = len(y_train) - n_labeled # number of samples to query # Comparing UncertaintySampling strategy with RandomSampling. # model is the base learner, e.g. LogisticRegression, SVM ... etc. qs = UncertaintySampling(trn_ds, method='lc', model=LogisticRegression()) model = LogisticRegression() E_in_1, E_out_1 = run(trn_ds, tst_ds, lbr, model, qs, quota) qs2 = RandomSampling(trn_ds2) model = LogisticRegression() E_in_2, E_out_2 = run(trn_ds2, tst_ds, lbr, model, qs2, quota) # Plot the learning curve of UncertaintySampling to RandomSampling # The x-axis is the number of queries, and the y-axis is the corresponding # error rate. query_num = np.arange(1, quota + 1) plt.plot(query_num, E_in_1, 'b', label='qs Ein') plt.plot(query_num, E_in_2, 'r', label='random Ein') plt.plot(query_num, E_out_1, 'g', label='qs Eout') plt.plot(query_num, E_out_2, 'k', label='random Eout') plt.xlabel('Number of Queries') plt.ylabel('Error') plt.title('Experiment Result') plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=5) plt.show()
def active_learning(data, labels, test_size, n_labeled, quota=1000): # Load dataset trn_ds, tst_ds, y_train, fully_labeled_trn_ds = split_train_test( data, labels, test_size, n_labeled) trn_ds2 = copy.deepcopy(trn_ds) lbr = IdealLabeler(fully_labeled_trn_ds) #quota = len(y_train) - n_labeled # number of samples to query #quota = 1000 # Comparing UncertaintySampling strategy with RandomSampling. # model is the base learner, e.g. LogisticRegression, SVM ... etc. print("## Running UncertaintySampling... [{}]".format( datetime.datetime.now()), flush=True) clf = SklearnProbaAdapter( GradientBoostingClassifier(n_estimators=5, learning_rate=1.0, max_depth=2, random_state=0)) qs = UncertaintySampling(trn_ds, method='lc', model=clf) model = clf E_in_1, E_out_1, E_full_1 = run(trn_ds, tst_ds, lbr, model, qs, quota, fully_labeled_trn_ds) print("## Running RandomSampling... [{}]".format(datetime.datetime.now()), flush=True) qs2 = RandomSampling(trn_ds2) model = clf E_in_2, E_out_2, E_full_2 = run(trn_ds2, tst_ds, lbr, model, qs2, quota, fully_labeled_trn_ds) # Plot the learning curve of UncertaintySampling to RandomSampling # The x-axis is the number of queries, and the y-axis is the corresponding # error rate. print("## Preparing dataframe... [{}]".format(datetime.datetime.now()), flush=True) rows = ["E_in_1", "E_in_2", "E_out_1", "E_out_2", "E_full_1", "E_full_2"] data = pd.DataFrame( data=[E_in_1, E_in_2, E_out_1, E_out_2, E_full_1, E_full_2], index=rows) return data.transpose()
def main(): # Path to your libsvm_sparse type classification dataset. # If dataset not in libsvm_sparse type use libsvm to convert. dataset_filepath = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data/supernova.txt') test_size = 0.5 # The percentage of samples in the dataset that will be randomly selected and assigned to the test set. n_labeled = 10 # Number of samples that are initially labeled. # Load dataset trn_ds, tst_ds, y_train, fully_labeled_trn_ds = split_train_test(dataset_filepath, test_size, n_labeled) trn_ds2 = copy.deepcopy(trn_ds) lbr = IdealLabeler(fully_labeled_trn_ds) quota = 2000 # Number of samples to query. # Model is the base learner, e.g. LogisticRegression, SVM ... etc. model = SklearnProbaAdapter(MLPClassifier(solver='lbfgs', alpha=2, random_state=1)) qs = UncertaintySampling(trn_ds, method='lc', model=model) E_in_1, E_out_1, accuracy = run(trn_ds, tst_ds, lbr, model, qs, quota) # Plot the learning curve of UncertaintySampling. # The x-axis is the number of queries, and the y-axis is the corresponding error rate. query_num = np.arange(1, quota + 1) plt.plot(query_num, E_in_1, 'b', label='qs Ein') plt.plot(query_num, E_out_1, 'g', label='qs Eout') plt.xlabel('Number of Queries') plt.ylabel('Error') plt.title('Experiment Result') plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=5) plt.show() # Plot the accuracy over requested queries. # The x-axis is the number of queries, and the y-axis is the corresponding accuracy rate. plt.plot(query_num, accuracy, 'y', label="accuracy") plt.xlabel('Number of Queries') plt.ylabel('Accuracy') plt.title('NN + Active Learning') plt.legend(loc='upper center', bbox_to_anchor=(0.8, -0.5), fancybox=True, shadow=True, ncol=5) plt.savefig('vis/nn.png') plt.show() results(accuracy)
def initialDataSetup(trainFeatures, trainClasses, testFeatures, testClasses, SNLabel='0'): """ Set up ideal labeler. input: trainFeatures, array - train matrix trainClasses, list - train labels testFeatures, array - test (photometric) matrix testClasses, list - test (photometric) labels SNLabel, str - SN Ia flag output: tuple, (train_dataset, fullLabels, labeler) """ # Concatenate features fullFeatures = np.vstack([trainFeatures, testFeatures]) # Include None in place of labels from target sample partialClasses = np.concatenate([(trainClasses[:, 2] == SNLabel).astype(int), np.array([None] * testFeatures.shape[0])]) # Complete concatenated labels for train and target samples fullClasses = np.concatenate([(trainClasses[:, 2] == SNLabel).astype(int), (testClasses[:, 2] == SNLabel).astype(int)]) # Concatenate labels fullLabels = np.concatenate([trainClasses, testClasses]) # Concatenated features and class labels with None on target data train_dataset = Dataset(fullFeatures, partialClasses) # Define ideal labeler labeler = IdealLabeler(Dataset(fullFeatures, fullClasses)) return (train_dataset, fullLabels, labeler)
def main(): dataset_filepath = os.path.join( os.path.dirname(os.path.realpath(__file__)), 'diabetes.txt') test_size = 0.2 # the percentage of samples in the dataset that will be n_labeled = 100 # number of samples that are initially labeled trn_ds, tst_ds, y_train, fully_labeled_trn_ds = \ split_train_test(dataset_filepath, test_size, n_labeled) trn_ds2 = copy.deepcopy(trn_ds) lbr = IdealLabeler(fully_labeled_trn_ds) quota = len(y_train) - n_labeled # number of samples to query step = 40 print(quota) # Comparing UncertaintySampling strategy with RandomSampling. # model is the base learner, e.g. LogisticRegression, SVM ... etc. qs = UncertaintySampling(trn_ds, method='lc', model=LogisticRegression()) model = LogisticRegression() acc_uncertain = run(trn_ds, tst_ds, lbr, model, qs, quota, step) qs2 = RandomSampling(trn_ds2) model = LogisticRegression() acc_randdom = run(trn_ds2, tst_ds, lbr, model, qs2, quota, step) query_num = np.arange(1, int(quota/step) + 3) print(query_num.shape) print(acc_randdom.shape) plt.plot(query_num, acc_uncertain, 'k', mec='b', marker='x', label='Uncertain Sampling', lw=1) plt.plot(query_num, acc_randdom, 'k', mec='g', marker='^', mfc='g', label='Random Sampling', lw=1) plt.xlabel('Iteration') plt.ylabel('Acurracy') plt.title('Experiment Result') plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=5) plt.show()
def main(): config = TRNNConfig() train_dir = './data/train10_shuf_6000.txt' vocab_dir = './data/vocab_train10_shuf_6000.txt' batchsize = config.batch_size wordslength = config.seq_length vocab_size = config.vocab_size numclass = config.num_classes val_size = 0.15 test_size = 0.2 # the percentage of samples in the dataset that will be n_labeled = 80 # number of samples that are initially labeled categories_class = [ '体育', '家居', '娱乐', '游戏', '财经', '房产', '教育', '时尚', '时政', '科技' ] result = {'E1': [], 'E2': []} for i in range(1): trn_ds_al, tst_ds_al, y_train_rnn, fully_labeled_trn_ds_al, trn_ds_rnn, tst_ds_rnn, fully_labeled_trn_ds_rnn, val_ds_rnn = \ split_train_test_rnn(train_dir, vocab_dir, vocab_size, test_size, val_size, n_labeled, wordslength, categories_class) trn_ds2 = copy.deepcopy(trn_ds_al) trn_ds3 = copy.deepcopy(trn_ds_al) lbr_al = IdealLabeler(fully_labeled_trn_ds_al) lbr_rnn = IdealLabeler(fully_labeled_trn_ds_rnn) trn_ds_cnn = copy.deepcopy(trn_ds_rnn) val_ds_cnn = copy.deepcopy(val_ds_rnn) tst_ds_cnn = copy.deepcopy(tst_ds_rnn) fully_labeled_trn_ds_cnn = copy.deepcopy(fully_labeled_trn_ds_rnn) lbr_cnn = IdealLabeler(fully_labeled_trn_ds_cnn) quota = len(y_train_rnn) - n_labeled modelcnn = RNN_Probability_Model_LSTM(vocab_dir, wordslength, batchsize, numclass, categories_class) modelcnn.train(trn_ds_cnn, val_ds_cnn) test_acc = modelcnn.test(val_ds_cnn) E_out_cnn, E_time_cnn = runrnn(trn_ds_cnn, tst_ds_cnn, val_ds_cnn, lbr_cnn, modelcnn, quota, test_acc, batchsize) modelrnn = RNN_Probability_Model(vocab_dir, wordslength, batchsize, numclass, categories_class) modelrnn.train(trn_ds_rnn, val_ds_rnn) #test_acc = 0.5 test_acc = modelrnn.test(val_ds_rnn) E_out_rnn, E_time_rnn = runrnn(trn_ds_rnn, tst_ds_rnn, val_ds_rnn, lbr_rnn, modelrnn, quota, test_acc, batchsize) # test_acc = modelrnn.test(tst_ds) result['E1'].append(E_out_cnn) result['E2'].append(E_out_rnn) E_out_cnn = np.mean(result['E1'], axis=0) # E_out_random = np.mean(result['E2'],axis=0) E_out_rnn = np.mean(result['E2'], axis=0) # Plot the learning curve of UncertaintySampling to RandomSampling # The x-axis is the number of queries, and the y-axis is the corresponding # error rate. modelcnn.test(tst_ds_cnn) modelrnn.test(tst_ds_rnn) print("[Result] for RNN") print(E_out_cnn) print(E_time_cnn) print("[Result] for RNN") print(E_out_rnn) print(E_time_rnn) if quota % batchsize == 0: intern = int(quota / batchsize) else: intern = int(quota / batchsize) + 1 query_num = np.arange(1, intern + 1) plt.figure(figsize=(10, 8)) #plt.plot(query_num, E_in_1, 'b', label='qs Ein') #plt.plot(query_num, E_in_2, 'r', label='random Ein') plt.plot(query_num, E_out_cnn, 'g', label='LSTM') plt.plot(query_num, E_out_rnn, 'r', label='GRU') plt.xlabel('Number of Batches') plt.ylabel('Accuracy') plt.title('Result') plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=5) plt.savefig('testmerge_gru_lstm_10_10000_0727.png') plt.show()
def main(): test_size = 0.25 # the percentage of samples in the dataset that will be # randomly selected and assigned to the test set result = {'E1': [], 'E2': [], 'E3': [], 'E4': [], 'E5': [], 'E6': []} for i in range(10): # repeat experiment trn_ds, tst_ds, fully_labeled_trn_ds = split_train_test(test_size) trn_ds2 = copy.deepcopy(trn_ds) trn_ds3 = copy.deepcopy(trn_ds) trn_ds4 = copy.deepcopy(trn_ds) trn_ds5 = copy.deepcopy(trn_ds) trn_ds6 = copy.deepcopy(trn_ds) lbr = IdealLabeler(fully_labeled_trn_ds) model = BinaryRelevance(LogisticRegression()) quota = 150 # number of samples to query qs = MMC(trn_ds, br_base=LogisticRegression()) _, E_out_1 = run(trn_ds, tst_ds, lbr, model, qs, quota) result['E1'].append(E_out_1) qs2 = RandomSampling(trn_ds2) _, E_out_2 = run(trn_ds2, tst_ds, lbr, model, qs2, quota) result['E2'].append(E_out_2) qs3 = MultilabelWithAuxiliaryLearner(trn_ds3, BinaryRelevance( LogisticRegression()), BinaryRelevance(SVM()), criterion='hlr') _, E_out_3 = run(trn_ds3, tst_ds, lbr, model, qs3, quota) result['E3'].append(E_out_3) qs4 = MultilabelWithAuxiliaryLearner(trn_ds4, BinaryRelevance( LogisticRegression()), BinaryRelevance(SVM()), criterion='shlr') _, E_out_4 = run(trn_ds4, tst_ds, lbr, model, qs4, quota) result['E4'].append(E_out_4) qs5 = MultilabelWithAuxiliaryLearner(trn_ds5, BinaryRelevance( LogisticRegression()), BinaryRelevance(SVM()), criterion='mmr') _, E_out_5 = run(trn_ds5, tst_ds, lbr, model, qs5, quota) result['E5'].append(E_out_5) qs6 = BinaryMinimization(trn_ds6, LogisticRegression()) _, E_out_6 = run(trn_ds6, tst_ds, lbr, model, qs6, quota) result['E6'].append(E_out_6) E_out_1 = np.mean(result['E1'], axis=0) E_out_2 = np.mean(result['E2'], axis=0) E_out_3 = np.mean(result['E3'], axis=0) E_out_4 = np.mean(result['E4'], axis=0) E_out_5 = np.mean(result['E5'], axis=0) E_out_6 = np.mean(result['E6'], axis=0) print("MMC: ", E_out_1[::5].tolist()) print("Random: ", E_out_2[::5].tolist()) print("MultilabelWithAuxiliaryLearner_hlr: ", E_out_3[::5].tolist()) print("MultilabelWithAuxiliaryLearner_shlr: ", E_out_4[::5].tolist()) print("MultilabelWithAuxiliaryLearner_mmr: ", E_out_5[::5].tolist()) print("BinaryMinimization: ", E_out_6[::5].tolist()) query_num = np.arange(1, quota + 1) fig = plt.figure(figsize=(9, 6)) ax = plt.subplot(111) ax.plot(query_num, E_out_1, 'g', label='MMC') ax.plot(query_num, E_out_2, 'k', label='Random') ax.plot(query_num, E_out_3, 'r', label='AuxiliaryLearner_hlr') ax.plot(query_num, E_out_4, 'b', label='AuxiliaryLearner_shlr') ax.plot(query_num, E_out_5, 'c', label='AuxiliaryLearner_mmr') ax.plot(query_num, E_out_6, 'm', label='BinaryMinimization') box = ax.get_position() ax.set_position([box.x0, box.y0, box.width * 0.75, box.height]) plt.legend(loc=2, bbox_to_anchor=(1.05, 1), borderaxespad=0.) plt.xlabel('Number of Queries') plt.ylabel('Loss') plt.title('Experiment Result (Hamming Loss)') plt.show()
def visualize(dataset_file, word2vec_file, word_freq_file, output_file): nfolds = 3 for i, foldid in zip( [0], [0]): #itertools.product(get_set_random_states(), range(nfolds)): #for i, foldid in itertools.product(get_set_random_states(), range(nfolds)): result = [] data = read_and_split_data2(dataset_file, random_state=i, nfold=nfolds, foldid=foldid) # convert to libact dataset docs = data['train_x'] + data['dev_x'] + data['test_x'] trn_len = len(data['train_x']) dev_len = len(data['dev_x']) test_len = len(data['test_x']) X = np.asarray([[i] for i in range(len(docs))]) X_train = np.asarray([X[i, :] for i in range(trn_len)]) X_dev = np.asarray( [X[i, :] for i in range(trn_len, trn_len + dev_len)]) X_test = np.asarray( [X[i, :] for i in range(trn_len + dev_len, len(docs))]) y = data['train_y'] + data['dev_y'] + data['test_y'] y_train = data['train_y'] y_dev = data['dev_y'] y_test = data['test_y'] n_labeled = 50 tmp = prepare_data(X_train, y_train, X_test, y_test, n_labeled, random_state=3) trn_ds, tst_ds, fully_labeled_trn_ds, n_labeled = tmp lbr = IdealLabeler(fully_labeled_trn_ds) # quota = len(y_train) - n_labeled # number of samples to query quota = 10 batchsize = 10 # Comparing UncertaintySampling strategy with RandomSampling. # model is the base learner, e.g. LogisticRegression, SVM ... etc. # train CNN Readability Classifier # train CNN Readability Classifier print('>>>>>>>>>>>UnCertainty ....') preprocessor = rlc.Preprocessor(word2vec_file, word_freq_file, topwords_as_vocab=True) cls = rlc.ReadlevelClassifier(preprocessor, useGPU=True) cls.cuda() wrapper = CNNWrapperClassifier((docs, y), cls, (data['dev_x'], data['dev_y'])) wrapper.train(trn_ds) import ipdb # ipdb.set_trace() unlabeled_entry_ids, X_pool = zip(*trn_ds.get_unlabeled_entries()) dvalue = wrapper.predict_proba(X_pool) from sklearn.manifold import TSNE X_embedded = TSNE(n_components=2).fit_transform(dvalue) labels = np.asarray([[ lbr.label(X_pool[i]) for i in range(len(unlabeled_entry_ids)) ]]).transpose() print(labels.shape, X_embedded.shape) X_embedded = np.concatenate((X_embedded, labels), axis=1) result = pd.DataFrame(data=X_embedded) with open(output_file, 'w') as f: result.to_csv(f, header=False)
print "=========After Sampling======" #print "Whole Dataset size: ", datasize print "Test size :", len(y_test) print "Train size :", len(y_train) n_labeled = int(len(y_train) * 0.98) trn_ds = Dataset( X_train, np.concatenate( [y_train[:n_labeled], [None] * (len(y_train) - n_labeled)])) tst_ds = Dataset(X_test, y_test) fully_labeled_trn_ds = Dataset(X_train, y_train) trn_ds2 = copy.deepcopy(trn_ds) lbr = IdealLabeler(fully_labeled_trn_ds) quota = len(y_train) - n_labeled # number of samples to query print "quotas:", quota batch_size = int(quota / 10) quota = 1 # model is the base learner, e.g. LogisticRegression, SVM ... etc. qs = UncertaintySampling(trn_ds, method='lc', model=LogisticRegression()) model = LogisticRegression() E_in_1, E_out_1, model = run(trn_ds, tst_ds, lbr, model, qs, quota, batch_size) y_pred = model.predict(X_test)
def main(): config = TRNNConfig() train_dir = './data/train10_shuf_10000.txt' vocab_dir = './data/vocab_train10_shuf_10000.txt' batchsize = config.batch_size wordslength = config.seq_length vocab_size = config.vocab_size numclass = config.num_classes val_size = 0.15 test_size = 0.2 # the percentage of samples in the dataset that will be n_labeled = 1000 # number of samples that are initially labeled categories_class = [ '体育', '家居', '娱乐', '游戏', '财经', '房产', '教育', '时尚', '时政', '科技' ] result = {'E1': [], 'E2': [], 'E3': []} for i in range(1): trn_ds_al, tst_ds_al, y_train_rnn, fully_labeled_trn_ds_al, trn_ds_rnn, tst_ds_rnn, fully_labeled_trn_ds_rnn, val_ds_rnn = \ split_train_test_rnn(train_dir, vocab_dir, vocab_size, test_size, val_size, n_labeled, wordslength, categories_class) trn_ds2 = copy.deepcopy(trn_ds_al) trn_ds3 = copy.deepcopy(trn_ds_al) lbr_al = IdealLabeler(fully_labeled_trn_ds_al) lbr_rnn = IdealLabeler(fully_labeled_trn_ds_rnn) quota = len(y_train_rnn) - n_labeled # quota = 24 print(len(trn_ds3.get_labeled_entries())) print(len(tst_ds_al.get_labeled_entries())) print(len(trn_ds_rnn.get_labeled_entries())) print(len(tst_ds_rnn.get_labeled_entries())) print(len(val_ds_rnn.get_labeled_entries())) modelrnn = RNN_Probability_Model(vocab_dir, wordslength, batchsize, numclass, categories_class) modelrnn.train(trn_ds_rnn, val_ds_rnn) #test_acc = 0.5 test_acc = modelrnn.test(val_ds_rnn) E_out_rnn, E_time_rnn = runrnn(trn_ds_rnn, tst_ds_rnn, val_ds_rnn, lbr_rnn, modelrnn, quota, test_acc, batchsize) # result['E1'].append(E_out_1) model = SVM(kernel='rbf', decision_function_shape='ovr') qs2 = RandomSampling(trn_ds2) E_out_random, E_time_random = realrun_random(trn_ds2, tst_ds_al, lbr_al, model, qs2, quota, batchsize) qs = UncertaintySampling(trn_ds3, method='sm', model=SVM(decision_function_shape='ovr')) model = SVM(kernel='rbf', decision_function_shape='ovr') E_out_us, E_time_us = realrun_qs(trn_ds3, tst_ds_al, lbr_al, model, qs, quota, batchsize) # test_acc = modelrnn.test(tst_ds) result['E1'].append(E_out_us) result['E2'].append(E_out_random) result['E3'].append(E_out_rnn) E_out_us = np.mean(result['E1'], axis=0) E_out_random = np.mean(result['E2'], axis=0) E_out_rnn = np.mean(result['E3'], axis=0) # Plot the learning curve of UncertaintySampling to RandomSampling # The x-axis is the number of queries, and the y-axis is the corresponding # error rate. print("[Result] for Uncertainty Sampling") print(E_out_us) print(E_time_us) print("[Result] for Random") print(E_out_random) print(E_time_random) print("[Result] for RNN") print(E_out_rnn) print(E_time_rnn) if quota % batchsize == 0: intern = int(quota / batchsize) else: intern = int(quota / batchsize) + 1 query_num = np.arange(1, intern + 1) plt.figure(figsize=(10, 8)) #plt.plot(query_num, E_in_1, 'b', label='qs Ein') #plt.plot(query_num, E_in_2, 'r', label='random Ein') plt.plot(query_num, E_out_us, 'g', label='Traditional AL') plt.plot(query_num, E_out_random, 'k', label='Random') plt.plot(query_num, E_out_rnn, 'r', label='Deep AL') plt.xlabel('Number of Batches') plt.ylabel('Accuracy') plt.title('Result') plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=5) plt.savefig('testmerge_rnn_10_10000_0630.png') plt.show()
def main(args): acc_pool = [] maxlen = 100 # get the texts and their corresponding labels texts, labels = load_ptsd_data() # Keras example # # transform data into matrix of integers # tokenizer = Tokenizer() # tokenizer.fit_on_texts(texts) # sequences = tokenizer.texts_to_sequences(texts) # data = pad_sequences(sequences, # maxlen=maxlen, # padding='post', truncating='post') from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer from libact.models import SklearnProbaAdapter, SklearnAdapter from sklearn.naive_bayes import MultinomialNB from sklearn.svm import SVC from sklearn.linear_model import LogisticRegression # count words count_vect = CountVectorizer(max_features=5000, stop_words='english') features = count_vect.fit_transform(texts).todense().tolist() # import pdb; pdb.set_trace() if 0: # tf-idf tfidf_transformer = TfidfTransformer() features = tfidf_transformer.fit_transform(features) pool, pool_ideal = make_pool( features, labels, prelabeled=[1, 2, 3, 4, 5, 218, 260, 466, 532, 564] ) # get the model if args.model.lower() in ['multinomialnb', 'nb']: sklearn_model = MultinomialNB kwargs_model = {} elif args.model.lower() == 'svc': sklearn_model = SVC kwargs_model = { 'probability': True, # 'class_weight': {0: 1, 1: 100} 'class_weight': 'balanced' } elif args.model.lower() == 'logisticregression': sklearn_model = LogisticRegression kwargs_model = {} else: raise ValueError('Model not found.') # initialize the model through the adapter model = SklearnProbaAdapter(sklearn_model(**kwargs_model)) # query strategy # https://libact.readthedocs.io/en/latest/libact.query_strategies.html # #libact-query-strategies-uncertainty-sampling-module # # least confidence (lc), it queries the instance whose posterior # probability of being positive is nearest 0.5 (for binary # classification); smallest margin (sm), it queries the instance whose # posterior probability gap between the most and the second probable # labels is minimal qs = UncertaintySampling( pool, method='lc', model=SklearnProbaAdapter(sklearn_model(**kwargs_model))) # The passive learning model. The model given in the query strategy is not # the same. Have a look at this one. # model = LogisticRegression() fig, ax = plt.subplots() ax.set_xlabel('Number of Queries') ax.set_ylabel('Value') # Train the model on the train dataset. model.train(pool) # the accuracy of the entire pool acc_pool = np.append( acc_pool, model._model.score([x[0] for x in pool.get_entries()], labels) ) # make plot query_num = np.arange(0, 1) p2, = ax.plot(query_num, acc_pool, 'r', label='Accuracy') plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=5) plt.show(block=False) # Give each label its name (labels are from 0 to n_classes-1) if args.interactive: lbr = InteractivePaperLabeler(label_name=["0", "1"]) else: lbr = IdealLabeler(dataset=pool_ideal) query_i = 1 while query_i <= args.quota: # make a query from the pool print("Asking sample from pool with Uncertainty Sampling") ask_id = qs.make_query() print("Index {} returned. True label is {}.".format( ask_id, pool_ideal.data[ask_id][1])) # get the paper data_point = pool.data[ask_id][0] lb = lbr.label(data_point) # update the label in the train dataset pool.update(ask_id, lb) # train the model again model.train(pool) # append the score to the model acc_pool = np.append( acc_pool, model._model.score([x[0] for x in pool.get_entries()], labels) ) # additional evaluations #pred = model.predict([x[0] for x in pool.get_entries()]) idx_features = pool.get_unlabeled_entries() features = [x[1] for x in idx_features] idx= [x[0] for x in idx_features] pred = model.predict(features) print(confusion_matrix(labels[idx], pred)) print(recall_score(labels[idx], pred)) if args.interactive: # update plot ax.set_xlim((0, query_i)) ax.set_ylim((0, max(acc_pool) + 0.2)) p2.set_xdata(np.arange(0, query_i + 1)) p2.set_ydata(acc_pool) plt.draw() # update the query counter query_i += 1 if not args.interactive: # update plot ax.set_xlim((0, query_i - 1)) ax.set_ylim((0, max(acc_pool) + 0.2)) p2.set_xdata(np.arange(0, query_i)) p2.set_ydata(acc_pool) plt.draw() print(acc_pool) input("Press any key to continue...")
def main(): quota = 10 # ask human to label 10 samples n_classes = 5 E_out1, E_out2 = [], [] trn_ds, tst_ds, ds = split_train_test(n_classes) trn_ds2 = copy.deepcopy(trn_ds) qs = UncertaintySampling(trn_ds, method='lc', model=LogisticRegression()) qs2 = RandomSampling(trn_ds2) model = LogisticRegression() fig = plt.figure() ax = fig.add_subplot(2, 1, 1) ax.set_xlabel('Number of Queries') ax.set_ylabel('Error') model.train(trn_ds) E_out1 = np.append(E_out1, 1 - model.score(tst_ds)) model.train(trn_ds2) E_out2 = np.append(E_out2, 1 - model.score(tst_ds)) query_num = np.arange(0, 1) p1, = ax.plot(query_num, E_out1, 'g', label='qs Eout') p2, = ax.plot(query_num, E_out2, 'k', label='random Eout') plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=5) plt.show(block=False) img_ax = fig.add_subplot(2, 1, 2) box = img_ax.get_position() img_ax.set_position( [box.x0, box.y0 - box.height * 0.1, box.width, box.height * 0.9]) plt.show() # Give each label its name (labels are from 0 to n_classes-1) # lbr = InteractiveLabeler(label_name=[str(lbl) for lbl in range(n_classes)]) x_ds = ds.data print(x_ds.shape) y_ds = ds.target print(y_ds.shape) lbr_ds = Dataset(x_ds, y_ds) x, _ = zip(*trn_ds.data) print(x) lbr = IdealLabeler(lbr_ds) for i in range(quota): ask_id = qs.make_query() print("asking sample from Uncertainty Sampling") # reshape the image to its width and height x, _ = zip(*trn_ds.data) lb = lbr.label(x[ask_id]) # lb = lbr.label(trn_ds.data[ask_id][0].reshape(8, 8)) trn_ds.update(ask_id, lb) model.train(trn_ds) E_out1 = np.append(E_out1, 1 - model.score(tst_ds)) ask_id = qs2.make_query() print("asking sample from Random Sample") x, _ = zip(*trn_ds2.data) lb = lbr.label(x[ask_id]) # lb = lbr.label(trn_ds2.data[ask_id][0].reshape(8, 8)) trn_ds2.update(ask_id, lb) model.train(trn_ds2) E_out2 = np.append(E_out2, 1 - model.score(tst_ds)) ax.set_xlim((0, i + 1)) ax.set_ylim((0, max(max(E_out1), max(E_out2)) + 0.2)) query_num = np.arange(0, i + 2) p1.set_xdata(query_num) p1.set_ydata(E_out1) p2.set_xdata(query_num) p2.set_ydata(E_out2) plt.show() input("Press any key to continue...")
def active_weighted_transfer_learning(candsets,candsets_train,candsets_test,source_name,target_name,feature,estimator_name, query_strategy,quota,weighting=None,disagreement='vote',n=5): """ query_strategy: Possible strategies are: Baselines: 'uncertainty', 'random' Heterogeneous Committees: 'lr_lscv_rf_dt', 'lr_lsvc_dt_xgb', 'lr_lsvc_dt_gpc', 'lr_svc_dt_xgb_rf' ,'lr_svc_rf_dt', 'lr_svc_dt_gpc', 'lr_svc_dt_xgb', Homogeneous Committees: 'homogeneous_committee' (it will then take the specified committee for the model used) """ training_accuracy_scores, training_f1_scores, test_accuracy_scores, test_f1_scores, test_precision, test_recall = [],[],[],[],[],[] model_pred_prob_start, model_feature_import_start, model_depth_tree_start = [],[],[] model_pred_prob_end, model_feature_import_end, model_depth_tree_end = [],[],[] runtimes = [] #n_labeled = 0 X_source = candsets[source_name][feature].to_numpy() y_source = candsets[source_name]['label'].to_numpy() X_target = candsets[target_name][feature].to_numpy() #y_target = candsets[target_name]['label'].to_numpy() # the source instances are all labeled and used as initial training set # hence, n_labeled == the size of of source instances n_labeled = y_source.shape[0] # check if domain adaptation is desired if(weighting is None): print('No Unsupervised Domain Adaptation performed') sample_weight = None else: print('Unsupervised Domain Adaptation: Calculate sample_weight for the source instances using {}'.format(weighting)) sample_weight = da.getSampleWeightsOfDomainAdaptation(X_source, X_target, weighting) X_target_train = candsets_train[target_name][feature] y_target_train = candsets_train[target_name]['label'] X_target_test = candsets_test[target_name][feature] y_target_test = candsets_test[target_name]['label'] # create libact DataSet Object containting the validation set test_ds = Dataset(X=X_target_test,y=y_target_test) print('Starting ATL Experiments (WITH transfer!) source {} and target {}'.format(source_name,target_name)) for i in range(n): print('{}. Run of {}'.format(i+1,n)) train_ds, fully_labeled_trn_ds = initializeAWTLPool(X_source, y_source, X_target_train, y_target_train, n_labeled, sample_weight) # if quota -1 it means it is not a fixed amount # create the quota which is the amount of all instances # in the training pool minus the amount of already labeled ones if(quota == -1): quota = train_ds.len_unlabeled() # cerate the IdealLabeler with the full labeled training set lbr = IdealLabeler(fully_labeled_trn_ds) model = la.getLearningModel(estimator_name) qs = com.getQueryStrategy(query_strategy, train_ds, disagreement, estimator_name) train_acc, train_f1, test_acc, test_f1, test_p, test_r, model_, runt, model_pred_prob,\ model_feature_import, model_depth_tree = run_atl(train_ds,test_ds,lbr,model,qs,quota,n_labeled) #train_acc, train_f1, test_acc, test_f1, model_, runt = run_atl(train_ds,test_ds,lbr,model,qs,quota,n_labeled) training_accuracy_scores.append(train_acc) training_f1_scores.append(train_f1) test_accuracy_scores.append(test_acc) test_f1_scores.append(test_f1) test_precision.append(test_p) test_recall.append(test_r) model_pred_prob_start.append(model_pred_prob[0]) model_feature_import_start.append(model_feature_import[0]) model_pred_prob_end.append(model_pred_prob[1]) model_feature_import_end.append(model_feature_import[1]) if(model.name == 'rf' or model.name == 'dt'): model_depth_tree_start.append(model_depth_tree[0]) model_depth_tree_end.append(model_depth_tree[1]) runtimes.append(runt) runt = np.mean(runtimes) key = '{}_{}'.format(source_name,target_name) if(weighting is None): # append weighting strategy to query_strategy name to be able to distinguish d = {key:{estimator_name:{query_strategy:{'no_weighting':{'quota':quota,'n_runs':n,'n_init_labeled':n_labeled, 'model_params':model_.get_params(),'avg_runtime':runt, 'training_accuracy_scores':training_accuracy_scores, 'training_f1_scores':training_f1_scores, 'test_accuracy_scores':test_accuracy_scores, 'test_f1_scores':test_f1_scores, 'test_precision':test_precision, 'test_recall':test_recall, 'model_pred_prob_start':model_pred_prob_start, 'model_feature_import_start':model_feature_import_start, 'model_depth_tree_start':model_depth_tree_start, 'model_pred_prob_end':model_pred_prob_end, 'model_feature_import_end':model_feature_import_end, 'model_depth_tree_end':model_depth_tree_end}}}}} else: d = {key:{estimator_name:{query_strategy:{weighting:{'quota':quota,'n_runs':n,'n_init_labeled':n_labeled, 'model_params':model_.get_params(),'avg_runtime':runt, 'training_accuracy_scores':training_accuracy_scores, 'training_f1_scores':training_f1_scores, 'test_accuracy_scores':test_accuracy_scores, 'test_f1_scores':test_f1_scores, 'test_precision':test_precision, 'test_recall':test_recall, 'model_pred_prob_start':model_pred_prob_start, 'model_feature_import_start':model_feature_import_start, 'model_depth_tree_start':model_depth_tree_start, 'model_pred_prob_end':model_pred_prob_end, 'model_feature_import_end':model_feature_import_end, 'model_depth_tree_end':model_depth_tree_end, 'sample_weights':sample_weight}}}}} return d
def main(): global count, path_csv, test_size path_csv = '' random_shuffle_id = 23 for file_csv in l_csv: book = xlwt.Workbook(encoding="utf-8") start = datetime.now() folds = [1, 2] #, 3, 4, 5, 7, 23, 66, 123, 2018] for fold in folds: message = "Sheet " + str(fold) sheet1 = book.add_sheet(message) SIZE = (1 - test_size) * split_train_test( test_size, 1, fold, 0, random_shuffle_id, file_csv, path_csv) count = -1 for col in range( 1, 2 ): #we could increase the second argument of range, in case that more we would like to run the experiment again for the same fold with different shuffle e.g. 5x2 evaluation print '***********file*********** = ', file_csv print '***********col************ = ', col print '***********fold*********** = ', fold print 'SIZE of L + U = ', int(SIZE) print myspace = np.linspace(int(0.05 * SIZE), int(0.25 * SIZE) + 1, 3) learners = [ SGD(loss='log'), SGD(loss='modified_huber'), SGD(loss='log', penalty='l1'), SGD(loss='log', penalty='elasticnet'), SGD(loss='modified_huber', penalty='l1'), SGD(loss='modified_huber', penalty='elasticnet') ] for lea in learners: counter_j = -1 counter_jj = -1 count = count + 1 my_clf = lea print str(my_clf)[0:str(my_clf).find('(')] + '(' + str( my_clf)[str(my_clf).find('loss'):str(my_clf). find(',', str(my_clf).find('loss'))] + ' , ' + str( my_clf )[str(my_clf).find('penalty'):str(my_clf). find(',', str(my_clf).find('penalty'))] + ')' for j in myspace: j = int(round(j)) counter_j = counter_j + 1 n_labeled = j # number of samples that are initially labeled print '**** Labeled instances = ', j metrics = ['lc', 'entropy', 'sm', 'random'] for jj in metrics: trn_ds, tst_ds, y_train, fully_labeled_trn_ds, initial_instances = split_train_test( test_size, n_labeled, fold, random_shuffle_id, col, file_csv, path_csv) trn_ds2 = copy.deepcopy(trn_ds) lbr = IdealLabeler(fully_labeled_trn_ds) train_data = int(initial_instances - initial_instances * test_size) quota = len( y_train ) - n_labeled # number of samples to query # Comparing UncertaintySampling strategy with RandomSampling. counter_jj = counter_jj + 1 if jj != 'random': print '**** Metric of Uncertainty Sampling strategy = ', jj qs1 = UncertaintySampling( trn_ds, kernel=jj, model=SklearnProbaAdapter(my_clf)) model = SklearnProbaAdapter(my_clf) E_out_1, ttt, trn_ds_returned, aa, bb = run( trn_ds, tst_ds, lbr, model, qs1, quota, j) else: print '**** Baseline Sampling strategy = ', jj qs1 = RandomSampling( trn_ds, model=SklearnProbaAdapter(my_clf)) model = SklearnProbaAdapter(my_clf) E_out_1, ttt, trn_ds_returned, aa, bb = run( trn_ds, tst_ds, lbr, model, qs1, quota, j) if count != 0: down_cells = len(E_out_1) + 9 else: down_cells = 0 i = 8 + down_cells * count sheet1.write(i - 7, counter_jj + counter_j, jj) # metric of incertaintly sheet1.write(i - 6, counter_jj + counter_j, quota) # amount of U sheet1.write(i - 5, counter_jj + counter_j, aa) # instanes inserted per iteration sheet1.write(i - 4, counter_jj + counter_j, bb) # amount of L sheet1.write( i - 3, counter_jj + counter_j, trn_ds_returned.len_labeled() ) # amount of training data after active learning procedure sheet1.write( i - 2, counter_jj + counter_j, trn_ds_returned.len_unlabeled() ) # amount of unlabeled instances after active learning procedure sheet1.write( i - 8, counter_jj + counter_j, str(my_clf)[0:str(my_clf).find('(')] + '(' + str(my_clf)[str(my_clf).find('loss'):str( my_clf).find(',', str(my_clf).find('loss'))] + ' , ' + str(my_clf) [str(my_clf).find('penalty'):str(my_clf). find(',', str(my_clf).find('penalty'))] + ')') for n in E_out_1: sheet1.write(i, counter_jj + counter_j, n) i = i + 1 #print 'error in last iteration: ', E_out_1[-1] print print("> Compilation Time : %s", (datetime.now() - start).total_seconds()) print("AIAIexperiment_" + file_csv[0:-4] + ".xls") book.save("AIAIexperimetn_" + file_csv[0:-4] + "_incremental_" + str(fold) + ".xls") times_l.append((datetime.now() - start).total_seconds())
def main(): config = TRNNConfig() train_dir = './data/train10_shuf_3000.txt' vocab_dir = './data/vocab_train10_shuf_3000.txt' batchsize = 64 wordslength = config.seq_length vocab_size = config.vocab_size numclass = config.num_classes val_size = 0.15 test_size = 0.2 # the percentage of samples in the dataset that will be n_labeled = 300 # number of samples that are initially labeled categories_class = [ '体育', '家居', '娱乐', '游戏', '财经', '房产', '教育', '时尚', '时政', '科技' ] batch_one = 1 batch_sixteen = 16 batch_128 = 128 batch_256 = 256 resultfile = open('queryresult4.txt', 'w') result = {'E1': [], 'E2': [], 'E3': []} for i in range(1): trn_ds_al, tst_ds_al, y_train_rnn, fully_labeled_trn_ds_al, trn_ds_rnn, tst_ds_rnn, fully_labeled_trn_ds_rnn, val_ds_rnn = \ split_train_test_rnn(train_dir, vocab_dir, vocab_size, test_size, val_size, n_labeled, wordslength, categories_class) trn_ds2 = copy.deepcopy(trn_ds_al) trn_ds3 = copy.deepcopy(trn_ds_al) trn_ds4 = copy.deepcopy(trn_ds_al) trn_ds5 = copy.deepcopy(trn_ds_al) trn_ds6 = copy.deepcopy(trn_ds_al) lbr_al = IdealLabeler(fully_labeled_trn_ds_al) lbr_rnn = IdealLabeler(fully_labeled_trn_ds_rnn) quota = len(y_train_rnn) - n_labeled model = SVM(kernel='rbf', decision_function_shape='ovr') qs2 = UncertaintySampling(trn_ds_al, method='sm', model=SVM(decision_function_shape='ovr')) E_out_us16, E_time_us16 = realrun_qs(trn_ds_al, tst_ds_al, lbr_al, model, qs2, quota, batch_sixteen) qs = UncertaintySampling(trn_ds3, method='sm', model=SVM(decision_function_shape='ovr')) model = SVM(kernel='rbf', decision_function_shape='ovr') E_out_us64, E_time_us64 = realrun_qs(trn_ds3, tst_ds_al, lbr_al, model, qs, quota, batchsize) qs4 = UncertaintySampling(trn_ds4, method='sm', model=SVM(decision_function_shape='ovr')) model = SVM(kernel='rbf', decision_function_shape='ovr') E_out_us1, E_time_us1 = realrun_qs(trn_ds4, tst_ds_al, lbr_al, model, qs4, quota, batch_one) qs5 = UncertaintySampling(trn_ds5, method='sm', model=SVM(decision_function_shape='ovr')) model = SVM(kernel='rbf', decision_function_shape='ovr') E_out_us128, E_time_us128 = realrun_qs(trn_ds5, tst_ds_al, lbr_al, model, qs5, quota, batch_128) qs6 = UncertaintySampling(trn_ds6, method='sm', model=SVM(decision_function_shape='ovr')) model = SVM(kernel='rbf', decision_function_shape='ovr') E_out_us256, E_time_us256 = realrun_qs(trn_ds6, tst_ds_al, lbr_al, model, qs5, quota, batch_256) resultfile.writelines(str(E_out_us1) + '\n') resultfile.writelines(str(E_time_us1) + '\n') resultfile.writelines(str(E_out_us16) + '\n') resultfile.writelines(str(E_time_us16) + '\n') resultfile.writelines(str(E_out_us64) + '\n') resultfile.writelines(str(E_time_us64) + '\n') resultfile.writelines(str(E_out_us128) + '\n') resultfile.writelines(str(E_time_us128) + '\n') resultfile.writelines(str(E_out_us256) + '\n') resultfile.writelines(str(E_time_us256) + '\n') # if len(E_out_us1) > len(E_out_us16): # E_out_us1.pop() # if len(E_out_us1) > len(E_out_us64): # E_out_us1.pop() # test_acc = modelrnn.test(tst_ds) for t in range(len(E_out_us1)): if t % batchsize == 0: result['E1'].append(E_out_us1[t]) for m in range(len(E_out_us16)): if m % 4 == 0: result['E3'].append(E_out_us16[m]) # result['E3'].append(E_out_rnn1) result['E2'].append(E_out_us64) E_out_us1 = np.mean(result['E1'], axis=0) E_out_us64 = np.mean(result['E2'], axis=0) E_out_us16 = np.mean(result['E3'], axis=0) # Plot the learning curve of UncertaintySampling to RandomSampling # The x-axis is the number of queries, and the y-axis is the corresponding # error rate. print(np.shape(E_out_us1)) print(np.shape(E_out_us16)) print(np.shape(E_out_us64)) print("[Result] for Uncertainty Sampling") print(E_out_us1) print(E_time_us1) print(E_out_us16) print(E_time_us16) print(E_out_us64) print(E_time_us64) if quota % batchsize == 0: intern = int(quota / batchsize) else: intern = int(quota / batchsize) + 1 query_num = np.arange(1, intern + 1) plt.figure(figsize=(10, 8)) plt.plot(query_num, E_out_us1, 'b', label='Single1') plt.plot(query_num, E_out_us16, 'r', label='Batch16') plt.plot(query_num, E_out_us64, 'g', label='Batch64') plt.xlabel('Number of Batches') plt.ylabel('Accuracy') plt.title('Result') plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=5) plt.savefig('testmerge_rnn_10_3000_0705_time.png') plt.show()
def main(args): pickle_file_name = args.dataset + '_pickle.pickle' pickle_file_path = os.path.join(TEMP_DATA_DIR, pickle_file_name) seed = 2018 * args.T if args.dataset == 'ptsd': texts, lbls = load_ptsd_data() else: texts, lbls = load_drug_data(args.dataset) # get the texts and their corresponding labels textManager = TextManager() data, labels, word_index = textManager.sequence_maker(texts, lbls) max_num_words = textManager.max_num_words max_sequence_length = textManager.max_sequence_length prelabeled_index = select_prelabeled(labels, args.init_included_papers, seed) # [1, 2, 3, 4, 5, 218, 260, 466, 532, 564] print('prelabeled_index', prelabeled_index) pool, pool_ideal = make_pool(data, labels, prelabeled=prelabeled_index) if os.path.isfile(pickle_file_path): embedding_layer = load_pickle(pickle_file_path) else: if not os.path.exists(TEMP_DATA_DIR): os.makedirs(TEMP_DATA_DIR) embedding = Word2VecEmbedding(word_index, max_num_words, max_sequence_length) embedding.load_word2vec_data(GLOVE_PATH) embedding_layer = embedding.build_embedding() dump_pickle(embedding_layer, pickle_file_path) # get the model if args.model.lower() == 'lstm': deep_model = LSTM_Libact kwargs_model = { 'backwards': True, 'dropout': 0.4, 'optimizer': 'rmsprop', 'max_sequence_length': max_sequence_length, 'embedding_layer': embedding_layer } else: raise ValueError('Model not found.') model = deep_model(**kwargs_model) # # query strategy # # https://libact.readthedocs.io/en/latest/libact.query_strategies.html # # #libact-query-strategies-uncertainty-sampling-module # # # # least confidence (lc), it queries the instance whose posterior # # probability of being positive is nearest 0.5 (for binary # # classification); smallest margin (sm), it queries the instance whose # # posterior probability gap between the most and the second probable # # labels is minimal # qs = UncertaintySampling( # pool, method='lc', model=SklearnProbaAdapter(sklearn_model(**kwargs_model))) #Todo: check if 'lc' works correctly/ add random as well qs = UncertaintySampling(pool, method='lc', model=deep_model(**kwargs_model)) # Give each label its name (labels are from 0 to n_classes-1) if args.interactive: lbr = InteractivePaperLabeler(label_name=["0", "1"]) else: lbr = IdealLabeler(dataset=pool_ideal) result_df = pd.DataFrame({'label': [x[1] for x in pool_ideal.data]}) query_i = 1 ##Todo: add multiple papers to labeled dataset with size of batch_size while query_i <= args.quota: # make a query from the pool print("Asking sample from pool with Uncertainty Sampling") # unlabeled_entry = pool.get_unlabeled_entries() ask_id = qs.make_query() print("Index {} returned. True label is {}.".format( ask_id, pool_ideal.data[ask_id][1])) # get the paper data_point = pool.data[ask_id][0] lb = lbr.label(data_point) # update the label in the train dataset pool.update(ask_id, lb) # train the model again # to_read_mean, to_read_std = cross_validation(model,pool,split_no=3,seed =query_i) model.train(pool) idx_features = pool.get_unlabeled_entries() idx = [x[0] for x in idx_features] features = [x[1] for x in idx_features] pred = model.predict(features) c_name = str(query_i) result_df[c_name] = -1 result_df.loc[idx, c_name] = pred[:, 1] # update the query counter query_i += 1 # save the result to a file output_dir = os.path.join(ACTIVE_DIR, args.dataset) if not os.path.exists(output_dir): os.makedirs(output_dir) export_path = os.path.join(output_dir, 'sr_lstm_active{}.csv'.format(args.T)) result_df.to_csv(export_path) input("Press any key to continue...")
def active_learning(candsets_train, candsets_test, target_name, feature, estimator_name, query_strategy, n_labeled, quota, disagreement='vote', n=5): """ query_strategy: Possible strategies are: Baselines: 'uncertainty', 'random' Heterogeneous Committees: 'lr_lscv_rf_dt', 'lr_lsvc_dt_xgb', 'lr_lsvc_dt_gpc', 'lr_svc_dt_xgb_rf' ,'lr_svc_rf_dt', 'lr_svc_dt_gpc', 'lr_svc_dt_xgb', Homogeneous Committees: 'homogeneous_committee' (it will then take the specified committee for the model used) """ training_accuracy_scores, training_f1_scores, test_accuracy_scores, test_f1_scores, test_precision, test_recall = [],[],[],[],[],[] model_pred_prob_start, model_feature_import_start, model_depth_tree_start = [],[],[] model_pred_prob_end, model_feature_import_end, model_depth_tree_end = [],[],[] runtimes = [] X_target_train = candsets_train[target_name][feature] y_target_train = candsets_train[target_name]['label'] X_target_test = candsets_test[target_name][feature] y_target_test = candsets_test[target_name]['label'] # create libact DataSet Object containting the validation set test_ds = Dataset(X=X_target_test, y=y_target_test) print('Starting AL Experiments (no transfer!) for candset {}'.format( target_name)) for i in range(n): print('{}. Run of {}'.format(i + 1, n)) train_ds, fully_labeled_trn_ds = initializeALPool( X_target_train, y_target_train, n_labeled) # if quota -1 it means it is not a fixed amount # create the quota which is the amount of all instances # in the training pool minus the amount of already labeled ones if (quota == -1): quota = train_ds.len_unlabeled() # cerate the IdealLabeler with the full labeled training set lbr = IdealLabeler(fully_labeled_trn_ds) model = la.getLearningModel(estimator_name) qs = com.getQueryStrategy(query_strategy, train_ds, disagreement, estimator_name) train_acc, train_f1, test_acc, test_f1, test_p, test_r, model_, runt, model_pred_prob,\ model_feature_import, model_depth_tree = run_al(train_ds,test_ds,lbr,model,qs,quota,n_labeled) training_accuracy_scores.append(train_acc) training_f1_scores.append(train_f1) test_accuracy_scores.append(test_acc) test_f1_scores.append(test_f1) test_precision.append(test_p) test_recall.append(test_r) model_pred_prob_start.append(model_pred_prob[0]) model_feature_import_start.append(model_feature_import[0]) model_pred_prob_end.append(model_pred_prob[1]) model_feature_import_end.append(model_feature_import[1]) if (model.name == 'rf' or model.name == 'dt'): model_depth_tree_start.append(model_depth_tree[0]) model_depth_tree_end.append(model_depth_tree[1]) runt = np.mean(runtimes) d = { target_name: { estimator_name: { query_strategy: { 'quota': quota, 'n_runs': n, 'n_init_labeled': n_labeled, 'model_params': model_.get_params(), 'avg_runtime': runt, 'training_accuracy_scores': training_accuracy_scores, 'training_f1_scores': training_f1_scores, 'test_accuracy_scores': test_accuracy_scores, 'test_f1_scores': test_f1_scores, 'test_precision': test_precision, 'test_recall': test_recall, 'model_pred_prob_start': model_pred_prob_start, 'model_feature_import_start': model_feature_import_start, 'model_depth_tree_start': model_depth_tree_start, 'model_pred_prob_end': model_pred_prob_end, 'model_feature_import_end': model_feature_import_end, 'model_depth_tree_end': model_depth_tree_end } } } } return d
def active_transfer_learning(candsets, source_name, target_name, feature, estimator_name, query_strategy, quota, disagreement='vote', n=5): """ query_strategy: Possible strategies are: Baselines: 'uncertainty', 'random' Heterogeneous Committees: 'lr_lscv_rf_dt', 'lr_lsvc_dt_xgb', 'lr_lsvc_dt_gpc', 'lr_svc_dt_xgb_rf' ,'lr_svc_rf_dt', 'lr_svc_dt_gpc', 'lr_svc_dt_xgb', Homogeneous Committees: 'homogeneous_committee' (it will then take the specified committee for the model used) """ training_accuracy_scores = [] training_f1_scores = [] test_accuracy_scores = [] test_f1_scores = [] runtimes = [] X_source = candsets[source_name][feature].to_numpy() y_source = candsets[source_name]['label'].to_numpy() X_target = candsets[target_name][feature].to_numpy() y_target = candsets[target_name]['label'].to_numpy() # the source instances are all labeled and used as initial training set # hence, n_labeled == the size of of source instances n_labeled = y_source.shape[0] X_target_train, X_target_test, y_target_train, y_target_test = train_test_split( X_target, y_target, test_size=0.33, random_state=42, stratify=y_target) print( 'Train_test_split: random_state = 42, stratified ; LR solver: liblinear' ) # test set test_ds = Dataset(X=X_target_test, y=y_target_test) print('Starting ATL Experiments (WITH transfer!) source {} and target {}'. format(source_name, target_name)) for i in range(n): print('{}. Run of {}'.format(i + 1, n)) train_ds, fully_labeled_trn_ds = initializeATLPool( X_source, y_source, X_target_train, y_target_train, n_labeled) # if quota -1 it means it is not a fixed amount # create the quota which is the amount of all instances # in the training pool minus the amount of already labeled ones if (quota == -1): quota = train_ds.len_unlabeled() # cerate the IdealLabeler with the full labeled training set lbr = IdealLabeler(fully_labeled_trn_ds) model = la.getLearningModel(estimator_name) qs = com.getQueryStrategy(query_strategy, train_ds, disagreement, estimator_name) train_acc, train_f1, test_acc, test_f1, model_, runt = run_atl( train_ds, test_ds, lbr, model, qs, quota, n_labeled) training_accuracy_scores.append(train_acc) training_f1_scores.append(train_f1) test_accuracy_scores.append(test_acc) test_f1_scores.append(test_f1) runtimes.append(runt) runt = np.mean(runtimes) key = '{}_{}'.format(source_name, target_name) d = { key: { estimator_name: { query_strategy: { 'quota': quota, 'n_runs': n, 'n_init_labeled': n_labeled, 'model_params': model_.get_params(), 'avg_runtime': runt, 'training_accuracy_scores': training_accuracy_scores, 'training_f1_scores': training_f1_scores, 'test_accuracy_scores': test_accuracy_scores, 'test_f1_scores': test_f1_scores } } } } return d
def main(args): # Read dataset, labels and embedding layer from pickle file. pickle_fp = os.path.join(TEMP_DATA_DIR, args.dataset + '_pickle.pickle') with open(pickle_fp, 'rb') as f: data, labels, embedding_layer = pickle.load(f) # label the first batch (the initial labels) seed = 2017 + args.T prelabeled_index = select_prelabeled(labels, args.init_included_papers, seed) # [1, 2, 3, 4, 5, 218, 260, 466, 532, 564] print('prelabeled_index', prelabeled_index) pool, pool_ideal = make_pool(data, labels, prelabeled=prelabeled_index) # print([(idx, entry[0][0:5]) for idx, entry in enumerate(pool_ideal.data) if entry[1] == 1]) # get the model if args.model.lower() == 'lstm': deep_model = LSTM_Libact kwargs_model = { 'backwards': True, 'dropout': 0.4, 'optimizer': 'rmsprop', 'max_sequence_length': 1000, 'embedding_layer': embedding_layer } else: raise ValueError('Model not found.') np.random.seed(seed) tf.set_random_seed(seed) model = deep_model(**kwargs_model) #init_weights = model._model.get_weights() # print('init_weights.shape',len(init_weights)) # print('init_weights[0]',init_weights[0]) # # query strategy # # https://libact.readthedocs.io/en/latest/libact.query_strategies.html # # #libact-query-strategies-uncertainty-sampling-module # # # # least confidence (lc), it queries the instance whose posterior # # probability of being positive is nearest 0.5 (for binary # # classification); smallest margin (sm), it queries the instance whose # # posterior probability gap between the most and the second probable # # labels is minimal # qs = UncertaintySampling( # pool, method='lc', model=SklearnProbaAdapter(sklearn_model(**kwargs_model))) # Give each label its name (labels are from 0 to n_classes-1) if args.interactive: lbr = InteractivePaperLabeler(label_name=["0", "1"]) else: lbr = IdealLabeler(dataset=pool_ideal) result_df = pd.DataFrame({'label': [x[1] for x in pool_ideal.data]}) query_i = 0 while query_i <= args.quota: # make a query from the pool print("Asking sample from pool with Uncertainty Sampling") # unlabeled_entry = pool.get_unlabeled_entries() np.random.seed(seed) tf.set_random_seed(seed) # model = deep_model(**kwargs_model) # model._model.set_weights(init_weights) # train the model model.train(pool) # predict the label of the unlabeled entries in the pool idx_features = pool.get_unlabeled_entries() idx = [x[0] for x in idx_features] features = [x[1] for x in idx_features] pred = model.predict(features) print('len(idx)', len(idx)) print('idx[0]', idx[0]) print('pred[idx[0],1]', pred[idx[0], 1]) # store result in dataframe c_name = str(query_i) result_df[c_name] = -1 result_df.loc[idx, c_name] = pred[:, 1] # make query if (args.query_strategy == 'lc'): qs = UncertaintySampling(pool, method='lc', model=model) elif (args.query_strategy == 'random'): qs = RandomSampling(pool) ask_id = qs.make_query(n=args.batch_size) if not isinstance(ask_id, list): ask_id = [ask_id] # deal with batch query for id in ask_id: # label the entry data_point = pool.data[id][0] lb = lbr.label(data_point) print("Index {} returned. True label is {}.".format(id, lb)) # update the pool with the new label pool.update(id, lb) lbld = [x[1] for x in pool.data if x[1] is not None] print(len(lbld)) # # store result in dataframe # c_name = str(query_i) # result_df[c_name] = -1 # result_df.loc[idx, c_name] = pred[:, 1] # weights = model._model.get_weights() # # print('shape of weights',len(weights)) # print('weights[0]',weights[0]) # reset the weights # model._model.set_weights(init_weights) # update the query counter query_i += 1 # save the result to a file output_dir = os.path.join(ACTIVE_OUTPUT_DIR, args.dataset) if not os.path.exists(output_dir): os.makedirs(output_dir) export_path = os.path.join( output_dir, 'dataset_{}_sr_lstm_active{}_q_{}.csv'.format(args.dataset, args.T, args.query_strategy)) result_df.to_csv(export_path) input("Press any key to continue...")
def main(): # Specifiy the parameters here: # path to your binary classification dataset ds_name = 'australian' dataset_filepath = os.path.join( os.path.dirname(os.path.realpath(__file__)), '%s.txt' % ds_name) test_size = 0.33 # the percentage of samples in the dataset that will be # randomly selected and assigned to the test set n_labeled = 10 # number of samples that are initially labeled results = [] for T in range(20): # repeat the experiment 20 times print("%dth experiment" % (T + 1)) trn_ds, tst_ds, y_train, fully_labeled_trn_ds = \ split_train_test(dataset_filepath, test_size, n_labeled) trn_ds2 = copy.deepcopy(trn_ds) trn_ds3 = copy.deepcopy(trn_ds) trn_ds4 = copy.deepcopy(trn_ds) trn_ds5 = copy.deepcopy(trn_ds) lbr = IdealLabeler(fully_labeled_trn_ds) quota = len(y_train) - n_labeled # number of samples to query # Comparing UncertaintySampling strategy with RandomSampling. # model is the base learner, e.g. LogisticRegression, SVM ... etc. qs = UncertaintySampling(trn_ds, model=SVM(decision_function_shape='ovr')) model = SVM(kernel='linear', decision_function_shape='ovr') _, E_out_1 = run(trn_ds, tst_ds, lbr, model, qs, quota) results.append(E_out_1.tolist()) qs2 = RandomSampling(trn_ds2) model = SVM(kernel='linear', decision_function_shape='ovr') _, E_out_2 = run(trn_ds2, tst_ds, lbr, model, qs2, quota) results.append(E_out_2.tolist()) qs3 = QUIRE(trn_ds3) model = SVM(kernel='linear', decision_function_shape='ovr') _, E_out_3 = run(trn_ds3, tst_ds, lbr, model, qs3, quota) results.append(E_out_3.tolist()) qs4 = HintSVM(trn_ds4, cl=1.0, ch=1.0) model = SVM(kernel='linear', decision_function_shape='ovr') _, E_out_4 = run(trn_ds4, tst_ds, lbr, model, qs4, quota) results.append(E_out_4.tolist()) qs5 = ActiveLearningByLearning( trn_ds5, query_strategies=[ UncertaintySampling(trn_ds5, model=SVM(kernel='linear', decision_function_shape='ovr')), QUIRE(trn_ds5), HintSVM(trn_ds5, cl=1.0, ch=1.0), ], T=quota, uniform_sampler=True, model=SVM(kernel='linear', decision_function_shape='ovr')) model = SVM(kernel='linear', decision_function_shape='ovr') _, E_out_5 = run(trn_ds5, tst_ds, lbr, model, qs5, quota) results.append(E_out_5.tolist()) result = [] for i in range(5): _temp = [] for j in range(i, len(results), 5): _temp.append(results[j]) result.append(np.mean(_temp, axis=0)) # Plot the learning curve of UncertaintySampling to RandomSampling # The x-axis is the number of queries, and the y-axis is the corresponding # error rate. query_num = np.arange(1, quota + 1) plt.plot(query_num, result[0], 'g', label='uncertainty sampling') plt.plot(query_num, result[1], 'k', label='random') plt.plot(query_num, result[2], 'r', label='QUIRE') plt.plot(query_num, result[3], 'b', label='HintSVM') plt.plot(query_num, result[4], 'c', label='ALBL') plt.xlabel('Number of Queries') plt.ylabel('Error') plt.title('Experiment Result') plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=5) plt.show()
def atlx(candsets, candsets_train, candsets_test, source_name, target_name, feature, bootstrap_clf, query_strategy, quota, warm_start, n_bootstrapped_samples=2, weighting=None, disagreement='vote', n=5): """ query_strategy: Possible strategies are: Baselines: 'uncertainty', 'random' Heterogeneous Committees: 'lr_lrcv_rf_dt', 'lr_lsvc_dt_gpc' Homogeneous Committees: 'homogeneous_committee' (it will then take the specified committee for the model used) """ training_accuracy_scores, training_f1_scores, test_accuracy_scores, test_f1_scores, test_precision, test_recall = [],[],[],[],[],[] model_pred_prob_start, model_feature_import_start, model_depth_tree_start = [],[],[] model_pred_prob_end, model_feature_import_end, model_depth_tree_end = [],[],[] runtimes = [] share_noise_labeled_set_pos_lst, share_noise_labeled_set_neg_lst = [], [] X_source = candsets[source_name][feature].to_numpy() y_source = candsets[source_name]['label'].to_numpy() X_target = candsets[target_name][feature].to_numpy() X_target_train = candsets_train[target_name][feature].to_numpy() y_target_train = candsets_train[target_name]['label'].to_numpy() X_target_test = candsets_test[target_name][feature].to_numpy() y_target_test = candsets_test[target_name]['label'].to_numpy() n_labeled = n_bootstrapped_samples # check if domain adaptation is desired if (weighting is None): print('No Unsupervised Domain Adaptation performed') sample_weight = None else: print( 'Unsupervised Domain Adaptation: Calculate sample_weight for the source instances using {}' .format(weighting)) # unsupervised domain adaptation so we use the whole unlabeled source and target data sample_weight = da.getSampleWeightsOfDomainAdaptation( X_source, X_target, weighting) # create libact DataSet Object containting the validation set test_ds = Dataset(X=X_target_test, y=y_target_test) print('Starting ATL Experiments (WITH transfer!) source {} and target {}'. format(source_name, target_name)) for i in range(n): print('{}. Run of {}'.format(i + 1, n)) train_ds, fully_labeled_trn_ds, n_labeled_, share_noise_labeled_set_pos, share_noise_labeled_set_neg = initializeATLPool( X_source, y_source, X_target_train, y_target_train, sample_weight, bootstrap_clf, n_labeled) share_noise_labeled_set_pos_lst.append(share_noise_labeled_set_pos) share_noise_labeled_set_neg_lst.append(share_noise_labeled_set_neg) # if quota -1 it means it is not a fixed amount # create the quota which is the amount of all instances # in the training pool minus the amount of already labeled ones if (quota == -1): quota = train_ds.len_unlabeled() # cerate the IdealLabeler with the full labeled training set lbr = IdealLabeler(fully_labeled_trn_ds) model = la.RandomForest_(random_state=42, warm_start=warm_start, n_estimators=10) qs = getQueryStrategy(query_strategy, train_ds, disagreement, 'rf') train_acc, train_f1, test_acc, test_f1, test_p, test_r, model_, runt, share_of_corrected_labels, model_pred_prob,\ model_feature_import, model_depth_tree = run_weighted_atl(train_ds,test_ds,lbr,model,qs,quota) training_accuracy_scores.append(train_acc) training_f1_scores.append(train_f1) test_accuracy_scores.append(test_acc) test_f1_scores.append(test_f1) test_precision.append(test_p) test_recall.append(test_r) model_pred_prob_start.append(model_pred_prob[0]) model_feature_import_start.append(model_feature_import[0]) model_pred_prob_end.append(model_pred_prob[1]) model_feature_import_end.append(model_feature_import[1]) model_depth_tree_start.append(model_depth_tree[0]) model_depth_tree_end.append(model_depth_tree[1]) runtimes.append(runt) runt = np.mean(runtimes) key = '{}_{}'.format(source_name, target_name) if (weighting is None): # append weighting strategy to query_strategy name to be able to distinguish d = { key: { 'rf': { query_strategy: { 'no_weighting': { 'quota': quota, 'n_runs': n, 'n_init_labeled': n_labeled, 'share_noise_labeled_set_pos': share_noise_labeled_set_pos, 'share_noise_labeled_set_neg': share_noise_labeled_set_neg, 'share_of_corrected_labels': share_of_corrected_labels, 'disagreement_measure': disagreement, 'model_params': model_.get_params(), 'avg_runtime': runt, 'training_accuracy_scores': training_accuracy_scores, 'training_f1_scores': training_f1_scores, 'test_accuracy_scores': test_accuracy_scores, 'test_f1_scores': test_f1_scores, 'test_precision': test_precision, 'test_recall': test_recall, 'model_pred_prob_start': model_pred_prob_start, 'model_feature_import_start': model_feature_import_start, 'model_depth_tree_start': model_depth_tree_start, 'model_pred_prob_end': model_pred_prob_end, 'model_feature_import_end': model_feature_import_end, 'model_depth_tree_end': model_depth_tree_end, 'sample_weights': sample_weight } } } } } else: d = { key: { 'rf': { query_strategy: { weighting: { 'quota': quota, 'n_runs': n, 'n_init_labeled': n_labeled, 'share_noise_labeled_set_pos': share_noise_labeled_set_pos, 'share_noise_labeled_set_neg': share_noise_labeled_set_neg, 'share_of_corrected_labels': share_of_corrected_labels, 'disagreement_measure': disagreement, 'model_params': model_.get_params(), 'avg_runtime': runt, 'training_accuracy_scores': training_accuracy_scores, 'training_f1_scores': training_f1_scores, 'test_accuracy_scores': test_accuracy_scores, 'test_f1_scores': test_f1_scores, 'test_precision': test_precision, 'test_recall': test_recall, 'model_pred_prob_start': model_pred_prob_start, 'model_feature_import_start': model_feature_import_start, 'model_depth_tree_start': model_depth_tree_start, 'model_pred_prob_end': model_pred_prob_end, 'model_feature_import_end': model_feature_import_end, 'model_depth_tree_end': model_depth_tree_end, 'sample_weights': sample_weight } } } } } return d
def train_exclude_user(user_id=None, device_type=None, n_class=None): test_data = waterloo_iv_processing.get_all_but_one_user( ex_user_id=user_id, device=device_type, video_name=['sports', 'document', 'nature', 'game', 'movie']) X, y = processing_training_data(n_class=n_class, train_data=test_data) test_size = 0.2 # the percentage of samples in the dataset that will be quota = 350 # number of samples to query result = {'E1': [], 'E2': [], 'E3': []} for i in range(20): print('exp:', i) trn_ds, tst_ds, fully_labeled_trn_ds, cost_matrix = split_train_test( X=X, y=y, test_size=test_size, n_class=n_class) trn_ds2 = copy.deepcopy(trn_ds) trn_ds3 = copy.deepcopy(trn_ds) lbr = IdealLabeler(fully_labeled_trn_ds) model = SVM(kernel='rbf', decision_function_shape='ovr') qs = UncertaintySampling(trn_ds, method='sm', model=SVM(decision_function_shape='ovr')) _, E_out_1 = run(trn_ds, tst_ds, lbr, model, qs, quota, cost_matrix) result['E1'].append(E_out_1) qs2 = RandomSampling(trn_ds2) _, E_out_2 = run(trn_ds2, tst_ds, lbr, model, qs2, quota, cost_matrix) result['E2'].append(E_out_2) qs3 = ALCE(trn_ds3, cost_matrix, SVR()) _, E_out_3 = run(trn_ds3, tst_ds, lbr, model, qs3, quota, cost_matrix) result['E3'].append(E_out_3) E_out_1 = np.mean(result['E1'], axis=0) E_out_2 = np.mean(result['E2'], axis=0) E_out_3 = np.mean(result['E3'], axis=0) save_file( 'results/exclude_user_' + str(user_id) + '_' + device_type + '_E1_class_' + str(n_class) + '.txt', result['E1']) save_file( 'results/exclude_user_' + str(user_id) + '_' + device_type + '_E2_class_' + str(n_class) + '.txt', result['E2']) save_file( 'results/exclude_user_' + str(user_id) + '_' + device_type + '_E3_class_' + str(n_class) + '.txt', result['E3']) print("Uncertainty: ", E_out_1[::5].tolist()) print("Random: ", E_out_2[::5].tolist()) print("ALCE: ", E_out_3[::5].tolist()) query_num = np.arange(0, quota + 1) uncert, = plt.plot(query_num, E_out_1, 'g', label='Uncertainty sampling') rd, = plt.plot(query_num, E_out_2, 'k', label='Random') alce, = plt.plot(query_num, E_out_3, 'r', label='ALCE') plt.xlabel('Number of Queries') plt.ylabel('Error') plt.title('Experiment Result (exclude) ' + 'user ' + str(user_id)) plt.legend(handles=[uncert, rd, alce], loc=1) plt.show()
def perfom_analysis(dataset_file, word2vec_file, word_freq_file, output_file): nfolds = 3 name = [ 'experimen_no', 'algorithm', 'query_strategy', 'number_of_instances', 'f1' ] with open(output_file, 'w') as f: f.write('{0}\n'.format(name)) #for i, foldid in zip([0],[0]):#itertools.product(get_set_random_states(), range(nfolds)): for i, foldid in itertools.product(get_set_random_states(), range(nfolds)): result = [] data = read_and_split_data2(dataset_file, random_state=i, nfold=nfolds, foldid=foldid) # convert to libact dataset docs = data['train_x'] + data['dev_x'] + data['test_x'] trn_len = len(data['train_x']) dev_len = len(data['dev_x']) test_len = len(data['test_x']) X = np.asarray([[i] for i in range(len(docs))]) X_train = np.asarray([X[i, :] for i in range(trn_len)]) X_dev = np.asarray( [X[i, :] for i in range(trn_len, trn_len + dev_len)]) X_test = np.asarray( [X[i, :] for i in range(trn_len + dev_len, len(docs))]) y = data['train_y'] + data['dev_y'] + data['test_y'] y_train = data['train_y'] y_dev = data['dev_y'] y_test = data['test_y'] n_labeled = 50 tmp = prepare_data(X_train, y_train, X_test, y_test, n_labeled, random_state=3) trn_ds, tst_ds, fully_labeled_trn_ds, n_labeled = tmp trn_ds2 = copy.deepcopy(trn_ds) lbr = IdealLabeler(fully_labeled_trn_ds) quota = len(y_train) - n_labeled # number of samples to query # quota = 10 batchsize = 10 # Comparing UncertaintySampling strategy with RandomSampling. # model is the base learner, e.g. LogisticRegression, SVM ... etc. # train CNN Readability Classifier # train CNN Readability Classifier print('>>>>>>>>>>>Least Certainty ....') preprocessor = rlc.Preprocessor(word2vec_file, word_freq_file, topwords_as_vocab=True) cls = rlc.ReadlevelClassifier(preprocessor, useGPU=True) cls.cuda() wrapper = CNNWrapperClassifier((docs, y), cls, (data['dev_x'], data['dev_y'])) qs = MyUncertaintySampling(trn_ds, method='entropy', model=wrapper, batch_size=batchsize) lc_result = run_analysis(trn_ds, tst_ds, lbr, wrapper, qs, quota) for result_item in lc_result: num_instance, f1, ask_ids = result_item item = new_result_item( 'state={0}/foldid={1}/ask_ids={2}'.format(i, foldid, ask_ids), 'cnn-sep', 'lc', num_instance, f1) result.append(item) # random print('--------->Random Sampling ....') preprocessor = rlc.Preprocessor(word2vec_file, word_freq_file, topwords_as_vocab=True) cls = rlc.ReadlevelClassifier(preprocessor, useGPU=True) cls.cuda() wrapper = CNNWrapperClassifier((docs, y), cls, (data['dev_x'], data['dev_y'])) qs2 = MyRandomSampling(trn_ds2, batch_size=batchsize) rs_result = run_analysis(trn_ds2, tst_ds, lbr, wrapper, qs2, quota) for result_item in rs_result: num_instance, f1, ask_ids = result_item item = new_result_item( 'state={0}/foldid={1}/ask_ids={2}'.format(i, foldid, ask_ids), 'cnn-sep', 'rs', num_instance, f1) result.append(item) #wrapper = CNNWrapperClassifier((docs,y), cls) #qs3 = MyVarianceReduction(trn_ds3, model=wrapper) doesn't work #vr_result = run(trn_ds3, tst_ds, lbr, wrapper, qs3, quota) #for result_item in vr_result: # num_instance, f1 = result_item # item = new_result_item(i, 'cnn-sep', 'vr', num_instance, f1) # result.append(item) result = pd.DataFrame(columns=name, data=result) with open(output_file, 'a') as f: result.to_csv(f, header=False)