def main(): print args print accuracies = defaultdict(lambda: []) ora_accu = defaultdict(lambda: []) oracle_accuracies =[] ora_cm = defaultdict(lambda: []) lbl_dit = defaultdict(lambda: []) aucs = defaultdict(lambda: []) x_axis = defaultdict(lambda: []) vct = TfidfVectorizer(encoding='ISO-8859-1', min_df=5, max_df=1.0, binary=False, ngram_range=(1, 1), token_pattern='\\b\\w+\\b', tokenizer=StemTokenizer()) print("Start loading ...") # data fields: data, bow, file_names, target_names, target ########## NEWS GROUPS ############### # easy to hard. see "Less is More" paper: http://axon.cs.byu.edu/~martinez/classes/678/Presentations/Clawson.pdf categories = [['alt.atheism', 'talk.religion.misc'], ['comp.graphics', 'comp.windows.x'], ['comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware'], ['rec.sport.baseball', 'sci.crypt']] min_size = 10 args.fixk = None data, vct = load_from_file(args.train, [categories[3]], args.fixk, min_size, vct, raw=True) print("Data %s" % args.train) print("Data size %s" % len(data.train.data)) parameters = experiment_utils.parse_parameters_mat(args.cost_model) print "Cost Parameters %s" % parameters cost_model = experiment_utils.set_cost_model(args.cost_function, parameters=parameters) print "\nCost Model: %s" % cost_model.__class__.__name__ ### SENTENCE TRANSFORMATION if args.train == "twitter": sent_detector = TwitterSentenceTokenizer() else: sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') ## delete <br> to "." to recognize as end of sentence data.train.data = experiment_utils.clean_html(data.train.data) data.test.data = experiment_utils.clean_html(data.test.data) print("Train:{}, Test:{}, {}".format(len(data.train.data), len(data.test.data), data.test.target.shape[0])) ## Get the features of the sentence dataset ## create splits of data: pool, test, oracle, sentences expert_data = Bunch() if not args.fulloracle: train_test_data = Bunch() expert_data.sentence, train_test_data.pool = split_data(data.train) expert_data.oracle, train_test_data.test = split_data(data.test) data.train.data = train_test_data.pool.train.data data.train.target = train_test_data.pool.train.target data.test.data = train_test_data.test.train.data data.test.target = train_test_data.test.train.target ## convert document to matrix data.train.bow = vct.fit_transform(data.train.data) data.test.bow = vct.transform(data.test.data) #### EXPERT CLASSIFIER: ORACLE print("Training Oracle expert") exp_clf = experiment_utils.set_classifier(args.classifier, parameter=args.expert_penalty) if not args.fulloracle: print "Training expert documents:%s" % len(expert_data.oracle.train.data) labels, sent_train = experiment_utils.split_data_sentences(expert_data.oracle.train, sent_detector, vct, limit=args.limit) expert_data.oracle.train.data = sent_train expert_data.oracle.train.target = np.array(labels) expert_data.oracle.train.bow = vct.transform(expert_data.oracle.train.data) exp_clf.fit(expert_data.oracle.train.bow, expert_data.oracle.train.target) else: # expert_data.data = np.concatenate((data.train.data, data.test.data)) # expert_data.target = np.concatenate((data.train.target, data.test.target)) expert_data.data =data.train.data expert_data.target = data.train.target expert_data.target_names = data.train.target_names labels, sent_train = experiment_utils.split_data_sentences(expert_data, sent_detector, vct, limit=args.limit) expert_data.bow = vct.transform(sent_train) expert_data.target = labels expert_data.data = sent_train exp_clf.fit(expert_data.bow, expert_data.target) if "neutral" in args.expert: expert = baseexpert.NeutralityExpert(exp_clf, threshold=args.neutral_threshold, cost_function=cost_model.cost_function) elif "true" in args.expert: expert = baseexpert.TrueOracleExpert(cost_function=cost_model.cost_function) elif "pred" in args.expert: expert = baseexpert.PredictingExpert(exp_clf, #threshold=args.neutral_threshold, cost_function=cost_model.cost_function) elif "human" in args.expert: expert = baseexpert.HumanExpert(", ".join(["{}={}".format(a,b) for a,b in enumerate(data.train.target_names)])+"? > ") else: raise Exception("We need an expert!") print "\nExpert: %s " % expert #### EXPERT CLASSIFIER: SENTENCES print("Training sentence expert") sent_clf = None if args.cheating: labels, sent_train = experiment_utils.split_data_sentences(expert_data.sentence.train, sent_detector, vct, limit=args.limit) expert_data.sentence.train.data = sent_train expert_data.sentence.train.target = np.array(labels) expert_data.sentence.train.bow = vct.transform(expert_data.sentence.train.data) sent_clf = experiment_utils.set_classifier(args.classifier, parameter=args.expert_penalty) sent_clf.fit(expert_data.sentence.train.bow, expert_data.sentence.train.target) #### STUDENT CLASSIFIER clf = experiment_utils.set_classifier(args.classifier, parameter=args.expert_penalty) print "\nStudent Classifier: %s" % clf print "\nSentence Classifier: %s" % sent_clf print "\nExpert Oracle Classifier: %s" % exp_clf print "\nPenalty Oracle:", exp_clf.C print "\nVectorizer: %s" % vct #### ACTIVE LEARNING SETTINGS step_size = args.step_size bootstrap_size = args.bootstrap evaluation_points = 200 print("\nExperiment: step={0}, BT={1}, plot points={2}, fixk:{3}, minsize:{4}".format(step_size, bootstrap_size, evaluation_points, args.fixk, min_size)) print ("Anytime active learning experiment - use objective function to pick data") t0 = time.time() tac = [] tau = [] ### experiment starts for t in range(args.trials): trial_accu = [] trial_aucs = [] print "*" * 60 print "Trial: %s" % t student = get_student(clf, cost_model, sent_clf, sent_detector, vct) student.human_mode = args.expert == 'human' print "\nStudent: %s " % student train_indices = [] neutral_data = [] # save the xik vectors train_x = [] train_y = [] neu_x = [] # data to train the classifier neu_y = np.array([]) pool = Bunch() pool.data = data.train.bow.tocsr() # full words, for training pool.text = data.train.data pool.target = data.train.target pool.predicted = [] pool.remaining = set(range(pool.data.shape[0])) # indices of the pool bootstrapped = False current_cost = 0 iteration = 0 query_index = None query_size = None oracle_answers = 0 calibrated=args.calibrate while 0 < student.budget and len(pool.remaining) > step_size and iteration <= args.maxiter: util = [] if not bootstrapped: ## random from each bootstrap bt = randomsampling.BootstrapFromEach(t * 10) query_index = bt.bootstrap(pool=pool, k=bootstrap_size) bootstrapped = True query = pool.data[query_index] print "Bootstrap: %s " % bt.__class__.__name__ print else: chosen = student.pick_next(pool=pool, step_size=step_size) query_index = [x for x, y in chosen] # document id of chosen instances query = [y[0] for x, y in chosen] # sentence of the document query_size = [1] * len(query_index) ground_truth = pool.target[query_index] if iteration == 0: ## bootstrap uses ground truth labels = ground_truth spent = [0] * len(ground_truth) ## bootstrap cost is ignored else: # print "ask labels" labels = expert.label_instances(query, ground_truth) spent = expert.estimate_instances(query_size) ### accumulate the cost of the query query_cost = np.array(spent).sum() current_cost += query_cost useful_answers = np.array([[x, y] for x, y in zip(query_index, labels) if y is not None]) neutral_answers = np.array([[x, z] for x, y, z in zip(query_index, labels, query_size) if y is None]) \ if iteration != 0 else np.array([]) ## add data recent acquired to train if useful_answers.shape[0] != 0: train_indices.extend(useful_answers[:, 0]) # add labels to training train_x = pool.data[train_indices] # # train with all the words # update labels with the expert labels train_y.extend(useful_answers[:, 1]) neu_x, neu_y, neutral_data = update_sentence(neutral_data, neu_x, neu_y, labels, query_index, pool, vct) # neu_x, neu_y, neutral_data = update_sentence_query(neutral_data, neu_x, neu_y, query, labels) if neu_y.shape[0] != neu_x.shape[0]: raise Exception("Training data corrupted!") if train_x.shape[0] != len(train_y): raise Exception("Training data corrupted!") # remove labels from pool pool.remaining.difference_update(query_index) # retrain the model current_model = student.train_all(train_x, train_y, neu_x, neu_y) # evaluate and save results y_probas = current_model.predict_proba(data.test.bow) auc = metrics.roc_auc_score(data.test.target, y_probas[:, 1]) pred_y = current_model.classes_[np.argmax(y_probas, axis=1)] correct_labels = (np.array(ground_truth) == np.array(labels).reshape(len(labels))).sum() accu = metrics.accuracy_score(data.test.target, pred_y) print ("TS:{0}\tAccu:{1:.3f}\tAUC:{2:.3f}\tCost:{3:.2f}\tCumm:{4:.2f}\tGT:{5}\tneu:{6}\t{7}\tND:{8}\tTD:{9}\t ora_accu:{10}".format( len(train_indices), accu, auc, query_cost, current_cost, ground_truth, len(neutral_answers), neu_y.shape[0], neu_y.sum(), np.array(train_y).sum(), correct_labels)) ## the results should be based on the cost of the labeling if iteration > 0: # bootstrap iteration student.budget -= query_cost ## Bootstrap doesn't count # oracle accuracy (from queries) oracle_answers += correct_labels x_axis_range = current_cost x_axis[x_axis_range].append(current_cost) ## save results accuracies[x_axis_range].append(accu) aucs[x_axis_range].append(auc) ora_accu[x_axis_range].append(1. * correct_labels) ora_cm[x_axis_range].append(metrics.confusion_matrix(ground_truth, labels, labels=np.unique(train_y))) lbl_dit[x_axis_range].append(np.sum(train_y)) # partial trial results trial_accu.append([x_axis_range, accu]) trial_aucs.append([x_axis_range, auc]) # oracle_accuracies[x_axis_range].append(oracle_answers) iteration += 1 # end of budget loop tac.append(trial_accu) tau.append(trial_aucs) oracle_accuracies.append(1.*oracle_answers / (len(train_indices)-bootstrap_size)) print "Trial: {}, Oracle right answers: {}, Iteration: {}, Labels:{}, ACCU-OR:{}".format(t, oracle_answers, iteration, len(train_indices)-bootstrap_size,1.*oracle_answers / (len(train_indices)-bootstrap_size)) #end trial loop if args.cost_function not in "uniform": accuracies = experiment_utils.extrapolate_trials(tac, cost_25=parameters[1][1], step_size=args.step_size) aucs = experiment_utils.extrapolate_trials(tau, cost_25=parameters[1][1], step_size=args.step_size) print "\nAverage oracle accuracy: ", np.array(oracle_accuracies).mean() print("Elapsed time %.3f" % (time.time() - t0)) cheating = "CHEATING" if args.cheating else "NOCHEAT" experiment_utils.print_extrapolated_results(accuracies, aucs, file_name=args.train+"-"+cheating+"-"+args.prefix+"-"+args.classifier+"-"+args.student) experiment_utils.oracle_accuracy(ora_accu, file_name=args.train+"-"+cheating+"-"+args.prefix+"-"+args.classifier+"-"+args.student, cm=ora_cm, num_trials=args.trials)
def main(): vct = TfidfVectorizer(encoding='ISO-8859-1', min_df=1, max_df=1.0, binary=False, ngram_range=(1, 1), token_pattern='\\b\\w+\\b') #, tokenizer=StemTokenizer()) vct_analizer = vct.build_tokenizer() print("Start loading ...") # data fields: data, bow, file_names, target_names, target ########## NEWS GROUPS ############### # easy to hard. see "Less is More" paper: http://axon.cs.byu.edu/~martinez/classes/678/Presentations/Clawson.pdf categories = [['alt.atheism', 'talk.religion.misc'], ['comp.graphics', 'comp.windows.x'], ['comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware'], ['rec.sport.baseball', 'sci.crypt']] min_size = 10 # max(10, args.fixk) args.fixk = None data, vct = load_from_file(args.train, [categories[3]], args.fixk, min_size, vct, raw=True) print("Data %s" % args.train) print("Data size %s" % len(data.train.data)) ### SENTENCE TRANSFORMATION sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') ## delete <br> to "." to recognize as end of sentence data.train.data = experiment_utils.clean_html(data.train.data) data.test.data = experiment_utils.clean_html(data.test.data) print("Train:{}, Test:{}, {}".format(len(data.train.data), len(data.test.data), data.test.target.shape[0])) ## Get the features of the sentence dataset ## create splits of data: pool, test, oracle, sentences expert_data = Bunch() train_test_data = Bunch() expert_data.sentence, train_test_data.pool = split_data(data.train) expert_data.oracle, train_test_data.test = split_data(data.test) data.train.data = train_test_data.pool.train.data data.train.target = train_test_data.pool.train.target data.test.data = train_test_data.test.train.data data.test.target = train_test_data.test.train.target ## convert document to matrix data.train.bow = vct.fit_transform(data.train.data) data.test.bow = vct.transform(data.test.data) #### EXPERT CLASSIFIER: ORACLE print("Training Oracle expert") labels, sent_train = split_data_sentences(expert_data.oracle.train, sent_detector) expert_data.oracle.train.data = sent_train expert_data.oracle.train.target = np.array(labels) expert_data.oracle.train.bow = vct.transform(expert_data.oracle.train.data) exp_clf = linear_model.LogisticRegression(penalty='l1', C=args.expert_penalty) exp_clf.fit(expert_data.oracle.train.bow, expert_data.oracle.train.target) #### EXPERT CLASSIFIER: SENTENCES print("Training sentence expert") labels, sent_train = split_data_sentences(expert_data.sentence.train, sent_detector) expert_data.sentence.train.data = sent_train expert_data.sentence.train.target = np.array(labels) expert_data.sentence.train.bow = vct.transform(expert_data.sentence.train.data) sent_clf = linear_model.LogisticRegression(penalty='l1', C=args.expert_penalty) sent_clf.fit(expert_data.sentence.train.bow, expert_data.sentence.train.target) #### TESTING THE CLASSIFERS test_target, test_data = split_data_sentences(data.test,sent_detector) test_data_bow = vct.transform(test_data) #pred_sent = sent_clf.predict(test_data_bow) pred_ora = exp_clf.predict(test_data_bow) y_probas = sent_clf.predict_proba(test_data_bow) pred_sent = sent_clf.classes_[np.argmax(y_probas, axis=1)] ## just based on one class probability # order = np.argsort(y_probas[:,0]) order = np.argsort(y_probas.max(axis=1)) print "ORACLE\tSENTENCE\tMAX-SENT" # for i in order[:500]: # print pred_ora[i],pred_sent[i], y_probas[i,0], test_data[i] for i in order[-500:]: print pred_ora[i],pred_sent[i], y_probas[i,0], test_data[i] print "Accuracy of Sentences Classifier", metrics.accuracy_score(test_target, pred_sent) print "Class distribution: %s" % pred_sent.sum() print "Size of data: %s" % pred_sent.shape[0] sizes = [50, 100, 500, 1000, 2000, 3000, 4000, 20000] clf = linear_model.LogisticRegression(penalty='l1', C=1) bootstrap = rand.permutation(len(test_data)) x = [] y = [] for s in sizes: indices = bootstrap[:s] train_x = expert_data.sentence.train.bow[indices[:s]] train_y = expert_data.sentence.train.target[indices[:s]] clf.fit(train_x, train_y) predictions = clf.predict(test_data_bow) scores = metrics.accuracy_score(test_target,predictions) ## print clf.__class__.__name__ print "Accuracy {0}: {1}".format(s, scores) y.append(scores) plt.clf() plt.title("Accuracy") plt.xlabel("Labels") plt.ylabel("Accuracy") plt.legend() plt.plot(sizes, y, '--bo', label="sent") plt.show()
def get_data(clf, train, cats, fixk, min_size, vct, raw, limit=2): import copy min_size = 10 args.fixk = None data, vct2 = load_from_file(train, cats, fixk, min_size, vct, raw=raw) print("Data %s" % args.train) print("Data size %s" % len(data.train.data)) ### SENTENCE TRANSFORMATION sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') ## delete <br> to "." to recognize as end of sentence data.train.data = clean_html(data.train.data) data.test.data = clean_html(data.test.data) print("Train:{}, Test:{}, {}".format(len(data.train.data), len(data.test.data), data.test.target.shape[0])) ## Get the features of the sentence dataset ## create splits of data: pool, test, oracle, sentences expert_data = Bunch() train_test_data = Bunch() expert_data.sentence, train_test_data.pool = split_data(data.train) expert_data.oracle, train_test_data.test = split_data(data.test) data.train.data = train_test_data.pool.train.data data.train.target = train_test_data.pool.train.target data.test.data = train_test_data.test.train.data data.test.target = train_test_data.test.train.target ## convert document to matrix data.train.bow = vct.fit_transform(data.train.data) data.test.bow = vct.transform(data.test.data) #### EXPERT CLASSIFIER: ORACLE print("Training Oracle expert") labels, sent_train = split_data_sentences(expert_data.oracle.train, sent_detector, vct, limit=limit) print len(sent_train) expert_data.oracle.train.data = sent_train expert_data.oracle.train.target = np.array(labels) expert_data.oracle.train.bow = vct.transform(expert_data.oracle.train.data) print expert_data.oracle.train.bow.shape # exp_clf = linear_model.LogisticRegression(penalty='l1', C=args.expert_penalty) exp_clf = copy.copy(clf) exp_clf.fit(expert_data.oracle.train.bow, expert_data.oracle.train.target) #### EXPERT CLASSIFIER: SENTENCES print("Training sentence expert") labels, sent_train = split_data_sentences(expert_data.sentence.train, sent_detector, vct, limit=limit) expert_data.sentence.train.data = sent_train expert_data.sentence.train.target = np.array(labels) expert_data.sentence.train.bow = vct.transform(expert_data.sentence.train.data) sent_clf = None # if args.cheating: sent_clf = copy.copy(clf) # sent_clf = linear_model.LogisticRegression(penalty='l1', C=args.expert_penalty) sent_clf.fit(expert_data.sentence.train.bow, expert_data.sentence.train.target) return exp_clf, data, vct, sent_clf, expert_data