def main(): accuracies = defaultdict(lambda: []) aucs = defaultdict(lambda: []) x_axis = defaultdict(lambda: []) vct = CountVectorizer(encoding='ISO-8859-1', min_df=5, max_df=1.0, binary=True, ngram_range=(1, 3), token_pattern='\\b\\w+\\b', tokenizer=StemTokenizer()) vct_analizer = vct.build_tokenizer() print("Start loading ...") # data fields: data, bow, file_names, target_names, target ########## NEWS GROUPS ############### # easy to hard. see "Less is More" paper: http://axon.cs.byu.edu/~martinez/classes/678/Presentations/Clawson.pdf categories = [['alt.atheism', 'talk.religion.misc'], ['comp.graphics', 'comp.windows.x'], ['comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware'], ['rec.sport.baseball', 'sci.crypt']] min_size = max(100, args.fixk) fixk_saved = "{0}{1}.p".format(args.train, args.fixk) try: fixk_file = open(fixk_saved, "rb") data = pickle.load(fixk_file) except IOError: data = load_dataset(args.train, args.fixk, categories[0], vct, min_size, percent=.5) fixk_file = open(fixk_saved, "wb") pickle.dump(data, fixk_file) # data = load_dataset(args.train, args.fixk, categories[0], vct, min_size) print("Data %s" % args.train) print("Data size %s" % len(data.train.data)) parameters = parse_parameters_mat(args.cost_model) print "Cost Parameters %s" % parameters cost_model = set_cost_model(args.cost_function, parameters=parameters) print "\nCost Model: %s" % cost_model.__class__.__name__ #### STUDENT CLASSIFIER clf = linear_model.LogisticRegression(penalty="l1", C=1) print "\nStudent Classifier: %s" % clf #### EXPERT CLASSIFIER exp_clf = linear_model.LogisticRegression(penalty='l1', C=.3) exp_clf.fit(data.test.bow, data.test.target) expert = baseexpert.NeutralityExpert(exp_clf, threshold=args.neutral_threshold, cost_function=cost_model.cost_function) print "\nExpert: %s " % expert #### ACTIVE LEARNING SETTINGS step_size = args.step_size bootstrap_size = args.bootstrap evaluation_points = 200 print("\nExperiment: step={0}, BT={1}, plot points={2}, fixk:{3}, minsize:{4}".format(step_size, bootstrap_size, evaluation_points, args.fixk, min_size)) print ("Cheating experiment - use full uncertainty query k words") t0 = time.time() ### experiment starts tx =[] tac = [] tau = [] for t in range(args.trials): trial_accu =[] trial_aucs = [] trial_x_axis = [] print "*" * 60 print "Trial: %s" % t student = randomsampling.UncertaintyLearner(model=clf, accuracy_model=None, budget=args.budget, seed=t) print "\nStudent: %s " % student train_indices = [] train_x = [] train_y = [] pool = Bunch() pool.data = data.train.bow.tocsr() # full words, for training pool.fixk = data.train.bowk.tocsr() # k words BOW for querying pool.target = data.train.target pool.predicted = [] pool.kwords = np.array(data.train.kwords) # k words pool.remaining = set(range(pool.data.shape[0])) # indices of the pool bootstrapped = False current_cost = 0 iteration = 0 while 0 < student.budget and len(pool.remaining) > step_size and iteration <= args.maxiter: if not bootstrapped: ## random from each bootstrap bt = randomsampling.BootstrapFromEach(t * 10) query_index = bt.bootstrap(pool=pool, k=bootstrap_size) bootstrapped = True print "Bootstrap: %s " % bt.__class__.__name__ print else: query_index = student.pick_next(pool=pool, k=step_size) query = pool.fixk[query_index] # query with k words query_size = [len(vct_analizer(x)) for x in pool.kwords[query_index]] ground_truth = pool.target[query_index] #labels, spent = expert.label(unlabeled=query, target=ground_truth) if iteration == 0: ## bootstrap uses ground truth labels = ground_truth spent = [0] * len(ground_truth) ## bootstrap cost is ignored else: labels = expert.label_instances(query, ground_truth) spent = expert.estimate_instances(query_size) ## add data recent acquired to train ## CHANGE: if label is not useful, ignore and do not charge money for it useful_answers = np.array([[x, y, z] for x, y, z in zip(query_index, labels, spent) if y is not None]) # train_indices.extend(query_index) if useful_answers.shape[0] != 0: train_indices.extend(useful_answers[:, 0]) # add labels to training train_x = pool.data[train_indices] ## train with all the words # update labels with the expert labels train_y.extend(useful_answers[:, 1]) #count for cost ### accumulate the cost of the query # query_cost = np.array(spent).sum() # current_cost += query_cost query_cost = useful_answers[:, 2] query_cost = np.sum(query_cost) current_cost += query_cost if train_x.shape[0] != len(train_y): raise Exception("Training data corrupted!") # remove labels from pool pool.remaining.difference_update(query_index) # retrain the model current_model = student.train(train_x, train_y) # evaluate and save results y_probas = current_model.predict_proba(data.test.bow) auc = metrics.roc_auc_score(data.test.target, y_probas[:, 1]) pred_y = current_model.classes_[np.argmax(y_probas, axis=1)] accu = metrics.accuracy_score(data.test.target, pred_y) print ("TS:{0}\tAccu:{1:.3f}\tAUC:{2:.3f}\tCost:{3:.2f}\tCumm:{4:.2f}\tSpent:{5}".format(len(train_indices), accu, auc, query_cost, current_cost, spent)) ## the results should be based on the cost of the labeling if iteration > 0: # bootstrap iteration student.budget -= query_cost ## Bootstrap doesn't count x_axis_range = current_cost x_axis[x_axis_range].append(current_cost) ## save results accuracies[x_axis_range].append(accu) aucs[x_axis_range].append(auc) ## partial trial results trial_accu.append([x_axis_range, accu]) trial_aucs.append([x_axis_range, auc]) iteration += 1 # end of budget loop tac.append(trial_accu) tau.append(trial_aucs) #end trial loop accuracies = extrapolate_trials(tac) aucs = extrapolate_trials(tau) print("Elapsed time %.3f" % (time.time() - t0)) print_extrapolated_results(accuracies, aucs)
def main(): print args print accuracies = defaultdict(lambda: []) ora_accu = defaultdict(lambda: []) oracle_accuracies =[] ora_cm = defaultdict(lambda: []) lbl_dit = defaultdict(lambda: []) aucs = defaultdict(lambda: []) x_axis = defaultdict(lambda: []) vct = TfidfVectorizer(encoding='ISO-8859-1', min_df=5, max_df=1.0, binary=False, ngram_range=(1, 1), token_pattern='\\b\\w+\\b', tokenizer=StemTokenizer()) print("Start loading ...") # data fields: data, bow, file_names, target_names, target ########## NEWS GROUPS ############### # easy to hard. see "Less is More" paper: http://axon.cs.byu.edu/~martinez/classes/678/Presentations/Clawson.pdf categories = [['alt.atheism', 'talk.religion.misc'], ['comp.graphics', 'comp.windows.x'], ['comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware'], ['rec.sport.baseball', 'sci.crypt']] min_size = 10 args.fixk = None data, vct = load_from_file(args.train, [categories[3]], args.fixk, min_size, vct, raw=True) print("Data %s" % args.train) print("Data size %s" % len(data.train.data)) parameters = experiment_utils.parse_parameters_mat(args.cost_model) print "Cost Parameters %s" % parameters cost_model = experiment_utils.set_cost_model(args.cost_function, parameters=parameters) print "\nCost Model: %s" % cost_model.__class__.__name__ ### SENTENCE TRANSFORMATION if args.train == "twitter": sent_detector = TwitterSentenceTokenizer() else: sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') ## delete <br> to "." to recognize as end of sentence data.train.data = experiment_utils.clean_html(data.train.data) data.test.data = experiment_utils.clean_html(data.test.data) print("Train:{}, Test:{}, {}".format(len(data.train.data), len(data.test.data), data.test.target.shape[0])) ## Get the features of the sentence dataset ## create splits of data: pool, test, oracle, sentences expert_data = Bunch() if not args.fulloracle: train_test_data = Bunch() expert_data.sentence, train_test_data.pool = split_data(data.train) expert_data.oracle, train_test_data.test = split_data(data.test) data.train.data = train_test_data.pool.train.data data.train.target = train_test_data.pool.train.target data.test.data = train_test_data.test.train.data data.test.target = train_test_data.test.train.target ## convert document to matrix data.train.bow = vct.fit_transform(data.train.data) data.test.bow = vct.transform(data.test.data) #### EXPERT CLASSIFIER: ORACLE print("Training Oracle expert") exp_clf = experiment_utils.set_classifier(args.classifier, parameter=args.expert_penalty) if not args.fulloracle: print "Training expert documents:%s" % len(expert_data.oracle.train.data) labels, sent_train = experiment_utils.split_data_sentences(expert_data.oracle.train, sent_detector, vct, limit=args.limit) expert_data.oracle.train.data = sent_train expert_data.oracle.train.target = np.array(labels) expert_data.oracle.train.bow = vct.transform(expert_data.oracle.train.data) exp_clf.fit(expert_data.oracle.train.bow, expert_data.oracle.train.target) else: # expert_data.data = np.concatenate((data.train.data, data.test.data)) # expert_data.target = np.concatenate((data.train.target, data.test.target)) expert_data.data =data.train.data expert_data.target = data.train.target expert_data.target_names = data.train.target_names labels, sent_train = experiment_utils.split_data_sentences(expert_data, sent_detector, vct, limit=args.limit) expert_data.bow = vct.transform(sent_train) expert_data.target = labels expert_data.data = sent_train exp_clf.fit(expert_data.bow, expert_data.target) if "neutral" in args.expert: expert = baseexpert.NeutralityExpert(exp_clf, threshold=args.neutral_threshold, cost_function=cost_model.cost_function) elif "true" in args.expert: expert = baseexpert.TrueOracleExpert(cost_function=cost_model.cost_function) elif "pred" in args.expert: expert = baseexpert.PredictingExpert(exp_clf, #threshold=args.neutral_threshold, cost_function=cost_model.cost_function) elif "human" in args.expert: expert = baseexpert.HumanExpert(", ".join(["{}={}".format(a,b) for a,b in enumerate(data.train.target_names)])+"? > ") else: raise Exception("We need an expert!") print "\nExpert: %s " % expert #### EXPERT CLASSIFIER: SENTENCES print("Training sentence expert") sent_clf = None if args.cheating: labels, sent_train = experiment_utils.split_data_sentences(expert_data.sentence.train, sent_detector, vct, limit=args.limit) expert_data.sentence.train.data = sent_train expert_data.sentence.train.target = np.array(labels) expert_data.sentence.train.bow = vct.transform(expert_data.sentence.train.data) sent_clf = experiment_utils.set_classifier(args.classifier, parameter=args.expert_penalty) sent_clf.fit(expert_data.sentence.train.bow, expert_data.sentence.train.target) #### STUDENT CLASSIFIER clf = experiment_utils.set_classifier(args.classifier, parameter=args.expert_penalty) print "\nStudent Classifier: %s" % clf print "\nSentence Classifier: %s" % sent_clf print "\nExpert Oracle Classifier: %s" % exp_clf print "\nPenalty Oracle:", exp_clf.C print "\nVectorizer: %s" % vct #### ACTIVE LEARNING SETTINGS step_size = args.step_size bootstrap_size = args.bootstrap evaluation_points = 200 print("\nExperiment: step={0}, BT={1}, plot points={2}, fixk:{3}, minsize:{4}".format(step_size, bootstrap_size, evaluation_points, args.fixk, min_size)) print ("Anytime active learning experiment - use objective function to pick data") t0 = time.time() tac = [] tau = [] ### experiment starts for t in range(args.trials): trial_accu = [] trial_aucs = [] print "*" * 60 print "Trial: %s" % t student = get_student(clf, cost_model, sent_clf, sent_detector, vct) student.human_mode = args.expert == 'human' print "\nStudent: %s " % student train_indices = [] neutral_data = [] # save the xik vectors train_x = [] train_y = [] neu_x = [] # data to train the classifier neu_y = np.array([]) pool = Bunch() pool.data = data.train.bow.tocsr() # full words, for training pool.text = data.train.data pool.target = data.train.target pool.predicted = [] pool.remaining = set(range(pool.data.shape[0])) # indices of the pool bootstrapped = False current_cost = 0 iteration = 0 query_index = None query_size = None oracle_answers = 0 calibrated=args.calibrate while 0 < student.budget and len(pool.remaining) > step_size and iteration <= args.maxiter: util = [] if not bootstrapped: ## random from each bootstrap bt = randomsampling.BootstrapFromEach(t * 10) query_index = bt.bootstrap(pool=pool, k=bootstrap_size) bootstrapped = True query = pool.data[query_index] print "Bootstrap: %s " % bt.__class__.__name__ print else: chosen = student.pick_next(pool=pool, step_size=step_size) query_index = [x for x, y in chosen] # document id of chosen instances query = [y[0] for x, y in chosen] # sentence of the document query_size = [1] * len(query_index) ground_truth = pool.target[query_index] if iteration == 0: ## bootstrap uses ground truth labels = ground_truth spent = [0] * len(ground_truth) ## bootstrap cost is ignored else: # print "ask labels" labels = expert.label_instances(query, ground_truth) spent = expert.estimate_instances(query_size) ### accumulate the cost of the query query_cost = np.array(spent).sum() current_cost += query_cost useful_answers = np.array([[x, y] for x, y in zip(query_index, labels) if y is not None]) neutral_answers = np.array([[x, z] for x, y, z in zip(query_index, labels, query_size) if y is None]) \ if iteration != 0 else np.array([]) ## add data recent acquired to train if useful_answers.shape[0] != 0: train_indices.extend(useful_answers[:, 0]) # add labels to training train_x = pool.data[train_indices] # # train with all the words # update labels with the expert labels train_y.extend(useful_answers[:, 1]) neu_x, neu_y, neutral_data = update_sentence(neutral_data, neu_x, neu_y, labels, query_index, pool, vct) # neu_x, neu_y, neutral_data = update_sentence_query(neutral_data, neu_x, neu_y, query, labels) if neu_y.shape[0] != neu_x.shape[0]: raise Exception("Training data corrupted!") if train_x.shape[0] != len(train_y): raise Exception("Training data corrupted!") # remove labels from pool pool.remaining.difference_update(query_index) # retrain the model current_model = student.train_all(train_x, train_y, neu_x, neu_y) # evaluate and save results y_probas = current_model.predict_proba(data.test.bow) auc = metrics.roc_auc_score(data.test.target, y_probas[:, 1]) pred_y = current_model.classes_[np.argmax(y_probas, axis=1)] correct_labels = (np.array(ground_truth) == np.array(labels).reshape(len(labels))).sum() accu = metrics.accuracy_score(data.test.target, pred_y) print ("TS:{0}\tAccu:{1:.3f}\tAUC:{2:.3f}\tCost:{3:.2f}\tCumm:{4:.2f}\tGT:{5}\tneu:{6}\t{7}\tND:{8}\tTD:{9}\t ora_accu:{10}".format( len(train_indices), accu, auc, query_cost, current_cost, ground_truth, len(neutral_answers), neu_y.shape[0], neu_y.sum(), np.array(train_y).sum(), correct_labels)) ## the results should be based on the cost of the labeling if iteration > 0: # bootstrap iteration student.budget -= query_cost ## Bootstrap doesn't count # oracle accuracy (from queries) oracle_answers += correct_labels x_axis_range = current_cost x_axis[x_axis_range].append(current_cost) ## save results accuracies[x_axis_range].append(accu) aucs[x_axis_range].append(auc) ora_accu[x_axis_range].append(1. * correct_labels) ora_cm[x_axis_range].append(metrics.confusion_matrix(ground_truth, labels, labels=np.unique(train_y))) lbl_dit[x_axis_range].append(np.sum(train_y)) # partial trial results trial_accu.append([x_axis_range, accu]) trial_aucs.append([x_axis_range, auc]) # oracle_accuracies[x_axis_range].append(oracle_answers) iteration += 1 # end of budget loop tac.append(trial_accu) tau.append(trial_aucs) oracle_accuracies.append(1.*oracle_answers / (len(train_indices)-bootstrap_size)) print "Trial: {}, Oracle right answers: {}, Iteration: {}, Labels:{}, ACCU-OR:{}".format(t, oracle_answers, iteration, len(train_indices)-bootstrap_size,1.*oracle_answers / (len(train_indices)-bootstrap_size)) #end trial loop if args.cost_function not in "uniform": accuracies = experiment_utils.extrapolate_trials(tac, cost_25=parameters[1][1], step_size=args.step_size) aucs = experiment_utils.extrapolate_trials(tau, cost_25=parameters[1][1], step_size=args.step_size) print "\nAverage oracle accuracy: ", np.array(oracle_accuracies).mean() print("Elapsed time %.3f" % (time.time() - t0)) cheating = "CHEATING" if args.cheating else "NOCHEAT" experiment_utils.print_extrapolated_results(accuracies, aucs, file_name=args.train+"-"+cheating+"-"+args.prefix+"-"+args.classifier+"-"+args.student) experiment_utils.oracle_accuracy(ora_accu, file_name=args.train+"-"+cheating+"-"+args.prefix+"-"+args.classifier+"-"+args.student, cm=ora_cm, num_trials=args.trials)
def main(): accuracies = defaultdict(lambda: []) aucs = defaultdict(lambda: []) x_axis = defaultdict(lambda: []) vct = CountVectorizer(encoding='ISO-8859-1', min_df=5, max_df=1.0, binary=True, ngram_range=(1, 1), token_pattern='\\b\\w+\\b', tokenizer=StemTokenizer()) vct_analizer = vct.build_tokenizer() print("Start loading ...") # data fields: data, bow, file_names, target_names, target ########## NEWS GROUPS ############### # easy to hard. see "Less is More" paper: http://axon.cs.byu.edu/~martinez/classes/678/Presentations/Clawson.pdf categories = [['alt.atheism', 'talk.religion.misc'], ['comp.graphics', 'comp.windows.x'], ['comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware'], ['rec.sport.baseball', 'sci.crypt']] min_size = max(50, args.fixk) if "imdb" in args.train: ########## IMDB MOVIE REVIEWS ########### data = load_imdb(args.train, shuffle=True, rnd=2356, vct=vct, min_size=min_size, fix_k=args.fixk) # should brind data as is elif "aviation" in args.train: raise Exception("We are not ready for that data yet") elif "20news" in args.train: ########## 20 news groups ###### data = load_20newsgroups(categories=categories[0], vectorizer=vct, min_size=min_size, fix_k=args.fixk) # for testing purposes elif "dummy" in args.train: ########## DUMMY DATA########### data = load_dummy("C:/Users/mramire8/Documents/code/python/data/dummy", shuffle=True, rnd=2356, vct=vct, min_size=0, fix_k=args.fixk) else: raise Exception("We do not know that dataset") print("Data %s" % args.train) print("Data size %s" % len(data.train.data)) #print(data.train.data[0]) #### COST MODEL parameters = parse_parameters(args.cost_model) print "Cost Parameters %s" % parameters cost_model = set_cost_model(parameters) print "\nCost Model: %s" % cost_model.__class__.__name__ #### ACCURACY MODEL # try: # # accu_parameters = parse_parameters(args.accu_model) # except ValueError: accu_parameters = parse_parameters_mat(args.accu_model) # else # print("Error: Accuracy parameters didn't work") print "Accuracy Parameters %s" % accu_parameters #if "fixed" in args.accu_function: # accuracy_model = base_models.FixedAccuracyModel(accuracy_value=.7) #elif "log" in args.accu_function: # accuracy_model = base_models.LogAccuracyModel(model=parameters) #elif "linear" in args.accu_function: # accuracy_model = base_models.LRAccuracyModel(model=parameters) #else: # raise Exception("We need a defined cost function options [fixed|log|linear]") # #print "\nAccuracy Model: %s " % accuracy_model #### CLASSIFIER #### Informed priors #feature_counts = np.ones(x_train.shape[0]) * x_train #feature_frequencies = feature_counts / np.sum(feature_counts) #alpha = feature_frequencies alpha = 1 clf = MultinomialNB(alpha=alpha) print "\nClassifier: %s" % clf #### EXPERT MODEL #expert = baseexpert.BaseExpert() if "fixed" in args.expert: expert = baseexpert.FixedAccuracyExpert(accuracy_value=accu_parameters[0], cost_function=cost_model.cost_function) #average value of accuracy of the experts elif "true" in args.expert: expert = baseexpert.TrueOracleExpert(cost_function=cost_model.cost_function) elif "linear" in args.expert: #expert = baseexpert.LRFunctionExpert(model=[0.0019, 0.6363],cost_function=cost_model.cost_function) raise Exception("We do not know linear yet!!") elif "log" in args.expert: expert = baseexpert.LogFunctionExpert(model=accu_parameters, cost_function=cost_model.cost_function) elif "direct" in args.expert: expert = baseexpert.LookUpExpert(accuracy_value=accu_parameters, cost_function=cost_model.cost_function) else: raise Exception("We need a defined cost function options [fixed|log|linear]") #expert = baseexpert.TrueOracleExpert(cost_function=cost_model.cost_function) print "\nExpert: %s " % expert #### ACTIVE LEARNING SETTINGS step_size = args.step_size bootstrap_size = args.bootstrap evaluation_points = 200 eval_range = 1 if (args.budget / evaluation_points) <= 0 else args.budget / evaluation_points print("\nExperiment: step={0}, BT={1}, plot points={2}, fixk:{3}, minsize:{4}".format(step_size, bootstrap_size, evaluation_points, args.fixk, 50)) t0 = time.time() ### experiment starts for t in range(args.trials): print "*" * 60 print "Trial: %s" % t # TODO shuffle the data?? #student = baselearner.BaseLearner(model=clf, cost_model=cost_model, accuracy_model=accuracy_model, budget=args.budget, # seed=t) student = randomsampling.RandomSamplingLearner(model=clf, accuracy_model=None, budget=args.budget, seed=t) print "\nStudent: %s " % student train_indices = [] train_x = [] train_y = [] pool = Bunch() pool.data = data.train.bow.tocsr() # full words, for training pool.fixk = data.train.bowk.tocsr() # k words BOW for querying pool.target = data.train.target pool.predicted = [] pool.kwords = np.array(data.train.kwords) # k words pool.remaining = set(range(pool.data.shape[0])) # indices of the pool #for x in pool.fixk: # print x.todense().sum() bootstrapped = False current_cost = 0 iteration = 0 while 0 < student.budget and len(pool.remaining) > step_size and iteration <= args.maxiter: if not bootstrapped: ## random bootstrap #bt = randomsampling.BootstrapRandom(random_state=t * 10) ## random from each bootstrap bt = randomsampling.BootstrapFromEach(t * 10) query_index = bt.bootstrap(pool=pool, k=bootstrap_size) bootstrapped = True print "Bootstrap: %s " % bt.__class__.__name__ print else: query_index = student.pick_next(pool=pool, k=step_size) query = pool.fixk[query_index] # query with k words query_size = [len(vct_analizer(x)) for x in pool.kwords[query_index]] #if query_size[0] >50: # print "*** %s" % pool.kwords[query_index] ground_truth = pool.target[query_index] #labels, spent = expert.label(unlabeled=query, target=ground_truth) if iteration == 0: ## bootstrap uses ground truth labels = ground_truth else: #labels = expert.label_instances(query, ground_truth) labels = expert.label_instances(query_size, ground_truth) #spent = expert.estimate_instances(pool.kwords[query_index]) spent = expert.estimate_instances(query_size) query_cost = np.array(spent).sum() current_cost += query_cost train_indices.extend(query_index) # remove labels from pool pool.remaining.difference_update(query_index) # add labels to training train_x = pool.data[train_indices] ## train with all the words # update labels with the expert labels #train_y = pool.target[train_indices] train_y.extend(labels) if train_x.shape[0] != len(train_y): raise Exception("Training data corrupted!") # retrain the model current_model = student.train(train_x, train_y) # evaluate and save results y_probas = current_model.predict_proba(data.test.bow) #auc = metrics.roc_auc_score(data.test.target, y_probas[:,1]) auc = metrics.roc_auc_score(data.test.target, y_probas[:, 1]) pred_y = current_model.classes_[np.argmax(y_probas, axis=1)] accu = metrics.accuracy_score(data.test.target, pred_y) print ( "TS:{0}\tAccu:{1:.3f}\tAUC:{2:.3f}\tCost:{3:.2f}\tCumm:{4:.2f}\tSpent:{5}".format(len(train_indices), accu, auc, query_cost, current_cost, spent)) ## the results should be based on the cost of the labeling if iteration > 0: # bootstrap iteration student.budget -= query_cost ## Bootstrap doesn't count #x_axis_range = int(current_cost / eval_range) x_axis_range = current_cost x_axis[x_axis_range].append(current_cost) ## save results #accuracies[len(train_indices)].append(accu) #aucs[len(train_indices)].append(auc) accuracies[x_axis_range].append(accu) aucs[x_axis_range].append(auc) iteration += 1 print("Elapsed time %.3f" % (time() - t0)) print_results(x_axis, accuracies, aucs)
def main(): accuracies = defaultdict(lambda: []) aucs = defaultdict(lambda: []) x_axis = defaultdict(lambda: []) vct = CountVectorizer(encoding='ISO-8859-1', min_df=5, max_df=1.0, binary=True, ngram_range=(1, 3), token_pattern='\\b\\w+\\b', tokenizer=StemTokenizer()) vct_analizer = vct.build_tokenizer() print("Start loading ...") # data fields: data, bow, file_names, target_names, target ########## NEWS GROUPS ############### # easy to hard. see "Less is More" paper: http://axon.cs.byu.edu/~martinez/classes/678/Presentations/Clawson.pdf categories = [['alt.atheism', 'talk.religion.misc'], ['comp.graphics', 'comp.windows.x'], ['comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware'], ['rec.sport.baseball', 'sci.crypt']] min_size = max(100, args.fixk) if args.fixk < 0: args.fixk = None fixk_saved = "{0}{1}.p".format(args.train, args.fixk) try: print "Loading existing file... %s " % args.train fixk_file = open(fixk_saved, "rb") data = pickle.load(fixk_file) fixk_file.close() vectorizer = open("{0}vectorizer.p".format(args.train), "rb") vct = pickle.load(vectorizer) vectorizer.close() except (IOError, ValueError): print "Loading from scratch..." data = load_dataset(args.train, args.fixk, categories[0], vct, min_size, percent=.5) fixk_file = open(fixk_saved, "wb") pickle.dump(data, fixk_file) fixk_file.close() vectorizer = open("{0}vectorizer.p".format(args.train), "wb") pickle.dump(vct, vectorizer) vectorizer.close() # data = load_dataset(args.train, args.fixk, categories[0], vct, min_size) print("Data %s" % args.train) print("Data size %s" % len(data.train.data)) parameters = parse_parameters_mat(args.cost_model) print "Cost Parameters %s" % parameters cost_model = set_cost_model(args.cost_function, parameters=parameters) print "\nCost Model: %s" % cost_model.__class__.__name__ #### STUDENT CLASSIFIER clf = linear_model.LogisticRegression(penalty="l1", C=1) # clf = set_classifier(args.classifier) print "\nStudent Classifier: %s" % clf #### EXPERT CLASSIFIER exp_clf = linear_model.LogisticRegression(penalty='l1', C=args.expert_penalty) exp_clf.fit(data.test.bow, data.test.target) expert = baseexpert.NeutralityExpert(exp_clf, threshold=args.neutral_threshold, cost_function=cost_model.cost_function) print "\nExpert: %s " % expert #### ACTIVE LEARNING SETTINGS step_size = args.step_size bootstrap_size = args.bootstrap evaluation_points = 200 print("\nExperiment: step={0}, BT={1}, plot points={2}, fixk:{3}, minsize:{4}".format(step_size, bootstrap_size, evaluation_points, args.fixk, min_size)) print ("Anytime active learning experiment - use objective function to pick data") t0 = time.time() tac = [] tau = [] ### experiment starts for t in range(args.trials): trial_accu = [] trial_aucs = [] print "*" * 60 print "Trial: %s" % t if args.student in "anyunc": student = randomsampling.AnytimeLearner(model=clf, accuracy_model=None, budget=args.budget, seed=t, vcn=vct, subpool=250, cost_model=cost_model) elif args.student in "lambda": student = randomsampling.AnytimeLearnerDiff(model=clf, accuracy_model=None, budget=args.budget, seed=t, vcn=vct, subpool=250, cost_model=cost_model, lambda_value=args.lambda_value) elif args.student in "anyzero": student = randomsampling.AnytimeLearnerZeroUtility(model=clf, accuracy_model=None, budget=args.budget, seed=t, vcn=vct, subpool=250, cost_model=cost_model) else: raise ValueError("Oops! We do not know that anytime strategy. Try again.") print "\nStudent: %s " % student train_indices = [] neutral_text = [] # save the raw text of the queries neutral_data = [] # save the xik vectors train_x = [] train_y = [] neu_x = [] # data to train the classifier neu_y = np.array([]) pool = Bunch() pool.data = data.train.bow.tocsr() # full words, for training pool.text = data.train.data # pool.fixk = data.train.bowk.tocsr() # k words BOW for querying pool.target = data.train.target pool.predicted = [] # pool.kwords = np.array(data.train.kwords) # k words pool.remaining = set(range(pool.data.shape[0])) # indices of the pool bootstrapped = False current_cost = 0 iteration = 0 query_index = None query_size = None while 0 < student.budget and len(pool.remaining) > step_size and iteration <= args.maxiter: util = [] if not bootstrapped: ## random from each bootstrap bt = randomsampling.BootstrapFromEach(t * 10) query_index = bt.bootstrap(pool=pool, k=bootstrap_size) bootstrapped = True query = pool.data[query_index] print "Bootstrap: %s " % bt.__class__.__name__ print else: # print "pick instance" ## chose returns: index, k ## util returns: utility, k, unc query_chosen, util = student.pick_next(pool=pool, step_size=step_size) query_index = [a for a, b in query_chosen] query_size = [b for a, b in query_chosen] # query = pool.fixk[query_index] # query with k words qk = [] for q, k in query_chosen: qk.append(" ".join(vct_analizer(pool.text[q])[0:int(k)])) query = vct.transform(qk) # query_size = [len(vct_analizer(x)) for x in pool.kwords[query_index]] ground_truth = pool.target[query_index] #labels, spent = expert.label(unlabeled=query, target=ground_truth) if iteration == 0: ## bootstrap uses ground truth labels = ground_truth spent = [0] * len(ground_truth) ## bootstrap cost is ignored else: # print "ask labels" labels = expert.label_instances(query, ground_truth) spent = expert.estimate_instances(query_size) ### accumulate the cost of the query query_cost = np.array(spent).sum() current_cost += query_cost # print query_index useful_answers = np.array([[x, y] for x, y in zip(query_index, labels) if y is not None]) neutral_answers = np.array([[x, z] for x, y, z in zip(query_index, labels, query_size) if y is None]) \ if iteration != 0 else np.array([]) # print labels # print "label\tutility\tk\tunc" # print format_query(zip(labels, util)) ## add data recent acquired to train if useful_answers.shape[0] != 0: # print "get training" # train_indices.extend(query_index) train_indices.extend(useful_answers[:, 0]) # add labels to training train_x = pool.data[train_indices] # # train with all the words # update labels with the expert labels #train_y = pool.target[train_indices] train_y.extend(useful_answers[:, 1]) if neutral_answers.shape[0] != 0: # current query neutrals qlbl = [] for xik, lbl in zip(query, labels): # neutral_data.append(xik) if isinstance(neutral_data, list): neutral_data = xik else: neutral_data = vstack([neutral_data, xik], format='csr') qlbl.append(neutral_label(lbl)) ## append the labels of the current query neu_y = np.append(neu_y, qlbl) neu_x = neutral_data #end usefulanswers if train_x.shape[0] != len(train_y): raise Exception("Training data corrupted!") # remove labels from pool pool.remaining.difference_update(query_index) # retrain the model # current_model = student.train(train_x, train_y) # print "train models" current_model = student.train_all(train_x, train_y, neu_x, neu_y) # print "evaluate" # evaluate and save results y_probas = current_model.predict_proba(data.test.bow) auc = metrics.roc_auc_score(data.test.target, y_probas[:, 1]) pred_y = current_model.classes_[np.argmax(y_probas, axis=1)] accu = metrics.accuracy_score(data.test.target, pred_y) print ("TS:{0}\tAccu:{1:.3f}\tAUC:{2:.3f}\tCost:{3:.2f}\tCumm:{4:.2f}\tSpent:{5}\tneu:{6}\t{7}".format( len(train_indices), accu, auc, query_cost, current_cost, format_spent(spent), len(neutral_answers), neu_y.shape[0])) ## the results should be based on the cost of the labeling if iteration > 0: # bootstrap iteration student.budget -= query_cost ## Bootstrap doesn't count x_axis_range = current_cost x_axis[x_axis_range].append(current_cost) ## save results accuracies[x_axis_range].append(accu) aucs[x_axis_range].append(auc) # partial trial results trial_accu.append([x_axis_range, accu]) trial_aucs.append([x_axis_range, auc]) iteration += 1 # end of budget loop tac.append(trial_accu) tau.append(trial_aucs) #end trial loop if args.cost_function not in "uniform": accuracies = extrapolate_trials(tac, cost_25=parameters[1][1], step_size=args.step_size) aucs = extrapolate_trials(tau, cost_25=parameters[1][1], step_size=args.step_size) print("Elapsed time %.3f" % (time.time() - t0)) print_extrapolated_results(accuracies, aucs)
def main(): accuracies = defaultdict(lambda: []) aucs = defaultdict(lambda: []) x_axis = defaultdict(lambda: []) vct = CountVectorizer(encoding='latin-1', min_df=5, max_df=1.0, binary=True, ngram_range=(1, 3), token_pattern='\\b\\w+\\b', tokenizer=StemTokenizer()) vct_analizer = vct.build_tokenizer() print("Start loading ...") # data fields: data, bow, file_names, target_names, target ########## NEWS GROUPS ############### # easy to hard. see "Less is More" paper: http://axon.cs.byu.edu/~martinez/classes/678/Presentations/Clawson.pdf categories = [['alt.atheism', 'talk.religion.misc'], ['comp.graphics', 'comp.windows.x'], ['comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware'], ['rec.sport.baseball', 'sci.crypt']] min_size = max(10, args.fixk) if args.fixk < 0: args.fixk = None # data = load_dataset(args.train, args.fixk, categories[0], vct, min_size, percent=.5) # fixk_saved = "{0}{1}.p".format(args.train, args.fixk) data, vct = load_from_file(args.train, categories, args.fixk, min_size, vct) print("Data %s" % args.train) print("Data size %s" % len(data.train.data)) #### COST MODEL parameters = parse_parameters_mat(args.cost_model) print "Cost Parameters %s" % parameters cost_model = set_cost_model(args.cost_function, parameters=parameters) print "\nCost Model: %s" % cost_model.__class__.__name__ #### ACCURACY MODEL accu_parameters = parse_parameters_mat(args.accu_model) #### CLASSIFIER clf = set_classifier(args.classifier) print "\nClassifier: %s" % clf #### EXPERT MODEL if "fixed" in args.expert: expert = baseexpert.FixedAccuracyExpert(accuracy_value=accu_parameters[0], cost_function=cost_model.cost_function) #average value of accuracy of the experts elif "true" in args.expert: expert = baseexpert.TrueOracleExpert(cost_function=cost_model.cost_function) elif "linear" in args.expert: #expert = baseexpert.LRFunctionExpert(model=[0.0019, 0.6363],cost_function=cost_model.cost_function) raise Exception("We do not know linear yet!!") elif "log" in args.expert: expert = baseexpert.LogFunctionExpert(model=accu_parameters, cost_function=cost_model.cost_function) elif "direct" in args.expert: expert = baseexpert.LookUpExpert(accuracy_value=accu_parameters, cost_function=cost_model.cost_function) elif "neutral" in args.expert: exp_clf = LogisticRegression(penalty='l1', C=1) exp_clf.fit(data.test.bow, data.test.target) expert = baseexpert.NeutralityExpert(exp_clf, threshold=args.neutral_threshold, cost_function=cost_model.cost_function) else: raise Exception("We need a defined cost function options [fixed|log|linear]") exp_clf = LogisticRegression(penalty='l1', C=args.expert_penalty) exp_clf.fit(data.test.bow, data.test.target) print "\nExpert: %s " % expert coef = exp_clf.coef_[0] # print_features(coef, vct.get_feature_names()) #### ACTIVE LEARNING SETTINGS step_size = args.step_size bootstrap_size = args.bootstrap evaluation_points = 200 print("\nExperiment: step={0}, BT={1}, plot points={2}, fixk:{3}, minsize:{4}".format(step_size, bootstrap_size, evaluation_points, args.fixk, 50)) t0 = time.time() tac = [] tau = [] ### experiment starts for t in range(args.trials): trial_accu = [] trial_aucs = [] print "*" * 60 print "Trial: %s" % t if args.student in "unc": student = randomsampling.UncertaintyLearner(model=clf, accuracy_model=None, budget=args.budget, seed=t, subpool=250) else: student = randomsampling.RandomSamplingLearner(model=clf, accuracy_model=None, budget=args.budget, seed=t) print "\nStudent: %s " % student train_indices = [] train_x = [] train_y = [] pool = Bunch() pool.data = data.train.bow.tocsr() # full words, for training if args.fixk is None: pool.fixk = data.train.bow.tocsr() else: pool.fixk = data.train.bowk.tocsr() # k words BOW for querying pool.target = data.train.target pool.predicted = [] # pool.kwords = np.array(data.train.kwords) # k words pool.remaining = set(range(pool.data.shape[0])) # indices of the pool bootstrapped = False current_cost = 0 iteration = 0 while 0 < student.budget and len(pool.remaining) > step_size and iteration <= args.maxiter: if not bootstrapped: ## random bootstrap #bt = randomsampling.BootstrapRandom(random_state=t * 10) ## random from each bootstrap bt = randomsampling.BootstrapFromEach(t * 10) query_index = bt.bootstrap(pool=pool, k=bootstrap_size) bootstrapped = True print "Bootstrap: %s " % bt.__class__.__name__ print else: query_index = student.pick_next(pool=pool, k=step_size) # query = pool.fixk[query_index] # query with k words query = pool.data[query_index] # print query_index # query_size = [len(vct_analizer(x)) for x in pool.kwords[query_index]] query_size = [1]*query.shape[0] ground_truth = pool.target[query_index] if iteration == 0: ## bootstrap uses ground truth labels = ground_truth spent = [0] * len(ground_truth) else: labels = expert.label_instances(query, ground_truth) spent = expert.estimate_instances(query_size) query_cost = np.array(spent).sum() current_cost += query_cost # train_indices.extend(query_index) # remove labels from pool pool.remaining.difference_update(query_index) # add labels to training # train_x = pool.data[train_indices] ## train with all the words # update labels with the expert labels useful_answers = np.array([[x, y] for x, y in zip(query_index, labels) if y is not None]) if useful_answers.shape[0] != 0: train_indices.extend(useful_answers[:, 0]) # add labels to training train_x = pool.data[train_indices] ## train with all the words # update labels with the expert labels train_y.extend(useful_answers[:, 1]) if train_x.shape[0] != len(train_y): raise Exception("Training data corrupted!") # retrain the model current_model = student.train(train_x, train_y) # evaluate and save results y_probas = current_model.predict_proba(data.test.bow) auc = metrics.roc_auc_score(data.test.target, y_probas[:, 1]) pred_y = current_model.classes_[np.argmax(y_probas, axis=1)] accu = metrics.accuracy_score(data.test.target, pred_y) print ( "TS:{0}\tAccu:{1:.3f}\tAUC:{2:.3f}\tCost:{3:.2f}\tCumm:{4:.2f}\tSpent:{5}".format(len(train_indices), accu, auc, query_cost, current_cost, format_spent(spent))) ## the results should be based on the cost of the labeling if iteration > 0: # bootstrap iteration student.budget -= query_cost ## Bootstrap doesn't count #x_axis_range = int(current_cost / eval_range) x_axis_range = current_cost x_axis[x_axis_range].append(current_cost) ## save results accuracies[x_axis_range].append(accu) aucs[x_axis_range].append(auc) trial_accu.append([x_axis_range, accu]) trial_aucs.append([x_axis_range, auc]) iteration += 1 # end of budget loop tac.append(trial_accu) tau.append(trial_aucs) #end trial loop if args.cost_function not in "uniform": accuracies = extrapolate_trials(tac, cost_25=parameters[1][1], step_size=args.step_size) aucs = extrapolate_trials(tau, cost_25=parameters[1][1], step_size=args.step_size) print("Elapsed time %.3f" % (time.time() - t0)) print_extrapolated_results(accuracies, aucs)