def test_libact_first_try_results_are_the_same(self): """ test that the first libact example work the same way as the original example taken from github\ very long test ! """ # self.skipTest(reason="too long") cn.Inner_PredictionModel = SvmModel cn.Feature_Extractor = AvgGloveExtractor cn.load_codementor_sentiment_analysis_parameters() kb_helper.load_WordNet_model() quota = 5 # ask labeler to label 5 samples (tops) base_training_df, validation_data_df = prepare_balanced_dataset() pos_sents = pandas_util.get_all_positive_sentences( base_training_df, cn.col_names.text, cn.col_names.tag, cn.pos_tags) # prepare all data generated_pool_df = sg.generate_sents_using_random_synthesis( pos_sents, base_training_df, cn.col_names) labeled_pool_df = label_df_with_expert(generated_pool_df, cn.col_names) enriched_train_df = pd.concat([base_training_df, generated_pool_df], ignore_index=True) ideal_df = pd.concat([base_training_df, labeled_pool_df], ignore_index=True) extractor = cn.Feature_Extractor( enriched_train_df, cn.col_names) # build the feature extractor lbr = IdealTextLabeler(TextDataset(ideal_df, cn.col_names, extractor)) manager = multiprocessing.Manager() return_dict = manager.dict() jobs = [] # first job p = multiprocessing.Process(target=self.libact_first_try_first_run, args=(enriched_train_df, extractor, lbr, quota, validation_data_df, return_dict)) jobs.append(p) p.start() # second job p = multiprocessing.Process(target=self.libact_first_try_second_run, args=(enriched_train_df, extractor, ideal_df, lbr, quota, validation_data_df, return_dict)) jobs.append(p) p.start() for proc in jobs: proc.join() self.assertTrue(np.array_equal(return_dict[1], return_dict[2]))
def compare_generated_pool_insertion_order(): experiment_name = 'gen_pool_insertion_order_balanced' print experiment_name # prepare the different splits of $data_df balanced_train_df, validation_data_df = prepare_balanced_dataset() all_sents = list(balanced_train_df[cn.col_names.text]) tns = 50 pool_name, prep_pools = prepare_pools_template( "random synthesis", generate_sents_using_random_synthesis) # pool_name, prep_pools = local_search_gen_template("EGL-EGL") generated_pool_df, labeled_pool_df = prep_pools(all_sents, balanced_train_df, cn.col_names, tns + 50) cn.experiment_purpose += pool_name + " " trn_ds, lbr, extractor = prepare_trn_ds(balanced_train_df, generated_pool_df, labeled_pool_df) final_scoring_fun = partial( lambda en_df: run_classifier(en_df, validation_data_df).acc) table_headers = ['#added examples'] data = [[0]] compared_heuristics = combined_heuristics_list[:] + [ #("test-data-gain", lambda: SynStateTestDataGain), ("random", lambda: "random") ] for (heuristic_name, prepare_usage) in compared_heuristics: init_score = final_scoring_fun(balanced_train_df) ss_type = prepare_usage() print heuristic_name # query_num, heur_scores = insert_in_AL_fashion(trn_ds, final_scoring_fun, lbr, ss_type, # labeled_pool_df, quota=tns) query_num, heur_scores = insert_in_batch_AL(trn_ds, final_scoring_fun, lbr, ss_type, labeled_pool_df, quota=tns, batch_num=5) heur_scores[0] = init_score data[0] = query_num if len(data[0]) < len(query_num) else data[0] data.append(heur_scores) table_headers.append(heuristic_name) # assert_ends_and_beginnings_are_the_same(data[1:]) return experiment_name, table_headers, data, plot_compare_insertion_order
def effect_of_size_of_semantic_environment(): experiment_name = 'effect_of_size_of_semantic_environment' print experiment_name # prepare the different splits of $data_df balanced_train_df, validation_data_df = prepare_balanced_dataset( print_expert_acc=False) all_sents = list(balanced_train_df[cn.col_names.text]) tns = 50 prepare_pools_funcs = list() prepare_pools_funcs.append( local_search_gen_template("uncertainty lc LogReg", 5)) prepare_pools_funcs.append(local_search_gen_template("random-score", 5)) prepare_pools_funcs.append(generate_pool_using_random_synthesis()) final_scoring_fun = partial( lambda en_df: run_classifier(en_df, validation_data_df).acc) insertion_order_heuristic = find_heuristic("random-score")() print cn.experiment_purpose table_headers = ['size of semantic environment'] data = [[]] for env_size in range(1, 5, 1) + range(5, 35, 5): data[0].append(env_size) for i, (pool_name, prepare_pool_fun) in enumerate(prepare_pools_funcs): if pool_name not in table_headers: table_headers.append(pool_name) data.append([]) assert len(data) == i + 2, "meaning i+1 is our index" print pool_name cn.distance_measure = env_size gen_pool_df, labeled_pool_df = prepare_pool_fun( all_sents, balanced_train_df, cn.col_names, tns) trn_ds, lbr, extractor = prepare_trn_ds(balanced_train_df, gen_pool_df, labeled_pool_df) query_num, pool_insr_scores = insert_in_AL_fashion( trn_ds, final_scoring_fun, lbr, insertion_order_heuristic, labeled_pool_df, quota=tns) data[i + 1].append(pool_insr_scores[-1]) return experiment_name, table_headers, data, plot_effect_of_size_of_semantic_environment
def effect_of_semantic_environment_on_label_switches(): experiment_name = 'effect_of_semantic_environment_on_label_switches' print experiment_name # prepare the different splits of $data_df balanced_train_df, validation_data_df = prepare_balanced_dataset( print_expert_acc=False) all_sents = list(balanced_train_df[cn.col_names.text]) tns = 50 prepare_pools_funcs = list() prepare_pools_funcs.append( local_search_gen_template("uncertainty lc LogReg", 5)) prepare_pools_funcs.append(local_search_gen_template("random-score", 5)) prepare_pools_funcs.append(generate_pool_using_random_synthesis()) print cn.experiment_purpose table_headers = ['size of semantic environment'] data = [[]] def get_num_of_label_switches(labeled_pool_df): count = 0 orig_tags = dict( map(lambda (i, r): (r[cn.col_names.text], r[cn.col_names.tag]), balanced_train_df.iterrows())) for i, r in labeled_pool_df.iterrows(): if orig_tags[r[cn.col_names.prev_states][-1]] != r[ cn.col_names.tag]: count += 1 return count for env_size in [10]: data[0].append(env_size) data[0].append(env_size * 2) for i, (pool_name, prepare_pool_fun) in enumerate(prepare_pools_funcs): if pool_name not in table_headers: table_headers.append(pool_name) data.append([]) assert len(data) == i + 2, "meaning i+1 is our index" print pool_name cn.distance_measure = env_size gen_pool_df, labeled_pool_df = prepare_pool_fun( all_sents, balanced_train_df, cn.col_names, tns) data[i + 1].append(get_num_of_label_switches(labeled_pool_df)) data[i + 1].append(get_num_of_label_switches(labeled_pool_df)) return experiment_name, table_headers, data, plot_effect_of_semantic_environment_on_label_switches
def effect_of_num_of_operators(): experiment_name = 'effect_of_num_of_operators' print experiment_name # prepare the different splits of $data_df balanced_train_df, validation_data_df = prepare_balanced_dataset( print_expert_acc=False) all_sents = list(balanced_train_df[cn.col_names.text]) tns = 50 final_scoring_fun = partial( lambda en_df: run_classifier(en_df, validation_data_df).acc) insertion_order_heuristic = find_heuristic("random-score")() print cn.experiment_purpose table_headers = ['num of operators'] data = [[1] + range(2, 12, 2)] for i, heur in enumerate(["uncertainty lc LogReg", "random-score"]): # "uncertainty lc LogReg", table_headers.append(heur) data.append(list()) prepare_pools_funcs = list() prepare_pools_funcs.append(local_search_gen_template(heur, 1)) for j in range(2, 12, 2): prepare_pools_funcs.append(local_search_gen_template(heur, j)) for pool_name, prepare_pool_fun in prepare_pools_funcs: gen_pool_df, labeled_pool_df = prepare_pool_fun( all_sents, balanced_train_df, cn.col_names, tns) trn_ds, lbr, extractor = prepare_trn_ds(balanced_train_df, gen_pool_df, labeled_pool_df) print pool_name query_num, pool_insr_scores = insert_in_batch_AL( trn_ds, final_scoring_fun, lbr, insertion_order_heuristic, labeled_pool_df, batch_num=len(labeled_pool_df)) data[1 + i].append(pool_insr_scores[-1]) return experiment_name, table_headers, data, plot_effect_of_num_of_operators
def compare_generation_methods_pools(): experiment_name = 'small_train_compare_generation_methods_pools' print experiment_name # prepare the different splits of $data_df balanced_train_df, validation_data_df = prepare_balanced_dataset( print_expert_acc=False) all_sents = list(balanced_train_df[cn.col_names.text]) tns = 40 prepare_pools_funcs = list() # prepare_pools_funcs.append(curr_pool_gen_template("uncertainty lc LogReg")) # # # prepare_pools_funcs.append(local_search_gen_template("uncertainty lc LogReg", 2)) # prepare_pools_funcs.append(local_search_gen_template("uncertainty lc LogReg", 2)) # # prepare_pools_funcs.append(local_search_gen_template("uncertainty lc LogReg", 10)) # # prepare_pools_funcs.append(local_search_gen_template("uncertainty lc LogReg", 0, use_enhanced=True)) # # prepare_pools_funcs.append(local_search_gen_template("uncertainty lc LogReg", 5, use_enhanced=True)) # # prepare_pools_funcs.append(local_search_gen_template("uncertainty lc LogReg", 10, use_enhanced=True)) # prepare_pools_funcs.append(local_search_gen_template("random-score", 2)) # # prepare_pools_funcs.append(local_search_gen_template("random-score", 0)) # # # prepare_pools_funcs.append(curr_pool_gen_template("test-data-gain")) # prepare_pools_funcs.append(generate_pool_using_random_synthesis()) # prepare_pools_funcs.append(prepare_orig_examples_pools()) # prepare_pools_funcs.append(generate_pool_lecun_augmentation()) prepare_pools_funcs.append(generate_sents_using_lstm_generator()) final_scoring_fun = partial( lambda en_df: run_classifier(en_df, validation_data_df).acc) insertion_order_heuristic = find_heuristic("uncertainty lc LogReg")() # insertion_order_heuristic = find_heuristic("test-data-gain")() cn.add_experiment_param(insertion_order_heuristic.__name__) cn.experiment_purpose += "insertion order using " + insertion_order_heuristic.__name__ + " " print cn.experiment_purpose table_headers = ['#added examples'] data = [[0]] for pool_name, prepare_pool_fun in prepare_pools_funcs: init_score = final_scoring_fun(balanced_train_df) print pool_name gen_pool_df, labeled_pool_df = prepare_pool_fun( all_sents, balanced_train_df, cn.col_names, tns) trn_ds, lbr, extractor = prepare_trn_ds(balanced_train_df, gen_pool_df, labeled_pool_df) print pool_name query_num, pool_insr_scores = insert_in_AL_fashion( trn_ds, final_scoring_fun, lbr, insertion_order_heuristic, labeled_pool_df, quota=tns) # query_num, pool_insr_scores = insert_in_batch_AL(trn_ds, final_scoring_fun, lbr, insertion_order_heuristic, # labeled_pool_df, quota=tns, batch_num=5) pool_insr_scores[0] = init_score data[0] = query_num if len(data[0]) < len(query_num) else data[0] table_headers.append(pool_name) data.append(pool_insr_scores) return experiment_name, table_headers, data, plot_compare_generation_methods
def compare_pool_generation_methods_proper_al(): experiment_name = 'compare_pool_generation_methods_proper_AL' print experiment_name # prepare the different splits of $data_df balanced_train_df, validation_data_df = prepare_balanced_dataset() all_sents = list(balanced_train_df[cn.col_names.text]) prepare_pools_funcs = list() # prepare_pools_funcs.append(curr_pool_gen_template("uncertainty lc LogReg")) # prepare_pools_funcs.append(curr_pool_gen_template("random-score")) # prepare_pools_funcs.append(local_search_gen_template("uncertainty lc LogReg", 5)) # prepare_pools_funcs.append(local_search_gen_template("uncertainty lc LogReg", 5, use_enhanced=True)) # prepare_pools_funcs.append(local_search_gen_template("random-score", 5)) # prepare_pools_funcs.append(generate_pool_using_random_synthesis()) # prepare_pools_funcs.append(prepare_orig_examples_pools()) # prepare_pools_funcs.append(generate_pool_lecun_augmentation()) prepare_pools_funcs.append(generate_sents_using_lstm_generator()) final_scoring_fun = partial( lambda en_df: run_classifier(en_df, validation_data_df).acc) cn.add_experiment_param("tns_" + str(total_new_sents)) cn.add_experiment_param("pool_size" + str(pool_size_each_step)) cn.add_experiment_param("batch_size" + str(examples_at_each_step)) print cn.experiment_purpose def do_one_AL_cycle(pool_gen_fun, curr_training_df): done_generating = False sent_pool = set(curr_training_df[cn.col_names.text]) gen_pool_df, labeled_pool_df = pool_gen_fun(list(sent_pool), curr_training_df, cn.col_names, pool_size_each_step) if len(gen_pool_df) > examples_at_each_step: selected_instance_idxs = select_from_pool_uncertainty( gen_pool_df, balanced_train_df, cn.col_names, sent_pool, examples_at_each_step) labeled_instances_df = labeled_pool_df.iloc[ selected_instance_idxs].copy(deep=True).reset_index(drop=True) else: labeled_instances_df = labeled_pool_df # all there is, close enough. if len(gen_pool_df) < examples_at_each_step: done_generating = True enriched_train_df = pd.concat([curr_training_df, labeled_instances_df], ignore_index=True) return enriched_train_df, final_scoring_fun( enriched_train_df), done_generating table_headers = ['#added examples'] data = [ range(0, total_new_sents + examples_at_each_step, examples_at_each_step) ] for pool_name, prepare_pool_fun in prepare_pools_funcs: start_time = time.time() print "starting {0} - {1}".format(pool_name, cn.data_name) curr_training_df = balanced_train_df.copy(deep=True) res_list = [final_scoring_fun(curr_training_df)] for i in range(0, total_new_sents, examples_at_each_step): # has to be serial print_progress(i, total=total_new_sents) sa = time.time() curr_training_df, curr_add_res, done = do_one_AL_cycle( prepare_pool_fun, curr_training_df) if done: break res_list.append(curr_add_res) print "AL cycle took {0:.2f} s".format(time.time() - sa) print "{0} run time: {1:.2f} minutes - {2}".format( pool_name, (time.time() - start_time) / 60.0, cn.data_name) table_headers.append(pool_name) data.append(res_list) return experiment_name, table_headers, data, plot_compare_pool_generation_methods_proper_al