def create_pointer_examples(): results = [] result_names = [] # pure BPEmb vs = 100000 d = 200 bp_man = BPEmbeddings(bp_vocab_size=vs, dim=d, case_sensitive=False) ds = DataSet("blah") ds.read_multiple(train_sets + dev_set + itac_test + conll_test) bp_man.build_vocabulary([ds]) manager = PointerManager(bp_man, "basic", learning_rate=START_LR, lr_factor=LR_DECAY, lr_patience=LR_PATIENCE, cuda_device=CUDA_DEVICE) manager.load_model("pointer/models/19_05_11b/bpemb_{}_{}.pt".format(vs, d)) results.append(test_example(manager)) result_names.append("bpemb_{}_{}".format(vs, d)) # pure glove for d in [50, 300]: path = "embeddings/glove/glove.6B.{}d.txt".format(d) g_man = GloveEmbeddings(path=path, dim=d) manager = PointerManager(g_man, "basic", learning_rate=START_LR, lr_factor=LR_DECAY, lr_patience=LR_PATIENCE, cuda_device=CUDA_DEVICE) manager.load_model("pointer/models/19_05_11b/glove_{}.pt".format(d)) results.append(test_example(manager)) result_names.append("glove_{}".format(d)) # glove + bpemb for g_d, b_d in [(200, 50), (300, 25)]: path = "embeddings/glove/glove.6B.{}d.txt".format(g_d) g_man = GloveEmbeddings(path=path, dim=g_d) b_man = BPEmbeddings(dim=b_d, bp_vocab_size=100000) c_man = CombinedEmbeddings([g_man, b_man]) ds = DataSet("blah") ds.read_multiple(train_sets + dev_set + itac_test + conll_test) c_man.build_vocabulary([ds]) manager = PointerManager(g_man, "basic", learning_rate=START_LR, lr_factor=LR_DECAY, lr_patience=LR_PATIENCE, cuda_device=CUDA_DEVICE) manager.load_model( "pointer/models/19_05_11b/glove_d{}_bp_d{}.pt".format(g_d, b_d)) results.append(test_example(manager)) result_names.append("glove_d{}_bp_d{}_vs100000".format(g_d, b_d)) write_results("results/19_05_11b/pointer_examples.txt", results=results, names=result_names)
def grid_search(): base_path = "models/19_04_11_grid_search/" train_sets = [ "../data/standardized/conll_train.txt", "../data/standardized/conll_test.txt", "../data/standardized/itac_dev.txt" ] dev_set = ["../data/standardized/conll_valid.txt"] for lr in [0.1, 0.05, 0.01, 0.005, 0.001]: for tf in [0.0, 0.2, 0.5, 0.8, 1]: manager = PointerManager(GloveEmbeddings( "../embeddings/glove.6B.50d.txt", dim=50), "basic", learning_rate=lr) manager.train_model(train_sets, n_epochs=5, print_interval=5000, teacher_forcing=tf, save_path=base_path + str(lr) + str(tf) + "_5epoch.pt") manager.test_model(dev_set) manager.train_model(train_sets, n_epochs=5, print_interval=5000, teacher_forcing=tf, save_path=base_path + str(lr) + str(tf) + "_10epoch.pt") manager.test_model(dev_set)
def pointer_with_pure_glove(): g_dim_list = [] time_list = [] itac_p = [] itac_r = [] itac_f = [] conll_r = [] conll_p = [] conll_f = [] epoch_list = [] for path, d in glove_embeddings: g_dim_list.append(d) g_man = GloveEmbeddings(path=path, dim=d) manager = PointerManager(g_man, "basic", learning_rate=START_LR, lr_factor=LR_DECAY, lr_patience=LR_PATIENCE, cuda_device=CUDA_DEVICE) st = time.time() manager.train_model_dynamic( train_sets=train_sets, dev_sets=dev_set, max_tries=EPOCH_PATIENCE, print_interval=10000, save_path="pointer/models/19_05_11b/glove_{}.pt".format(d), teacher_forcing=TEACHER_FORCING) time_list.append(time.time() - st) manager2 = PointerManager(g_man, "basic", learning_rate=START_LR, lr_factor=LR_DECAY, lr_patience=LR_PATIENCE, cuda_device=CUDA_DEVICE) manager2.load_model("pointer/models/19_05_11b/glove_{}.pt".format(d)) epoch_list.append(manager2.model.cur_epoch) r, p, f = manager2.test_model(itac_test) itac_f.append(f) itac_p.append(p) itac_r.append(r) r, p, f = manager2.test_model(conll_test) conll_f.append(f) conll_p.append(p) conll_r.append(r) result_df = pd.DataFrame() result_df["glove dim"] = g_dim_list result_df["epochs_trained"] = epoch_list result_df["train_time"] = time_list result_df["precision_itac"] = itac_p result_df["recall_itac"] = itac_r result_df["f1_itac"] = itac_f result_df["precision_conll"] = conll_p result_df["recall_conll"] = conll_r result_df["f1_conll"] = conll_f result_df.to_csv("results/19_05_11b/ptr_pure_glove_dim_search.csv")
def test_depth_n_trees(): d_list = [] n_list = [] time_list = [] itac_p = [] itac_r = [] itac_f = [] conll_r = [] conll_p = [] conll_f = [] for d in list(range(5, 20)): # [5, 10, 20, 50, None]: for n in [10, 20, 50, 100]: d_list.append(d) n_list.append(n) print("\nTraining {} trees with max_depth {}".format(n, d)) g_man = GloveEmbeddings(path="embeddings/glove/glove.6B.50d.txt", dim=50) clf = BasicClassifier(model=RandomForestClassifier(n_estimators=n, max_depth=d), emb_man=g_man, wiki_file=wiki_file) t = time.time() clf.train_model(data_files=train_sets) time_list.append(time.time() - t) r, p, f = clf.test_model(itac_test) itac_f.append(f) itac_p.append(p) itac_r.append(r) r, p, f = clf.test_model(conll_test) conll_f.append(f) conll_p.append(p) conll_r.append(r) result_df = pd.DataFrame() result_df["n_estimators"] = n_list result_df["max_depth"] = d_list result_df["train_time"] = time_list result_df["precision_itac"] = itac_p result_df["recall_itac"] = itac_r result_df["f1_itac"] = itac_f result_df["precision_conll"] = conll_p result_df["recall_conll"] = conll_r result_df["f1_conll"] = conll_f result_df.to_csv("results/random_forest/n_trees_depth_search2.csv")
def _embed_sentence(self, sent): s_list = [] for w in sent: s_list.append(self.embedding_man.get_embedding_vec(w)) return s_list if __name__ == "__main__": ds1 = DataSet("../data/standardized/conll_test.txt") ds1.read_data() ds2 = DataSet("../data/standardized/itac_test.txt") ds2.read_data() ds3 = DataSet("../data/standardized/rsics_test.txt") ds3.read_data() g_man = GloveEmbeddings("glove/glove.6B.50d.txt", 50) embedder = SimpleDataEmbedder(g_man, create_sent_emb=False) embedder.reduce_dataset(ds1, path="dim_reductions/glove50_conll.csv") embedder.reduce_dataset(ds2, path="dim_reductions/glove50_itac.csv") embedder.reduce_dataset(ds3, path="dim_reductions/glove50_rsics.csv") b_man = BPEmbeddings(dim=100, bp_vocab_size=50000) embedder = SimpleDataEmbedder(b_man, create_sent_emb=False) embedder.reduce_dataset(ds1, path="dim_reductions/bp-d100-vs50000_conll.csv") embedder.reduce_dataset(ds2, path="dim_reductions/bp-d100-vs50000_itac.csv") embedder.reduce_dataset(ds3, path="dim_reductions/bp-d100-vs50000_rsics.csv") c_man = CombinedEmbeddings([g_man, b_man]) embedder = SimpleDataEmbedder(c_man, create_sent_emb=False) embedder.reduce_dataset(ds1, path="dim_reductions/bp-glove_conll.csv") embedder.reduce_dataset(ds2, path="dim_reductions/bp-glove_itac.csv")
def pointer_test_n_layers(): g_dim_list = [] time_list = [] itac_p = [] itac_r = [] itac_f = [] conll_r = [] conll_p = [] conll_f = [] epoch_list = [] n_list = [] d = 50 for n_layers in range(1, 5): n_list.append(n_layers) g_dim_list.append(d) path = "embeddings/glove/glove.6B.{}d.txt".format(d) g_man = GloveEmbeddings(path=path, dim=d) manager = PointerManager(g_man, "basic", learning_rate=START_LR, lr_factor=LR_DECAY, lr_patience=LR_PATIENCE, n_encoder_layers=n_layers, n_decoder_layers=n_layers, cuda_device=CUDA_DEVICE) st = time.time() manager.train_model_dynamic( train_sets=train_sets, dev_sets=dev_set, max_tries=EPOCH_PATIENCE, print_interval=10000, save_path="pointer/models/19_06_06/glove_d{}_{}layers.pt".format( d, n_layers), teacher_forcing=TEACHER_FORCING) time_list.append(time.time() - st) manager2 = PointerManager(g_man, "basic", learning_rate=START_LR, lr_factor=LR_DECAY, lr_patience=LR_PATIENCE, n_encoder_layers=n_layers, n_decoder_layers=n_layers, cuda_device=CUDA_DEVICE) manager2.load_model( "pointer/models/19_06_06/glove_d{}_{}layers.pt".format( d, n_layers)) epoch_list.append(manager2.model.cur_epoch) r, p, f = manager2.test_model(itac_test) itac_f.append(f) itac_p.append(p) itac_r.append(r) r, p, f = manager2.test_model(conll_test) conll_f.append(f) conll_p.append(p) conll_r.append(r) result_df = pd.DataFrame() result_df["glove_dim"] = g_dim_list result_df["num_layers"] = n_list result_df["epochs_trained"] = epoch_list result_df["train_time"] = time_list result_df["precision_itac"] = itac_p result_df["recall_itac"] = itac_r result_df["f1_itac"] = itac_f result_df["precision_conll"] = conll_p result_df["recall_conll"] = conll_r result_df["f1_conll"] = conll_f result_df.to_csv("results/19_06_06/ptr_glove_n_layers.csv")
# lr_factor=0.5, lr_patience=3) #manager.train_model(["../data/standardized/conll_valid.txt"], print_interval=1000, n_epochs=20, teacher_forcing=0.5, # save_path="models/test_scheduling.pt", dev_sets=["../data/standardized/conll_valid.txt"]) # manager2 = PointerManager(GloveEmbeddings("../embeddings/glove.6B.50d.txt", 50), "basic", learning_rate=0.01, # lr_factor=0.5, lr_patience=3) # manager2.load_model("models/test_dynamic.pt") # manager2.test_model(["../data/standardized/conll_test.txt"]) START_LR = 0.01 LR_PATIENCE = 3 LR_DECAY = 0.5 EPOCH_PATIENCE = 6 MAX_EPOCHS = 2 TEACHER_FORCING = 0.5 g_man = GloveEmbeddings(path="../embeddings/glove/glove.6B.50d.txt", dim=50) # manager = PointerManager(g_man, "basic", learning_rate=START_LR, lr_factor=LR_DECAY, # lr_patience=LR_PATIENCE, n_encoder_layers=2, n_decoder_layers=2) # manager.train_model_dynamic(train_sets=["../data/standardized/conll_valid.txt"], # dev_sets=["../data/standardized/conll_valid.txt"], max_tries=EPOCH_PATIENCE, # print_interval=10000, # save_path="models/test.pt", # teacher_forcing=TEACHER_FORCING) manager2 = PointerManager(g_man, "basic", learning_rate=START_LR, lr_factor=LR_DECAY, lr_patience=LR_PATIENCE, n_encoder_layers=2, n_decoder_layers=2)
def binary_classification_bpemb_glove(): # Fix Vocabulary Size to 50000 clf_names = [ "AdaBoostClassifier", "LogisticRegression", "SGDClassifier", "BayesianGaussianMixture", "GaussianNB", "LinearSVC", "RandomForest", "GradientBoosting" ] g_dim_list = [] b_dim_list = [] time_list = [] itac_p = [] itac_r = [] itac_f = [] conll_r = [] conll_p = [] conll_f = [] model_list = [] for path, d in glove_embeddings: for b_dim in bpemb_dims: classifiers = [ AdaBoostClassifier(), LogisticRegression(class_weight="balanced"), SGDClassifier(class_weight="balanced"), BayesianGaussianMixture(), GaussianNB(), LinearSVC(class_weight="balanced"), RandomForestClassifier(class_weight="balanced"), GradientBoostingClassifier() ] g_man = GloveEmbeddings(path=path, dim=d) b_man = BPEmbeddings(dim=b_dim, bp_vocab_size=50000) c_man = CombinedEmbeddings([g_man, b_man]) for model, model_name in zip(classifiers, clf_names): g_dim_list.append(d) b_dim_list.append(b_dim) model_list.append(model_name) clf = BasicClassifier(model=model, emb_man=c_man, wiki_file=wiki_file) st = time.time() clf.train_model(data_files=train_sets) time_list.append(time.time() - st) r, p, f = clf.test_model(itac_test) itac_f.append(f) itac_p.append(p) itac_r.append(r) r, p, f = clf.test_model(conll_test) conll_f.append(f) conll_p.append(p) conll_r.append(r) result_df = pd.DataFrame() result_df["clf_type"] = model_list result_df["glove_dim"] = g_dim_list result_df["bpemb dim"] = b_dim_list result_df["train_time"] = time_list result_df["precision_itac"] = itac_p result_df["recall_itac"] = itac_r result_df["f1_itac"] = itac_f result_df["precision_conll"] = conll_p result_df["recall_conll"] = conll_r result_df["f1_conll"] = conll_f result_df.to_csv("results/19_05_11b/bc_bpemb_glove_search.csv")
def pointer_bpemb_glove(): # Fix Vocabulary Size to 100000 epoch_list = [] g_dim_list = [] b_dim_list = [] time_list = [] itac_p = [] itac_r = [] itac_f = [] conll_r = [] conll_p = [] conll_f = [] done_list = [(100, 100), (100, 200), (100, 25), (100, 300), (100, 50), (50, 100), (50, 200), (50, 25), (50, 300), (50, 50), (200, 25), (200, 50)] for path, d in glove_embeddings: for b_dim in bpemb_dims: g_dim_list.append(d) b_dim_list.append(b_dim) g_man = GloveEmbeddings(path=path, dim=d) b_man = BPEmbeddings(dim=b_dim, bp_vocab_size=100000) c_man = CombinedEmbeddings([g_man, b_man]) ds = DataSet("blah") ds.read_multiple(train_sets + dev_set + itac_test + conll_test) c_man.build_vocabulary([ds]) if (d, b_dim) not in done_list: manager = PointerManager(g_man, "basic", learning_rate=START_LR, lr_factor=LR_DECAY, lr_patience=LR_PATIENCE, cuda_device=CUDA_DEVICE) # st = time.time() manager.train_model_dynamic( train_sets=train_sets, dev_sets=dev_set, max_tries=EPOCH_PATIENCE, print_interval=10000, save_path="pointer/models/19_05_11b/glove_d{}_bp_d{}.pt". format(d, b_dim), teacher_forcing=TEACHER_FORCING) # time_list.append(time.time() - st) manager2 = PointerManager(g_man, "basic", learning_rate=START_LR, lr_factor=LR_DECAY, lr_patience=LR_PATIENCE, cuda_device=CUDA_DEVICE) manager2.load_model( "pointer/models/19_05_11b/glove_d{}_bp_d{}.pt".format( d, b_dim)) time_list.append(manager2.model.train_time) epoch_list.append(manager2.model.cur_epoch) r, p, f = manager2.test_model(itac_test) itac_f.append(f) itac_p.append(p) itac_r.append(r) r, p, f = manager2.test_model(conll_test) conll_f.append(f) conll_p.append(p) conll_r.append(r) result_df = pd.DataFrame() result_df["glove dim"] = g_dim_list result_df["bpemb dim"] = b_dim_list result_df["epochs_trained"] = epoch_list result_df["train_time"] = time_list result_df["precision_itac"] = itac_p result_df["recall_itac"] = itac_r result_df["f1_itac"] = itac_f result_df["precision_conll"] = conll_p result_df["recall_conll"] = conll_r result_df["f1_conll"] = conll_f result_df.to_csv("results/19_05_11b/ptr_bpemb_glove_search.csv")