Пример #1
0
def create_pointer_examples():
    results = []
    result_names = []
    # pure BPEmb
    vs = 100000
    d = 200
    bp_man = BPEmbeddings(bp_vocab_size=vs, dim=d, case_sensitive=False)
    ds = DataSet("blah")
    ds.read_multiple(train_sets + dev_set + itac_test + conll_test)
    bp_man.build_vocabulary([ds])
    manager = PointerManager(bp_man,
                             "basic",
                             learning_rate=START_LR,
                             lr_factor=LR_DECAY,
                             lr_patience=LR_PATIENCE,
                             cuda_device=CUDA_DEVICE)
    manager.load_model("pointer/models/19_05_11b/bpemb_{}_{}.pt".format(vs, d))
    results.append(test_example(manager))
    result_names.append("bpemb_{}_{}".format(vs, d))

    # pure glove
    for d in [50, 300]:
        path = "embeddings/glove/glove.6B.{}d.txt".format(d)
        g_man = GloveEmbeddings(path=path, dim=d)
        manager = PointerManager(g_man,
                                 "basic",
                                 learning_rate=START_LR,
                                 lr_factor=LR_DECAY,
                                 lr_patience=LR_PATIENCE,
                                 cuda_device=CUDA_DEVICE)
        manager.load_model("pointer/models/19_05_11b/glove_{}.pt".format(d))
        results.append(test_example(manager))
        result_names.append("glove_{}".format(d))

    # glove + bpemb
    for g_d, b_d in [(200, 50), (300, 25)]:
        path = "embeddings/glove/glove.6B.{}d.txt".format(g_d)
        g_man = GloveEmbeddings(path=path, dim=g_d)
        b_man = BPEmbeddings(dim=b_d, bp_vocab_size=100000)
        c_man = CombinedEmbeddings([g_man, b_man])
        ds = DataSet("blah")
        ds.read_multiple(train_sets + dev_set + itac_test + conll_test)
        c_man.build_vocabulary([ds])

        manager = PointerManager(g_man,
                                 "basic",
                                 learning_rate=START_LR,
                                 lr_factor=LR_DECAY,
                                 lr_patience=LR_PATIENCE,
                                 cuda_device=CUDA_DEVICE)
        manager.load_model(
            "pointer/models/19_05_11b/glove_d{}_bp_d{}.pt".format(g_d, b_d))
        results.append(test_example(manager))
        result_names.append("glove_d{}_bp_d{}_vs100000".format(g_d, b_d))

    write_results("results/19_05_11b/pointer_examples.txt",
                  results=results,
                  names=result_names)
Пример #2
0
def grid_search():
    base_path = "models/19_04_11_grid_search/"
    train_sets = [
        "../data/standardized/conll_train.txt",
        "../data/standardized/conll_test.txt",
        "../data/standardized/itac_dev.txt"
    ]
    dev_set = ["../data/standardized/conll_valid.txt"]

    for lr in [0.1, 0.05, 0.01, 0.005, 0.001]:
        for tf in [0.0, 0.2, 0.5, 0.8, 1]:
            manager = PointerManager(GloveEmbeddings(
                "../embeddings/glove.6B.50d.txt", dim=50),
                                     "basic",
                                     learning_rate=lr)
            manager.train_model(train_sets,
                                n_epochs=5,
                                print_interval=5000,
                                teacher_forcing=tf,
                                save_path=base_path + str(lr) + str(tf) +
                                "_5epoch.pt")
            manager.test_model(dev_set)
            manager.train_model(train_sets,
                                n_epochs=5,
                                print_interval=5000,
                                teacher_forcing=tf,
                                save_path=base_path + str(lr) + str(tf) +
                                "_10epoch.pt")
            manager.test_model(dev_set)
Пример #3
0
def pointer_with_pure_glove():
    g_dim_list = []
    time_list = []
    itac_p = []
    itac_r = []
    itac_f = []
    conll_r = []
    conll_p = []
    conll_f = []
    epoch_list = []

    for path, d in glove_embeddings:
        g_dim_list.append(d)
        g_man = GloveEmbeddings(path=path, dim=d)

        manager = PointerManager(g_man,
                                 "basic",
                                 learning_rate=START_LR,
                                 lr_factor=LR_DECAY,
                                 lr_patience=LR_PATIENCE,
                                 cuda_device=CUDA_DEVICE)
        st = time.time()
        manager.train_model_dynamic(
            train_sets=train_sets,
            dev_sets=dev_set,
            max_tries=EPOCH_PATIENCE,
            print_interval=10000,
            save_path="pointer/models/19_05_11b/glove_{}.pt".format(d),
            teacher_forcing=TEACHER_FORCING)
        time_list.append(time.time() - st)
        manager2 = PointerManager(g_man,
                                  "basic",
                                  learning_rate=START_LR,
                                  lr_factor=LR_DECAY,
                                  lr_patience=LR_PATIENCE,
                                  cuda_device=CUDA_DEVICE)
        manager2.load_model("pointer/models/19_05_11b/glove_{}.pt".format(d))
        epoch_list.append(manager2.model.cur_epoch)
        r, p, f = manager2.test_model(itac_test)
        itac_f.append(f)
        itac_p.append(p)
        itac_r.append(r)

        r, p, f = manager2.test_model(conll_test)
        conll_f.append(f)
        conll_p.append(p)
        conll_r.append(r)
    result_df = pd.DataFrame()
    result_df["glove dim"] = g_dim_list
    result_df["epochs_trained"] = epoch_list
    result_df["train_time"] = time_list
    result_df["precision_itac"] = itac_p
    result_df["recall_itac"] = itac_r
    result_df["f1_itac"] = itac_f
    result_df["precision_conll"] = conll_p
    result_df["recall_conll"] = conll_r
    result_df["f1_conll"] = conll_f
    result_df.to_csv("results/19_05_11b/ptr_pure_glove_dim_search.csv")
Пример #4
0
def test_depth_n_trees():
    d_list = []
    n_list = []
    time_list = []
    itac_p = []
    itac_r = []
    itac_f = []
    conll_r = []
    conll_p = []
    conll_f = []

    for d in list(range(5, 20)):  # [5, 10, 20, 50, None]:
        for n in [10, 20, 50, 100]:
            d_list.append(d)
            n_list.append(n)
            print("\nTraining {} trees with max_depth {}".format(n, d))
            g_man = GloveEmbeddings(path="embeddings/glove/glove.6B.50d.txt",
                                    dim=50)
            clf = BasicClassifier(model=RandomForestClassifier(n_estimators=n,
                                                               max_depth=d),
                                  emb_man=g_man,
                                  wiki_file=wiki_file)
            t = time.time()
            clf.train_model(data_files=train_sets)
            time_list.append(time.time() - t)

            r, p, f = clf.test_model(itac_test)
            itac_f.append(f)
            itac_p.append(p)
            itac_r.append(r)

            r, p, f = clf.test_model(conll_test)
            conll_f.append(f)
            conll_p.append(p)
            conll_r.append(r)

    result_df = pd.DataFrame()
    result_df["n_estimators"] = n_list
    result_df["max_depth"] = d_list
    result_df["train_time"] = time_list
    result_df["precision_itac"] = itac_p
    result_df["recall_itac"] = itac_r
    result_df["f1_itac"] = itac_f
    result_df["precision_conll"] = conll_p
    result_df["recall_conll"] = conll_r
    result_df["f1_conll"] = conll_f
    result_df.to_csv("results/random_forest/n_trees_depth_search2.csv")
Пример #5
0
    def _embed_sentence(self, sent):
        s_list = []
        for w in sent:
            s_list.append(self.embedding_man.get_embedding_vec(w))
        return s_list


if __name__ == "__main__":
    ds1 = DataSet("../data/standardized/conll_test.txt")
    ds1.read_data()
    ds2 = DataSet("../data/standardized/itac_test.txt")
    ds2.read_data()
    ds3 = DataSet("../data/standardized/rsics_test.txt")
    ds3.read_data()

    g_man = GloveEmbeddings("glove/glove.6B.50d.txt", 50)
    embedder = SimpleDataEmbedder(g_man, create_sent_emb=False)
    embedder.reduce_dataset(ds1, path="dim_reductions/glove50_conll.csv")
    embedder.reduce_dataset(ds2, path="dim_reductions/glove50_itac.csv")
    embedder.reduce_dataset(ds3, path="dim_reductions/glove50_rsics.csv")

    b_man = BPEmbeddings(dim=100, bp_vocab_size=50000)
    embedder = SimpleDataEmbedder(b_man, create_sent_emb=False)
    embedder.reduce_dataset(ds1, path="dim_reductions/bp-d100-vs50000_conll.csv")
    embedder.reduce_dataset(ds2, path="dim_reductions/bp-d100-vs50000_itac.csv")
    embedder.reduce_dataset(ds3, path="dim_reductions/bp-d100-vs50000_rsics.csv")

    c_man = CombinedEmbeddings([g_man, b_man])
    embedder = SimpleDataEmbedder(c_man, create_sent_emb=False)
    embedder.reduce_dataset(ds1, path="dim_reductions/bp-glove_conll.csv")
    embedder.reduce_dataset(ds2, path="dim_reductions/bp-glove_itac.csv")
Пример #6
0
def pointer_test_n_layers():
    g_dim_list = []
    time_list = []
    itac_p = []
    itac_r = []
    itac_f = []
    conll_r = []
    conll_p = []
    conll_f = []
    epoch_list = []
    n_list = []

    d = 50
    for n_layers in range(1, 5):
        n_list.append(n_layers)
        g_dim_list.append(d)
        path = "embeddings/glove/glove.6B.{}d.txt".format(d)
        g_man = GloveEmbeddings(path=path, dim=d)

        manager = PointerManager(g_man,
                                 "basic",
                                 learning_rate=START_LR,
                                 lr_factor=LR_DECAY,
                                 lr_patience=LR_PATIENCE,
                                 n_encoder_layers=n_layers,
                                 n_decoder_layers=n_layers,
                                 cuda_device=CUDA_DEVICE)
        st = time.time()
        manager.train_model_dynamic(
            train_sets=train_sets,
            dev_sets=dev_set,
            max_tries=EPOCH_PATIENCE,
            print_interval=10000,
            save_path="pointer/models/19_06_06/glove_d{}_{}layers.pt".format(
                d, n_layers),
            teacher_forcing=TEACHER_FORCING)
        time_list.append(time.time() - st)
        manager2 = PointerManager(g_man,
                                  "basic",
                                  learning_rate=START_LR,
                                  lr_factor=LR_DECAY,
                                  lr_patience=LR_PATIENCE,
                                  n_encoder_layers=n_layers,
                                  n_decoder_layers=n_layers,
                                  cuda_device=CUDA_DEVICE)
        manager2.load_model(
            "pointer/models/19_06_06/glove_d{}_{}layers.pt".format(
                d, n_layers))
        epoch_list.append(manager2.model.cur_epoch)
        r, p, f = manager2.test_model(itac_test)
        itac_f.append(f)
        itac_p.append(p)
        itac_r.append(r)

        r, p, f = manager2.test_model(conll_test)
        conll_f.append(f)
        conll_p.append(p)
        conll_r.append(r)
    result_df = pd.DataFrame()
    result_df["glove_dim"] = g_dim_list
    result_df["num_layers"] = n_list
    result_df["epochs_trained"] = epoch_list
    result_df["train_time"] = time_list
    result_df["precision_itac"] = itac_p
    result_df["recall_itac"] = itac_r
    result_df["f1_itac"] = itac_f
    result_df["precision_conll"] = conll_p
    result_df["recall_conll"] = conll_r
    result_df["f1_conll"] = conll_f
    result_df.to_csv("results/19_06_06/ptr_glove_n_layers.csv")
Пример #7
0
    #                         lr_factor=0.5, lr_patience=3)
    #manager.train_model(["../data/standardized/conll_valid.txt"], print_interval=1000, n_epochs=20, teacher_forcing=0.5,
    #                    save_path="models/test_scheduling.pt", dev_sets=["../data/standardized/conll_valid.txt"])
    # manager2 = PointerManager(GloveEmbeddings("../embeddings/glove.6B.50d.txt", 50), "basic", learning_rate=0.01,
    #                          lr_factor=0.5, lr_patience=3)
    # manager2.load_model("models/test_dynamic.pt")
    # manager2.test_model(["../data/standardized/conll_test.txt"])

    START_LR = 0.01
    LR_PATIENCE = 3
    LR_DECAY = 0.5
    EPOCH_PATIENCE = 6
    MAX_EPOCHS = 2
    TEACHER_FORCING = 0.5

    g_man = GloveEmbeddings(path="../embeddings/glove/glove.6B.50d.txt",
                            dim=50)

    # manager = PointerManager(g_man, "basic", learning_rate=START_LR, lr_factor=LR_DECAY,
    #                          lr_patience=LR_PATIENCE, n_encoder_layers=2, n_decoder_layers=2)
    # manager.train_model_dynamic(train_sets=["../data/standardized/conll_valid.txt"],
    #                             dev_sets=["../data/standardized/conll_valid.txt"], max_tries=EPOCH_PATIENCE,
    #                             print_interval=10000,
    #                             save_path="models/test.pt",
    #                             teacher_forcing=TEACHER_FORCING)
    manager2 = PointerManager(g_man,
                              "basic",
                              learning_rate=START_LR,
                              lr_factor=LR_DECAY,
                              lr_patience=LR_PATIENCE,
                              n_encoder_layers=2,
                              n_decoder_layers=2)
Пример #8
0
def binary_classification_bpemb_glove():  # Fix Vocabulary Size to 50000
    clf_names = [
        "AdaBoostClassifier", "LogisticRegression", "SGDClassifier",
        "BayesianGaussianMixture", "GaussianNB", "LinearSVC", "RandomForest",
        "GradientBoosting"
    ]
    g_dim_list = []
    b_dim_list = []
    time_list = []
    itac_p = []
    itac_r = []
    itac_f = []
    conll_r = []
    conll_p = []
    conll_f = []
    model_list = []

    for path, d in glove_embeddings:
        for b_dim in bpemb_dims:

            classifiers = [
                AdaBoostClassifier(),
                LogisticRegression(class_weight="balanced"),
                SGDClassifier(class_weight="balanced"),
                BayesianGaussianMixture(),
                GaussianNB(),
                LinearSVC(class_weight="balanced"),
                RandomForestClassifier(class_weight="balanced"),
                GradientBoostingClassifier()
            ]

            g_man = GloveEmbeddings(path=path, dim=d)
            b_man = BPEmbeddings(dim=b_dim, bp_vocab_size=50000)
            c_man = CombinedEmbeddings([g_man, b_man])

            for model, model_name in zip(classifiers, clf_names):
                g_dim_list.append(d)
                b_dim_list.append(b_dim)
                model_list.append(model_name)
                clf = BasicClassifier(model=model,
                                      emb_man=c_man,
                                      wiki_file=wiki_file)

                st = time.time()
                clf.train_model(data_files=train_sets)
                time_list.append(time.time() - st)

                r, p, f = clf.test_model(itac_test)
                itac_f.append(f)
                itac_p.append(p)
                itac_r.append(r)

                r, p, f = clf.test_model(conll_test)
                conll_f.append(f)
                conll_p.append(p)
                conll_r.append(r)

    result_df = pd.DataFrame()
    result_df["clf_type"] = model_list
    result_df["glove_dim"] = g_dim_list
    result_df["bpemb dim"] = b_dim_list
    result_df["train_time"] = time_list
    result_df["precision_itac"] = itac_p
    result_df["recall_itac"] = itac_r
    result_df["f1_itac"] = itac_f
    result_df["precision_conll"] = conll_p
    result_df["recall_conll"] = conll_r
    result_df["f1_conll"] = conll_f
    result_df.to_csv("results/19_05_11b/bc_bpemb_glove_search.csv")
Пример #9
0
def pointer_bpemb_glove():  # Fix Vocabulary Size to 100000
    epoch_list = []
    g_dim_list = []
    b_dim_list = []
    time_list = []
    itac_p = []
    itac_r = []
    itac_f = []
    conll_r = []
    conll_p = []
    conll_f = []

    done_list = [(100, 100), (100, 200), (100, 25), (100, 300), (100, 50),
                 (50, 100), (50, 200), (50, 25), (50, 300), (50, 50),
                 (200, 25), (200, 50)]

    for path, d in glove_embeddings:
        for b_dim in bpemb_dims:

            g_dim_list.append(d)
            b_dim_list.append(b_dim)

            g_man = GloveEmbeddings(path=path, dim=d)
            b_man = BPEmbeddings(dim=b_dim, bp_vocab_size=100000)
            c_man = CombinedEmbeddings([g_man, b_man])

            ds = DataSet("blah")
            ds.read_multiple(train_sets + dev_set + itac_test + conll_test)
            c_man.build_vocabulary([ds])

            if (d, b_dim) not in done_list:
                manager = PointerManager(g_man,
                                         "basic",
                                         learning_rate=START_LR,
                                         lr_factor=LR_DECAY,
                                         lr_patience=LR_PATIENCE,
                                         cuda_device=CUDA_DEVICE)
                # st = time.time()
                manager.train_model_dynamic(
                    train_sets=train_sets,
                    dev_sets=dev_set,
                    max_tries=EPOCH_PATIENCE,
                    print_interval=10000,
                    save_path="pointer/models/19_05_11b/glove_d{}_bp_d{}.pt".
                    format(d, b_dim),
                    teacher_forcing=TEACHER_FORCING)
                # time_list.append(time.time() - st)
            manager2 = PointerManager(g_man,
                                      "basic",
                                      learning_rate=START_LR,
                                      lr_factor=LR_DECAY,
                                      lr_patience=LR_PATIENCE,
                                      cuda_device=CUDA_DEVICE)
            manager2.load_model(
                "pointer/models/19_05_11b/glove_d{}_bp_d{}.pt".format(
                    d, b_dim))
            time_list.append(manager2.model.train_time)
            epoch_list.append(manager2.model.cur_epoch)
            r, p, f = manager2.test_model(itac_test)
            itac_f.append(f)
            itac_p.append(p)
            itac_r.append(r)

            r, p, f = manager2.test_model(conll_test)
            conll_f.append(f)
            conll_p.append(p)
            conll_r.append(r)

        result_df = pd.DataFrame()
        result_df["glove dim"] = g_dim_list
        result_df["bpemb dim"] = b_dim_list
        result_df["epochs_trained"] = epoch_list
        result_df["train_time"] = time_list
        result_df["precision_itac"] = itac_p
        result_df["recall_itac"] = itac_r
        result_df["f1_itac"] = itac_f
        result_df["precision_conll"] = conll_p
        result_df["recall_conll"] = conll_r
        result_df["f1_conll"] = conll_f
        result_df.to_csv("results/19_05_11b/ptr_bpemb_glove_search.csv")