def blending():
    # saved = True if args.saved != 0 else False
    saved = args.saved
    test_file1 = ROOT_DIR + "/data/test.txt"
    test_file2 = ROOT_DIR + "/data/test_predict_aspect_ensemble.txt"
    test_texts, test_aspects = load_ab_test(test_file1, test_file2)
    # print(test_aspects)

    word2index = pickle.load(open(ROOT_DIR + "/data/vocabulary.pkl", 'rb'))

    f_dict = ROOT_DIR + "/dataset/polarity.json"
    polarity_list, polarity_dict = parse_json(f_dict)
    f_dict2 = ROOT_DIR + "/dataset/attribute.json"
    attr_list, attr_dict = parse_json(f_dict2)

    paths = args.test_dir.split('#')
    models_files = []
    for path in paths:
        models_files.append([
            os.path.join(path, f) for f in os.listdir(path)
            if os.path.isfile(os.path.join(path, f))
        ])
    test_data = Data3((test_texts, None, test_aspects),
                      word2index,
                      polarity_dict,
                      args,
                      target_dict=attr_dict)
    if args.use_elmo != 0:
        test_elmo = load_elmo(test_texts)
        test_data.add_feature(test_elmo)

    x_test = []
    for dir, checkpoints_per_model in zip(paths, models_files):
        print(dir, checkpoints_per_model)
        if saved == 1:
            oof_test = load_oof_test(dir)
        else:
            clfs = checkpoints_per_model
            oof_test = get_oof_test(clfs, test_data)
        x_test.append(oof_test)
    x_test = np.stack(x_test, axis=1)
    print(x_test)
    print(x_test.shape)
    test_predict = np.mean(x_test, axis=1)
    fw = codecs.open(ROOT_DIR + "/data/test_predict_polarity_ensemble.txt",
                     'w',
                     encoding='utf-8')
    for j, prob in enumerate(test_predict):
        polarity = np.argmax(prob) - 1
        fw.write(test_aspects[j] + ',' + str(polarity) + '\n')
    time_stamp = time.asctime().replace(':', '_').split()
    fw.close()
    shutil.copy2(
        ROOT_DIR + "/data/test_predict_polarity_ensemble.txt", ROOT_DIR +
        "/data/backup/test_predict_polarity_ensemble_%s.txt" % time_stamp)
def test():
    # model = Classifier()
    test_file1 = ROOT_DIR + "/attribute_level/data/attribute_test.txt"
    test_file2 = ROOT_DIR + "/attribute_level/test_predict.txt"
    test_texts, test_aspects = load_ab_test(test_file1, test_file2)
    f_w2v = ROOT_DIR + "/embedding/embedding_all_merge_300.txt"
    W, word2index = load_w2v(f_w2v)

    f_dict1 = ROOT_DIR + "/dataset/polarity.json"
    f_dict2 = ROOT_DIR + "/dataset/attribute.json"
    polarity_list, polarity_dict = parse_json(f_dict1)
    attr_list, attr_dict = parse_json(f_dict2)

    assert len(test_texts) == len(test_aspects)

    files = ["checkpoint_HEAT_0.7189.pt", "checkpoint_HEAT_0.7062.pt"]

    predicts = []
    for check_point in files:
        predict = []
        classifier = torch.load(check_point)
        for text, aspect in zip(test_texts, test_aspects):
            if aspect != '':
                if aspect is None:
                    print("error")
                test_data = Data3(([text], [None], [aspect]),
                                  word2index,
                                  polarity_dict,
                                  args,
                                  target_dict=attr_dict)
                test_predict = train_single.predict(classifier, test_data,
                                                    args)
                assert len(test_predict) == 1
                polarity = str(test_predict[0].item() - 1)
            else:
                print(aspect)
                print(text)
                polarity = '0'
            # fw.write(aspect+','+polarity+'\n')
            predict.append(aspect + ',' + polarity)
        predicts.append(predict)
    print(len(predicts))
    print(len(predicts[0]))
    fw = codecs.open("test_predict_polarity_ensemble.txt",
                     'w',
                     encoding='utf-8')

    for j in range(len(predicts[0])):
        votes = [predicts[i][j] for i in range(len(predicts))]
        voted = Counter(votes).most_common(1)
        fw.write(voted + '\n')
示例#3
0
def test():
    model = AttributeClassifier()
    check_point = "checkpoint_AttA3_0.8810.pt"
    model.load_model(check_point)

    test_file = "data/attribute_test.txt"
    test_texts = load_test_data(test_file)
    f_w2v = "../embedding/embedding_all_merge_300.txt"
    W, word2index = load_w2v(f_w2v)

    f_dict = "../dataset/attribute.json"
    attr_list, attr_dict = parse_json(f_dict)

    test_data = Data((test_texts, None), word2index)

    test_predict = train.predict(model.classifier, test_data, args)
    print(test_predict)

    fw = codecs.open("test_predict.txt", 'w', encoding='utf-8')
    for p in test_predict:
        attributes = []
        for i, l in enumerate(p):
            if l != 0:
                attributes.append(attr_list[i])
        fw.write('|'.join(attributes) + '\n')
def ensemble():
    f_train = ROOT_DIR + "/data/train.txt"
    if args.w2v == "merge":
        f_w2v = ROOT_DIR + "/embedding/embedding_all_merge_300.txt"
    elif args.w2v == "fasttext2":
        f_w2v = ROOT_DIR + "/embedding/embedding_all_fasttext2_300.txt"
    elif args.w2v == "tencent":
        f_w2v = ROOT_DIR + "/embedding/embedding_all_tencent_200.txt"
    else:
        print("error, no embedding")
        exit(-1)
    f_dict1 = ROOT_DIR + "/dataset/polarity.json"
    f_dict2 = ROOT_DIR + "/dataset/attribute.json"
    print(f_train)
    print(f_w2v)
    if not os.path.exists("%s" % args.check_dir):
        os.mkdir("%s" % args.check_dir)
    W, word2index2 = load_w2v(f_w2v)
    word2index = pickle.load(open(ROOT_DIR + "/data/vocabulary.pkl", 'rb'))
    assert word2index == word2index2
    polarity_list, polarity_dict = parse_json(f_dict1)
    attr_list, attr_dict = parse_json(f_dict2)
    kf = 0
    fo = load_abp_raw(f_train)
    for train_index, test_index in kfold_split(len(fo), args.folds):
        kf += 1
        print("FOLD:", kf)
        # print("TRAIN:", train_index, '\n', "TEST:", test_index, str(len(test_index)))
        train_texts, train_labels, train_aspects, test_texts, test_labels, test_aspects = splits(
            fo, train_index, test_index)
        print(len(train_texts))
        print(len(test_texts))
        # print(list(attr_dict.keys()))
        model = Classifier()
        print(attr_list)
        print(attr_dict)
        # exit(-1)
        # print(train_texts)
        model.train_from_data((train_texts, train_labels, train_aspects),
                              (test_texts, test_labels, test_aspects), W,
                              word2index, polarity_dict, attr_dict, args, kf)
def main():
    f_train = ROOT_DIR + "/data/train.txt"
    f_test = "data/test_p.txt"
    if args.w2v == "merge":
        f_w2v = ROOT_DIR + "/embedding/embedding_all_merge_300.txt"
    elif args.w2v == "fasttext":
        f_w2v = ROOT_DIR + "/embedding/embedding_all_fasttext_300.txt"
    elif args.w2v == "fasttext2":
        f_w2v = ROOT_DIR + "/embedding/embedding_all_fasttext2_300.txt"
    elif args.w2v == "tencent":
        f_w2v = ROOT_DIR + "/embedding/embedding_all_tencent_200.txt"
    else:
        print("error, no embedding")
        exit(-1)
    f_dict1 = ROOT_DIR + "/dataset/polarity.json"
    f_dict2 = ROOT_DIR + "/dataset/attribute.json"
    print(f_w2v)
    # train_texts, train_labels = load_attr_data(filename=f_train)
    # # test_text, test_labels = load_attr_data(filename=f_test)
    # train_texts, train_labels, test_texts, test_labels = split_dev(train_texts, train_labels)
    train_texts, train_labels, train_aspects, test_texts, test_labels, test_aspects = load_abp_data(
        f_train, folds=5)
    if not os.path.exists("%s" % args.check_dir):
        os.mkdir("%s" % args.check_dir)
    print(len(train_texts))
    print(len(test_texts))
    W, word2index2 = load_w2v(f_w2v)
    word2index = pickle.load(open(ROOT_DIR + "/data/vocabulary.pkl", 'rb'))
    assert word2index == word2index2
    polarity_list, polarity_dict = parse_json(f_dict1)
    attr_list, attr_dict = parse_json(f_dict2)
    # print(list(attr_dict.keys()))
    model = Classifier()
    print(polarity_list)
    print(polarity_dict)
    # exit(-1)
    # print(train_texts)
    model.train_from_data((train_texts, train_labels, train_aspects),
                          (test_texts, test_labels, test_aspects), W,
                          word2index, polarity_dict, attr_dict, args)
示例#6
0
def ensemble():
    f_train = "../data/train.txt"
    # f_test = "data/test_attr2.txt"
    if args.w2v == "merge":
        f_w2v = "../embedding/embedding_all_merge_300.txt"
    elif args.w2v == "fasttext2":
        f_w2v = "../embedding/embedding_all_fasttext2_300.txt"
    elif args.w2v == "tencent":
        f_w2v = "../embedding/embedding_all_tencent_200.txt"
    else:
        print("error, no embedding")
        exit(-1)
    f_dict = "../dataset/attribute.json"
    print(f_train)
    print(f_w2v)
    if not os.path.exists("%s" % args.check_dir):
        os.mkdir("%s" % args.check_dir)
    raw_texts, raw_labels = load_attr_data(filename=f_train)
    W, word2index2 = load_w2v(f_w2v)
    word2index = pickle.load(open("../data/vocabulary.pkl", 'rb'))
    assert word2index == word2index2
    attr_list, attr_dict = parse_json(f_dict)
    kf = 0
    for train_index, test_index in kfold_split(len(raw_texts), args.folds):
        kf += 1
        print("FOLD:", kf)
        print("TRAIN:", str(len(train_index)), '\n', "TEST:",
              str(len(test_index)))
        # train_index, test_index = train_index.tolist(), test_index.tolist()
        test_texts, test_labels = [raw_texts[i] for i in test_index
                                   ], [raw_labels[i] for i in test_index]
        train_texts, train_labels = [raw_texts[i] for i in train_index
                                     ], [raw_labels[i] for i in train_index]
        print(len(train_texts))
        print(len(test_labels))
        model = AttributeClassifier()
        print(attr_list)
        print(attr_dict)
        # exit(-1)
        # print(train_texts)
        model.train_from_data((train_texts, train_labels),
                              (test_texts, test_labels), W, word2index,
                              attr_dict, args, kf)
    pass
示例#7
0
def main():
    f_train = "../data/train.txt"
    # f_test = "data/test_attr2.txt"
    if args.w2v == "merge":
        f_w2v = "../embedding/embedding_all_merge_300.txt"
    elif args.w2v == "fasttext":
        f_w2v = "../embedding/embedding_all_fasttext_300.txt"
    elif args.w2v == "fasttext2":
        f_w2v = "../embedding/embedding_all_fasttext2_300.txt"
    elif args.w2v == "tencent":
        f_w2v = "../embedding/embedding_all_tencent_200.txt"
    else:
        print("error, no embedding")
        exit(-1)
    f_dict = "../dataset/attribute.json"
    print(f_w2v)
    train_texts, train_labels = load_attr_data(filename=f_train)
    train_texts, train_labels, test_texts, test_labels = split_dev(
        train_texts, train_labels)
    print(len(train_texts))
    print(len(test_labels))
    # train_texts2, train_labels2, test_texts, test_labels = split_dev(train_texts, train_labels)
    if not os.path.exists("%s" % args.check_dir):
        os.mkdir("%s" % args.check_dir)
    # test_texts, test_labels = load_attr_data(filename=f_test)
    W, word2index2 = load_w2v(f_w2v)
    word2index = pickle.load(open("../data/vocabulary.pkl", 'rb'))
    assert word2index == word2index2
    attr_list, attr_dict = parse_json(f_dict)
    print(list(attr_dict.keys()))
    model = AttributeClassifier()
    print(attr_list)
    print(attr_dict)
    # exit(-1)
    # print(train_texts)
    model.train_from_data((train_texts, train_labels),
                          (test_texts, test_labels), W, word2index, attr_dict,
                          args)
示例#8
0
def dev():
    model = AttributeClassifier()
    check_point = "checkpoints5/checkpoint_AttA3_0.8666.pt"
    model.load_model(check_point)

    f_train = "data/attribute_data.txt"
    # f_test = "data/test_attr2.txt"
    f_w2v = "../embedding/embedding_all_merge_300.txt"
    f_dict = "../dataset/attribute.json"
    print(f_w2v)
    raw_texts, raw_labels = load_attr_data(filename=f_train)
    W, word2index = load_w2v(f_w2v)
    attr_list, attr_dict = parse_json(f_dict)
    kf = 0

    _, test_index = kfold_split(len(raw_texts), args.folds)[2]
    test_texts, test_labels = [raw_texts[i] for i in test_index
                               ], [raw_labels[i] for i in test_index]
    test_data = Data((test_texts, test_labels), word2index, attr_dict, args)

    test_predict = train.predict(model.classifier, test_data, args)
    pred_acc_t = score(test_predict, test_data.labels)
    print(pred_acc_t)
示例#9
0
def stacking():
    saved = True if args.saved != 0 else False
    f_train = "../data/train.txt"
    test_file = "../data/test.txt"
    test_texts = load_test_data(test_file)
    raw_texts, raw_labels = load_attr_data(filename=f_train)
    word2index = pickle.load(open("../data/vocabulary.pkl", 'rb'))

    f_dict = "../dataset/attribute.json"
    attr_list, attr_dict = parse_json(f_dict)

    paths = args.test_dir.split('#')
    models_files = []
    for path in paths:
        models_files.append([
            os.path.join(path, f) for f in os.listdir(path)
            if os.path.isfile(os.path.join(path, f))
        ])

    test_data = Data((test_texts, None), word2index)
    if args.use_elmo != 0:
        test_elmo = load_elmo(test_texts)
        test_data.add_feature(test_elmo)

    x_train = []
    y_train = []  # TODO replace
    x_test = []
    for dir, checkpoints_per_model in zip(paths, models_files):
        print(dir, checkpoints_per_model)
        if saved == 1 and os.path.isfile(
                os.path.join(dir, 'npy', "oof_train.npy")):
            oof_train, oof_train_y, oof_test = load_oof(dir)
        else:
            NFOLDS = len(checkpoints_per_model)
            print(NFOLDS)
            assert NFOLDS == args.folds
            clfs = [None for i in range(NFOLDS)]
            for cp in checkpoints_per_model:
                fold = int(cp.replace('_', '.').split('.')[-2])
                print(fold)
                clfs[fold - 1] = cp
            oof_train, oof_train_y, oof_test = get_oof(clfs, raw_texts,
                                                       raw_labels, test_data,
                                                       word2index, attr_dict)
        x_train.append(oof_train)
        if y_train == []:
            y_train = oof_train_y
        else:
            assert (y_train == oof_train_y).all()
        x_test.append(oof_test)
    x_train = np.stack(x_train, axis=2)
    x_test = np.stack(x_test, axis=2)

    print(x_train.shape)
    num_train = x_train.shape[0]
    num_test = x_test.shape[0]
    test_predict = []
    for c in range(x_train.shape[1]):
        x_train_c = x_train[:, c, :].reshape(num_train, -1)
        x_test_c = x_test[:, c, :].reshape(num_test, -1)
        meta_clf_c = LogisticRegression()
        y_train_c = y_train[:, c]
        meta_clf_c.fit(x_train_c, y_train_c)
        test_predict_c = meta_clf_c.predict_proba(x_test_c)[:, 1]
        test_predict.append(test_predict_c)

    test_predict = np.stack(test_predict, axis=1)
    print(test_predict.shape)
    fw = codecs.open("../data/test_predict_aspect_ensemble.txt",
                     'w',
                     encoding='utf-8')

    for prob in test_predict:
        attributes = []
        voted = [0 for a in range(len(attr_list))]

        for i in range(len(prob)):
            p = prob[i]
            # print(p)
            if p > args.threshold:
                voted[i] = 1
                # categories.append(attrC[i])
        if sum(voted) == 0:
            voted[prob.argmax()] = 1
        for i, l in enumerate(voted):
            if l != 0:
                attributes.append(attr_list[i])
        fw.write('|'.join(attributes) + '\n')
    time_stamp = time.asctime().replace(':', '_').split()
    fw.close()
    shutil.copy2(
        "../data/test_predict_aspect_ensemble.txt",
        "../data/backup/test_predict_aspect_ensemble_%s.txt" % time_stamp)
示例#10
0
def stacking():
    # saved = True if args.saved != 0 else False
    saved = args.saved
    f_train = ROOT_DIR + "/data/train.txt"
    test_file1 = ROOT_DIR + "/data/test.txt"
    test_file2 = ROOT_DIR + "/data/test_predict_aspect_ensemble.txt"
    test_texts, test_aspects = load_ab_test(test_file1, test_file2)
    # print(test_aspects)

    fo = load_abp_raw(f_train)
    word2index = pickle.load(open(ROOT_DIR + "/data/vocabulary.pkl", 'rb'))

    f_dict = ROOT_DIR + "/dataset/polarity.json"
    polarity_list, polarity_dict = parse_json(f_dict)
    f_dict2 = ROOT_DIR + "/dataset/attribute.json"
    attr_list, attr_dict = parse_json(f_dict2)

    paths = args.test_dir.split('#')
    models_files = []
    for path in paths:
        path = BASE_DIR + '/' + path
        models_files.append([
            os.path.join(path, f) for f in os.listdir(path)
            if os.path.isfile(os.path.join(path, f))
        ])
    test_data = Data3((test_texts, None, test_aspects),
                      word2index,
                      polarity_dict,
                      args,
                      target_dict=attr_dict)
    if args.use_elmo != 0:
        test_elmo = load_elmo(test_texts)
        test_data.add_feature(test_elmo)

    x_train = []
    y_train = []
    x_test = []
    for dir, checkpoints_per_model in zip(paths, models_files):
        print(dir, checkpoints_per_model)
        dir = BASE_DIR + '/' + dir
        if saved == 1:
            oof_train, oof_train_y, oof_test = load_oof_dir(dir)
        else:
            print(checkpoints_per_model)
            NFOLDS = len(checkpoints_per_model)
            print(NFOLDS)
            # assert NFOLDS == args.folds
            clfs = [None for i in range(NFOLDS)]
            for cp in checkpoints_per_model:
                fold = int(cp.replace('_', '.').split('.')[-2])
                clfs[fold - 1] = cp
            if saved == 2:
                oof_train, oof_train_y, oof_test = load_oof(
                    clfs,
                    fo,
                    test_data,
                    word2index,
                    polarity_dict=polarity_dict,
                    attr_dict=attr_dict)
            elif saved == 3:
                oof_train, oof_train_y, oof_test = load_oof3(
                    clfs,
                    fo,
                    test_data,
                    word2index,
                    polarity_dict=polarity_dict,
                    attr_dict=attr_dict)
            elif saved == 0:
                oof_train, oof_train_y, oof_test = get_oof(
                    clfs,
                    fo,
                    test_data,
                    word2index,
                    polarity_dict=polarity_dict,
                    attr_dict=attr_dict)
            else:
                print("saved error, [0:3]")
                exit(-1)
        x_train.append(oof_train)
        oof_train_y = oof_train_y.reshape(oof_train_y.shape[0], )
        if y_train == []:
            y_train = oof_train_y
        else:
            assert (y_train == oof_train_y).all()
        x_test.append(oof_test)
    x_train = np.concatenate(x_train, axis=1)
    x_test = np.concatenate(x_test, axis=1)

    y_train = np.asarray(y_train).reshape((len(y_train), ))

    meta_clf = LogisticRegression()
    meta_clf.fit(x_train, y_train)
    test_predict = meta_clf.predict_proba(x_test)
    fw = codecs.open(ROOT_DIR + "/data/test_predict_polarity_ensemble.txt",
                     'w',
                     encoding='utf-8')
    for j, prob in enumerate(test_predict):
        polarity = np.argmax(prob) - 1
        fw.write(test_aspects[j] + ',' + str(polarity) + '\n')
    time_stamp = time.asctime().replace(':', '_').split()
    fw.close()
    shutil.copy2(
        ROOT_DIR + "/data/test_predict_polarity_ensemble.txt", ROOT_DIR +
        "/data/backup/test_predict_polarity_ensemble_%s.txt" % time_stamp)