def blending(): # saved = True if args.saved != 0 else False saved = args.saved test_file1 = ROOT_DIR + "/data/test.txt" test_file2 = ROOT_DIR + "/data/test_predict_aspect_ensemble.txt" test_texts, test_aspects = load_ab_test(test_file1, test_file2) # print(test_aspects) word2index = pickle.load(open(ROOT_DIR + "/data/vocabulary.pkl", 'rb')) f_dict = ROOT_DIR + "/dataset/polarity.json" polarity_list, polarity_dict = parse_json(f_dict) f_dict2 = ROOT_DIR + "/dataset/attribute.json" attr_list, attr_dict = parse_json(f_dict2) paths = args.test_dir.split('#') models_files = [] for path in paths: models_files.append([ os.path.join(path, f) for f in os.listdir(path) if os.path.isfile(os.path.join(path, f)) ]) test_data = Data3((test_texts, None, test_aspects), word2index, polarity_dict, args, target_dict=attr_dict) if args.use_elmo != 0: test_elmo = load_elmo(test_texts) test_data.add_feature(test_elmo) x_test = [] for dir, checkpoints_per_model in zip(paths, models_files): print(dir, checkpoints_per_model) if saved == 1: oof_test = load_oof_test(dir) else: clfs = checkpoints_per_model oof_test = get_oof_test(clfs, test_data) x_test.append(oof_test) x_test = np.stack(x_test, axis=1) print(x_test) print(x_test.shape) test_predict = np.mean(x_test, axis=1) fw = codecs.open(ROOT_DIR + "/data/test_predict_polarity_ensemble.txt", 'w', encoding='utf-8') for j, prob in enumerate(test_predict): polarity = np.argmax(prob) - 1 fw.write(test_aspects[j] + ',' + str(polarity) + '\n') time_stamp = time.asctime().replace(':', '_').split() fw.close() shutil.copy2( ROOT_DIR + "/data/test_predict_polarity_ensemble.txt", ROOT_DIR + "/data/backup/test_predict_polarity_ensemble_%s.txt" % time_stamp)
def test(): # model = Classifier() test_file1 = ROOT_DIR + "/attribute_level/data/attribute_test.txt" test_file2 = ROOT_DIR + "/attribute_level/test_predict.txt" test_texts, test_aspects = load_ab_test(test_file1, test_file2) f_w2v = ROOT_DIR + "/embedding/embedding_all_merge_300.txt" W, word2index = load_w2v(f_w2v) f_dict1 = ROOT_DIR + "/dataset/polarity.json" f_dict2 = ROOT_DIR + "/dataset/attribute.json" polarity_list, polarity_dict = parse_json(f_dict1) attr_list, attr_dict = parse_json(f_dict2) assert len(test_texts) == len(test_aspects) files = ["checkpoint_HEAT_0.7189.pt", "checkpoint_HEAT_0.7062.pt"] predicts = [] for check_point in files: predict = [] classifier = torch.load(check_point) for text, aspect in zip(test_texts, test_aspects): if aspect != '': if aspect is None: print("error") test_data = Data3(([text], [None], [aspect]), word2index, polarity_dict, args, target_dict=attr_dict) test_predict = train_single.predict(classifier, test_data, args) assert len(test_predict) == 1 polarity = str(test_predict[0].item() - 1) else: print(aspect) print(text) polarity = '0' # fw.write(aspect+','+polarity+'\n') predict.append(aspect + ',' + polarity) predicts.append(predict) print(len(predicts)) print(len(predicts[0])) fw = codecs.open("test_predict_polarity_ensemble.txt", 'w', encoding='utf-8') for j in range(len(predicts[0])): votes = [predicts[i][j] for i in range(len(predicts))] voted = Counter(votes).most_common(1) fw.write(voted + '\n')
def test(): model = AttributeClassifier() check_point = "checkpoint_AttA3_0.8810.pt" model.load_model(check_point) test_file = "data/attribute_test.txt" test_texts = load_test_data(test_file) f_w2v = "../embedding/embedding_all_merge_300.txt" W, word2index = load_w2v(f_w2v) f_dict = "../dataset/attribute.json" attr_list, attr_dict = parse_json(f_dict) test_data = Data((test_texts, None), word2index) test_predict = train.predict(model.classifier, test_data, args) print(test_predict) fw = codecs.open("test_predict.txt", 'w', encoding='utf-8') for p in test_predict: attributes = [] for i, l in enumerate(p): if l != 0: attributes.append(attr_list[i]) fw.write('|'.join(attributes) + '\n')
def ensemble(): f_train = ROOT_DIR + "/data/train.txt" if args.w2v == "merge": f_w2v = ROOT_DIR + "/embedding/embedding_all_merge_300.txt" elif args.w2v == "fasttext2": f_w2v = ROOT_DIR + "/embedding/embedding_all_fasttext2_300.txt" elif args.w2v == "tencent": f_w2v = ROOT_DIR + "/embedding/embedding_all_tencent_200.txt" else: print("error, no embedding") exit(-1) f_dict1 = ROOT_DIR + "/dataset/polarity.json" f_dict2 = ROOT_DIR + "/dataset/attribute.json" print(f_train) print(f_w2v) if not os.path.exists("%s" % args.check_dir): os.mkdir("%s" % args.check_dir) W, word2index2 = load_w2v(f_w2v) word2index = pickle.load(open(ROOT_DIR + "/data/vocabulary.pkl", 'rb')) assert word2index == word2index2 polarity_list, polarity_dict = parse_json(f_dict1) attr_list, attr_dict = parse_json(f_dict2) kf = 0 fo = load_abp_raw(f_train) for train_index, test_index in kfold_split(len(fo), args.folds): kf += 1 print("FOLD:", kf) # print("TRAIN:", train_index, '\n', "TEST:", test_index, str(len(test_index))) train_texts, train_labels, train_aspects, test_texts, test_labels, test_aspects = splits( fo, train_index, test_index) print(len(train_texts)) print(len(test_texts)) # print(list(attr_dict.keys())) model = Classifier() print(attr_list) print(attr_dict) # exit(-1) # print(train_texts) model.train_from_data((train_texts, train_labels, train_aspects), (test_texts, test_labels, test_aspects), W, word2index, polarity_dict, attr_dict, args, kf)
def main(): f_train = ROOT_DIR + "/data/train.txt" f_test = "data/test_p.txt" if args.w2v == "merge": f_w2v = ROOT_DIR + "/embedding/embedding_all_merge_300.txt" elif args.w2v == "fasttext": f_w2v = ROOT_DIR + "/embedding/embedding_all_fasttext_300.txt" elif args.w2v == "fasttext2": f_w2v = ROOT_DIR + "/embedding/embedding_all_fasttext2_300.txt" elif args.w2v == "tencent": f_w2v = ROOT_DIR + "/embedding/embedding_all_tencent_200.txt" else: print("error, no embedding") exit(-1) f_dict1 = ROOT_DIR + "/dataset/polarity.json" f_dict2 = ROOT_DIR + "/dataset/attribute.json" print(f_w2v) # train_texts, train_labels = load_attr_data(filename=f_train) # # test_text, test_labels = load_attr_data(filename=f_test) # train_texts, train_labels, test_texts, test_labels = split_dev(train_texts, train_labels) train_texts, train_labels, train_aspects, test_texts, test_labels, test_aspects = load_abp_data( f_train, folds=5) if not os.path.exists("%s" % args.check_dir): os.mkdir("%s" % args.check_dir) print(len(train_texts)) print(len(test_texts)) W, word2index2 = load_w2v(f_w2v) word2index = pickle.load(open(ROOT_DIR + "/data/vocabulary.pkl", 'rb')) assert word2index == word2index2 polarity_list, polarity_dict = parse_json(f_dict1) attr_list, attr_dict = parse_json(f_dict2) # print(list(attr_dict.keys())) model = Classifier() print(polarity_list) print(polarity_dict) # exit(-1) # print(train_texts) model.train_from_data((train_texts, train_labels, train_aspects), (test_texts, test_labels, test_aspects), W, word2index, polarity_dict, attr_dict, args)
def ensemble(): f_train = "../data/train.txt" # f_test = "data/test_attr2.txt" if args.w2v == "merge": f_w2v = "../embedding/embedding_all_merge_300.txt" elif args.w2v == "fasttext2": f_w2v = "../embedding/embedding_all_fasttext2_300.txt" elif args.w2v == "tencent": f_w2v = "../embedding/embedding_all_tencent_200.txt" else: print("error, no embedding") exit(-1) f_dict = "../dataset/attribute.json" print(f_train) print(f_w2v) if not os.path.exists("%s" % args.check_dir): os.mkdir("%s" % args.check_dir) raw_texts, raw_labels = load_attr_data(filename=f_train) W, word2index2 = load_w2v(f_w2v) word2index = pickle.load(open("../data/vocabulary.pkl", 'rb')) assert word2index == word2index2 attr_list, attr_dict = parse_json(f_dict) kf = 0 for train_index, test_index in kfold_split(len(raw_texts), args.folds): kf += 1 print("FOLD:", kf) print("TRAIN:", str(len(train_index)), '\n', "TEST:", str(len(test_index))) # train_index, test_index = train_index.tolist(), test_index.tolist() test_texts, test_labels = [raw_texts[i] for i in test_index ], [raw_labels[i] for i in test_index] train_texts, train_labels = [raw_texts[i] for i in train_index ], [raw_labels[i] for i in train_index] print(len(train_texts)) print(len(test_labels)) model = AttributeClassifier() print(attr_list) print(attr_dict) # exit(-1) # print(train_texts) model.train_from_data((train_texts, train_labels), (test_texts, test_labels), W, word2index, attr_dict, args, kf) pass
def main(): f_train = "../data/train.txt" # f_test = "data/test_attr2.txt" if args.w2v == "merge": f_w2v = "../embedding/embedding_all_merge_300.txt" elif args.w2v == "fasttext": f_w2v = "../embedding/embedding_all_fasttext_300.txt" elif args.w2v == "fasttext2": f_w2v = "../embedding/embedding_all_fasttext2_300.txt" elif args.w2v == "tencent": f_w2v = "../embedding/embedding_all_tencent_200.txt" else: print("error, no embedding") exit(-1) f_dict = "../dataset/attribute.json" print(f_w2v) train_texts, train_labels = load_attr_data(filename=f_train) train_texts, train_labels, test_texts, test_labels = split_dev( train_texts, train_labels) print(len(train_texts)) print(len(test_labels)) # train_texts2, train_labels2, test_texts, test_labels = split_dev(train_texts, train_labels) if not os.path.exists("%s" % args.check_dir): os.mkdir("%s" % args.check_dir) # test_texts, test_labels = load_attr_data(filename=f_test) W, word2index2 = load_w2v(f_w2v) word2index = pickle.load(open("../data/vocabulary.pkl", 'rb')) assert word2index == word2index2 attr_list, attr_dict = parse_json(f_dict) print(list(attr_dict.keys())) model = AttributeClassifier() print(attr_list) print(attr_dict) # exit(-1) # print(train_texts) model.train_from_data((train_texts, train_labels), (test_texts, test_labels), W, word2index, attr_dict, args)
def dev(): model = AttributeClassifier() check_point = "checkpoints5/checkpoint_AttA3_0.8666.pt" model.load_model(check_point) f_train = "data/attribute_data.txt" # f_test = "data/test_attr2.txt" f_w2v = "../embedding/embedding_all_merge_300.txt" f_dict = "../dataset/attribute.json" print(f_w2v) raw_texts, raw_labels = load_attr_data(filename=f_train) W, word2index = load_w2v(f_w2v) attr_list, attr_dict = parse_json(f_dict) kf = 0 _, test_index = kfold_split(len(raw_texts), args.folds)[2] test_texts, test_labels = [raw_texts[i] for i in test_index ], [raw_labels[i] for i in test_index] test_data = Data((test_texts, test_labels), word2index, attr_dict, args) test_predict = train.predict(model.classifier, test_data, args) pred_acc_t = score(test_predict, test_data.labels) print(pred_acc_t)
def stacking(): saved = True if args.saved != 0 else False f_train = "../data/train.txt" test_file = "../data/test.txt" test_texts = load_test_data(test_file) raw_texts, raw_labels = load_attr_data(filename=f_train) word2index = pickle.load(open("../data/vocabulary.pkl", 'rb')) f_dict = "../dataset/attribute.json" attr_list, attr_dict = parse_json(f_dict) paths = args.test_dir.split('#') models_files = [] for path in paths: models_files.append([ os.path.join(path, f) for f in os.listdir(path) if os.path.isfile(os.path.join(path, f)) ]) test_data = Data((test_texts, None), word2index) if args.use_elmo != 0: test_elmo = load_elmo(test_texts) test_data.add_feature(test_elmo) x_train = [] y_train = [] # TODO replace x_test = [] for dir, checkpoints_per_model in zip(paths, models_files): print(dir, checkpoints_per_model) if saved == 1 and os.path.isfile( os.path.join(dir, 'npy', "oof_train.npy")): oof_train, oof_train_y, oof_test = load_oof(dir) else: NFOLDS = len(checkpoints_per_model) print(NFOLDS) assert NFOLDS == args.folds clfs = [None for i in range(NFOLDS)] for cp in checkpoints_per_model: fold = int(cp.replace('_', '.').split('.')[-2]) print(fold) clfs[fold - 1] = cp oof_train, oof_train_y, oof_test = get_oof(clfs, raw_texts, raw_labels, test_data, word2index, attr_dict) x_train.append(oof_train) if y_train == []: y_train = oof_train_y else: assert (y_train == oof_train_y).all() x_test.append(oof_test) x_train = np.stack(x_train, axis=2) x_test = np.stack(x_test, axis=2) print(x_train.shape) num_train = x_train.shape[0] num_test = x_test.shape[0] test_predict = [] for c in range(x_train.shape[1]): x_train_c = x_train[:, c, :].reshape(num_train, -1) x_test_c = x_test[:, c, :].reshape(num_test, -1) meta_clf_c = LogisticRegression() y_train_c = y_train[:, c] meta_clf_c.fit(x_train_c, y_train_c) test_predict_c = meta_clf_c.predict_proba(x_test_c)[:, 1] test_predict.append(test_predict_c) test_predict = np.stack(test_predict, axis=1) print(test_predict.shape) fw = codecs.open("../data/test_predict_aspect_ensemble.txt", 'w', encoding='utf-8') for prob in test_predict: attributes = [] voted = [0 for a in range(len(attr_list))] for i in range(len(prob)): p = prob[i] # print(p) if p > args.threshold: voted[i] = 1 # categories.append(attrC[i]) if sum(voted) == 0: voted[prob.argmax()] = 1 for i, l in enumerate(voted): if l != 0: attributes.append(attr_list[i]) fw.write('|'.join(attributes) + '\n') time_stamp = time.asctime().replace(':', '_').split() fw.close() shutil.copy2( "../data/test_predict_aspect_ensemble.txt", "../data/backup/test_predict_aspect_ensemble_%s.txt" % time_stamp)
def stacking(): # saved = True if args.saved != 0 else False saved = args.saved f_train = ROOT_DIR + "/data/train.txt" test_file1 = ROOT_DIR + "/data/test.txt" test_file2 = ROOT_DIR + "/data/test_predict_aspect_ensemble.txt" test_texts, test_aspects = load_ab_test(test_file1, test_file2) # print(test_aspects) fo = load_abp_raw(f_train) word2index = pickle.load(open(ROOT_DIR + "/data/vocabulary.pkl", 'rb')) f_dict = ROOT_DIR + "/dataset/polarity.json" polarity_list, polarity_dict = parse_json(f_dict) f_dict2 = ROOT_DIR + "/dataset/attribute.json" attr_list, attr_dict = parse_json(f_dict2) paths = args.test_dir.split('#') models_files = [] for path in paths: path = BASE_DIR + '/' + path models_files.append([ os.path.join(path, f) for f in os.listdir(path) if os.path.isfile(os.path.join(path, f)) ]) test_data = Data3((test_texts, None, test_aspects), word2index, polarity_dict, args, target_dict=attr_dict) if args.use_elmo != 0: test_elmo = load_elmo(test_texts) test_data.add_feature(test_elmo) x_train = [] y_train = [] x_test = [] for dir, checkpoints_per_model in zip(paths, models_files): print(dir, checkpoints_per_model) dir = BASE_DIR + '/' + dir if saved == 1: oof_train, oof_train_y, oof_test = load_oof_dir(dir) else: print(checkpoints_per_model) NFOLDS = len(checkpoints_per_model) print(NFOLDS) # assert NFOLDS == args.folds clfs = [None for i in range(NFOLDS)] for cp in checkpoints_per_model: fold = int(cp.replace('_', '.').split('.')[-2]) clfs[fold - 1] = cp if saved == 2: oof_train, oof_train_y, oof_test = load_oof( clfs, fo, test_data, word2index, polarity_dict=polarity_dict, attr_dict=attr_dict) elif saved == 3: oof_train, oof_train_y, oof_test = load_oof3( clfs, fo, test_data, word2index, polarity_dict=polarity_dict, attr_dict=attr_dict) elif saved == 0: oof_train, oof_train_y, oof_test = get_oof( clfs, fo, test_data, word2index, polarity_dict=polarity_dict, attr_dict=attr_dict) else: print("saved error, [0:3]") exit(-1) x_train.append(oof_train) oof_train_y = oof_train_y.reshape(oof_train_y.shape[0], ) if y_train == []: y_train = oof_train_y else: assert (y_train == oof_train_y).all() x_test.append(oof_test) x_train = np.concatenate(x_train, axis=1) x_test = np.concatenate(x_test, axis=1) y_train = np.asarray(y_train).reshape((len(y_train), )) meta_clf = LogisticRegression() meta_clf.fit(x_train, y_train) test_predict = meta_clf.predict_proba(x_test) fw = codecs.open(ROOT_DIR + "/data/test_predict_polarity_ensemble.txt", 'w', encoding='utf-8') for j, prob in enumerate(test_predict): polarity = np.argmax(prob) - 1 fw.write(test_aspects[j] + ',' + str(polarity) + '\n') time_stamp = time.asctime().replace(':', '_').split() fw.close() shutil.copy2( ROOT_DIR + "/data/test_predict_polarity_ensemble.txt", ROOT_DIR + "/data/backup/test_predict_polarity_ensemble_%s.txt" % time_stamp)