l = line.split("\t") target_id = l[0] source_id = l[1] intent = l[2] left_citated_text = l[3] right_citated_text = l[4] left_citation_tokenized = tokenizer.tokenize( left_citated_text)[-250:] right_citation_tokenized = tokenizer.tokenize( right_citated_text)[:250] input_tokens = tokenizer.convert_tokens_to_ids( left_citation_tokenized) + [ tokenizer.sep_token_id ] + tokenizer.convert_tokens_to_ids(right_citation_tokenized) position_citation_mark = len(left_citation_tokenized) tokens_tensor = torch.tensor([input_tokens]) outputs = model(tokens_tensor) emb = np.array(outputs[0][0][position_citation_mark].cpu()) if intent not in intentdict: intentdict[intent] = intentn intentn += 1 X.append(emb) y.append(intentdict[intent]) return X, y if __name__ == "__main__": ent_vocab = build_ent_vocab( "/home/ohagi_masaya/TransBasedCitEmb/dataset/AASC/train.csv") load_data_SVM_with_context(ent_vocab)
def load_AASC_graph_data(args): def extract_by_frequency(path_train, path_test, frequency): dftrain = pd.read_csv(path_train, quotechar="'") dftest = pd.read_csv(path_test, quotechar="'") source_cut_train = dftrain[[ 'target_id', 'source_id' ]].drop_duplicates(subset=['target_id', 'source_id']) source_cut_test = dftest[[ 'target_id', 'source_id' ]].drop_duplicates(subset=['target_id', 'source_id']) ftrain_fre = open( path_train[:-4] + "_frequency" + str(frequency) + ".csv", "w") ftest_fre = open( path_test[:-4] + "_frequency" + str(frequency) + ".csv", "w") wtrain = csv.writer(ftrain_fre, quotechar="'") wtest = csv.writer(ftest_fre, quotechar="'") wtrain.writerow([ "target_id", "left_citated_text", "right_citated_text", "source_id" ]) wtest.writerow([ "target_id", "left_citated_text", "right_citated_text", "source_id" ]) source_train_keys = source_cut_train.source_id.value_counts().keys() source_test_keys = source_cut_test.source_id.value_counts().keys() dic1 = {} train_counts = source_cut_train.source_id.value_counts() test_counts = source_cut_test.source_id.value_counts() for key in source_train_keys: dic1[key] = train_counts[key] for key in source_test_keys: if key in dic1: dic1[key] += test_counts[key] else: dic1[key] = test_counts[key] frequencylist = [] for key in dic1: if dic1[key] >= frequency: frequencylist.append(key) dftrain = dftrain.loc[dftrain["source_id"].isin(frequencylist)] dftest = dftest.loc[dftest["source_id"].isin(frequencylist)] for target_id, left_citated_text, right_citated_text, source_id in zip( dftrain["target_id"], dftrain["left_citated_text"], dftrain["right_citated_text"], dftrain["source_id"]): wtrain.writerow( [target_id, left_citated_text, right_citated_text, source_id]) ftrain_fre.close() for target_id, left_citated_text, right_citated_text, source_id in zip( dftest["target_id"], dftest["left_citated_text"], dftest["right_citated_text"], dftest["source_id"]): wtest.writerow( [target_id, left_citated_text, right_citated_text, source_id]) ftest_fre.close() entitylist = list( set( list(dftrain["source_id"].values) + list(dftrain["target_id"].values) + list(dftest["source_id"].values) + list(dftest["target_id"].values))) entitylist.sort() ent_vocab = {"UNKNOWN": 0, "MASK": 1} for i, entity in enumerate(entitylist): ent_vocab[entity] = i + 2 return path_train[:-4] + "_frequency" + str( frequency) + ".csv", path_test[:-4] + "_frequency" + str( frequency) + ".csv", ent_vocab path = settings.citation_recommendation_dir path_train = os.path.join(path, "train.csv") path_test = os.path.join(path, "test.csv") ent_vocab = build_ent_vocab(path_train) path_train_frequency5, path_test_frequency5, ent_vocab_frequency5 = extract_by_frequency( path_train, path_test, args.frequency) #randomでMASKするように一旦変更 datasetdict = { "tail": AASCDataSet, "random": AASCDataSetRANDOM, "both": AASCDataSetBOTH } cur_dataset = datasetdict[args.mask_type] if args.train_data == "full": dataset_train = cur_dataset(path_train, ent_vocab=ent_vocab, WINDOW_SIZE=args.WINDOW_SIZE, MAX_LEN=args.MAX_LEN, pretrained_model=args.pretrained_model, mode="train") else: dataset_train = cur_dataset(path_train_frequency5, ent_vocab=ent_vocab, WINDOW_SIZE=args.WINDOW_SIZE, MAX_LEN=args.MAX_LEN, pretrained_model=args.pretrained_model, mode="train") if args.test_data == "full": dataset_test = cur_dataset(path_test, ent_vocab=ent_vocab, WINDOW_SIZE=args.WINDOW_SIZE, MAX_LEN=args.MAX_LEN, pretrained_model=args.pretrained_model, mode="test") else: dataset_test = cur_dataset(path_test_frequency5, ent_vocab=ent_vocab, WINDOW_SIZE=args.WINDOW_SIZE, MAX_LEN=args.MAX_LEN, pretrained_model=args.pretrained_model, mode="test") print("----loading data done----") return dataset_train, dataset_test, ent_vocab
def load_AASC_graph_data(args): def extract_by_frequency(path_train, path_test, frequency): dftrain = pd.read_csv(path_train, quotechar="'") dftest = pd.read_csv(path_test, quotechar="'") source_cut_train = dftrain[[ 'target_id', 'source_id' ]].drop_duplicates(subset=['target_id', 'source_id']) source_cut_test = dftest[[ 'target_id', 'source_id' ]].drop_duplicates(subset=['target_id', 'source_id']) ftrain_fre = open( path_train[:-4] + "_frequency" + str(frequency) + ".csv", "w") ftest_fre = open( path_test[:-4] + "_frequency" + str(frequency) + ".csv", "w") wtrain = csv.writer(ftrain_fre, quotechar="'") wtest = csv.writer(ftest_fre, quotechar="'") wtrain.writerow([ "target_id", "left_citated_text", "right_citated_text", "source_id" ]) wtest.writerow([ "target_id", "left_citated_text", "right_citated_text", "source_id" ]) source_train_keys = source_cut_train.source_id.value_counts().keys() source_test_keys = source_cut_test.source_id.value_counts().keys() dic1 = {} train_counts = source_cut_train.source_id.value_counts() test_counts = source_cut_test.source_id.value_counts() for key in source_train_keys: dic1[key] = train_counts[key] for key in source_test_keys: if key in dic1: dic1[key] += test_counts[key] else: dic1[key] = test_counts[key] frequencylist = [] for key in dic1: if dic1[key] >= frequency: frequencylist.append(key) dftrain = dftrain.loc[dftrain["source_id"].isin(frequencylist)] dftest = dftest.loc[dftest["source_id"].isin(frequencylist)] for target_id, left_citated_text, right_citated_text, source_id in zip( dftrain["target_id"], dftrain["left_citated_text"], dftrain["right_citated_text"], dftrain["source_id"]): wtrain.writerow( [target_id, left_citated_text, right_citated_text, source_id]) ftrain_fre.close() for target_id, left_citated_text, right_citated_text, source_id in zip( dftest["target_id"], dftest["left_citated_text"], dftest["right_citated_text"], dftest["source_id"]): wtest.writerow( [target_id, left_citated_text, right_citated_text, source_id]) ftest_fre.close() entitylist = list( set( list(dftrain["source_id"].values) + list(dftrain["target_id"].values) + list(dftest["source_id"].values) + list(dftest["target_id"].values))) entitylist.sort() entvocab = {"UNKNOWN": 0, "MASK": 1} for i, entity in enumerate(entitylist): entvocab[entity] = i + 2 return path_train[:-4] + "_frequency" + str( frequency) + ".csv", path_test[:-4] + "_frequency" + str( frequency) + ".csv", entvocab path = settings.citation_recommendation_dir path_train = os.path.join(path, "train.csv") path_test = os.path.join(path, "test.csv") path_emb_train = os.path.join(path, "scibert_AASCtrain.npy") path_emb_test = os.path.join(path, "scibert_AASCtest.npy") entvocab = build_ent_vocab(path_train) matrix_train = makecitationmatrix_AASC(path_train, path_emb_train, entvocab) matrix_test = makecitationmatrix_AASC(path_test, path_emb_test, entvocab) path_train_frequency5, path_test_frequency5, entvocab_frequency5 = extract_by_frequency( path_train, path_test, args.frequency) if args.train_data == "full": dataset_train = AASCDataSet(path_train, ent_vocab=entvocab, MAX_LEN=args.MAX_LEN, matrix=matrix_train) else: dataset_train = AASCDataSet(path_train_frequency5, ent_vocab=entvocab, MAX_LEN=args.MAX_LEN, matrix=matrix_train) if args.test_data == "full": dataset_test = AASCDataSet(path_test, ent_vocab=entvocab, MAX_LEN=args.MAX_LEN, matrix=matrix_test, mode="test") else: dataset_test = AASCDataSet(path_test_frequency5, ent_vocab=entvocab, MAX_LEN=args.MAX_LEN, matrix=matrix_test, mode="test") #return dataset_train,dataset_test,entvocab return dataset_train, dataset_test, entvocab, matrix_train, matrix_test