def gen_one_test_feature(): # process test data and save in pickle # testdatafeatures --> {pid-with-index: {candidate-aids: [...], data: [[xxx], [xxx], [xxx]...]}} valid_nuass = load_json(VALID_UNASS_PATH) valid_pub = load_json(VALID_PUB_PATH) # whole_author_profile_pub = load_json(WHOLE_AUTHOR_PROFILE_PUB_PATH) aid2yearinfo = load_pickle( os.path.join(NEW_DATA_V2_DIR, 'aid2yearinfo.pkl')) aid2coauthor = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2coauthor.pkl')) aid2venue = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2venue.pkl')) aid2keywords = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2keywords.pkl')) aid2year = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2year.pkl')) aid2orgwithyear = load_pickle( os.path.join(NEW_DATA_V2_DIR, 'aid2orgwithyear.pkl')) name2aids = load_pickle(os.path.join(NEW_DATA_DIR, 'name2aids.pkl')) # aid2pids = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2pids.pkl')) testdatafeatures = {} all_authors_name = list(name2aids.keys()) all_aids = [] for key in name2aids: aids = name2aids[key] all_aids.extend(aids.tolist()) all_aids = np.array(all_aids) for pid_with_index in tqdm.tqdm(valid_nuass): inner_dict = {} now_pid, index = pid_with_index.split('-') author_name = valid_pub[now_pid]['authors'][int(index)]['name'] author_name = clean_name(author_name) index = get_name_index(author_name, all_authors_name) author_name = all_authors_name[index] candidate_aids = name2aids[author_name] candidate_aids = all_aids inner_dict['candidate-aids'] = candidate_aids data = [] for aid in candidate_aids: print(aid) new_pair = (aid, pid_with_index) pid_info_dict = valid_pub[now_pid] aid_author_info_dict = aid2coauthor[aid] aid_year_info_dict = aid2year[aid] aid_venue_dict = aid2venue[aid] aid_org_year_list = aid2orgwithyear[aid] aid_keywords_dict = aid2keywords[aid] aid_year_all_info_dict = aid2yearinfo[aid] data.append( get_features(new_pair, pid_info_dict, aid_author_info_dict, aid_year_info_dict, aid_venue_dict, aid_org_year_list, aid_keywords_dict, aid_year_all_info_dict)) data = np.array(data) inner_dict['data'] = data testdatafeatures[pid_with_index] = inner_dict break save_pickle(testdatafeatures, './testdatafeatures_one.pkl')
def gen_test_feature(): # process test data and save in pickle # testdatafeatures --> {pid-with-index: {candidate-aids: [...], data: [[xxx], [xxx], [xxx]...]}} valid_nuass = load_json(VALID_UNASS_PATH) valid_pub = load_json(VALID_PUB_PATH) # whole_author_profile_pub = load_json(WHOLE_AUTHOR_PROFILE_PUB_PATH) aid2yearinfo = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2yearinfo.pkl')) aid2coauthor = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2coauthor.pkl')) aid2venue = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2venue.pkl')) aid2keywords = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2keywords.pkl')) aid2year = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2year.pkl')) aid2orgwithyear = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2orgwithyear.pkl')) name2aids = load_pickle(os.path.join(NEW_DATA_DIR, 'name2aids.pkl')) aid2pids = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2pids.pkl')) aid2orgset = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2orgset.pkl')) aid2venueset = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2venueset.pkl')) aid2keywordsset = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2keywordsset.pkl')) all_pids_len = 0 for aid in aid2pids: all_pids_len += len(aid2pids[aid]) testdatafeatures = {} all_authors_name = list(name2aids.keys()) # author_name_count = defaultdict(int) for pid_with_index in tqdm.tqdm(valid_nuass): inner_dict = {} now_pid, index = pid_with_index.split('-') author_name = valid_pub[now_pid]['authors'][int(index)]['name'] author_name = clean_name(author_name) index = get_name_index(author_name, all_authors_name) author_name = all_authors_name[index] # author_name_count[author_name] += 1 # continue candidate_aids = name2aids[author_name] inner_dict['candidate-aids'] = candidate_aids data = [] for aid in candidate_aids: new_pair = (aid, pid_with_index) pid_info_dict = valid_pub[now_pid] aid_author_info_dict = aid2coauthor[aid] aid_year_info_dict = aid2year[aid] aid_venue_dict = aid2venue[aid] aid_org_year_list = aid2orgwithyear[aid] aid_keywords_dict = aid2keywords[aid] aid_year_all_info_dict = aid2yearinfo[aid] org_info_set = aid2orgset[aid] aid_venue_set = aid2venueset[aid] aid_keywords_set = aid2keywordsset[aid] data.append(get_features(new_pair, pid_info_dict, aid_author_info_dict, aid_year_info_dict, aid_venue_dict, aid_org_year_list, aid_keywords_dict, aid_year_all_info_dict, org_info_set, aid_venue_set, aid_keywords_set)) data[-1].append(len(aid2pids[aid]) / all_pids_len) data = np.array(data) inner_dict['data'] = data testdatafeatures[pid_with_index] = inner_dict save_pickle(testdatafeatures, os.path.join(TEST_FEATURE_DIR_V2, 'testdatafeatures-withsetinfo-papercount.pkl'))
def gen_test_title_abstract_vec(mission='title'): if mission == 'title': aid2cate = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2titlevec.pkl')) elif mission == 'abstract': aid2cate = load_pickle( os.path.join(NEW_DATA_DIR, 'aid2abstractvec.pkl')) else: raise ValueError('mission value error') valid_nuass = load_json(VALID_UNASS_PATH) valid_pub = load_json(VALID_PUB_PATH) name2aids = load_pickle(os.path.join(NEW_DATA_DIR, 'name2aids.pkl')) texttovec = TextToVec() all_authors_name = list(name2aids.keys()) # test_cate_feature --> {pid-with-index: {candidate-aids: [...], data: [(emb0, meb1), ...]}} test_cate_feature = {} for pid_with_index in tqdm.tqdm(valid_nuass): inner_dict = {} now_pid, index = pid_with_index.split('-') author_name = valid_pub[now_pid]['authors'][int(index)]['name'] author_name = clean_name(author_name) index = get_name_index(author_name, all_authors_name) author_name = all_authors_name[index] candidate_aids = name2aids[author_name] inner_dict['candidate-aids'] = candidate_aids data = [] for aid in candidate_aids: info = valid_pub[now_pid].get(mission) if info is None: emb = np.zeros(300) else: emb = texttovec.get_vec(info) emb_pair = (aid2cate[aid], emb) data.append(emb_pair) inner_dict['data'] = data test_cate_feature[pid_with_index] = inner_dict save_pickle( test_cate_feature, os.path.join(TEST_FEATURE_DIR_V2, 'test-%s-emb-pair.pkl' % mission))
def check_name(): problem_pids = load_json(os.path.join(FINAL_DIR, 'problem.pids.3.json')) name2aids = load_pickle(os.path.join(NEW_DATA_DIR, 'name2aids.pkl')) test_pub = load_json(TEST_PUB_PATH) all_authors_name = list(name2aids.keys()) name_map = [] for pid_with_index in tqdm.tqdm(problem_pids): now_pid, index = pid_with_index.split('-') author_name_no_clean = test_pub[now_pid]['authors'][int(index)]['name'] author_name = clean_name(author_name_no_clean) if pid_with_index == 'ToCcabLT-1': author_name = 'junliang_wang' if pid_with_index == 'cVvvcFzj-1': author_name = 'xiaojun_liu' index = get_name_index(author_name, all_authors_name) author_name_inlist = all_authors_name[index] # if author_name_inlist != author_name: name_map.append((pid_with_index, author_name_no_clean, author_name, author_name_inlist)) name_map = list(set(name_map)) print(len(name_map)) save_json(name_map, os.path.join(FINAL_DIR, 'name.different.3.json'))
def get_triplet_corpus(mission='train'): whole_author_profile_pub = load_json(WHOLE_AUTHOR_PROFILE_PUB_PATH) whole_author_profile = load_json(WHOLE_AUTHOR_PROFILE_PATH) name2aids = {} aid2pids = {} aids = [] names = [] pids_with_index = [] for aid in tqdm.tqdm(whole_author_profile): aids.append(aid) names.append(whole_author_profile[aid]['name']) pids = whole_author_profile[aid]['papers'] tmp = [] for paper in pids: paper_authors = whole_author_profile_pub[paper]['authors'] author_names = [clean_name(item['name']) for item in paper_authors] # print(author_names) index = get_name_index(names[-1], author_names) tmp.append('%s-%d' % (paper, index)) pids_with_index.append(tmp) assert len(aids) == len(names) assert len(names) == len(pids_with_index) print('all aids num: ', len(aids)) name_set = set(names) names_array = np.array(names) aids_array = np.array(aids) for name in name_set: target_aid = aids_array[names_array == name] name2aids[name] = target_aid for aid, pid in zip(aids, pids_with_index): aid2pids[aid] = pid if mission == 'train': save_pickle(name2aids, os.path.join(NEW_DATA_V2_DIR, 'name2aids.pkl')) save_pickle(aid2pids, os.path.join(NEW_DATA_V2_DIR, 'aid2pids.pkl')) elif mission == 'test': save_pickle(name2aids, os.path.join(NEW_DATA_DIR, 'name2aids.pkl')) save_pickle(aid2pids, os.path.join(NEW_DATA_DIR, 'aid2pids.pkl')) texttovec = TextToVec() if mission == 'train': aid2pids = load_pickle(os.path.join(NEW_DATA_V2_DIR, 'aid2pids.pkl')) elif mission == 'test': aid2pids = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2pids.pkl')) # ------------------------------------------ # save format: aid2titlevec --> {aid: [mean value]} aid2titlevec = {} for aid in tqdm.tqdm(aid2pids.keys()): papers = aid2pids[aid] inner_list = [] for pid_with_index in papers: pid, index = pid_with_index.split('-') title = whole_author_profile_pub[pid]['title'] inner_list.append(texttovec.get_vec(title)) if len(inner_list) == 0: aid2titlevec[aid] = np.zeros(300) else: aid2titlevec[aid] = np.mean(np.array(inner_list), axis=0) if mission == 'train': save_pickle(aid2titlevec, os.path.join(NEW_DATA_V2_DIR, 'aid2titlevec.pkl')) elif mission == 'test': save_pickle(aid2titlevec, os.path.join(NEW_DATA_DIR, 'aid2titlevec.pkl'))
def get_pid_with_index(whole_author_profile_pub, pid, name): authors = whole_author_profile_pub[pid]['authors'] authors_names = [clean_name(item['name']) for item in authors] index = get_name_index(name, authors_names) return '%s-%d' % (pid, index)
def preprocessing(mission='train'): # os.makedirs(NEW_DATA_DIR, exist_ok=True) # ------------------------------------------ # process whole_author_profile.json, add index, and save to pickle # save format: name2aids --> {name: [aids, ...]}, aid2pids --> {aid: [pid-index, ...]} os.makedirs(NEW_DATA_DIR, exist_ok=True) os.makedirs(NEW_DATA_V2_DIR, exist_ok=True) if mission == 'train': whole_author_profile = load_json( os.path.join(SPLIT_DIR, 'train_profile-last1year.json')) elif mission == 'test': whole_author_profile = load_json(WHOLE_AUTHOR_PROFILE_PATH) else: raise ValueError("check mission value") whole_author_profile_pub = load_json(WHOLE_AUTHOR_PROFILE_PUB_PATH) name2aids = {} aid2pids = {} aids = [] names = [] pids_with_index = [] for aid in tqdm.tqdm(whole_author_profile): aids.append(aid) names.append(whole_author_profile[aid]['name']) pids = whole_author_profile[aid]['papers'] tmp = [] for paper in pids: paper_authors = whole_author_profile_pub[paper]['authors'] author_names = [clean_name(item['name']) for item in paper_authors] # print(author_names) index = get_name_index(names[-1], author_names) tmp.append('%s-%d' % (paper, index)) pids_with_index.append(tmp) assert len(aids) == len(names) assert len(names) == len(pids_with_index) print('all aids num: ', len(aids)) name_set = set(names) names_array = np.array(names) aids_array = np.array(aids) for name in name_set: target_aid = aids_array[names_array == name] name2aids[name] = target_aid for aid, pid in zip(aids, pids_with_index): aid2pids[aid] = pid if mission == 'train': save_pickle(name2aids, os.path.join(NEW_DATA_V2_DIR, 'name2aids.pkl')) save_pickle(aid2pids, os.path.join(NEW_DATA_V2_DIR, 'aid2pids.pkl')) elif mission == 'test': save_pickle(name2aids, os.path.join(NEW_DATA_DIR, 'name2aids.pkl')) save_pickle(aid2pids, os.path.join(NEW_DATA_DIR, 'aid2pids.pkl')) # ------------------------------------------ # save format: aid2year --> {aid: {min: xxx, max: xxx, mean: xxx, median: xxx, min_max_avg: xxx, year_list: [year, ...]}} if mission == 'train': aid2pids = load_pickle(os.path.join(NEW_DATA_V2_DIR, 'aid2pids.pkl')) elif mission == 'test': aid2pids = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2pids.pkl')) aid2year = {} print('Process year info ...') for aid in tqdm.tqdm(aid2pids.keys()): pids = aid2pids[aid] all_years = [] for pid_with_index in pids: pid = pid_with_index.split('-')[0] year = whole_author_profile_pub[pid].get('year', '0') if year == '': year = 0 else: year = int(year) if any([year < 1500, year > 2100]): year = 0 all_years.append(year) all_years = np.array(all_years) all_years = all_years[all_years != 0] if len(all_years) == 0: year_info = None else: year_info = { 'min': np.min(all_years), 'max': np.max(all_years), 'mean': np.mean(all_years), 'min_max_avg': (np.min(all_years) + np.max(all_years)) / 2, 'median': np.median(all_years), 'year_list': all_years, } aid2year[aid] = year_info if mission == 'train': save_pickle(aid2year, os.path.join(NEW_DATA_V2_DIR, 'aid2year.pkl')) elif mission == 'test': save_pickle(aid2year, os.path.join(NEW_DATA_DIR, 'aid2year.pkl')) # ------------------------------------------ # save format: aid2coauthor --> {aid: {anuthor-name: count, ...}} aid2coauthor = {} print('aid2coauthor processing ...') for aid in tqdm.tqdm(aid2pids.keys()): pids = aid2pids[aid] inner_dict = defaultdict(int) for pid_with_index in pids: pid, index = pid_with_index.split('-') authors = whole_author_profile_pub[pid]['authors'] authors_name = [clean_name(item['name']) for item in authors] authors_name.pop(int(index)) for name in authors_name: inner_dict[name] += 1 aid2coauthor[aid] = inner_dict if mission == 'train': save_pickle(aid2coauthor, os.path.join(NEW_DATA_V2_DIR, 'aid2coauthor.pkl')) elif mission == 'test': save_pickle(aid2coauthor, os.path.join(NEW_DATA_DIR, 'aid2coauthor.pkl')) # ------------------------------------------ # save format: aid2venue --> {aid: {venue-name: count ...}} aid2venue = {} print('aid2venue processing ...') for aid in tqdm.tqdm(aid2pids.keys()): pids = aid2pids[aid] inner_dict = defaultdict(int) for pid_with_index in pids: pid, index = pid_with_index.split('-') venue = whole_author_profile_pub[pid]['venue'].lower() if venue != '': # aid2venue[aid].add(venue) inner_dict[venue] += 1 aid2venue[aid] = inner_dict if mission == 'train': save_pickle(aid2venue, os.path.join(NEW_DATA_V2_DIR, 'aid2venue.pkl')) elif mission == 'test': save_pickle(aid2venue, os.path.join(NEW_DATA_DIR, 'aid2venue.pkl')) # ------------------------------------------ # save format: aid2keywords --> {aid: {keyword: count, ...}} aid2keywords = {} print('aid2keywords processing ...') for aid in tqdm.tqdm(aid2pids.keys()): pids = aid2pids[aid] inner_dict = defaultdict(int) for pid_with_index in pids: pid, index = pid_with_index.split('-') keywords = whole_author_profile_pub[pid].get('keywords', '') if len(keywords) == 0: continue for keyword in keywords: if keyword != '': # aid2keywords[aid].add(keyword.lower()) inner_dict[keyword] += 1 aid2keywords[aid] = inner_dict if mission == 'train': save_pickle(aid2keywords, os.path.join(NEW_DATA_V2_DIR, 'aid2keywords.pkl')) elif mission == 'test': save_pickle(aid2keywords, os.path.join(NEW_DATA_DIR, 'aid2keywords.pkl')) # ------------------------------------------ # save format: aid2orgset--> {aid: set{org_word, org_word, ...}} aid2orgset = {} for aid in tqdm.tqdm(aid2pids.keys()): pids = aid2pids[aid] inner_set = set() for pid_with_index in pids: pid, index = pid_with_index.split('-') author = whole_author_profile_pub[pid].get('authors')[int(index)] org = author.get('org', '').lower().strip() org_set = set(org.split()) inner_set = inner_set | org_set aid2orgset[aid] = inner_set if mission == 'train': save_pickle(aid2orgset, os.path.join(NEW_DATA_V2_DIR, 'aid2orgset.pkl')) elif mission == 'test': save_pickle(aid2orgset, os.path.join(NEW_DATA_DIR, 'aid2orgset.pkl')) # ------------------------------------------ # save format: aid2venueset--> {aid: set{venue_word, venue_word, ...}} aid2venueset = {} for aid in tqdm.tqdm(aid2pids.keys()): pids = aid2pids[aid] inner_set = set() for pid_with_index in pids: pid, index = pid_with_index.split('-') venue = whole_author_profile_pub[pid].get('venue', '').lower() if venue == '': continue else: venue_set = set(venue.replace('-', ' ').split()) inner_set = inner_set | venue_set aid2venueset[aid] = inner_set if mission == 'train': save_pickle(aid2venueset, os.path.join(NEW_DATA_V2_DIR, 'aid2venueset.pkl')) elif mission == 'test': save_pickle(aid2venueset, os.path.join(NEW_DATA_DIR, 'aid2venueset.pkl')) # ------------------------------------------ # save format: aid2keywordsset--> {aid: set{key_word, key_word, ...}} aid2keywordsset = {} for aid in tqdm.tqdm(aid2pids.keys()): pids = aid2pids[aid] inner_set = set() for pid_with_index in pids: pid, index = pid_with_index.split('-') keywords = whole_author_profile_pub[pid].get('keywords', '') if len(keywords) == 0: continue for keyword in keywords: if keyword != '': keyword_set = set(keyword.lower().replace('-', ' ').split()) inner_set = inner_set | keyword_set aid2keywordsset[aid] = inner_set if mission == 'train': save_pickle(aid2keywordsset, os.path.join(NEW_DATA_V2_DIR, 'aid2keywordsset.pkl')) elif mission == 'test': save_pickle(aid2keywordsset, os.path.join(NEW_DATA_DIR, 'aid2keywordsset.pkl')) # ------------------------------------------ # save format: aid2orgwithyear --> {aid: [(org, year), () ...]} aid2orgwithyear = {} for aid in tqdm.tqdm(aid2pids.keys()): pids = aid2pids[aid] inner_list = [] for pid_with_index in pids: pid, index = pid_with_index.split('-') auhtors = whole_author_profile_pub[pid]['authors'] org = auhtors[int(index)].get('org', '').lower() year = whole_author_profile_pub[pid].get('year', '0') if year == '': year = 0 else: year = int(year) if any([year < 1500, year > 2100]): year = 0 inner_list.append((org, year)) aid2orgwithyear[aid] = inner_list if mission == 'train': save_pickle(aid2orgwithyear, os.path.join(NEW_DATA_V2_DIR, 'aid2orgwithyear.pkl')) elif mission == 'test': save_pickle(aid2orgwithyear, os.path.join(NEW_DATA_DIR, 'aid2orgwithyear.pkl')) # ------------------------------------------ # save format aid2yearinfo --> {aid: {year: { # orgs: [org, ....], # venues: [venues, ...], # keywords: [keyword, ...], # coauthors: [author-name, ...], # }}} aid2yearinfo = {} for aid in tqdm.tqdm(aid2pids.keys()): inner_dict = {} pids = aid2pids[aid] for pid_with_index in pids: pid, index = pid_with_index.split('-') year = whole_author_profile_pub[pid].get('year', '0') if year == '': year = 0 else: year = int(year) if any([year < 1500, year > 2100]): year = 0 authors = whole_author_profile_pub[pid]['authors'] authors_name = [clean_name(item['name']) for item in authors] org = [authors[int(index)].get('org', '').lower()] authors_name.pop(int(index)) coauthor = authors_name venue = [whole_author_profile_pub[pid].get('venue', '').lower()] keywords = whole_author_profile_pub[pid].get('keywords', ['']) if len(keywords) == 0: keywords = [''] keywords = [keyword.lower() for keyword in keywords] tmp_dict = { 'orgs': org, 'venues': venue, 'keywords': keywords, 'coauthors': coauthor, } if year in inner_dict.keys(): for key in tmp_dict: inner_dict[year][key].extend(tmp_dict[key]) else: inner_dict[year] = tmp_dict aid2yearinfo[aid] = inner_dict if mission == 'train': save_pickle(aid2yearinfo, os.path.join(NEW_DATA_V2_DIR, 'aid2yearinfo.pkl')) elif mission == 'test': save_pickle(aid2yearinfo, os.path.join(NEW_DATA_DIR, 'aid2yearinfo.pkl')) texttovec = TextToVec() # ------------------------------------------ # save format: aid2titlevec --> {aid: [mean value]} aid2titlevec = {} for aid in tqdm.tqdm(aid2pids.keys()): papers = aid2pids[aid] inner_list = [] for pid_with_index in papers: pid, index = pid_with_index.split('-') title = whole_author_profile_pub[pid]['title'] inner_list.append(texttovec.get_vec(title)) if len(inner_list) == 0: aid2titlevec[aid] = np.zeros(300) else: aid2titlevec[aid] = np.mean(np.array(inner_list), axis=0) if mission == 'train': save_pickle(aid2titlevec, os.path.join(NEW_DATA_V2_DIR, 'aid2titlevec.pkl')) elif mission == 'test': save_pickle(aid2titlevec, os.path.join(NEW_DATA_DIR, 'aid2titlevec.pkl')) # ------------------------------------------ # save format: aid2abstractvec --> {aid: [mean value]} aid2abstractvec = {} for aid in tqdm.tqdm(aid2pids.keys()): papers = aid2pids[aid] inner_list = [] for pid_with_index in papers: pid, index = pid_with_index.split('-') abstract = whole_author_profile_pub[pid].get('abstract') if abstract is None: continue inner_list.append(texttovec.get_vec(abstract)) if len(inner_list) == 0: aid2abstractvec[aid] = np.zeros(300) else: aid2abstractvec[aid] = np.mean(np.array(inner_list), axis=0) if mission == 'train': save_pickle(aid2abstractvec, os.path.join(NEW_DATA_V2_DIR, 'aid2abstractvec.pkl')) elif mission == 'test': save_pickle(aid2abstractvec, os.path.join(NEW_DATA_DIR, 'aid2abstractvec.pkl'))
def gen_title_feature(): aid2titlevec = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2titlevec.pkl')) test_unass = load_json(TEST_UNASS_PATH) test_pub = load_json(TEST_PUB_PATH) name2aids = load_pickle(os.path.join(NEW_DATA_DIR, 'name2aids.pkl')) texttovec = TextToVec() name_map = load_json( os.path.join(FINAL_DIR, 'name.different.modified.json')) original_name = [pair[0] for pair in name_map] changed_name = [pair[1] for pair in name_map] name_map2 = load_json( os.path.join(FINAL_DIR, 'name.different.2.modified.json')) original_name2 = [pair[0] for pair in name_map2] changed_name2 = [pair[1] for pair in name_map2] all_authors_name = list(name2aids.keys()) # test_title_feature --> {pid-with-index: {candidate-aids: [...], data: [(emb0, meb1), ...]}} test_title_feature = {} print('Gen title emb pair ...') for pid_with_index in tqdm.tqdm(test_unass): inner_dict = {} now_pid, index = pid_with_index.split('-') author_name = test_pub[now_pid]['authors'][int(index)]['name'] author_name = clean_name(author_name) if pid_with_index == 'ToCcabLT-1': author_name = 'junliang_wang' if pid_with_index == 'cVvvcFzj-1': author_name = 'xiaojun_liu' if author_name in original_name2: name_index = original_name2.index(author_name) author_name = changed_name2[name_index] elif author_name in original_name: name_index = original_name.index(author_name) author_name = changed_name[name_index] else: index = get_name_index(author_name, all_authors_name) author_name = all_authors_name[index] if isinstance(author_name, str): candidate_aids = name2aids[author_name] elif isinstance(author_name, list): candidate_aids = [] for name in author_name: candidate_aids.extend(name2aids[name].tolist()) candidate_aids = np.array(candidate_aids) else: raise ValueError("check author name !!!") inner_dict['candidate-aids'] = candidate_aids info = test_pub[now_pid].get('title') if info is None: emb = np.zeros(300) else: emb = texttovec.get_vec(info) data = [] for aid in candidate_aids: emb_pair = (aid2titlevec[aid], emb) data.append(emb_pair) inner_dict['data'] = data test_title_feature[pid_with_index] = inner_dict save_pickle( test_title_feature, os.path.join(TEST_FEATURE_DIR, 'test-title-emb-pair-name-clean-2.pkl')) print('Gen title distance ...') test_title_emb_pair = load_pickle( os.path.join(TEST_FEATURE_DIR, 'test-title-emb-pair-name-clean-2.pkl')) test_unass = load_json(TEST_UNASS_PATH) title_emb_pair = [] for pid_with_index in tqdm.tqdm(test_unass): for pair in test_title_emb_pair[pid_with_index]['data']: title_emb_pair.append(pair) emb_pair_to_distance( 'tm.title.1.checkpoint.pth', 'title', title_emb_pair, os.path.join(TEST_FEATURE_DIR, 'test-title-distance-df-name-clean-2.pkl'))
def gen_base_feature(index, multi_size): # process test data and save in pickle # testdatafeatures --> {pid-with-index: {candidate-aids: [...], data: [[xxx], [xxx], [xxx]...]}} test_unass = load_json(TEST_UNASS_PATH) test_pub = load_json(TEST_PUB_PATH) # whole_author_profile_pub = load_json(WHOLE_AUTHOR_PROFILE_PUB_PATH) aid2yearinfo = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2yearinfo.pkl')) aid2coauthor = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2coauthor.pkl')) aid2venue = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2venue.pkl')) aid2keywords = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2keywords.pkl')) aid2year = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2year.pkl')) aid2orgwithyear = load_pickle( os.path.join(NEW_DATA_DIR, 'aid2orgwithyear.pkl')) name2aids = load_pickle(os.path.join(NEW_DATA_DIR, 'name2aids.pkl')) # aid2pids = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2pids.pkl')) aid2orgset = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2orgset.pkl')) aid2venueset = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2venueset.pkl')) aid2keywordsset = load_pickle( os.path.join(NEW_DATA_DIR, 'aid2keywordsset.pkl')) name_map = load_json( os.path.join(FINAL_DIR, 'name.different.modified.json')) original_name = [pair[0] for pair in name_map] changed_name = [pair[1] for pair in name_map] name_map2 = load_json( os.path.join(FINAL_DIR, 'name.different.2.modified.json')) original_name2 = [pair[0] for pair in name_map2] changed_name2 = [pair[1] for pair in name_map2] single_range = math.ceil(len(test_unass) / multi_size) start = index * single_range end = (index + 1) * single_range if (index + 1) * single_range < len( test_unass) else len(test_unass) testdatafeatures = {} all_authors_name = list(name2aids.keys()) print('Gen test features ...') for pid_with_index in tqdm.tqdm(test_unass[start:end]): inner_dict = {} now_pid, index = pid_with_index.split('-') author_name = test_pub[now_pid]['authors'][int(index)]['name'] author_name = clean_name(author_name) if pid_with_index == 'ToCcabLT-1': author_name = 'junliang_wang' if pid_with_index == 'cVvvcFzj-1': author_name = 'xiaojun_liu' if author_name in original_name2: name_index = original_name2.index(author_name) author_name = changed_name2[name_index] elif author_name in original_name: name_index = original_name.index(author_name) author_name = changed_name[name_index] else: index = get_name_index(author_name, all_authors_name) author_name = all_authors_name[index] if isinstance(author_name, str): candidate_aids = name2aids[author_name] elif isinstance(author_name, list): candidate_aids = [] for name in author_name: candidate_aids.extend(name2aids[name].tolist()) candidate_aids = np.array(candidate_aids) else: raise ValueError("check author name ! ! !") inner_dict['candidate-aids'] = candidate_aids data = [] for aid in candidate_aids: new_pair = (aid, pid_with_index) pid_info_dict = test_pub[now_pid] aid_author_info_dict = aid2coauthor[aid] aid_year_info_dict = aid2year[aid] aid_venue_dict = aid2venue[aid] aid_org_year_list = aid2orgwithyear[aid] aid_keywords_dict = aid2keywords[aid] aid_year_all_info_dict = aid2yearinfo[aid] org_info_set = aid2orgset[aid] aid_venue_set = aid2venueset[aid] aid_keywords_set = aid2keywordsset[aid] data.append( get_features(new_pair, pid_info_dict, aid_author_info_dict, aid_year_info_dict, aid_venue_dict, aid_org_year_list, aid_keywords_dict, aid_year_all_info_dict, org_info_set, aid_venue_set, aid_keywords_set)) data = np.array(data) inner_dict['data'] = data testdatafeatures[pid_with_index] = inner_dict # save_pickle(testdatafeatures, os.path.join(TEST_FEATURE_DIR, 'u6uRzaff-5.pkl')) return testdatafeatures