Пример #1
0
def gen_one_test_feature():
    # process test data and save in pickle
    # testdatafeatures --> {pid-with-index: {candidate-aids: [...], data: [[xxx], [xxx], [xxx]...]}}
    valid_nuass = load_json(VALID_UNASS_PATH)
    valid_pub = load_json(VALID_PUB_PATH)
    # whole_author_profile_pub = load_json(WHOLE_AUTHOR_PROFILE_PUB_PATH)
    aid2yearinfo = load_pickle(
        os.path.join(NEW_DATA_V2_DIR, 'aid2yearinfo.pkl'))
    aid2coauthor = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2coauthor.pkl'))
    aid2venue = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2venue.pkl'))
    aid2keywords = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2keywords.pkl'))
    aid2year = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2year.pkl'))
    aid2orgwithyear = load_pickle(
        os.path.join(NEW_DATA_V2_DIR, 'aid2orgwithyear.pkl'))
    name2aids = load_pickle(os.path.join(NEW_DATA_DIR, 'name2aids.pkl'))
    # aid2pids = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2pids.pkl'))

    testdatafeatures = {}
    all_authors_name = list(name2aids.keys())
    all_aids = []
    for key in name2aids:
        aids = name2aids[key]
        all_aids.extend(aids.tolist())
    all_aids = np.array(all_aids)
    for pid_with_index in tqdm.tqdm(valid_nuass):
        inner_dict = {}
        now_pid, index = pid_with_index.split('-')
        author_name = valid_pub[now_pid]['authors'][int(index)]['name']
        author_name = clean_name(author_name)
        index = get_name_index(author_name, all_authors_name)
        author_name = all_authors_name[index]

        candidate_aids = name2aids[author_name]
        candidate_aids = all_aids
        inner_dict['candidate-aids'] = candidate_aids
        data = []
        for aid in candidate_aids:
            print(aid)
            new_pair = (aid, pid_with_index)
            pid_info_dict = valid_pub[now_pid]
            aid_author_info_dict = aid2coauthor[aid]
            aid_year_info_dict = aid2year[aid]
            aid_venue_dict = aid2venue[aid]
            aid_org_year_list = aid2orgwithyear[aid]
            aid_keywords_dict = aid2keywords[aid]
            aid_year_all_info_dict = aid2yearinfo[aid]
            data.append(
                get_features(new_pair, pid_info_dict, aid_author_info_dict,
                             aid_year_info_dict, aid_venue_dict,
                             aid_org_year_list, aid_keywords_dict,
                             aid_year_all_info_dict))
        data = np.array(data)
        inner_dict['data'] = data
        testdatafeatures[pid_with_index] = inner_dict
        break
    save_pickle(testdatafeatures, './testdatafeatures_one.pkl')
Пример #2
0
def gen_test_feature():
    # process test data and save in pickle
    # testdatafeatures --> {pid-with-index: {candidate-aids: [...], data: [[xxx], [xxx], [xxx]...]}}
    valid_nuass = load_json(VALID_UNASS_PATH)
    valid_pub = load_json(VALID_PUB_PATH)
    # whole_author_profile_pub = load_json(WHOLE_AUTHOR_PROFILE_PUB_PATH)
    aid2yearinfo = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2yearinfo.pkl'))
    aid2coauthor = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2coauthor.pkl'))
    aid2venue = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2venue.pkl'))
    aid2keywords = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2keywords.pkl'))
    aid2year = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2year.pkl'))
    aid2orgwithyear = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2orgwithyear.pkl'))
    name2aids = load_pickle(os.path.join(NEW_DATA_DIR, 'name2aids.pkl'))
    aid2pids = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2pids.pkl'))
    aid2orgset = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2orgset.pkl'))
    aid2venueset = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2venueset.pkl'))
    aid2keywordsset = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2keywordsset.pkl'))

    all_pids_len = 0
    for aid in aid2pids:
        all_pids_len += len(aid2pids[aid])
    testdatafeatures = {}
    all_authors_name = list(name2aids.keys())
    # author_name_count = defaultdict(int)
    for pid_with_index in tqdm.tqdm(valid_nuass):
        inner_dict = {}
        now_pid, index = pid_with_index.split('-')
        author_name = valid_pub[now_pid]['authors'][int(index)]['name']
        author_name = clean_name(author_name)
        index = get_name_index(author_name, all_authors_name)
        author_name = all_authors_name[index]
        # author_name_count[author_name] += 1
        # continue

        candidate_aids = name2aids[author_name]
        inner_dict['candidate-aids'] = candidate_aids
        data = []
        for aid in candidate_aids:
            new_pair = (aid, pid_with_index)
            pid_info_dict = valid_pub[now_pid]
            aid_author_info_dict = aid2coauthor[aid]
            aid_year_info_dict = aid2year[aid]
            aid_venue_dict = aid2venue[aid]
            aid_org_year_list = aid2orgwithyear[aid]
            aid_keywords_dict = aid2keywords[aid]
            aid_year_all_info_dict = aid2yearinfo[aid]
            org_info_set = aid2orgset[aid]
            aid_venue_set = aid2venueset[aid]
            aid_keywords_set = aid2keywordsset[aid]
            data.append(get_features(new_pair, pid_info_dict, aid_author_info_dict, aid_year_info_dict, aid_venue_dict, aid_org_year_list, aid_keywords_dict, aid_year_all_info_dict, org_info_set, aid_venue_set, aid_keywords_set))
            data[-1].append(len(aid2pids[aid]) / all_pids_len)
        data = np.array(data)
        inner_dict['data'] = data
        testdatafeatures[pid_with_index] = inner_dict
    save_pickle(testdatafeatures, os.path.join(TEST_FEATURE_DIR_V2, 'testdatafeatures-withsetinfo-papercount.pkl'))
Пример #3
0
def gen_test_title_abstract_vec(mission='title'):
    if mission == 'title':
        aid2cate = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2titlevec.pkl'))
    elif mission == 'abstract':
        aid2cate = load_pickle(
            os.path.join(NEW_DATA_DIR, 'aid2abstractvec.pkl'))
    else:
        raise ValueError('mission value error')

    valid_nuass = load_json(VALID_UNASS_PATH)
    valid_pub = load_json(VALID_PUB_PATH)
    name2aids = load_pickle(os.path.join(NEW_DATA_DIR, 'name2aids.pkl'))
    texttovec = TextToVec()

    all_authors_name = list(name2aids.keys())
    # test_cate_feature --> {pid-with-index: {candidate-aids: [...], data: [(emb0, meb1), ...]}}
    test_cate_feature = {}
    for pid_with_index in tqdm.tqdm(valid_nuass):
        inner_dict = {}
        now_pid, index = pid_with_index.split('-')
        author_name = valid_pub[now_pid]['authors'][int(index)]['name']
        author_name = clean_name(author_name)
        index = get_name_index(author_name, all_authors_name)
        author_name = all_authors_name[index]

        candidate_aids = name2aids[author_name]
        inner_dict['candidate-aids'] = candidate_aids
        data = []
        for aid in candidate_aids:
            info = valid_pub[now_pid].get(mission)
            if info is None:
                emb = np.zeros(300)
            else:
                emb = texttovec.get_vec(info)
            emb_pair = (aid2cate[aid], emb)
            data.append(emb_pair)
        inner_dict['data'] = data
        test_cate_feature[pid_with_index] = inner_dict
    save_pickle(
        test_cate_feature,
        os.path.join(TEST_FEATURE_DIR_V2, 'test-%s-emb-pair.pkl' % mission))
Пример #4
0
def check_name():
    problem_pids = load_json(os.path.join(FINAL_DIR, 'problem.pids.3.json'))
    name2aids = load_pickle(os.path.join(NEW_DATA_DIR, 'name2aids.pkl'))
    test_pub = load_json(TEST_PUB_PATH)
    all_authors_name = list(name2aids.keys())

    name_map = []
    for pid_with_index in tqdm.tqdm(problem_pids):
        now_pid, index = pid_with_index.split('-')
        author_name_no_clean = test_pub[now_pid]['authors'][int(index)]['name']
        author_name = clean_name(author_name_no_clean)
        if pid_with_index == 'ToCcabLT-1':
            author_name = 'junliang_wang'
        if pid_with_index == 'cVvvcFzj-1':
            author_name = 'xiaojun_liu'

        index = get_name_index(author_name, all_authors_name)
        author_name_inlist = all_authors_name[index]
        # if author_name_inlist != author_name:
        name_map.append((pid_with_index, author_name_no_clean, author_name,
                         author_name_inlist))
    name_map = list(set(name_map))
    print(len(name_map))
    save_json(name_map, os.path.join(FINAL_DIR, 'name.different.3.json'))
Пример #5
0
def get_triplet_corpus(mission='train'):
    whole_author_profile_pub = load_json(WHOLE_AUTHOR_PROFILE_PUB_PATH)
    whole_author_profile = load_json(WHOLE_AUTHOR_PROFILE_PATH)
    name2aids = {}
    aid2pids = {}
    aids = []
    names = []
    pids_with_index = []
    for aid in tqdm.tqdm(whole_author_profile):
        aids.append(aid)
        names.append(whole_author_profile[aid]['name'])
        pids = whole_author_profile[aid]['papers']
        tmp = []
        for paper in pids:
            paper_authors = whole_author_profile_pub[paper]['authors']
            author_names = [clean_name(item['name']) for item in paper_authors]
            # print(author_names)
            index = get_name_index(names[-1], author_names)
            tmp.append('%s-%d' % (paper, index))
        pids_with_index.append(tmp)
    assert len(aids) == len(names)
    assert len(names) == len(pids_with_index)
    print('all aids num: ', len(aids))
    name_set = set(names)
    names_array = np.array(names)
    aids_array = np.array(aids)
    for name in name_set:
        target_aid = aids_array[names_array == name]
        name2aids[name] = target_aid
    for aid, pid in zip(aids, pids_with_index):
        aid2pids[aid] = pid
    if mission == 'train':
        save_pickle(name2aids, os.path.join(NEW_DATA_V2_DIR, 'name2aids.pkl'))
        save_pickle(aid2pids, os.path.join(NEW_DATA_V2_DIR, 'aid2pids.pkl'))
    elif mission == 'test':
        save_pickle(name2aids, os.path.join(NEW_DATA_DIR, 'name2aids.pkl'))
        save_pickle(aid2pids, os.path.join(NEW_DATA_DIR, 'aid2pids.pkl'))

    texttovec = TextToVec()
    if mission == 'train':
        aid2pids = load_pickle(os.path.join(NEW_DATA_V2_DIR, 'aid2pids.pkl'))
    elif mission == 'test':
        aid2pids = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2pids.pkl'))
    # ------------------------------------------
    # save format: aid2titlevec --> {aid: [mean value]}
    aid2titlevec = {}
    for aid in tqdm.tqdm(aid2pids.keys()):
        papers = aid2pids[aid]
        inner_list = []
        for pid_with_index in papers:
            pid, index = pid_with_index.split('-')
            title = whole_author_profile_pub[pid]['title']
            inner_list.append(texttovec.get_vec(title))
        if len(inner_list) == 0:
            aid2titlevec[aid] = np.zeros(300)
        else:
            aid2titlevec[aid] = np.mean(np.array(inner_list), axis=0)
    if mission == 'train':
        save_pickle(aid2titlevec,
                    os.path.join(NEW_DATA_V2_DIR, 'aid2titlevec.pkl'))
    elif mission == 'test':
        save_pickle(aid2titlevec, os.path.join(NEW_DATA_DIR,
                                               'aid2titlevec.pkl'))
Пример #6
0
 def get_pid_with_index(whole_author_profile_pub, pid, name):
     authors = whole_author_profile_pub[pid]['authors']
     authors_names = [clean_name(item['name']) for item in authors]
     index = get_name_index(name, authors_names)
     return '%s-%d' % (pid, index)
Пример #7
0
def preprocessing(mission='train'):
    # os.makedirs(NEW_DATA_DIR, exist_ok=True)

    # ------------------------------------------
    # process whole_author_profile.json, add index, and save to pickle
    # save format: name2aids --> {name: [aids, ...]}, aid2pids --> {aid: [pid-index, ...]}
    os.makedirs(NEW_DATA_DIR, exist_ok=True)
    os.makedirs(NEW_DATA_V2_DIR, exist_ok=True)
    if mission == 'train':
        whole_author_profile = load_json(
            os.path.join(SPLIT_DIR, 'train_profile-last1year.json'))
    elif mission == 'test':
        whole_author_profile = load_json(WHOLE_AUTHOR_PROFILE_PATH)
    else:
        raise ValueError("check mission value")
    whole_author_profile_pub = load_json(WHOLE_AUTHOR_PROFILE_PUB_PATH)
    name2aids = {}
    aid2pids = {}
    aids = []
    names = []
    pids_with_index = []
    for aid in tqdm.tqdm(whole_author_profile):
        aids.append(aid)
        names.append(whole_author_profile[aid]['name'])
        pids = whole_author_profile[aid]['papers']
        tmp = []
        for paper in pids:
            paper_authors = whole_author_profile_pub[paper]['authors']
            author_names = [clean_name(item['name']) for item in paper_authors]
            # print(author_names)
            index = get_name_index(names[-1], author_names)
            tmp.append('%s-%d' % (paper, index))
        pids_with_index.append(tmp)
    assert len(aids) == len(names)
    assert len(names) == len(pids_with_index)
    print('all aids num: ', len(aids))
    name_set = set(names)
    names_array = np.array(names)
    aids_array = np.array(aids)
    for name in name_set:
        target_aid = aids_array[names_array == name]
        name2aids[name] = target_aid
    for aid, pid in zip(aids, pids_with_index):
        aid2pids[aid] = pid
    if mission == 'train':
        save_pickle(name2aids, os.path.join(NEW_DATA_V2_DIR, 'name2aids.pkl'))
        save_pickle(aid2pids, os.path.join(NEW_DATA_V2_DIR, 'aid2pids.pkl'))
    elif mission == 'test':
        save_pickle(name2aids, os.path.join(NEW_DATA_DIR, 'name2aids.pkl'))
        save_pickle(aid2pids, os.path.join(NEW_DATA_DIR, 'aid2pids.pkl'))

    # ------------------------------------------
    # save format: aid2year --> {aid: {min: xxx, max: xxx, mean: xxx, median: xxx, min_max_avg: xxx, year_list: [year, ...]}}
    if mission == 'train':
        aid2pids = load_pickle(os.path.join(NEW_DATA_V2_DIR, 'aid2pids.pkl'))
    elif mission == 'test':
        aid2pids = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2pids.pkl'))
    aid2year = {}
    print('Process year info ...')
    for aid in tqdm.tqdm(aid2pids.keys()):
        pids = aid2pids[aid]
        all_years = []
        for pid_with_index in pids:
            pid = pid_with_index.split('-')[0]
            year = whole_author_profile_pub[pid].get('year', '0')
            if year == '':
                year = 0
            else:
                year = int(year)
            if any([year < 1500, year > 2100]):
                year = 0
            all_years.append(year)
        all_years = np.array(all_years)
        all_years = all_years[all_years != 0]
        if len(all_years) == 0:
            year_info = None
        else:
            year_info = {
                'min': np.min(all_years),
                'max': np.max(all_years),
                'mean': np.mean(all_years),
                'min_max_avg': (np.min(all_years) + np.max(all_years)) / 2,
                'median': np.median(all_years),
                'year_list': all_years,
            }
        aid2year[aid] = year_info
    if mission == 'train':
        save_pickle(aid2year, os.path.join(NEW_DATA_V2_DIR, 'aid2year.pkl'))
    elif mission == 'test':
        save_pickle(aid2year, os.path.join(NEW_DATA_DIR, 'aid2year.pkl'))

    # ------------------------------------------
    # save format: aid2coauthor --> {aid: {anuthor-name: count, ...}}
    aid2coauthor = {}
    print('aid2coauthor processing ...')
    for aid in tqdm.tqdm(aid2pids.keys()):
        pids = aid2pids[aid]
        inner_dict = defaultdict(int)
        for pid_with_index in pids:
            pid, index = pid_with_index.split('-')
            authors = whole_author_profile_pub[pid]['authors']
            authors_name = [clean_name(item['name']) for item in authors]
            authors_name.pop(int(index))
            for name in authors_name:
                inner_dict[name] += 1
        aid2coauthor[aid] = inner_dict
    if mission == 'train':
        save_pickle(aid2coauthor,
                    os.path.join(NEW_DATA_V2_DIR, 'aid2coauthor.pkl'))
    elif mission == 'test':
        save_pickle(aid2coauthor, os.path.join(NEW_DATA_DIR,
                                               'aid2coauthor.pkl'))

    # ------------------------------------------
    # save format: aid2venue --> {aid: {venue-name: count ...}}
    aid2venue = {}
    print('aid2venue processing ...')
    for aid in tqdm.tqdm(aid2pids.keys()):
        pids = aid2pids[aid]
        inner_dict = defaultdict(int)
        for pid_with_index in pids:
            pid, index = pid_with_index.split('-')
            venue = whole_author_profile_pub[pid]['venue'].lower()
            if venue != '':
                # aid2venue[aid].add(venue)
                inner_dict[venue] += 1
        aid2venue[aid] = inner_dict
    if mission == 'train':
        save_pickle(aid2venue, os.path.join(NEW_DATA_V2_DIR, 'aid2venue.pkl'))
    elif mission == 'test':
        save_pickle(aid2venue, os.path.join(NEW_DATA_DIR, 'aid2venue.pkl'))

    # ------------------------------------------
    # save format: aid2keywords --> {aid: {keyword: count, ...}}
    aid2keywords = {}
    print('aid2keywords processing ...')
    for aid in tqdm.tqdm(aid2pids.keys()):
        pids = aid2pids[aid]
        inner_dict = defaultdict(int)
        for pid_with_index in pids:
            pid, index = pid_with_index.split('-')
            keywords = whole_author_profile_pub[pid].get('keywords', '')
            if len(keywords) == 0:
                continue
            for keyword in keywords:
                if keyword != '':
                    # aid2keywords[aid].add(keyword.lower())
                    inner_dict[keyword] += 1
        aid2keywords[aid] = inner_dict
    if mission == 'train':
        save_pickle(aid2keywords,
                    os.path.join(NEW_DATA_V2_DIR, 'aid2keywords.pkl'))
    elif mission == 'test':
        save_pickle(aid2keywords, os.path.join(NEW_DATA_DIR,
                                               'aid2keywords.pkl'))

    # ------------------------------------------
    # save format: aid2orgset--> {aid: set{org_word, org_word, ...}}
    aid2orgset = {}
    for aid in tqdm.tqdm(aid2pids.keys()):
        pids = aid2pids[aid]
        inner_set = set()
        for pid_with_index in pids:
            pid, index = pid_with_index.split('-')
            author = whole_author_profile_pub[pid].get('authors')[int(index)]
            org = author.get('org', '').lower().strip()
            org_set = set(org.split())
            inner_set = inner_set | org_set
        aid2orgset[aid] = inner_set
    if mission == 'train':
        save_pickle(aid2orgset, os.path.join(NEW_DATA_V2_DIR,
                                             'aid2orgset.pkl'))
    elif mission == 'test':
        save_pickle(aid2orgset, os.path.join(NEW_DATA_DIR, 'aid2orgset.pkl'))

    # ------------------------------------------
    # save format: aid2venueset--> {aid: set{venue_word, venue_word, ...}}
    aid2venueset = {}
    for aid in tqdm.tqdm(aid2pids.keys()):
        pids = aid2pids[aid]
        inner_set = set()
        for pid_with_index in pids:
            pid, index = pid_with_index.split('-')
            venue = whole_author_profile_pub[pid].get('venue', '').lower()
            if venue == '':
                continue
            else:
                venue_set = set(venue.replace('-', ' ').split())
                inner_set = inner_set | venue_set
        aid2venueset[aid] = inner_set
    if mission == 'train':
        save_pickle(aid2venueset,
                    os.path.join(NEW_DATA_V2_DIR, 'aid2venueset.pkl'))
    elif mission == 'test':
        save_pickle(aid2venueset, os.path.join(NEW_DATA_DIR,
                                               'aid2venueset.pkl'))

    # ------------------------------------------
    # save format: aid2keywordsset--> {aid: set{key_word, key_word, ...}}
    aid2keywordsset = {}
    for aid in tqdm.tqdm(aid2pids.keys()):
        pids = aid2pids[aid]
        inner_set = set()
        for pid_with_index in pids:
            pid, index = pid_with_index.split('-')
            keywords = whole_author_profile_pub[pid].get('keywords', '')
            if len(keywords) == 0:
                continue
            for keyword in keywords:
                if keyword != '':
                    keyword_set = set(keyword.lower().replace('-',
                                                              ' ').split())
                    inner_set = inner_set | keyword_set
        aid2keywordsset[aid] = inner_set
    if mission == 'train':
        save_pickle(aid2keywordsset,
                    os.path.join(NEW_DATA_V2_DIR, 'aid2keywordsset.pkl'))
    elif mission == 'test':
        save_pickle(aid2keywordsset,
                    os.path.join(NEW_DATA_DIR, 'aid2keywordsset.pkl'))

    # ------------------------------------------
    # save format: aid2orgwithyear --> {aid: [(org, year), () ...]}
    aid2orgwithyear = {}
    for aid in tqdm.tqdm(aid2pids.keys()):
        pids = aid2pids[aid]
        inner_list = []
        for pid_with_index in pids:
            pid, index = pid_with_index.split('-')
            auhtors = whole_author_profile_pub[pid]['authors']
            org = auhtors[int(index)].get('org', '').lower()
            year = whole_author_profile_pub[pid].get('year', '0')
            if year == '':
                year = 0
            else:
                year = int(year)
            if any([year < 1500, year > 2100]):
                year = 0
            inner_list.append((org, year))
        aid2orgwithyear[aid] = inner_list
    if mission == 'train':
        save_pickle(aid2orgwithyear,
                    os.path.join(NEW_DATA_V2_DIR, 'aid2orgwithyear.pkl'))
    elif mission == 'test':
        save_pickle(aid2orgwithyear,
                    os.path.join(NEW_DATA_DIR, 'aid2orgwithyear.pkl'))

    # ------------------------------------------
    # save format aid2yearinfo --> {aid: {year: {
    #                                            orgs: [org, ....],
    #                                            venues: [venues, ...],
    #                                            keywords: [keyword, ...],
    #                                            coauthors: [author-name, ...],
    #                                            }}}
    aid2yearinfo = {}
    for aid in tqdm.tqdm(aid2pids.keys()):
        inner_dict = {}
        pids = aid2pids[aid]
        for pid_with_index in pids:
            pid, index = pid_with_index.split('-')
            year = whole_author_profile_pub[pid].get('year', '0')
            if year == '':
                year = 0
            else:
                year = int(year)
            if any([year < 1500, year > 2100]):
                year = 0
            authors = whole_author_profile_pub[pid]['authors']
            authors_name = [clean_name(item['name']) for item in authors]
            org = [authors[int(index)].get('org', '').lower()]
            authors_name.pop(int(index))
            coauthor = authors_name
            venue = [whole_author_profile_pub[pid].get('venue', '').lower()]
            keywords = whole_author_profile_pub[pid].get('keywords', [''])
            if len(keywords) == 0:
                keywords = ['']
            keywords = [keyword.lower() for keyword in keywords]
            tmp_dict = {
                'orgs': org,
                'venues': venue,
                'keywords': keywords,
                'coauthors': coauthor,
            }
            if year in inner_dict.keys():
                for key in tmp_dict:
                    inner_dict[year][key].extend(tmp_dict[key])
            else:
                inner_dict[year] = tmp_dict
        aid2yearinfo[aid] = inner_dict
    if mission == 'train':
        save_pickle(aid2yearinfo,
                    os.path.join(NEW_DATA_V2_DIR, 'aid2yearinfo.pkl'))
    elif mission == 'test':
        save_pickle(aid2yearinfo, os.path.join(NEW_DATA_DIR,
                                               'aid2yearinfo.pkl'))

    texttovec = TextToVec()
    # ------------------------------------------
    # save format: aid2titlevec --> {aid: [mean value]}
    aid2titlevec = {}
    for aid in tqdm.tqdm(aid2pids.keys()):
        papers = aid2pids[aid]
        inner_list = []
        for pid_with_index in papers:
            pid, index = pid_with_index.split('-')
            title = whole_author_profile_pub[pid]['title']
            inner_list.append(texttovec.get_vec(title))
        if len(inner_list) == 0:
            aid2titlevec[aid] = np.zeros(300)
        else:
            aid2titlevec[aid] = np.mean(np.array(inner_list), axis=0)
    if mission == 'train':
        save_pickle(aid2titlevec,
                    os.path.join(NEW_DATA_V2_DIR, 'aid2titlevec.pkl'))
    elif mission == 'test':
        save_pickle(aid2titlevec, os.path.join(NEW_DATA_DIR,
                                               'aid2titlevec.pkl'))

    # ------------------------------------------
    # save format: aid2abstractvec --> {aid: [mean value]}
    aid2abstractvec = {}
    for aid in tqdm.tqdm(aid2pids.keys()):
        papers = aid2pids[aid]
        inner_list = []
        for pid_with_index in papers:
            pid, index = pid_with_index.split('-')
            abstract = whole_author_profile_pub[pid].get('abstract')
            if abstract is None:
                continue
            inner_list.append(texttovec.get_vec(abstract))
        if len(inner_list) == 0:
            aid2abstractvec[aid] = np.zeros(300)
        else:
            aid2abstractvec[aid] = np.mean(np.array(inner_list), axis=0)
    if mission == 'train':
        save_pickle(aid2abstractvec,
                    os.path.join(NEW_DATA_V2_DIR, 'aid2abstractvec.pkl'))
    elif mission == 'test':
        save_pickle(aid2abstractvec,
                    os.path.join(NEW_DATA_DIR, 'aid2abstractvec.pkl'))
Пример #8
0
def gen_title_feature():
    aid2titlevec = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2titlevec.pkl'))

    test_unass = load_json(TEST_UNASS_PATH)
    test_pub = load_json(TEST_PUB_PATH)
    name2aids = load_pickle(os.path.join(NEW_DATA_DIR, 'name2aids.pkl'))
    texttovec = TextToVec()

    name_map = load_json(
        os.path.join(FINAL_DIR, 'name.different.modified.json'))
    original_name = [pair[0] for pair in name_map]
    changed_name = [pair[1] for pair in name_map]
    name_map2 = load_json(
        os.path.join(FINAL_DIR, 'name.different.2.modified.json'))
    original_name2 = [pair[0] for pair in name_map2]
    changed_name2 = [pair[1] for pair in name_map2]

    all_authors_name = list(name2aids.keys())
    # test_title_feature --> {pid-with-index: {candidate-aids: [...], data: [(emb0, meb1), ...]}}
    test_title_feature = {}
    print('Gen title emb pair ...')
    for pid_with_index in tqdm.tqdm(test_unass):
        inner_dict = {}
        now_pid, index = pid_with_index.split('-')
        author_name = test_pub[now_pid]['authors'][int(index)]['name']
        author_name = clean_name(author_name)
        if pid_with_index == 'ToCcabLT-1':
            author_name = 'junliang_wang'
        if pid_with_index == 'cVvvcFzj-1':
            author_name = 'xiaojun_liu'

        if author_name in original_name2:
            name_index = original_name2.index(author_name)
            author_name = changed_name2[name_index]
        elif author_name in original_name:
            name_index = original_name.index(author_name)
            author_name = changed_name[name_index]
        else:
            index = get_name_index(author_name, all_authors_name)
            author_name = all_authors_name[index]

        if isinstance(author_name, str):
            candidate_aids = name2aids[author_name]
        elif isinstance(author_name, list):
            candidate_aids = []
            for name in author_name:
                candidate_aids.extend(name2aids[name].tolist())
            candidate_aids = np.array(candidate_aids)
        else:
            raise ValueError("check author name !!!")

        inner_dict['candidate-aids'] = candidate_aids
        info = test_pub[now_pid].get('title')
        if info is None:
            emb = np.zeros(300)
        else:
            emb = texttovec.get_vec(info)
        data = []
        for aid in candidate_aids:
            emb_pair = (aid2titlevec[aid], emb)
            data.append(emb_pair)
        inner_dict['data'] = data
        test_title_feature[pid_with_index] = inner_dict
    save_pickle(
        test_title_feature,
        os.path.join(TEST_FEATURE_DIR, 'test-title-emb-pair-name-clean-2.pkl'))

    print('Gen title distance ...')
    test_title_emb_pair = load_pickle(
        os.path.join(TEST_FEATURE_DIR, 'test-title-emb-pair-name-clean-2.pkl'))
    test_unass = load_json(TEST_UNASS_PATH)
    title_emb_pair = []
    for pid_with_index in tqdm.tqdm(test_unass):
        for pair in test_title_emb_pair[pid_with_index]['data']:
            title_emb_pair.append(pair)
    emb_pair_to_distance(
        'tm.title.1.checkpoint.pth', 'title', title_emb_pair,
        os.path.join(TEST_FEATURE_DIR,
                     'test-title-distance-df-name-clean-2.pkl'))
Пример #9
0
def gen_base_feature(index, multi_size):
    # process test data and save in pickle
    # testdatafeatures --> {pid-with-index: {candidate-aids: [...], data: [[xxx], [xxx], [xxx]...]}}
    test_unass = load_json(TEST_UNASS_PATH)
    test_pub = load_json(TEST_PUB_PATH)
    # whole_author_profile_pub = load_json(WHOLE_AUTHOR_PROFILE_PUB_PATH)
    aid2yearinfo = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2yearinfo.pkl'))
    aid2coauthor = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2coauthor.pkl'))
    aid2venue = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2venue.pkl'))
    aid2keywords = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2keywords.pkl'))
    aid2year = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2year.pkl'))
    aid2orgwithyear = load_pickle(
        os.path.join(NEW_DATA_DIR, 'aid2orgwithyear.pkl'))
    name2aids = load_pickle(os.path.join(NEW_DATA_DIR, 'name2aids.pkl'))
    # aid2pids = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2pids.pkl'))
    aid2orgset = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2orgset.pkl'))
    aid2venueset = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2venueset.pkl'))
    aid2keywordsset = load_pickle(
        os.path.join(NEW_DATA_DIR, 'aid2keywordsset.pkl'))

    name_map = load_json(
        os.path.join(FINAL_DIR, 'name.different.modified.json'))
    original_name = [pair[0] for pair in name_map]
    changed_name = [pair[1] for pair in name_map]
    name_map2 = load_json(
        os.path.join(FINAL_DIR, 'name.different.2.modified.json'))
    original_name2 = [pair[0] for pair in name_map2]
    changed_name2 = [pair[1] for pair in name_map2]

    single_range = math.ceil(len(test_unass) / multi_size)
    start = index * single_range
    end = (index + 1) * single_range if (index + 1) * single_range < len(
        test_unass) else len(test_unass)

    testdatafeatures = {}
    all_authors_name = list(name2aids.keys())
    print('Gen test features ...')
    for pid_with_index in tqdm.tqdm(test_unass[start:end]):
        inner_dict = {}
        now_pid, index = pid_with_index.split('-')
        author_name = test_pub[now_pid]['authors'][int(index)]['name']
        author_name = clean_name(author_name)
        if pid_with_index == 'ToCcabLT-1':
            author_name = 'junliang_wang'
        if pid_with_index == 'cVvvcFzj-1':
            author_name = 'xiaojun_liu'

        if author_name in original_name2:
            name_index = original_name2.index(author_name)
            author_name = changed_name2[name_index]
        elif author_name in original_name:
            name_index = original_name.index(author_name)
            author_name = changed_name[name_index]
        else:
            index = get_name_index(author_name, all_authors_name)
            author_name = all_authors_name[index]

        if isinstance(author_name, str):
            candidate_aids = name2aids[author_name]
        elif isinstance(author_name, list):
            candidate_aids = []
            for name in author_name:
                candidate_aids.extend(name2aids[name].tolist())
            candidate_aids = np.array(candidate_aids)
        else:
            raise ValueError("check author name ! ! !")

        inner_dict['candidate-aids'] = candidate_aids
        data = []
        for aid in candidate_aids:
            new_pair = (aid, pid_with_index)
            pid_info_dict = test_pub[now_pid]
            aid_author_info_dict = aid2coauthor[aid]
            aid_year_info_dict = aid2year[aid]
            aid_venue_dict = aid2venue[aid]
            aid_org_year_list = aid2orgwithyear[aid]
            aid_keywords_dict = aid2keywords[aid]
            aid_year_all_info_dict = aid2yearinfo[aid]
            org_info_set = aid2orgset[aid]
            aid_venue_set = aid2venueset[aid]
            aid_keywords_set = aid2keywordsset[aid]
            data.append(
                get_features(new_pair, pid_info_dict, aid_author_info_dict,
                             aid_year_info_dict, aid_venue_dict,
                             aid_org_year_list, aid_keywords_dict,
                             aid_year_all_info_dict, org_info_set,
                             aid_venue_set, aid_keywords_set))
        data = np.array(data)
        inner_dict['data'] = data
        testdatafeatures[pid_with_index] = inner_dict
    # save_pickle(testdatafeatures, os.path.join(TEST_FEATURE_DIR, 'u6uRzaff-5.pkl'))
    return testdatafeatures