def stringDistance_1(AuthorIdPaperId, dict_coauthor, dict_paperIdAuthorId_to_name_aff, PaperAuthor, Author): authorId = AuthorIdPaperId.authorId paperId = AuthorIdPaperId.paperId key = "%s|%s" % (paperId, authorId) name = str(dict_paperIdAuthorId_to_name_aff[key]["name"]) aff = str(dict_paperIdAuthorId_to_name_aff[key]["affiliation"]) T = list(Author[Author["Id"] == int(authorId)].values)[0] a_name = str(T[1]) a_aff = str(T[2]) if a_name == "nan": a_name = "" if a_aff == "nan": a_aff = "" feat_list = [] # 计算 a_name 与 name 的距离 feat_list.append(len(longest_common_subsequence(a_name, name))) feat_list.append(len(longest_common_substring(a_name, name))) feat_list.append(Levenshtein_distance(a_name, name)) # 计算 a_aff 与 aff 的距离 feat_list.append(len(longest_common_subsequence(a_aff, aff))) feat_list.append(len(longest_common_substring(a_aff, aff))) feat_list.append(Levenshtein_distance(a_aff, aff)) return util.get_feature_by_list(feat_list)
def keywords_1(AuthorIdPaperId, dict_coauthor, dict_paperIdAuthorId_to_name_aff, dict_author_keywords, PaperAuthor, Author, Paper,dict_author_conference_journal,conference,journal,dict_author_paperid ): authorId = AuthorIdPaperId.authorId paperId = AuthorIdPaperId.paperId #该作者以前写过的论文的keywords集合 fomer_keywords = dict_author_keywords[authorId].keys() fomer_dict_keywords = dict_author_keywords[authorId] #当前论文的keywords集合current_key title = Paper[Paper["Id"]==int(paperId)]["Title"].values keywords = Paper[Paper["Id"]==int(paperId)]["Keyword"].values if len(title) : title=str(title[0]) else: title = ' ' keywords = str(keywords[0]) if keywords=="nan": keywords = ' ' curr_keywords = util.get_string_splited(title + " " + keywords) #统计关键字相同的个数 nums = len(set(curr_keywords) & set(fomer_keywords)) #统计分数 score = 0 for word in curr_keywords: if word in fomer_dict_keywords : score += fomer_dict_keywords[word] #print nums, score return util.get_feature_by_list([nums, score])
def journal_conference_year(AuthorIdPaperId, dict_coauthor, dict_paperIdAuthorId_to_name_aff, PaperAuthor, Author, Paper, Conference, Journal): paperId = int(AuthorIdPaperId.paperId) conferenceId = Paper[Paper['Id'] == paperId]['ConferenceId'].values journalId = Paper[Paper['Id'] == paperId]['JournalId'].values paper_year = int(Paper[Paper['Id'] == int(paperId)]['Year'].values[0]) feat_list = [] if len(conferenceId) == 0 or int(conferenceId[0]) <= 0: feat_list.append(0) else: feat_list.append(int(conferenceId[0])) if len(journalId) == 0 or int(journalId[0]) <= 0: feat_list.append(0) else: feat_list.append(int(journalId[0])) if 1800 <= paper_year <= 2013: feat_list.append(paper_year) else: feat_list.append(0) return util.get_feature_by_list(feat_list)
def stringDistance_2(AuthorIdPaperId, dict_coauthor, dict_paperIdAuthorId_to_name_aff, PaperAuthor, Author, Paper, Conference, Journal): authorId = AuthorIdPaperId.authorId paperId = AuthorIdPaperId.paperId key = "%s|%s" % (paperId, authorId) name = str(dict_paperIdAuthorId_to_name_aff[key]["name"]) aff = str(dict_paperIdAuthorId_to_name_aff[key]["affiliation"]) T = list(Author[Author["Id"] == int(authorId)].values)[0] a_name = str(T[1]) a_aff = str(T[2]) if a_name == "nan": a_name = "" if a_aff == "nan": a_aff = "" feat_list = [] # 计算 a_name 与 name 的距离 lcs_distance = [] lss_distance = [] lev_distance = [] for _name in name.split("##"): lcs_distance.append(len(longest_common_subsequence(a_name, _name))) lss_distance.append(len(longest_common_substring(a_name, _name))) # 尝试不同的字符串相似度算法 # lev_distance.append(Levenshtein_distance(a_name, _name)) lev_distance.append(textdistance.JaroWinkler()(a_name, _name)) # lev_distance.append(textdistance.Jaccard()(a_name, _name)) feat_list += [ np.mean(lcs_distance), np.mean(lss_distance), np.mean(lev_distance) ] # 计算 a_aff 与 aff 的距离 lcs_distance = [] lss_distance = [] lev_distance = [] for _aff in aff.split("##"): lcs_distance.append(len(longest_common_subsequence(a_aff, _aff))) lss_distance.append(len(longest_common_substring(a_aff, _aff))) # 尝试不同的字符串相似度算法 # lev_distance.append(Levenshtein_distance(a_aff, _aff)) lev_distance.append(textdistance.JaroWinkler()(a_aff, _aff)) # lev_distance.append(textdistance.Jaccard()(a_aff, _aff)) feat_list += [ np.mean(lcs_distance), np.mean(lss_distance), np.mean(lev_distance) ] # # feat_list # feat_list = [feat_list[0],feat_list[1], feat_list[3],feat_list[4]] return util.get_feature_by_list(feat_list)
def publication_year(AuthorIdPaperId, dict_coauthor, dict_paperIdAuthorId_to_name_aff, PaperAuthor, Author, Paper, Conference, Journal): authorId = AuthorIdPaperId.authorId paperId = AuthorIdPaperId.paperId # print('authorId', authorId) # paperId 的发表年份 paper_year = Paper[Paper['Id'] == int(paperId)]['Year'].values[0] # 作者发表的所有论文 id paperIds = PaperAuthor[PaperAuthor['AuthorId'] == int( authorId)]['PaperId'].values years = [] for id in paperIds: year = Paper[Paper['Id'] == int(id)]['Year'].values if year.shape[0] == 0: continue year = year[0] if 1800 <= year <= 2013: years.append(year) if not years: feature = [0, 0, 0] else: feature = [1, paper_year - min(years), max(years) - paper_year] return util.get_feature_by_list(feature)
def coauthor_1(AuthorIdPaperId, dict_coauthor, dict_paperIdAuthorId_to_name_aff, dict_author_keywords, PaperAuthor, Author, Paper,dict_author_conference_journal ,conference,journal, dict_author_paperid): authorId = AuthorIdPaperId.authorId paperId = AuthorIdPaperId.paperId # 从PaperAuthor中,根据paperId找coauthor。 curr_coauthors = list(map(str, list(PaperAuthor[PaperAuthor["PaperId"] == int(paperId)]["AuthorId"].values))) # top_coauthors = dict_coauthor[authorId].keys() # 简单计算top 10 coauthor出现的个数 nums = len(set(curr_coauthors) & set(top_coauthors)) return util.get_feature_by_list([nums])
def journal_count(AuthorIdPaperId, dict_coauthor, dict_paperIdAuthorId_to_name_aff, PaperAuthor, Author, Paper, Conference, Journal): authorId = int(AuthorIdPaperId.authorId) paperIds = PaperAuthor[PaperAuthor['AuthorId'] == int( authorId)]['PaperId'].values journalIds = set() for id in paperIds: journalId = Paper[Paper['Id'] == int(id)]['JournalId'].values if len(journalId) > 0: journalIds.add(int(journalId[0])) return util.get_feature_by_list([len(journalIds)])
def affiliation_count(AuthorIdPaperId, dict_coauthor, dict_paperIdAuthorId_to_name_aff, PaperAuthor, Author, Paper, Conference, Journal): authorId = AuthorIdPaperId.authorId paperId = AuthorIdPaperId.paperId # 从PaperAuthor中,根据paperId找coauthor curr_coauthors = list( map( str, PaperAuthor[PaperAuthor["PaperId"] == int(paperId)] ["AuthorId"].values)) curr_affiliations = list( map( str, PaperAuthor[PaperAuthor["PaperId"] == int(paperId)] ["Affiliation"].values)) index = 0 for author in curr_coauthors: if author == authorId: break index += 1 affiliation = curr_affiliations[index] if affiliation == 'nan': return util.get_feature_by_list([0]) else: return util.get_feature_by_list([curr_affiliations.count(affiliation)])
def coauthor_2(AuthorIdPaperId, dict_coauthor, dict_paperIdAuthorId_to_name_aff, dict_author_keywords, PaperAuthor, Author, Paper,dict_author_conference_journal ,conference,journal,dict_author_paperid): authorId = AuthorIdPaperId.authorId paperId = AuthorIdPaperId.paperId # 从PaperAuthor中,根据paperId找coauthor。 curr_coauthors = list(map(str, list(PaperAuthor[PaperAuthor["PaperId"] == int(paperId)]["AuthorId"].values))) # {"authorId": 100} top_coauthors = dict_coauthor[authorId] score = 0 for curr_coauthor in curr_coauthors: if curr_coauthor in top_coauthors: score += top_coauthors[curr_coauthor] return util.get_feature_by_list([score])
def conference_journal_2(AuthorIdPaperId, dict_coauthor, dict_paperIdAuthorId_to_name_aff, dict_author_keywords, PaperAuthor, Author, Paper,dict_author_conference_journal,conference,journal,dict_author_paperid ): authorId = AuthorIdPaperId.authorId#目前作者的id paperId = AuthorIdPaperId.paperId#目前paper的id #这篇论文所在conference和journal情况 conferenceId = Paper[Paper["Id"]==int(paperId)]["ConferenceId"].values journalId = Paper[Paper["Id"]==int(paperId)]["JournalId"].values #print ("authorId:",authorId,type(authorId)) #print ("paperId:",paperId,type(paperId)) #print ("conferenceId:",conferenceId,type(conferenceId)) #print ("journalId:",journalId,type(journalId)) conferenceId = str(conferenceId[0]) journalId = str(journalId[0]) if conferenceId == "0" and journalId == "0": score = [max(dict_author_conference_journal[authorId]["conferenceId"][conferenceId] , dict_author_conference_journal[authorId]["journalId"][journalId])] else: score = [dict_author_conference_journal[authorId]["conferenceId"][conferenceId] + dict_author_conference_journal[authorId]["journalId"][journalId]] return util.get_feature_by_list(score)
def keyword(AuthorIdPaperId, dict_coauthor, dict_paperIdAuthorId_to_name_aff, PaperAuthor, Author, Paper, Conference, Journal): def get_words(paper): s = str(paper.Title) if not pd.isna(paper.Keyword): s += paper.Keyword # print(s) words = re.split(r'[|\s;,]', s) words = [ w for w in words if w and w not in nltk.corpus.stopwords.words('english') and not w.isdigit() ] return words authorId = AuthorIdPaperId.authorId paperId = AuthorIdPaperId.paperId papersOfAuthor = PaperAuthor[PaperAuthor['AuthorId'] == int(authorId)] kws = get_words(Paper[Paper['Id'] == int(paperId)].iloc[0]) feature = [] if papersOfAuthor.shape[0] == 0: feature += [0] else: cnt = 0 s = set() for _, row in papersOfAuthor.iterrows(): paper = Paper[Paper['Id'] == row.PaperId] if paper.shape[0] == 0: continue paper = paper.iloc[0] _kws = get_words(paper) cnt += len(_kws) if paper.Id != paperId: s.update(_kws) feature.append(len(s.intersection(set(kws)))) return util.get_feature_by_list(feature)
def yeardistance(AuthorIdPaperId, dict_coauthor, dict_paperIdAuthorId_to_name_aff, dict_author_keywords, PaperAuthor, Author, Paper,dict_author_conference_journal,conference,journal ,dict_author_paperid): authorId = AuthorIdPaperId.authorId#目前作者的id paperId = AuthorIdPaperId.paperId#目前paper的id #当前Paper年份附近年份该作者有没有发过Paper 如果作者已经很久没有发过paper那么有理由相信这篇paper是这个作者发的可能性比较小 minyear = 0 feat_list = [] #当前paper发的时间 curyear = (Paper[Paper["Id"]==int(paperId)]["Year"].values)[0] #根据作者的id从paperauthor数据集中找到他发过的所有paper """for item in dict_author_paperid[authorId]: #print ("item:",item) #计算每一个paper所发的时间 tempyear = int(Paper[Paper["Id"]==int(item)]["Year"]) yearlist.append(abs(int(curyear) - int(tempyear))) yearlist = [np.min(yearlist)]""" minyear = np.min(Paper[Paper["Id"].isin (dict_author_paperid[authorId])]["Year"]) if minyear < 8 : feat_list = [1] else : feat_list = [-1] return util.get_feature_by_list(feat_list)
def conference_journal_1(AuthorIdPaperId, dict_coauthor, dict_paperIdAuthorId_to_name_aff, dict_author_keywords, PaperAuthor, Author, Paper,dict_author_conference_journal,conferences,journals,dict_author_paperid): authorId = AuthorIdPaperId.authorId#目前作者的id paperId = AuthorIdPaperId.paperId#目前paper的id #这篇论文所在conference和journal情况 conferenceId = Paper[Paper["Id"]==int(paperId)]["ConferenceId"].values journalId = Paper[Paper["Id"]==int(paperId)]["JournalId"].values conferenceId = str(conferenceId[0]) journalId = str(journalId[0]) feat_list = [] thesameurl = 0 temp = "a" temp2 = "b" #如果目前这篇confereceid和journalid均为0,那么返回[0] if conferenceId == "0" and journalId == "0": #feat_list = [ max(dict_author_conference_journal[authorId]["conferenceId"][conferenceId],dict_author_conference_journal[authorId]["journalId"][journalId]) ] feat_list = [0] #如果目前这篇conferenceid或journalid不为0,且该id作者之前没发过,那么查找该作者之前发的paper所在conference情况;其中如果作者发过该期刊,则直接拿该期刊的发布次数作为score;如果没有发过该期刊,那么求最近期刊和当前期刊的距离运算后作为权重乘以分数作为分数 if conferenceId != "0": temp = conferences[conferences["Id"]==int(conferenceId)]["HomePage"].values #print ("aaaa",type(temp)) if temp != None: str_conference = str(temp[0]) else: str_conference = "0" for conference in dict_author_conference_journal[authorId]["conferenceId"]: #比较目前的conference和该作者dict里的conference,找到属于同一个主url的conference temp2 = conferences[conferences["Id"]==int(conference)]["HomePage"].values if temp2 != None: str_tempconference = str(temp2[0]) else: str_tempconference = "1" if in_thesame_major_website(str_tempconference,str_conference): #如果属于同一个主域,那么将作者发过的这个会议或者期刊的次数记录下来累加 thesameurl += dict_author_conference_journal[authorId]["conferenceId"][conference] feat_list = [thesameurl] thesameurl = 0 if journalId != "0": temp = journals[journals["Id"]==int(journalId)]["HomePage"].values if temp != None: str_journal = str(temp[0]) else: str_journal = "0" for journal in dict_author_conference_journal[authorId]["journalId"]: temp2 = journals[journals["Id"]==int(journal)]["HomePage"].values if temp2 != None: str_tempjournal = str(temp2[0]) else: str_tempjournal = "1" if in_thesame_major_website(str_tempjournal,str_journal) : thesameurl += dict_author_conference_journal[authorId]["journalId"][journal] feat_list = [thesameurl] return util.get_feature_by_list(feat_list)
# 我简单地把coauthor和当前aid作者和合作次数作为这个coauthor出现的得分。 def key(AuthorIdPaperId, dict_coauthor,dict_key, dict_paperIdAuthorId_to_name_aff, PaperAuthor, Author): authorId = AuthorIdPaperId.authorId paperId = AuthorIdPaperId.paperId # 从PaperAuthor中,根据paperId找coauthor。 curr_key = list(map(str, list(AuthorKeywords[AuthorKeywords["PaperId"] == int(paperId)]["AuthorId"].values))) # top_key = dict_key[AuthorId].keys() # 简单计算top 10 coauthor出现的个数 nums = len(set(curr_key) & set(top_key)) return util.get_feature_by_list([nums]) # 1. 简单计算top 10 coauthor出现的个数 def coauthor_1(AuthorIdPaperId, dict_coauthor,dict_key, dict_paperIdAuthorId_to_name_aff, PaperAuthor, Author): authorId = AuthorIdPaperId.authorId paperId = AuthorIdPaperId.paperId # 从PaperAuthor中,根据paperId找coauthor。 curr_coauthors = list(map(str, list(PaperAuthor[PaperAuthor["PaperId"] == int(paperId)]["AuthorId"].values))) # top_coauthors = dict_coauthor[authorId].keys() # 简单计算top 10 coauthor出现的个数 nums = len(set(curr_coauthors) & set(top_coauthors))