def run(self): author = CDBLPAuthor(self.author_name.strip()) #AuthorThread.authors[self.author_name.strip()] = author.get_author() chunk = open('authors/{}-{}-entry.data'.format(self.id, self.author_name.strip()), 'w') chunk.write(json.dumps(author.get_author())) chunk.close() print(self.author_name + ' %d is done.' % self.id)
def run(self): author = CDBLPAuthor(self.author_name.strip()) #AuthorThread.authors[self.author_name.strip()] = author.get_author() chunk = open( 'authors/{}-{}-entry.data'.format(self.id, self.author_name.strip()), 'w') chunk.write(json.dumps(author.get_author())) chunk.close() print(self.author_name + ' %d is done.' % self.id)
def get_dblp_author_from_zh(author_name): author_cdblp = CDBLPAuthor(author_name) author_name_comp = CDBLPAuthor.getEnglishName(author_name) urlpt = '{}/{}:{}'.format(author_name_comp['last_name'][0].lower(), author_name_comp['last_name'], author_name_comp['first_name']) candidate_urlpts = set() candidate_authors = [] res = urlopen(DBLPQuery.get_dblp_url(urlpt)) dom = BeautifulSoup(res) for cu_tag in dom.find_all('li', 'homonym'): cu = cu_tag.find('a')['href'][3:-5] candidate_urlpts.add(cu) if len(candidate_urlpts) == 0: candidate_urlpts.add(urlpt) for cu in candidate_urlpts: author = DBLPAuthor(cu) candidate_authors.append(author) print(cu) # for example, '骞雅楠' => 'Ya-nan Qian' in DBLP if len(candidate_authors) == 0 and len(author_name) == 3: res = urlopen('{}/search/author?xauthor={}'.format(DBLPQuery.base_url, quote(author_name_comp['full_name_dash']))) dom = BeautifulSoup(res) author_tags = dom.find_all('author') for author_tag in author_tags: if author_tag.string == author_name_comp['full_name_dash']: author = DBLPAuthor(author_tag['urlpt']) candidate_authors.append(author) try: target_author = candidate_authors[0] except IndexError: return { 'cdblp': author_cdblp.get_author(), 'dblp': {} } coauthors_set_cdblp = set(map(lambda a: a['full_name'], author_cdblp.get_coauthors())) coauthors_set_dblp = set(candidate_authors[0].get_coauthors()) coauthor_count_max = len(set(coauthors_set_cdblp.intersection(coauthors_set_dblp))) overlap = coauthors_set_cdblp.intersection(coauthors_set_dblp) if len(candidate_authors) > 1: for candidate in candidate_authors: coauthors_set_dblp = set(candidate.get_coauthors()) coauthor_overlap = coauthors_set_cdblp.intersection(coauthors_set_dblp) if len(coauthor_overlap) >= coauthor_count_max: target_author = candidate overlap = coauthor_overlap else: target_author = candidate_authors[0] #print(overlap) if coauthor_count_max == 0: return { 'cdblp': author_cdblp.get_author(), 'dblp': {} } return { 'cdblp': author_cdblp.get_author(), 'dblp': target_author.get_author() }
def author_distinct(cached_list, cached_set, author_name): trial = author_name[0] if author_name in cached_set: d = DBLPQuery.get_cache('author-entries-cache.data') print('This is a CDBLP author w/ a English name on file.') author_name_zh = '' for author_name_comp in cached_list: if author_name_comp['full_name'] == author_name.strip() or author_name_comp['zh'] == author_name: author_name_zh = author_name_comp['zh'] break return { 'cdblp': d.get(author_name_zh, {}), 'dblp': DBLPAuthor('{}/{}:{}'.format(author_name_comp['last_name'][0].lower(), author_name_comp['last_name'], author_name_comp['first_name'])).get_author() } else: if 0x3400 < ord(trial) < 0x2b6f8: print('This is a CDBLP author w/ a Chinese name.') author_cdblp = CDBLPAuthor(author_name) author_name_comp = CDBLPAuthor.getEnglishName(author_name) res = urlopen('{}/search/author?xauthor={}'.format(DBLPQuery.base_url, quote(author_name_comp['full_name']))) dom = BeautifulSoup(res) candidate_authors = [] author_tags = dom.find_all('author') for author_tag in author_tags: if author_tag.string == author_name_comp['full_name']: author = DBLPAuthor(author_tag['urlpt']) candidate_authors.append(author) # for example, '骞雅楠' => 'Ya-nan Qian' in DBLP if len(candidate_authors) == 0 and len(author_name) == 3: res = urlopen('{}/search/author?xauthor={}'.format(DBLPQuery.base_url, quote(author_name_comp['full_name_dash']))) dom = BeautifulSoup(res) author_tags = dom.find_all('author') for author_tag in author_tags: if author_tag.string == author_name_comp['full_name_dash']: author = DBLPAuthor(author_tag['urlpt']) candidate_authors.append(author) try: target_author = candidate_authors[0] except IndexError: return { 'cdblp': author_cdblp.get_author(), 'dblp': {} } coauthors_set_cdblp = set(map(lambda a: a['full_name'], author_cdblp.get_coauthors())) coauthors_set_dblp = set(candidate_authors[0].get_coauthors()) coauthor_count_max = len(set(coauthors_set_cdblp.intersection(coauthors_set_dblp))) overlap = coauthors_set_cdblp.intersection(coauthors_set_dblp) if len(candidate_authors) > 1: for candidate in candidate_authors: coauthors_set_dblp = set(candidate.get_coauthors()) coauthor_overlap = coauthors_set_cdblp.intersection(coauthors_set_dblp) if len(coauthor_overlap) >= coauthor_count_max: target_author = candidate overlap = coauthor_overlap else: target_author = candidate_authors[0] #print(overlap) return { 'cdblp': author_cdblp.get_author(), 'dblp': target_author.get_author() } else: print('This is a non-CDBLP author.') candidates = [] res = urlopen('{}/search/author?xauthor={}'.format(DBLPQuery.base_url, quote(author_name))) dom = BeautifulSoup(res) for candidate_tag in dom.find_all('author'): if author_name == candidate_tag.string: candidates.append(DBLPAuthor(candidate_tag['urlpt'])) return { 'cdblp': {}, 'dblp': candidates[0].get_author() }