Exemplo n.º 1
0
 def run(self):
     author = CDBLPAuthor(self.author_name.strip())
     #AuthorThread.authors[self.author_name.strip()] = author.get_author()
     chunk = open('authors/{}-{}-entry.data'.format(self.id, self.author_name.strip()), 'w')
     chunk.write(json.dumps(author.get_author()))
     chunk.close()
     print(self.author_name + ' %d is done.' % self.id)
Exemplo n.º 2
0
 def run(self):
     author = CDBLPAuthor(self.author_name.strip())
     #AuthorThread.authors[self.author_name.strip()] = author.get_author()
     chunk = open(
         'authors/{}-{}-entry.data'.format(self.id,
                                           self.author_name.strip()), 'w')
     chunk.write(json.dumps(author.get_author()))
     chunk.close()
     print(self.author_name + ' %d is done.' % self.id)
Exemplo n.º 3
0
def get_match(author_name):

    Data.clear()

    author_cdblp = CDBLPAuthor(author_name)
    author_name_comp = CDBLPAuthor.getEnglishName(author_name)

    urlpt = '{}/{}:{}'.format(author_name_comp['last_name'][0].lower(),
                              author_name_comp['last_name'],
                              author_name_comp['first_name'])

    candidate_urlpts = set()
    author_affiliation = dict()

    res = urlopen(DBLPQuery.get_dblp_url(urlpt))
    dom = BeautifulSoup(res)

    for cu_tag in dom.find_all('li', 'homonym'):
        cu = cu_tag.find('a')['href'][3:-5]
        candidate_urlpts.add(cu)
        author_affiliation[cu] = cu_tag.find('a').next_sibling.string

    if len(candidate_urlpts) == 0:
        candidate_urlpts.add(urlpt)
        author_affiliation[urlpt] = 'Default University'

    l = []

    for cu in candidate_urlpts:
        t = ThreadMatch(cu, author_cdblp)
        t.start()
        l.append(t)

    for t in l:
        t.join()

    result = []

    for k, v in Data.result.items():
        if v > 0.1:
            result.append({
                'urlpt': k,
                'aff': author_affiliation[k],
                'rank': v
            })

    result.sort(key=lambda i: i['rank'], reverse=True)
    return {'author': author_cdblp, 'result': result}
Exemplo n.º 4
0
    def get_authors_by_venue(cached_list, cached_set, cdblp_venue, dblp_venue):

        d = DBLPQuery.get_cache('cdblp-pub-cache.data')

        if not d.__contains__(cdblp_venue.get('title')):
            print('This C-DBLP venue is not on file.')
            return

        res = urlopen('http://www.dblp.org/search/api/?q=ce:venue:{}:*&h=750&format=json'.format(dblp_venue.get('title').lower()))
        # fix titles as { "Title ..." }
        fixed_json = re.compile('({\s*)(".+")(\s*})').sub(lambda m: m.group(2), res.read().decode('utf-8'))

        # get publications
        cdblp_pubs = d.get(cdblp_venue.get('title'))
        dblp_pubs = json.loads(fixed_json)

        cdblp_authors = set()
        dblp_authors = set()
        authors = dict()

        #print(type(cdblp_pubs))
        #print(cdblp_pubs.keys())

        for ky in cdblp_pubs.keys():
            for ki in cdblp_pubs.get(ky).keys():
                for pub in cdblp_pubs.get(ky).get(ki):
                    for author in pub.get('authors'):
                        cdblp_authors.add(author)

        for pub in dblp_pubs.get('result').get('hits').get('hit'):
            try:
                for author in pub.get('info').get('authors').get('author'):
                    dblp_authors.add(author)
            except AttributeError:
                print('PublicationException: %s' % pub.get('@id'))

        pinyin = PinYin()
        pinyin.load_word()

        for author in cdblp_authors:
            name_comp = CDBLPAuthor.get_english_name(author, pinyin)
            if name_comp['full_name'] in dblp_authors:
                if authors.__contains__(name_comp['full_name']):
                    authors[name_comp['full_name']]['zh'] = name_comp['zh']
                    authors[name_comp['full_name']]['count'] += 1
                else:
                    authors[name_comp['full_name']] = { 'zh': name_comp['zh'], 'count': 1 }
            elif len(author) == 3 and authors.__contains__(name_comp['full_name_dash']):
                if authors.__contains__(name_comp['full_name_dash']):
                    authors[name_comp['full_name_dash']]['zh'] = name_comp['zh']
                    authors[name_comp['full_name_dash']]['count'] += 1
                else:
                    authors[name_comp['full_name_dash']] = { 'zh': name_comp['zh'], 'count': 1 }

        return authors
Exemplo n.º 5
0
def get_match(author_name):

    Data.clear()

    author_cdblp = CDBLPAuthor(author_name)
    author_name_comp = CDBLPAuthor.getEnglishName(author_name)

    urlpt = '{}/{}:{}'.format(author_name_comp['last_name'][0].lower(), author_name_comp['last_name'], author_name_comp['first_name'])

    candidate_urlpts = set()
    author_affiliation = dict()

    res = urlopen(DBLPQuery.get_dblp_url(urlpt))
    dom = BeautifulSoup(res)

    for cu_tag in dom.find_all('li', 'homonym'):
        cu = cu_tag.find('a')['href'][3:-5]
        candidate_urlpts.add(cu)
        author_affiliation[cu] = cu_tag.find('a').next_sibling.string

    if len(candidate_urlpts) == 0:
        candidate_urlpts.add(urlpt)
        author_affiliation[urlpt] = 'Default University'

    l = []

    for cu in candidate_urlpts:
        t = ThreadMatch(cu, author_cdblp)
        t.start()
        l.append(t)

    for t in l:
        t.join()

    result = []

    for k, v in Data.result.items():
        if v > 0.1:
            result.append({ 'urlpt': k, 'aff': author_affiliation[k], 'rank': v })

    result.sort(key=lambda i: i['rank'], reverse=True)
    return { 'author': author_cdblp, 'result': result }
Exemplo n.º 6
0
    def get_sample_users():
        cache = open('author-cache.data', 'w')
        piy = PinYin()
        piy.load_word()
        author_list = []
        res = urlopen('http://easyscholar.ruc.edu.cn/moreuser.html')
        dom = BeautifulSoup(res)
        author_tags = dom.find_all(href=re.compile('^homepage/'))
        for author_tag in author_tags:
            if author_tag.findChild('strong'):
                #print(author_tag.findChild('strong').contents)
                author_name = CDBLPAuthor.getEnglishName(author_tag.findChild('strong').contents[0])
                author_list.append(author_name)
                #print('{} {}'.format(author_name['full_name'], author_name['zh']))
                #print(CDBLPAuthor.getEnglishName(author_tag.findChild('strong').contents[0])['full_name'])
                #print(piy.hanzi2pinyin(author_tag.findChild('strong').contents[0]))

        cache.write(json.dumps(author_list))
        cache.close()
        return author_list
Exemplo n.º 7
0
    def get_coauthored_publications_by_authors(cached_list, cached_set, author1_name, author2_name):
        """
        1. Get authors' publications
        2. Merge publications
        """
        publications = { 'cdblp': [], 'dblp': [] }
        pub1 = DBLPQuery.get_publications_by_author(cached_list, cached_set, author1_name)
        author2 = DBLPQuery.author_distinct(cached_list, cached_set, author2_name)
        #pub2 = DBLPQuery.get_publications_by_author(cached_list, cached_set, author2_name)
        for cdblp_pub in pub1.get('cdblp', []):
            authors = set(cdblp_pub.get('authors', []))
            authors_en = set(map(lambda a: CDBLPAuthor.getEnglishName(a)['full_name'], authors))
            if author2.get('cdblp', {}).get('author_name', {}).get('zh') in authors or author2.get('dblp', {}).get('author_name') in authors_en:
                publications['cdblp'].append(cdblp_pub)

        for dblp_pub in pub1.get('dblp', []):
            authors = set(map(lambda a: a.get('name'), dblp_pub.get('authors', [])))
            if author2.get('dblp', {}).get('author_name') in authors or author2.get('cdblp', {}).get('author_name', {}).get('full_name') in authors:
                publications['dblp'].append(dblp_pub)

        return publications
Exemplo n.º 8
0
    def get_dblp_author_from_zh(author_name):

        author_cdblp = CDBLPAuthor(author_name)
        author_name_comp = CDBLPAuthor.getEnglishName(author_name)

        urlpt = '{}/{}:{}'.format(author_name_comp['last_name'][0].lower(), author_name_comp['last_name'], author_name_comp['first_name'])

        candidate_urlpts = set()
        candidate_authors = []

        res = urlopen(DBLPQuery.get_dblp_url(urlpt))
        dom = BeautifulSoup(res)

        for cu_tag in dom.find_all('li', 'homonym'):
            cu = cu_tag.find('a')['href'][3:-5]
            candidate_urlpts.add(cu)

        if len(candidate_urlpts) == 0:
            candidate_urlpts.add(urlpt)

        for cu in candidate_urlpts:
            author = DBLPAuthor(cu)
            candidate_authors.append(author)
            print(cu)

        # for example, '骞雅楠' => 'Ya-nan Qian' in DBLP
        if len(candidate_authors) == 0 and len(author_name) == 3:
            res = urlopen('{}/search/author?xauthor={}'.format(DBLPQuery.base_url, quote(author_name_comp['full_name_dash'])))
            dom = BeautifulSoup(res)

            author_tags = dom.find_all('author')
            for author_tag in author_tags:
                if author_tag.string == author_name_comp['full_name_dash']:
                    author = DBLPAuthor(author_tag['urlpt'])
                    candidate_authors.append(author)

        try:
            target_author = candidate_authors[0]
        except IndexError:
            return { 'cdblp': author_cdblp.get_author(), 'dblp': {} }

        coauthors_set_cdblp = set(map(lambda a: a['full_name'], author_cdblp.get_coauthors()))
        coauthors_set_dblp  = set(candidate_authors[0].get_coauthors())
        coauthor_count_max = len(set(coauthors_set_cdblp.intersection(coauthors_set_dblp)))
        overlap = coauthors_set_cdblp.intersection(coauthors_set_dblp)

        if len(candidate_authors) > 1:
            for candidate in candidate_authors:
                coauthors_set_dblp  = set(candidate.get_coauthors())
                coauthor_overlap = coauthors_set_cdblp.intersection(coauthors_set_dblp)

                if len(coauthor_overlap) >= coauthor_count_max:
                    target_author = candidate
                    overlap = coauthor_overlap

        else:
            target_author = candidate_authors[0]

        #print(overlap)
        if coauthor_count_max == 0:
            return { 'cdblp': author_cdblp.get_author(), 'dblp': {} }

        return { 'cdblp': author_cdblp.get_author(), 'dblp': target_author.get_author() }
Exemplo n.º 9
0
    def author_distinct(cached_list, cached_set, author_name):
        trial = author_name[0]

        if author_name in cached_set:
            d = DBLPQuery.get_cache('author-entries-cache.data')
            print('This is a CDBLP author w/ a English name on file.')
            author_name_zh = ''
            for author_name_comp in cached_list:
                if author_name_comp['full_name'] == author_name.strip() or author_name_comp['zh'] == author_name:
                    author_name_zh = author_name_comp['zh']
                    break

            return {
                'cdblp': d.get(author_name_zh, {}),
                'dblp': DBLPAuthor('{}/{}:{}'.format(author_name_comp['last_name'][0].lower(), author_name_comp['last_name'], author_name_comp['first_name'])).get_author()
            }

        else:
            if 0x3400 < ord(trial) < 0x2b6f8:
                print('This is a CDBLP author w/ a Chinese name.')

                author_cdblp = CDBLPAuthor(author_name)
                author_name_comp = CDBLPAuthor.getEnglishName(author_name)

                res = urlopen('{}/search/author?xauthor={}'.format(DBLPQuery.base_url, quote(author_name_comp['full_name'])))
                dom = BeautifulSoup(res)

                candidate_authors = []
                author_tags = dom.find_all('author')

                for author_tag in author_tags:
                    if author_tag.string == author_name_comp['full_name']:
                        author = DBLPAuthor(author_tag['urlpt'])
                        candidate_authors.append(author)

                # for example, '骞雅楠' => 'Ya-nan Qian' in DBLP
                if len(candidate_authors) == 0 and len(author_name) == 3:
                    res = urlopen('{}/search/author?xauthor={}'.format(DBLPQuery.base_url, quote(author_name_comp['full_name_dash'])))
                    dom = BeautifulSoup(res)

                    author_tags = dom.find_all('author')
                    for author_tag in author_tags:
                        if author_tag.string == author_name_comp['full_name_dash']:
                            author = DBLPAuthor(author_tag['urlpt'])
                            candidate_authors.append(author)

                try:
                    target_author = candidate_authors[0]
                except IndexError:
                    return { 'cdblp': author_cdblp.get_author(), 'dblp': {} }

                coauthors_set_cdblp = set(map(lambda a: a['full_name'], author_cdblp.get_coauthors()))
                coauthors_set_dblp  = set(candidate_authors[0].get_coauthors())
                coauthor_count_max = len(set(coauthors_set_cdblp.intersection(coauthors_set_dblp)))
                overlap = coauthors_set_cdblp.intersection(coauthors_set_dblp)

                if len(candidate_authors) > 1:
                    for candidate in candidate_authors:
                        coauthors_set_dblp  = set(candidate.get_coauthors())
                        coauthor_overlap = coauthors_set_cdblp.intersection(coauthors_set_dblp)

                        if len(coauthor_overlap) >= coauthor_count_max:
                            target_author = candidate
                            overlap = coauthor_overlap

                else:
                    target_author = candidate_authors[0]

                #print(overlap)
                return { 'cdblp': author_cdblp.get_author(), 'dblp': target_author.get_author() }

            else:
                print('This is a non-CDBLP author.')
                candidates = []
                res = urlopen('{}/search/author?xauthor={}'.format(DBLPQuery.base_url, quote(author_name)))
                dom = BeautifulSoup(res)
                for candidate_tag in dom.find_all('author'):
                    if author_name == candidate_tag.string:
                        candidates.append(DBLPAuthor(candidate_tag['urlpt']))

                return { 'cdblp': {}, 'dblp': candidates[0].get_author() }
Exemplo n.º 10
0
 def run(self):
     CDBLPAuthor.parallel_get(self.journal, self.link)
     print(self.journal + ' is done.')
Exemplo n.º 11
0
 def run(self):
     CDBLPAuthor.parallel_get(self.journal, self.link)
     print(self.journal + ' is done.')