Пример #1
0
    def write_db(self):

        print "len of entry list " + str(len(self.entry_list))

        for entry in self.entry_list:
            paper = Paper()
            if entry.has_key("id"):
                paper.id = entry["id"]
            if entry.has_key("type"):
                paper.type = entry["type"]
            if entry.has_key("title"):
                paper.title = entry["title"]
            if entry.has_key("author"):
                paper.authors = entry["author"]
            if entry.has_key("year"):
                paper.year = int(entry["year"])
            if entry.has_key("journal"):
                paper.journal = entry["journal"]
            if entry.has_key("booktitle"):
                paper.book_title = entry["booktitle"]
            if entry.has_key("publisher"):
                paper.publisher = entry["publisher"]
            if entry.has_key("institution"):
                paper.institution = entry["institution"]
            if entry.has_key("volume"):
                paper.volume = int(entry["volume"])
            if entry.has_key("number"):
                paper.number = int(entry["number"])
            if entry.has_key("pages"):
                paper.pages = entry["pages"]
            if entry.has_key("url"):
                paper.url = entry["url"]
            if entry.has_key("doi"):
                paper.doi = entry["doi"]
            if entry.has_key("isbn"):
                paper.isbn = entry["isbn"]

            paper.save()
Пример #2
0
def get_references_citations_by_id(profile_id):
    if isinstance(profile_id, dict):
        profile_id = profile_id.get('profile_id')
        if MONGO:
            if data_collection.find({"id": profile_id}).count() > 0:
                # 说明这个数据已经被爬取过了
                return []
    print('func2')
    if not profile_id:
        return -1
    headers = {
        'user-agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36',
        'accept-language': 'zh-CN,zh;q=0.9'
    }
    session = requests.Session()
    while True:
        try:
            response = session.get(
                'https://cn.bing.com/academic/profile?id={}&encoded=0&v=paper_preview&mkt=zh-cn'
                .format(profile_id),
                headers=headers)
            response.raise_for_status()
            response.encoding = 'utf-8'
            break
        except KeyboardInterrupt:
            raise KeyboardInterrupt
        except Exception as e:
            time.sleep(3.0)
            print(e)
    result = re.search(r'IG:"(.*?)"', response.text)
    if result:
        ig = result.group(1)
    result = re.search(
        r'被 引 量</span></span><span class="aca_content"><div>(\d*)</div>',
        response.text)
    if result:
        citation_num = result.group(1)

    html = etree.HTML(response.text)

    paper = Paper(save2mongo=MONGO)
    try:
        paper.title = html.xpath('//li[@class="aca_title"]/text()')[0]
        paper.id = profile_id
        paper.citation_num = citation_num
        result = re.search(
            r'<span class="aca_label">DOI</span></span><span class="aca_content"><div>(.*?)</div>',
            response.text)
        if result:
            paper.doi = result.group(1)
        paper.authors = html.xpath(
            '//div[@class="aca_desc b_snippet"]/span//a/text()')
        paper.abstract = html.xpath(
            '//div[@class="aca_desc b_snippet"]/span[1]//text()')[-1]
        result = re.search(
            r'<span class="aca_label">发表日期</span></span><span class="aca_content"><div>(\d*)</div>',
            response.text)
        if result:
            paper.publish_year = result.group(1)

        base_url = 'https://cn.bing.com/academic/papers?ajax=scroll&infscroll=1&id={id}&encoded=0&v=paper_preview&mkt=zh-cn&first={first}&count={count}&IG={ig}&IID=morepage.{num}&SFX={num}&rt={rt}'

        count = 9
        citation_links = list()
        for i in range(1, int(citation_num) // count):
            ajax_url = base_url.format(id=profile_id,
                                       first=i * (count + 1),
                                       count=count + 1,
                                       ig=ig,
                                       num=i,
                                       rt='2')
            while True:
                try:
                    response = session.get(ajax_url, headers=headers)
                    response.raise_for_status()
                    response.encoding = 'utf-8'
                    break
                except KeyboardInterrupt:
                    raise KeyboardInterrupt
                except Exception as e:
                    time.sleep(3.0)
                    print(e)
            html = etree.HTML(response.text)
            citation_links.extend(html.xpath('//a[@target="_blank"]/@href'))
        print('number of citation_links', len(citation_links), 'citation_num',
              citation_num)
        if len(citation_links) >= 0:
            for i, citation_link in enumerate(citation_links):
                profile_id = get_profile_id(citation_link)
                if profile_id.get('title', False):
                    paper.citations.append(profile_id)
                print('get_profile_id: {}/{}\r'.format(i + 1,
                                                       len(citation_links)),
                      end='')
        print('\nnumber of ids:', len(paper.citations))
    except KeyboardInterrupt:
        raise KeyboardInterrupt
    except Exception as e:
        print(e)
    paper.save()
    # for profile_id in paper.citations:
    #     get_references_citations_by_id(profile_id)
    return paper.citations