def write_db(self): print "len of entry list " + str(len(self.entry_list)) for entry in self.entry_list: paper = Paper() if entry.has_key("id"): paper.id = entry["id"] if entry.has_key("type"): paper.type = entry["type"] if entry.has_key("title"): paper.title = entry["title"] if entry.has_key("author"): paper.authors = entry["author"] if entry.has_key("year"): paper.year = int(entry["year"]) if entry.has_key("journal"): paper.journal = entry["journal"] if entry.has_key("booktitle"): paper.book_title = entry["booktitle"] if entry.has_key("publisher"): paper.publisher = entry["publisher"] if entry.has_key("institution"): paper.institution = entry["institution"] if entry.has_key("volume"): paper.volume = int(entry["volume"]) if entry.has_key("number"): paper.number = int(entry["number"]) if entry.has_key("pages"): paper.pages = entry["pages"] if entry.has_key("url"): paper.url = entry["url"] if entry.has_key("doi"): paper.doi = entry["doi"] if entry.has_key("isbn"): paper.isbn = entry["isbn"] paper.save()
def get_references_citations_by_id(profile_id): if isinstance(profile_id, dict): profile_id = profile_id.get('profile_id') if MONGO: if data_collection.find({"id": profile_id}).count() > 0: # 说明这个数据已经被爬取过了 return [] print('func2') if not profile_id: return -1 headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36', 'accept-language': 'zh-CN,zh;q=0.9' } session = requests.Session() while True: try: response = session.get( 'https://cn.bing.com/academic/profile?id={}&encoded=0&v=paper_preview&mkt=zh-cn' .format(profile_id), headers=headers) response.raise_for_status() response.encoding = 'utf-8' break except KeyboardInterrupt: raise KeyboardInterrupt except Exception as e: time.sleep(3.0) print(e) result = re.search(r'IG:"(.*?)"', response.text) if result: ig = result.group(1) result = re.search( r'被 引 量</span></span><span class="aca_content"><div>(\d*)</div>', response.text) if result: citation_num = result.group(1) html = etree.HTML(response.text) paper = Paper(save2mongo=MONGO) try: paper.title = html.xpath('//li[@class="aca_title"]/text()')[0] paper.id = profile_id paper.citation_num = citation_num result = re.search( r'<span class="aca_label">DOI</span></span><span class="aca_content"><div>(.*?)</div>', response.text) if result: paper.doi = result.group(1) paper.authors = html.xpath( '//div[@class="aca_desc b_snippet"]/span//a/text()') paper.abstract = html.xpath( '//div[@class="aca_desc b_snippet"]/span[1]//text()')[-1] result = re.search( r'<span class="aca_label">发表日期</span></span><span class="aca_content"><div>(\d*)</div>', response.text) if result: paper.publish_year = result.group(1) base_url = 'https://cn.bing.com/academic/papers?ajax=scroll&infscroll=1&id={id}&encoded=0&v=paper_preview&mkt=zh-cn&first={first}&count={count}&IG={ig}&IID=morepage.{num}&SFX={num}&rt={rt}' count = 9 citation_links = list() for i in range(1, int(citation_num) // count): ajax_url = base_url.format(id=profile_id, first=i * (count + 1), count=count + 1, ig=ig, num=i, rt='2') while True: try: response = session.get(ajax_url, headers=headers) response.raise_for_status() response.encoding = 'utf-8' break except KeyboardInterrupt: raise KeyboardInterrupt except Exception as e: time.sleep(3.0) print(e) html = etree.HTML(response.text) citation_links.extend(html.xpath('//a[@target="_blank"]/@href')) print('number of citation_links', len(citation_links), 'citation_num', citation_num) if len(citation_links) >= 0: for i, citation_link in enumerate(citation_links): profile_id = get_profile_id(citation_link) if profile_id.get('title', False): paper.citations.append(profile_id) print('get_profile_id: {}/{}\r'.format(i + 1, len(citation_links)), end='') print('\nnumber of ids:', len(paper.citations)) except KeyboardInterrupt: raise KeyboardInterrupt except Exception as e: print(e) paper.save() # for profile_id in paper.citations: # get_references_citations_by_id(profile_id) return paper.citations