def handle_second_page(url, attrs): # 获得二级页面 soup = get_html_text(url) if soup is None: return None # 优先使用DOI链接 raw_links = soup.find_all(text=re.compile(r'electronic edition via DOI')) if len(raw_links) == 0: # 没有找到DOI链接,就选择使用通过 @ 找到的链接 raw_links = soup.find_all(text=re.compile(r'electronic edition @')) links = map(lambda tmp: tmp.find_parent('a'), raw_links) if links is None: logger.info('处理二级页面,没有找到electronic edition链接' + str(url)) for raw_url in links: paper_dict = handle_third_page(raw_url.get('href'), attrs) tmp = raw_url.find_parent('li', class_='drop-down') if tmp is not None: temp = tmp.find_next_sibling('li', class_='drop-down') if temp is not None: raw_ris = temp.select_one( 'div[class="body"] > ul:nth-of-type(1) > li:nth-of-type(2) > a' ) if raw_ris is not None: download_paper_info(raw_ris.get('href'), root_dir, paper_dict) time.sleep(get_random_uniform(begin=2.0, end=60.0))
def handle_third_page(urls, attrs): for url in urls: soup = get_html_str(get_phantomjs_page(url)) if soup is None: logger.info('3级页面无法获取:' + str(url)) return None else: link = soup.find('link', attrs={'rel': 'canonical'}) if link: link = link.get('href') else: logger.info('handle_third_page没有找到跳转链接link:' + str(url)) return None soup = get_html_str(get_phantomjs_page(link)) # 获取关于论文的描述信息:标题、作者、发表日期等等 data_dict = copy.deepcopy(attrs) # 深拷贝字典 data_dict['url'] = link # 保存论文的正真链接地址 h1 = soup.find('h1', class_='svTitle') if h1: data_dict['title'] = h1.get_text().strip() ul = soup.find('ul', class_='authorGroup noCollab svAuthor') if ul: a_list = ul.find_all_next('a', class_='authorName svAuthor') authors_dict = dict() for a in a_list: affiliation_dict = dict() affiliation_dict['affiliation'] = '' affiliation_dict['affiliation_name'] = '' affiliation_dict['affiliation_country'] = '' author_name = a.get_text().strip() author_name = re.sub(r'[\._$]', ' ', author_name) authors_dict[author_name] = affiliation_dict data_dict['author'] = authors_dict h2 = soup.find('h2', text=re.compile(r'Abstract')) if h2: p = h2.find_next_sibling('p') data_dict['abstract'] = p.get_text() h2 = soup.find('h2', text=re.compile(r'Keywords')) if h2: ul = h2.find_next_sibling('ul') keywords_list = ul.find_all_next('li', class_='svKeywords') keywords = list() for keyword in keywords_list: keywords.append(keyword.get_text().strip()) data_dict['keywords'] = keywords h2 = soup.find('h2', text=re.compile(r'References')) if h2: li_list = h2.find_all_next('li', class_='title') references = list() for li in li_list: references.append(li.get_text().strip()) data_dict['reference'] = references write_to_database(data_dict) time.sleep(get_random_uniform(begin=2.0, end=60.0))
def handle_first_page(url, attrs): # 获得一级页面 page_content = get_html_text(url) if page_content is None: logger.info('1级页面无法获取:' + str(url)) return None raw_links = page_content.find_all('a', text='[contents]') if (raw_links is not None) and (len(raw_links) > 0): links = map(lambda raw_link: raw_link.get('href'), raw_links) # 会议论文 else: raw_links = page_content.find_all('a', text=re.compile(r'Volume')) # 期刊 links = map(lambda raw_link: raw_link.get('href'), raw_links) for url in links: handle_second_page(url, attrs) time.sleep(get_random_uniform(begin=2.0, end=60.0))
def handle_first_page(url, attrs): # 获得一级页面 page_content = get_html_text(url) if page_content is None: logger.info('1级页面无法获取:' + str(url)) return None raw_links = list() li_list = page_content.select( 'a[href^="http://dblp.uni-trier.de/db/journals/"]') for li in li_list: temp = li.get('href') if 'http://dblp.uni-trier.de/db/journals/' != temp: raw_links.append(temp) for url in raw_links: handle_second_page(url, attrs) time.sleep(get_random_uniform(begin=2.0, end=60.0))
def run_ieee_update(): while True: hour = int(time.strftime('%H')) if START_HOUR <= hour <= END_HOUR: init_dir(log_dir) init_dir(root_dir) try: logger.warning('update_ieee正常启动!') update_ieee(ieee_updates_url) except Exception as e: logger.exception('update_ieee异常停止!') else: sleep_time = get_random_uniform(begin=30 * 60, end=1 * 60 * 60) logger.warning('update_ieee正常停止!即将休眠大约{:.2f}分钟...'.format( sleep_time / 60)) time.sleep(sleep_time) # 本次更新已经完成,休眠0.5h~1.0h else: # 休眠1小时 time.sleep(1 * 60 * 60)
def handle_second_page(urls): links = list() for url in urls: page_content = get_html_str(get_phantomjs_page(url)) if page_content is None: logger.info('2级页面无法获取:' + str(url)) return None ul = page_content.find('ul', class_='results') if ul is not None: divs = ul.find_all_next('div', class_='txt') for div in divs: temp = div.find_next('a', class_='art-abs-url') if temp is not None: links.append('http://ieeexplore.ieee.org' + temp.get('href')) # 找到分页代码,获得分页总数,并向分页链接请求页面内容 pagination = page_content.find('div', class_='pagination') if pagination is not None: a_list = pagination.select('a[aria-label^="Pagination Page"]') if a_list: pageNumber = a_list[-1].get_text().strip() if pageNumber is not None: pageNumber = int(pageNumber) url_list = list() for number in range(2, pageNumber + 1): url_list.append(url + '&pageNumber=' + str(number)) for tmp_url in url_list: page_content = get_html_str( get_phantomjs_page(tmp_url)) if page_content is None: logger.info('2级页面无法获取:' + str(url)) return None ul = page_content.find('ul', class_='results') if ul is not None: divs = ul.find_all_next('div', class_='txt') for div in divs: temp = div.find_next('a', class_='art-abs-url') if temp is not None: links.append('http://ieeexplore.ieee.org' + temp.get('href')) else: logger.info('处理2级页面时没有找到分页代码:' + str(url)) time.sleep(get_random_uniform(begin=5.0, end=10.0)) handle_third_page(links) # 进一步处理已采集到的当前页面上的所有3级页面的链接
def handle_third_page(urls): for url in urls: data_dict = dict() page_content = get_html_str(get_phantomjs_page(url)) if page_content is None: logger.info('论文内容页面无法获取!' + url) continue # 论文URL地址 data_dict['url'] = url # 论文类型 data_dict['category'] = 'conference' # IEEE更新论文日期 data_dict['update_time'] = time.strftime('%Y%m%d', time.localtime()) # 论文采集时间 data_dict['spider_time'] = time.strftime('%Y.%m.%d %H:%M:%S', time.localtime()) # 采集论文名 if page_content.title is not None: data_dict['title'] = page_content.title.string[:-22].strip() # 采集论文摘要信息 abstract = page_content.find('div', class_='abstract-text ng-binding') if abstract is not None: data_dict['abstract'] = abstract.get_text() # 采集论文发表日期 date = page_content.find('strong', text='Date of Publication:') if date is not None: div = date.find_parent('div') if div is not None: date = re.split(r':', div.get_text())[-1].strip() data_dict['publication_date'] = date # 采集论文关键词信息 ul = page_content.find('ul', class_='doc-all-keywords-list') if ul is not None: spans = ul.find_all_next('span') keywords = list() for span in spans: temp = span.find_next('a', class_='ng-binding') if temp is not None: keywords.append(temp.get_text().strip()) data_dict['keywords'] = keywords # 采集论文作者信息 h2 = page_content.find('h2', text='Authors') if h2 is not None: div = h2.find_next_sibling('div', class_='ng-scope') if div is not None: temp = div.select( 'a[href^="/search/searchresult.jsp?searchWithin="]') if temp is not None: authors_dict = dict() # 保存多个作者信息到字典 for a in temp: affiliation_dict = dict() span = a.find_next('span', class_='ng-binding') if span is not None: author_name = span.get_text().strip() author_name = re.sub(r'[._$]', ' ', author_name) tmp = a.parent.find_next_sibling( 'div', class_='ng-binding') if tmp is not None: affiliation = tmp.get_text().strip() data_list = re.split(r',', affiliation) affiliation_dict['affiliation'] = affiliation if data_list is not None: affiliation_dict[ 'affiliation_country'] = data_list[-1] authors_dict[author_name] = affiliation_dict data_dict['author'] = authors_dict # 获取论文参考信息 page_content = get_html_str( get_phantomjs_page(url + 'references?ctx=references')) if page_content is not None: h2 = page_content.find('h2', text='References') if h2 is not None: divs = h2.find_next_siblings( 'div', class_='reference-container ng-scope') references = list() for div in divs: div_temp = div.find_next('div', class_='description ng-binding') if div_temp: references.append(div_temp.get_text().strip()) data_dict['references'] = references # 获取论文被引用信息 page_content = get_html_str( get_phantomjs_page( url + 'citations?anchor=anchor-paper-citations-ieee&ctx=citations')) if page_content is not None: # Cited in Papers - IEEE h2 = page_content.find('h2', text=re.compile(r'Cited in Papers - IEEE')) citations = list() if h2 is not None: divs = h2.find_next_siblings('div', class_='ng-scope') for div in divs: div_temp = div.find_next('div', class_='description ng-binding') if div_temp: citations.append(div_temp.get_text().strip()) # Cited in Papers - Other Publishers h2 = page_content.find( 'h2', text=re.compile(r'Cited in Papers - Other Publishers')) if h2 is not None: divs = h2.find_next_siblings('div', class_='ng-scope') for div in divs: div_temp = div.find_next('div', class_='description ng-binding') if div_temp: citations.append(div_temp.get_text().strip()) data_dict['citations'] = citations write_to_database(data_dict) time.sleep(get_random_uniform(begin=1.0, end=20.0))