Пример #1
0
 def crawl_volume_page(self,
                       volume_item,
                       AllItemsPageParser,
                       JournalArticle,
                       use_tor=False,
                       check_pdf_url=True):
     volume_link = self.handle_volume_link_for_multi_results(volume_item[0])
     volume_db_id = volume_item[1]
     if use_tor:
         #此情况属于完全不可能直接获取pdf_url时使用,用远程服务器加速
         html_source = request_with_proxy(volume_link).text
     else:
         html_source = request_with_random_ua(volume_link).text
     parser = AllItemsPageParser(html_source)
     try:
         sections = parser.sections
     except Exception as e:
         print('[Error] JournalSpider:Page Invalid {}\n\
             error_url {}'.format(str(e), volume_link))
         return False
     try:
         volume_year = parser.volume_year
         print('Volume_year:{}'.format(volume_year))
     except AttributeError:
         volume_year = None
     print('\nPage Url: %s ' % volume_link)
     if self.crawl_articles(sections, volume_year, volume_db_id,
                            JournalArticle, check_pdf_url) == False:
         return False
     if len(parser.sections) > 0 and self.debug == False:
         self.mark_volume_ok(volume_db_id)
         return True
     return False
Пример #2
0
 def generate_htmls(self):
     for i in range(5):
         print(i)
         resp = request_with_proxy(self.url, no_proxy_test=True)
         with open(os.path.join(self.folder, '{}.html'.format(i)),
                   'wb') as html_file:
             html_file.write(resp.content)
 def get_google_item_by_search(self):
     search_url = (
         'https://scholar.google.com'
         '/scholar?q={}&btnG=&hl=en&lr=lang_en&as_sdt=0%2C5'
     ).format(self.ArticleObj.title.strip())
     #print('searching {}...'.format(search_url))
     req = request_with_proxy(
         timeout = 10,
         url = search_url,
         gap_time = random.choice(range(0,3)),
         #use_self_pool= random.choice([True,False])
         use_self_pool = False
         )
     parser = PageParser(
         html_source=req.text
     )
     sections = parser.sections
     if len(sections)>2:
         if parser.robot_error:
             #print(parser.robot_error)
             raise ConnectionError(
                 'Robot Error:{}'.\
                     format(parser.robot_error))
         else:
             pass
             #print('No Robot limit')
         raise LookupError(
             'Locate Article Error: '
             ' Multi or No Results:  len: {}'
             '\n{}\n'.format(len(sections),search_url)
         )
     return GoogleArticle(sec=sections[0])
Пример #4
0
 def __init__(self,
              from_web=True,
              url=None,
              no_proxy_test=False,
              file_name=None):
     '''
         本地测试解析时from_web = False
     '''
     if from_web and url:
         #print("from web")
         self.html = request_with_proxy(url,
                                        no_proxy_test=no_proxy_test).text
     else:
         print("from local file")
         with open(file_name, 'rb') as f:
             self.html = f.read()
     self.soup = BeautifulSoup(self.html, 'lxml')
Пример #5
0
 def text(self):
     for i in range(1, 10):
         url = self.url
         print(
             'Bibtex:\n\t{} times trying to get bibtex of article_id = {}\n\turl:{}'
             .format(i, self.article_id, url))
         bibtex_response = request_with_proxy(url)
         print('Bibtex:\n\tbibtex site status code: {}'.format(
             bibtex_response.status_code))
         if bibtex_response:
             bibtex = bibtex_response.text
             print(
                 'Bibtex:\n\t[SUCCESS] to get new bibtex of the article: {}\n{}'
                 .format(self.article_id, bibtex))
             return bibtex
         time.sleep(random.randint(1, 4))
     return None
Пример #6
0
 def crawl(self, unfinished_item):
     id = unfinished_item[0]
     google_id = unfinished_item[1]
     url = 'https://scholar.google.com/scholar?q=info:{}:scholar.google.com/&output=cite&scirp=0&hl=en'.format(
         google_id)
     print("BibtexSpider:\n\tid: {0} google_id: {1} \n\turl:{2}".format(
         id, google_id, url))
     for i in range(1, 10):
         print(
             'BibtexSpider:\n\t{} times to enter article_id = {} first page...'
             .format(i, id))
         response = request_with_proxy(url)
         if response.status_code == 200:
             print("BibtexSpider:\n\tFirst page ok!Response 200...")
             Bibtex(soup=BeautifulSoup(response.text, "lxml"),
                    article_id=id).save_to_db()
             return
         else:
             print('BibtexSpider:\n\tFirst page visit error:Responese {}'.
                   format(response.status_code))
         time.sleep(random.randint(1, 4))