class Spider(object): def __init__(self): self._headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36', 'Sec-Fetch-Mode': 'no-cors', 'Host': 'arxiv.org' } self._sess = requests.Session() self._sleep_time = 5 self._mysql = MySQL() def _get_detail(self, url): while 1: try: content = self._sess.get(url, headers=self._headers).content except Exception as e: print(e) self._sess.close() self._sess = requests.Session() time.sleep(self._sleep_time) continue html = etree.HTML(content) title = html.xpath( '//h1[@class="title mathjax"]/text()')[0].strip() #authors = ','.join(html.xpath('//div[@class="authors"]/a/text()')).strip() abstract = html.xpath( '//blockquote[@class="abstract mathjax"]/text()')[0].strip() subjects = html.xpath( 'string(//td[@class="tablecell subjects"])').strip() arxiv = url.split('/')[-1] print(arxiv) return (arxiv, title, abstract, subjects) def crawl_arxiv_n(self, begin, stop): self._mysql.connect() # for month in ['07, 06, 05, 04, 03, 02, 01']: # try: index_error_count = 0 for i in range(begin, stop + 1): try: result = self._get_detail( 'https://arxiv.org/abs/1709.{:05d}'.format(i)) index_error_count = 0 self._mysql.execute( 'INSERT IGNORE INTO `rec_arxiv_paper` \ (`arxiv`, `title`, `abstract`, `subjects`) VALUES \ (%s, %s, %s, %s)', result) time.sleep(self._sleep_time // 5) if i % 150: self._sess.close() self._sess = requests.Session() except IndexError: index_error_count += 1 if index_error_count > 5: break # except IndexError: # continue self._mysql.close()