def save_search_result(p, queue, retry=0): proxy = Proxy.get_random()['address'] url = SEARCH_URL.format(SEARCH_TEXT, p) try: r = fetch(url, proxy=proxy) except (Timeout, ConnectionError): sleep(0.1) retry += 1 if retry > 5: queue.put(url) raise GreenletExit() try: p = Proxy.objects.get(address=proxy) if p: p.delete() except DoesNotExist: pass return save_search_result(url, queue, retry) soup = BeautifulSoup(r.text, 'lxml') results = soup.find(class_='results') if results is None: # 此代理已经被封, 换其他的代理 sleep(0.1) retry += 1 if retry > 5: queue.put(url) raise GreenletExit() return save_search_result(url, queue, retry) articles = results.find_all('div', lambda x: 'wx-rb' in x) for article in articles: save_article(article)
def save_search_result(p, queue, retry=0): proxy = Proxy.get_random()['address'] url = SEARCH_URL.format(SEARCH_TEXT, p) try: r = fetch(url, proxy=proxy) except (Timeout, ConnectionError): sleep(0.1) retry += 1 if retry > 5: queue.put(url) raise GreenletExit() try: p = Proxy.objects.get(address=proxy) if p: p.delete() except DoesNotExist: pass return save_search_result(url, queue, retry) soup = BeautifulSoup(r.text, 'lxml') results = soup.find(class_='results') if results is None: # 此代理已经被封, 换其他的代理 sleep(0.1) retry += 1 if retry > 5: queue.put(url) raise GreenletExit() return save_search_result(url, queue, retry) articles = results.find_all( 'div', lambda x: 'wx-rb' in x) for article in articles: save_article(article)
def fetch(url): s = requests.Session() s.headers.update({'user-agent': get_user_agent()}) proxies = { 'http': Proxy.get_random()['address'], } html_text = s.get(url, timeout=TIMEOUT, proxies=proxies).text js_url = gen_js_url(url) try: js_data = s.get(js_url, timeout=TIMEOUT, proxies=proxies).json() except JSONDecodeError: raise RequestException() return html_text, js_data
def save_search_result(page, queue, retry=0): proxy = Proxy.get_random()['address'] url = SEARCH_URL.format(SEARCH_TEXT, page) try: r = fetch(url, proxy=proxy) except (Timeout, ConnectionError, IOError): sleep(0.1) retry += 1 if retry > 5: put_new_page(page, queue) raise GreenletExit() try: p = Proxy.objects.get(address=proxy) if p: p.delete() except DoesNotExist: pass return save_search_result(page, queue, retry) soup = BeautifulSoup(r.text, 'lxml') results = soup.find(class_='results') if results is None: # 此代理已经被封, 换其他的代理 sleep(0.1) retry += 1 if retry > 5: put_new_page(page, queue) print 'retry too much!' raise GreenletExit() return save_search_result(page, queue, retry) articles = results.find_all( 'div', lambda x: 'wx-rb' in x) for article in articles: save_article(article) page_container = soup.find(id='pagebar_container') if page_container and u'下一页' in page_container.text: last_page = int(page_container.find_all('a')[-2].text) current_page = int(page_container.find('span').text) for page in range(current_page + 1, last_page + 1): put_new_page(page, queue)
def save_search_result(page, queue, retry=0): proxy = Proxy.get_random()['address'] url = SEARCH_URL.format(SEARCH_TEXT, page) try: r = fetch(url, proxy=proxy) except (Timeout, ConnectionError, IOError): sleep(0.1) retry += 1 if retry > 5: put_new_page(page, queue) raise GreenletExit() try: p = Proxy.objects.get(address=proxy) if p: p.delete() except DoesNotExist: pass return save_search_result(page, queue, retry) soup = BeautifulSoup(r.text, 'lxml') results = soup.find(class_='results') if results is None: # 此代理已经被封, 换其他的代理 sleep(0.1) retry += 1 if retry > 5: put_new_page(page, queue) print 'retry too much!' raise GreenletExit() return save_search_result(page, queue, retry) articles = results.find_all('div', lambda x: 'wx-rb' in x) for article in articles: save_article(article) page_container = soup.find(id='pagebar_container') if page_container and u'下一页' in page_container.text: last_page = int(page_container.find_all('a')[-2].text) current_page = int(page_container.find('span').text) for page in range(current_page + 1, last_page + 1): put_new_page(page, queue)
async def fetch(url, retry=0): proxy = 'http://{}'.format(Proxy.get_random()['address']) headers = {'user-agent': get_user_agent()} conn = aiohttp.ProxyConnector(proxy=proxy) js_url = gen_js_url(url) try: with aiohttp.ClientSession(connector=conn) as session: with aiohttp.Timeout(TIMEOUT): async with session.get(url, headers=headers) as resp: html_text = await resp.text() async with session.get(js_url, headers=headers) as resp: js_data = await resp.json() except: retry += 1 if retry > 5: raise CrawlerError() await asyncio.sleep(1) return await fetch(url, retry=retry) return html_text, js_data
async def fetch(retry=0): proxy = 'http://{}'.format(Proxy.get_random()['address']) headers = {'user-agent': get_user_agent()} conn = aiohttp.ProxyConnector(proxy=proxy) url = 'http://httpbin.org/ip' try: with aiohttp.ClientSession(connector=conn) as session: with aiohttp.Timeout(TIMEOUT): async with session.get(url, headers=headers) as resp: return await resp.json() except (ProxyConnectionError, TimeoutError): try: p = Proxy.objects.get(address=proxy) if p: p.delete() except DoesNotExist: pass retry += 1 if retry > 5: raise TimeoutError() await asyncio.sleep(1) return await fetch(retry=retry)