class Downloader(): def __init__(self, crawler): logger.debug(f"New Downloader with spider {crawler.spider.name}") self.session = AsyncSession(n=100) self.crawler = crawler def download(self, request): logger.debug(f"DOWNLOADER downloading {request.method} {request.url}") if request.method == "GET": d = self.session.get( request.url, #params=request.params, allow_redirects=request.allow_redirects) elif request.method == "POST": d = self.session.post( request.url, #params=request.params, data=request.body, allow_redirects=request.allow_redirects) else: d = defer.Deferred() reactor.callLater(0, d.errback, ValueError(f"Undefined method found: {request.method}")) return d return d.addCallback(self.send_response, request) def start(self): logger.debug("Starting Downloader") for c in self.crawler.spider.preload_cookies: logger.debug(f"Downloader adding preloading cookie: {c['name']} for {c['domain']}") cookie = requests.cookies.create_cookie( domain=c['domain'], name=c['name'], value=c['value']) self.session.cookies.set_cookie(cookie) @staticmethod def send_response(response, request): return Response( request=request, status=( response.status_code, response.reason), body=response.content, cookies=response.cookies, encoding=response.encoding, headers=response.headers, meta=request.meta ) @staticmethod def is_busy(): return False @staticmethod def close(): logger.debug("Close Downloader")
def main(reactor): with open('event_info.txt', 'w+') as f: session = AsyncSession(n=RESULTS_PER_PAGE) i = 0 while True: print('%d events written...' % i) responses = [] try: r = requests.get(URL.format(results_per_page=RESULTS_PER_PAGE, page_start=i), timeout=30) except ReadTimeout as e: print('timed out paging through URLs') sys.exit(1) if r.status_code is not 200: print('non 200 status code %d: i=%d' % (r.status_code, i)) sys.exit(1) soup = BeautifulSoup(r.text, 'html.parser') try: results = soup.find(id='result_body').find( class_='list04').find_all('li') except AttributeError: print('invalid page') sys.exit(1) if not results: break for li in results: url = 'http://www.nihon-kankou.or.jp/%s' % li.find( 'a', href=True)['href'] responses.append(session.get(url, timeout=30)) for idx, r in enumerate(responses): try: r = yield r except ConnectionError as e: print( 'failed on request number: %d; please lower RESULTS_PER_PAGE' % idx) raise e except ReadTimeout as e: print('timed out requesting event page') sys.exit(1) try: scrape_page(r, f) except Exception: print('error with url: %s' % r.url) raise i += RESULTS_PER_PAGE
def main(reactor): t1 = time.time() responses = [] session = AsyncSession(n=args.num_threads) for record in work_unit.to_records('dict'): responses.append(session.get(record['url'], verify=False)) for response in responses: try: r = yield response async_worker_helper.process(r) except: print('issue with '+record['url']) return(metadata_collection, text_collection)