Пример #1
0
class Downloader():
    def __init__(self, crawler):
        logger.debug(f"New Downloader with spider {crawler.spider.name}")
        self.session = AsyncSession(n=100)
        self.crawler = crawler

    def download(self, request):
        logger.debug(f"DOWNLOADER downloading {request.method} {request.url}")
        if request.method == "GET":
            d = self.session.get(
                request.url,
                #params=request.params,
                allow_redirects=request.allow_redirects)
        elif request.method == "POST":
            d = self.session.post(
                request.url,
                #params=request.params,
                data=request.body,
                allow_redirects=request.allow_redirects)
        else:
            d = defer.Deferred()
            reactor.callLater(0, d.errback, ValueError(f"Undefined method found: {request.method}"))
            return d            
        return d.addCallback(self.send_response, request)

    def start(self):
        logger.debug("Starting Downloader")
        for c in self.crawler.spider.preload_cookies:
            logger.debug(f"Downloader adding preloading cookie: {c['name']} for {c['domain']}")
            cookie = requests.cookies.create_cookie(
                domain=c['domain'],
                name=c['name'],
                value=c['value'])
            self.session.cookies.set_cookie(cookie)

    @staticmethod
    def send_response(response, request):
        return Response(
            request=request,
            status=(
                response.status_code,
                response.reason),
            body=response.content,
            cookies=response.cookies,
            encoding=response.encoding,
            headers=response.headers,
            meta=request.meta
        )

    @staticmethod
    def is_busy():
        return False

    @staticmethod
    def close():
        logger.debug("Close Downloader")
Пример #2
0
def main(reactor):
    with open('event_info.txt', 'w+') as f:
        session = AsyncSession(n=RESULTS_PER_PAGE)
        i = 0
        while True:
            print('%d events written...' % i)
            responses = []
            try:
                r = requests.get(URL.format(results_per_page=RESULTS_PER_PAGE,
                                            page_start=i),
                                 timeout=30)
            except ReadTimeout as e:
                print('timed out paging through URLs')
                sys.exit(1)
            if r.status_code is not 200:
                print('non 200 status code %d: i=%d' % (r.status_code, i))
                sys.exit(1)
            soup = BeautifulSoup(r.text, 'html.parser')
            try:
                results = soup.find(id='result_body').find(
                    class_='list04').find_all('li')
            except AttributeError:
                print('invalid page')
                sys.exit(1)
            if not results:
                break
            for li in results:
                url = 'http://www.nihon-kankou.or.jp/%s' % li.find(
                    'a', href=True)['href']
                responses.append(session.get(url, timeout=30))
            for idx, r in enumerate(responses):
                try:
                    r = yield r
                except ConnectionError as e:
                    print(
                        'failed on request number: %d; please lower RESULTS_PER_PAGE'
                        % idx)
                    raise e
                except ReadTimeout as e:
                    print('timed out requesting event page')
                    sys.exit(1)
                try:
                    scrape_page(r, f)
                except Exception:
                    print('error with url: %s' % r.url)
                    raise
            i += RESULTS_PER_PAGE
Пример #3
0
def main(reactor):
    t1 = time.time()
    responses = []

    session = AsyncSession(n=args.num_threads)

    for record in work_unit.to_records('dict'):
        responses.append(session.get(record['url'], verify=False))        

    for response in responses:
        try:
            r = yield response
            async_worker_helper.process(r)
        except:
            print('issue with '+record['url'])

    return(metadata_collection, text_collection)