Exemplo n.º 1
0
    def get(self, url, user_agent=None, cookies={}, respect_robots=False, cache=False):
        logger.log_info('GET %s' % url)
        #lazy init
        if cache and not self.cache_db:
            self.cache_connect()

        if config.respect_robots and respect_robots and not robots.can_fetch(url):
            return False
        if cache:
            cache_result = self.cache_db.get(str(url))
            if cache_result:
                return pickle.loads(cache_result)
            else:
                logger.log_info('URL[%s] not found in cache' % url)
        headers = dict()
        if config.user_agent:
            headers['User-Agent'] = config.user_agent
        if user_agent:
            headers['User-Agent'] = user_agent
        res = requests.get(url, headers=headers, cookies=cookies)
        if res:
            #Don't reuse
            res.connection.close()
            logger.log_info('GET[%s] HTTP(%s)' % (url,res.status_code))
            lite_res = LiteResponse()
            lite_res.content = res.content
            lite_res.cookies = requests.utils.dict_from_cookiejar(res.cookies)
            lite_res.headers = res.headers
            lite_res.status_code = res.status_code
            pickled = pickle.dumps(lite_res)
            if self.cache_db:
                self.cache_db.put(str(url), pickled)
            logger.log_info('Put content at URL %s into cache' % url)
            return lite_res
        else:
            logger.log_info('Request for URL %s failed' % (url))
        return None
Exemplo n.º 2
0
    def run(self):
        with self.get_job(start_offset=self.crawl_delay) as job:
            #This is just so that we can retrieve url when printing an exception
            url = None
            try:
                if not job:
                    return
                url = job.get_url()
                #TODO: make respect robots configurable
                resp_args = {
                    'cache' : self.cache_requests,
                    'respect_robots' : config.respect_robots if config.respect_robots else True,
                }
                cookies = job.get_cookies()
                if cookies:
                    resp_args['cookies'] = cookies
                resp = job.get(url,**resp_args)
                if not resp:
                    logger.log_info('Failed to get url: ' + url)
                    return StrategyResult.PASSED
                if resp.status_code != 200:
                    logger.log_info('[WARNING] Job with URL %s failed with code(%s)' % \
                        (url, resp.status_code))
                    return StrategyResult.PASSED

                if self.can_scrape(url, job):
                    #maybe this can be put together into the same step
                    items = self.get_scraped_items(resp, job)
                    for item in items:
                        self.on_item_scraped(item, job)

                content_type = resp.headers.get('content-type')
                sitemap_links = None
                if content_type == 'text/xml' or \
                   content_type == 'application/xml' or \
                   url.endswith('.xml') and \
                   sitemap.is_sitemap(resp.content):
                    sitemap_links = sitemap.get_links(resp.content)


                follow_filter = job.get_follow_filter()
                links = [link.loc for link in sitemap_links] if sitemap_links else \
                        (linkparser.parse_links(
                            resp.content,
                            url,
                            restrict_to_host=True,
                            selector_filter=follow_filter
                        ) if follow_filter else \
                        linkparser.parse_links(
                            resp.content,
                            url,
                            restrict_to_host=True
                        ))
                if not follow_filter:
                    links = filter(lambda url: linkparser.can_follow(url, job.get_allowed_links()),links)
                for link in links:
                    job.add_url(link)

                if resp.cookies and len(resp.cookies):
                    cookies = resp.cookies
                    if isinstance(resp.cookies,RequestsCookieJar):
                        cookies = requests.utils.dict_from_cookiejar(resp.cookies)
                    job.set_cookies(cookies)

            except Exception as e:
                logger.log_error('[%s] Exception caught %s' % (url, e))
                return StrategyResult.PASSED
            return StrategyResult.SUCCESS