Пример #1
0
    def process_response(self, request, response, spider):
        """Only allow HTTP response types that match the given regular
        expressions whitelist. Each spider must define a whitelist
        iterable containing regular expressions whose content types
        the spider wishes to download.
        """

        whitelist = getattr(spider, "whitelist", None)
        if not whitelist:
            return response

        content_type = response.headers.get('content-type', None)
        if not content_type:
            logging.info(
                "spider {}: ignored: {} does not contain a content-type header"
                .format(spider.name, response.url))
            raise IgnoreRequest()

        if self.is_content_type_okay(whitelist, content_type):
            return response

        logging.info(
            "spider {}: ignored: {} has type {}, which was not whitelisted".
            format(spider.name, response.url, content_type))
        raise IgnoreRequest()
Пример #2
0
 def process_request(self, request, spider):
     if hasattr(
             request, 'meta'
     ) and 'webdriver' in request.meta and request.meta['webdriver'].get(
             'name', '') == 'selenium_grid':
         driver = self.get_driver()
         if driver:
             try:
                 meta = request.meta['webdriver']
                 action = meta.get('module', None)
                 if not action:
                     raise IgnoreRequest(
                         'selenium grid request must have "act" item in meta'
                     )
                 m = importlib.import_module('zeus_actions.' + action)
                 f = getattr(m, 'act')
                 if f is None or not hasattr(f, '__call__'):
                     raise IgnoreRequest(
                         'module %s must implement "act" method' % action)
                 self.driver.get(request.url)
                 f(self.driver)
                 body = self.driver.page_source
                 return HtmlResponse(url=request.url,
                                     body=body,
                                     request=request,
                                     encoding='utf-8')
             finally:
                 #self.driver.close()
                 pass
    def process_request(self, request, spider):
        # Called for each request that goes through the downloader
        # middleware.

        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object
        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called

        # if request.url.rstrip() in self.bing_archive:
        #     raise IgnoreRequest()

        for word in self.word_to_ignore:
            if word.lower() in request.url.lower():
                raise IgnoreRequest()

        for ext in self.extensions_to_ignore:
            if request.url.lower().endswith(ext):
                raise IgnoreRequest()
        # with open(self.bing_archive_path,'a') as f:
        #     f.write(request.url+"\n")
        self.visited_urls.append(request.url)
        return None
Пример #4
0
 def process_response(self, request, response, spider):
     """
     要考虑两种情况,一是被封,二是ip 失效
     :param request:
     :param response:
     :param spider:
     :return:
     """
     proxy_str = request.meta['proxy']
     proxy = ProxyItem.parse(proxy_str)
     # 持有的是方法,只有一个实例,所以并发时 self.proxy 应该是不准确的,需从 request 获取
     code, _ = douyin.parse_result(response.body.decode())
     if code == 1:
         proxy_manager.success(proxy)
     elif code == 2:
         proxy_manager.banned(proxy)
         if douyin_spider.ANONYMOUS:
             # 匿名则忽略并继续 ,不匿名返回处理
             raise IgnoreRequest()
         else:
             return response
     else:
         proxy_manager.fail(proxy)
         raise IgnoreRequest()
     return response
Пример #5
0
 def process_request(self, request, spider):  # todo pylint:disable=unused-argument
     """Process incoming request."""
     parsed_uri = urlparse(request.url)
     domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
     if '.onion' in domain:
         if domain[-7:-1] != '.onion':
             msg = 'Ignoring request %s, not .onion domain.' % domain
             logging.info(msg)
             raise IgnoreRequest()  # Not .onion domain
         # Drop connections to the old onion v2 addresses and other invalid domains
         if len(
                 domain.split('.')[-2].replace('http://', '').replace(
                     'https://', '')) != 56:
             msg = 'Ignoring request %s, not v3 onion domain.' % domain
             logging.info(msg)
             raise IgnoreRequest()  # Not a valid onion v3 address
         # List of proxies available
         if parsed_uri.scheme == "https":  # For those few HTTPS onion websites
             tor_proxy_list = settings.get('HTTPS_PROXY_TOR_PROXIES')
         else:  # Plain text HTTP without TLS
             tor_proxy_list = settings.get('HTTP_PROXY_TOR_PROXIES')
         # Always select the same proxy for the same onion domain
         # This will keep only one underlining Tor circuit to the onion service
         # Onion addresses form an uniform distribution
         # Therefore this address can be used as a seed for random
         hash = '{uri.netloc}'.format(uri=parsed_uri).replace(".onion", "")
         random.seed(hash)  # A seed for randomness is the onion domain
         # Always select the same proxy for the same onion address
         request.meta['proxy'] = random.choice(tor_proxy_list)
     elif ".i2p" in domain and ".i2p." not in domain:
         if parsed_uri.scheme == "https":
             request.meta['proxy'] = settings.get('HTTPS_PROXY_I2P')
         else:
             request.meta['proxy'] = settings.get('HTTP_PROXY_I2P')
Пример #6
0
    def process_request(self, request, spider):
        # Get meta prices for comparaison
        price = request.meta.get('prop_price', False)
        discount_price = request.meta.get('discount_price', False)

        if price and discount_price:  # It's discount item
            # Format prices
            price = self.Utils.format_price(price)
            discount_price = self.Utils.format_price(discount_price)

            # Check price
            if price == 0 or discount_price == 0:
                raise IgnoreRequest('isRequestItem_price_null')

            if price == discount_price:
                raise IgnoreRequest('isRequestItem_prices_egal')

            # Check if exist
            if (request.url, str(discount_price), str(price)) in self.items:
                self.DB.update(request.url)
                raise IgnoreRequest('already exist')

            else:
                # Remove it in case
                self.DB.delete_one(request.url)

        elif price or discount_price:  # It's not discount item
            raise IgnoreRequest('not_discount_item')
Пример #7
0
 def process_request(self, request, spider):
     path = urlparse(request.url).path or '/'
     for allow_path in spider.allow_path:
         if not match(escape(allow_path), path):
             raise IgnoreRequest("outside scope %s")
     for deny_path in spider.deny_path:
         if match(escape(deny_path), path):
             raise IgnoreRequest("outside scope %s")
Пример #8
0
 def process_exception(self, request, exception, spider):
     # 出现异常时(超时)使用代理
     print("\n出现异常,正在使用代理重试....\n")
     if isinstance(exception, pymysql.DatabaseError):
         raise IgnoreRequest("数据库错误,不做处理")
     if isinstance(exception, (HttpError, DNSLookupError, TimeoutError)):
         lines = self.get_proxies()
         current_proxy = lines[random.randint(0, len(lines) - 1)].strip()
         # 对当前reque加上代理
         print("更换代理为{}".format(current_proxy))
         raise IgnoreRequest("超过最大请求,{}\t被跳过".format(request.url))
Пример #9
0
 def process_exception(self, request, exception, spider):
     # 没有新房时日志记录
     province, city = request.meta['data']
     if isinstance(exception, DNSLookupError) and isinstance(spider, XinFangSpider) and '/loupan' in request.url:
         spider.logger.error(f'{province}-{city} 没有新房。。。')
         raise IgnoreRequest()
         # return TextResponse(url=request.url, body='没有新房'.encode())
         # 没有二手房时日志记录
     elif isinstance(exception, DNSLookupError) and spider.name == 'erShouFang' and '/ershoufang' in request.url:
         spider.logger.error(f'{province}-{city} 没有二手房。。。')
         raise IgnoreRequest()
     elif isinstance(exception, DNSLookupError) and spider.name == 'zuFang' and '/zufang' in request.url:
         spider.logger.error(f'{province}-{city} 没有租房信息。。。')
         raise IgnoreRequest()
Пример #10
0
 def process_response(self, request, response, spider):
     """Process response hook."""
     allowed_domains = getattr(spider, 'allowed_domains', None)
     blacklist_urls = getattr(spider, 'blacklist_urls', [])
     url = response.url
     # Offsite check
     if not is_url_in_domains(url, allowed_domains):
         raise IgnoreRequest(request)
     # Blacklist check
     for bad_url in blacklist_urls:
         if (isinstance(bad_url, str) and url == bad_url) \
         or bad_url.search(url):
             raise IgnoreRequest(request)
     return response
Пример #11
0
    def process_request(self, request, spider):
        request.headers['User-Agent'] = random.choice(self.user_agents)

        # Drop requests for "other-make"
        if 'other-make' in request.url:
            raise IgnoreRequest('Other make ignored.')

        # Drop duplicate Request
        if request.url in self.requests:
            raise IgnoreRequest('Dropping duplicate {0}'.format(request.url))

        # Otherwise, add URL to seen list
        else:
            self.requests |= request.url
Пример #12
0
 def process_response(self, request, response, spider):
     text = response.text
     if text.startswith("<script>") and "__jsl_clearance" in text:
         if '__jsl_clearance' in request.cookies:
             spider.log("Calculate __jsl_clearance value wrong, Ignore this Request", level=WARNING)
             raise IgnoreRequest()
         try:
             key, value = get_anti_spider_clearance(text.strip()).split("=", 1)
             clearance = {key: value}
         except:
             spider.log("Calculate __jsl_clearance error, Ignore this Request", level=WARNING)
             raise IgnoreRequest()
         else:
             return request.replace(dont_filter=True, cookies=clearance)
     return response
Пример #13
0
    def process_request(self, request, spider):
        if request.meta.get('dont_cache', False):
            return

        # Skip uncacheable requests
        if not self.policy.should_cache_request(request):
            request.meta['_dont_cache'] = True  # flag as uncacheable
            return

        # Look for cached response and check if expired
        cachedresponse = self.storage.retrieve_response(spider, request)
        if cachedresponse is None:
            self.stats.inc_value('httpcache/miss', spider=spider)
            if self.ignore_missing:
                self.stats.inc_value('httpcache/ignore', spider=spider)
                raise IgnoreRequest("Ignored request not in cache: %s" %
                                    request)
            return  # first time request

        # Return cached response only if not expired
        cachedresponse.flags.append('cached')
        if self.policy.is_cached_response_fresh(cachedresponse, request):
            self.stats.inc_value('httpcache/hit', spider=spider)
            return cachedresponse

        # Keep a reference to cached response to avoid a second cache lookup on
        # process_response hook
        request.meta['cached_response'] = cachedresponse
Пример #14
0
 def process_response(self, request, response, spider):
     if 'x-ignore-response' in request.url:
         raise IgnoreRequest()
     elif 'x-error-response' in request.url:
         _ = 1 / 0
     else:
         return response
Пример #15
0
	def process_request(self, request, spider):
		digest = hash(request.url)
		if digest in self.visited:
			raise IgnoreRequest("Duplicated url %s"%(request.url))
		else:
			self.visited.append(digest)
			return None
Пример #16
0
    def _redirect(self, redirected, request, spider, reason):
        ttl = request.meta.setdefault('redirect_ttl', self.max_redirect_times)
        redirects = request.meta.get('redirect_times', 0) + 1

        if ttl and redirects <= self.max_redirect_times:
            redirected.meta['redirect_times'] = redirects
            redirected.meta['redirect_ttl'] = ttl - 1
            redirected.meta['redirect_urls'] = request.meta.get('redirect_urls', []) + \
                [request.url]
            redirected.dont_filter = request.dont_filter
            redirected.priority = request.priority + self.priority_adjust
            logger.debug(
                "Redirecting (%(reason)s) to %(redirected)s from %(request)s",
                {
                    'reason': reason,
                    'redirected': redirected,
                    'request': request
                },
                extra={'spider': spider})
            return redirected
        else:
            logger.debug("Discarding %(request)s: max redirections reached",
                         {'request': request},
                         extra={'spider': spider})
            raise IgnoreRequest("max redirections reached")
Пример #17
0
    def parse_category(self, response: HtmlResponse) -> HtmlResponse:
        """
            List category and traverse product pages.
        """
        products_query = response.css(
            "section#bc-sf-filter-products > div.product-grid-item")
        if not products_query:
            raise IgnoreRequest('Product items not found')
        self.logger.info(
            f'parse product_categories len: {len(products_query)}')

        for pdp in products_query.css('div.product-grid-item'):
            item_loader = ProductLoader(item=UrgeItem(), selector=pdp)
            item_loader.add_css('product_name',
                                'div.product-text > p.title::text')
            item_loader.add_css('product_brand',
                                'div.product-text > h2.vendor.h5::text')
            # get regular product price through OR (,).
            item_loader.add_css(
                'product_price',
                'div.product-text p.price s::text , span[itemprop="price"]::text'
            )
            item_loader.add_css(
                'product_sale_price',
                'div.product-text p.sale span[itemprop="price"]::text')
            if 'href' in pdp.css('a').attrib:
                product_url = pdp.css('a').attrib['href']
                yield response.follow(product_url,
                                      callback=self.product_page,
                                      meta={'item': item_loader.load_item()})
Пример #18
0
    def process_request(self, request, spider):
        # Called for each request that goes through the downloader
        # middleware.

        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object
        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called

        headers = {
            'Host': 'twitter.com',
            # 'User-Agent': "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
            'Accept-Language': 'zh-CN,zh;q=0.9'
        }
        try:
            content = requests.get(url=request.url, headers=headers).text
        except Exception as e:
            raise IgnoreRequest(e)
        response = HtmlResponse(url=request.url,
                                body=content,
                                request=request,
                                encoding='utf-8')
        return response
 def test_process_spider_exception(self):
     assert self.instance.counters == {'all': 0, 'error': 0}
     self.instance.save_response = mock.Mock()
     # all conditions are true
     self.instance.on_error_enabled = True
     self.instance.process_spider_exception('err-response', Exception(),
                                            self.spider)
     assert self.instance.counters == {'all': 0, 'error': 1}
     # on_error flag is disabled, skipping
     self.instance.on_error_enabled = False
     self.instance.process_spider_exception('err-response', Exception(),
                                            self.spider)
     assert self.instance.counters == {'all': 0, 'error': 1}
     # exceeded error limit
     self.instance.on_error_enabled = True
     self.instance.counters['error'] = 11
     self.instance.process_spider_exception('err-response', Exception(),
                                            self.spider)
     assert self.instance.counters == {'all': 0, 'error': 11}
     # skip IgnoreRequest
     self.instance.limits['error'] = 12
     self.instance.process_spider_exception('err-response', IgnoreRequest(),
                                            self.spider)
     assert self.instance.counters == {'all': 0, 'error': 11}
     # all conditions are true again
     self.instance.limits['all'] = 12
     self.instance.process_spider_exception('err-response', Exception(),
                                            self.spider)
     assert self.instance.counters == {'all': 0, 'error': 12}
Пример #20
0
    def _redirect(self, redirected, request, spider, reason):

        reason = response_status_message(reason)
        redirects = request.meta.get('redirect_times', 0) + 1

        if redirects <= self.max_redirect_times:
            redirected.meta['redirect_times'] = redirects
            redirected.meta['redirect_urls'] = request.meta.get('redirect_urls', []) + \
                                               [request.url]
            redirected.meta['priority'] = redirected.meta[
                'priority'] + self.priority_adjust
            self.logger.debug("Redirecting %s to %s from %s for %s times " %
                              (reason, redirected.url, request.url,
                               redirected.meta.get("redirect_times")))
            return redirected
        else:
            self.logger.info("Discarding %s: max redirections reached" %
                             request.url)
            # 错误信息记录出错的url ,而不是最初的url
            # request.meta["url"] = request.url

            if request.meta.get("callback") == "parse":
                # 对于分类页失败,总数+1
                self.crawler.stats.inc_total_pages(
                    crawlid=request.meta['crawlid'])
                self.logger.error(
                    " in redicrect request error to failed pages url:%s, exception:%s, meta:%s"
                    % (request.url, reason, request.meta))

            raise IgnoreRequest("max redirections reached:%s" % reason)
Пример #21
0
 def process_request_2(self, rp, request, spider):
     if rp is not None and not rp.can_fetch(to_native_str(self._useragent),
                                            request.url):
         logger.debug("Forbidden by robots.txt: %(request)s",
                      {'request': request},
                      extra={'spider': spider})
         raise IgnoreRequest()
Пример #22
0
    def _redirect(self, redirected, request, spider, reason):
        ttl = request.meta.setdefault("redirect_ttl", self.max_redirect_times)
        redirects = request.meta.get("redirect_times", 0) + 1

        if ttl and redirects <= self.max_redirect_times:
            redirected.meta["redirect_times"] = redirects
            redirected.meta["redirect_ttl"] = ttl - 1
            redirected.meta["redirect_urls"] = request.meta.get(
                "redirect_urls", []) + [request.url]
            redirected.meta["redirect_reasons"] = request.meta.get(
                "redirect_reasons", []) + [reason]
            redirected.dont_filter = request.dont_filter
            redirected.priority = request.priority + self.priority_adjust
            logger.debug(
                "Redirecting (%(reason)s) to %(redirected)s from %(request)s",
                {
                    "reason": reason,
                    "redirected": redirected,
                    "request": request
                },
                extra={"spider": spider},
            )
            return redirected
        else:
            logger.debug(
                "Discarding %(request)s: max redirections reached",
                {"request": request},
                extra={"spider": spider},
            )
            raise IgnoreRequest("max redirections reached")
Пример #23
0
 def process_request(self, request, spider):
     if spider.use_selenium():
         driver = SeleniumDriver(
             request.meta.get('proxy', None),
             request.meta['site_settings'].headless
         ).driver
         cookies = request.meta['site_settings'].cookies
         if len(cookies) >= 1:
             # Selenium can only add cookies to
             # the domain that it is already on
             driver.get(request.url)
             driver.delete_all_cookies()
             for cookie in cookies:
                 driver.add_cookie(cookie)
         try:
             driver.get(request.url)
         except TimeoutException:
             raise IgnoreRequest()
         self._wait_for_page(driver, spider, request)
         return HtmlResponse(
             driver.current_url,
             body=driver.page_source,
             encoding='UTF-8',
             request=request
         )
Пример #24
0
 def process_response(self, request, response, spider):
     # If we hit an ignorable error (eg. "payment required") then ignore the request
     if response.status in self.ignore_http_codes:
         raise IgnoreRequest(
             "Skipping page which returned a status code that we ignore.")
     # If we hit a 'service unavailable' error then increase the delay
     if response.status in self.delay_http_codes:
         self.delay_interval += self.delay_increment
         spider.logger.info(
             "Too many requests - server returned an error (code {}). "
             "Adding {:.2f}s delay to future requests. "
             "Current delay interval is {:.2f}s".format(
                 response.status, self.delay_increment,
                 self.delay_interval))
         self.num_responses = 0
     # If we manage to hit 'num_responses_threshold' responses in a row
     # without problems then reduce the delay
     else:
         self.num_responses += 1
         if self.delay_interval and self.num_responses >= self.num_responses_threshold:
             self.delay_interval = max(
                 self.delay_interval - self.delay_increment, 0)
             spider.logger.info(
                 "Made {} requests without a server error. "
                 "Reducing delay for future requests by {:.2f}s. "
                 "Current delay interval is {:.2f}s".format(
                     self.num_responses, self.delay_increment,
                     self.delay_interval))
             self.num_responses = 0
     # Wait if the delay is non-zero
     if self.delay_interval:
         time.sleep(self.delay_interval)
     return super().process_response(request, response, spider)
Пример #25
0
    def process_request(self, request, spider):
        # print(request.url)

        if len(self.crawled_urls) % 100 == 0:
            print("Crawled sets: ", len(self.crawled_urls))

        if request.url in self.crawled_urls:
            print("Duplicate request", request.url)
            raise IgnoreRequest()

        elif any(x in request.url for x in self.deny_url_contains):
            raise IgnoreRequest()

        else:
            self.crawled_urls.add(request.url)
            return None
Пример #26
0
 def process_request(self, request, spider):
     if request.meta.get('captcha_request', False):
         return
     if self.paused:
         self.queue.append((request, spider))
         raise IgnoreRequest('Crawling paused, because CAPTCHA is '
                             'being solved')
Пример #27
0
    def process_response(self, request, response, spider):
        # 处理下载完成的response
        # 排除状态码不是304的所有以3为开头的响应

        http_code = response.status
        if http_code // 100 == 2:
            global count
            count = 0
            return response

        if http_code // 100 == 3 and http_code != 304:
            # 获取重定向的url
            # url = response.headers['location']
            # domain = urlparse.urlparse(url).netloc
            # 判断重定向的url的domain是否在allowed_domains中
            # if domain in spider.allowed_domains:
            #     return Request(url=url, meta=request.meta)
            # else:
            count
            if count == 1:
                sendMessage_warning()
            # print count
            count += 1
            #把request返回到下载器
            return request.replace(dont_filter=True)
        if http_code // 100 == 4:
            # 需要注意403不是响应错误,是无权访问
            raise IgnoreRequest(u'404')

        if http_code // 100 == 5:
            return request.replace(dont_filter=True)
Пример #28
0
 def process_request(self, request, spider):
     url = request.url
     key = hashlib.md5(url).hexdigest()
     info = self.mongo.find_one({'key': key})
     if info:
         logger.warn("ingore repeat url: %s" % request.url)
         raise IgnoreRequest("ingore repeat url: %s" % request.url)
    def process_request(self, request, spider: CollectAnnotableUrlsSpider):
        parsed_url = urlparse(request.url)
        domain = parsed_url.netloc
        domain_without_www = domain.replace('www.', '')

        if domain_without_www in self.limit_per_domain:
            max_visits_domain = self.limit_per_domain[
                domain_without_www] * MAX_VISITED_REQUIRED_MATCHED_RATIO
            limit_by_visits_reached = self.counter_visited.get(
                domain_without_www, 0) >= max_visits_domain
            limit_by_matches_reached = self.counter_matched.get(
                domain_without_www,
                0) >= self.limit_per_domain[domain_without_www]

            if limit_by_visits_reached or limit_by_matches_reached:
                spider.logger.info(
                    f'process_request() filtering request for domain {domain_without_www}: '
                    f'visited: {self.counter_visited[domain_without_www]}, '
                    f'matched: {self.counter_matched[domain_without_www]}')
                raise IgnoreRequest()
        spider.logger.info(
            f'process_request() accepting request for domain {domain_without_www}: '
            f'visited: {self.counter_visited[domain_without_www]}, '
            f'matched: {self.counter_matched[domain_without_www]}')

        return None
Пример #30
0
 def process_request(self, request, spider):
     if not request.url:
         return None
     channel_id = request.meta.get('channel_id', 0)
     if is_dup_detail(request.url, spider.name, channel_id):
         raise IgnoreRequest("Spider: %s, DeDuplicationRequest: %s" %
                             (spider.name, request.url))