示例#1
0
    def __init__(
        self,
        url: str,
        callback: Optional[Callable] = None,
        method: str = "GET",
        headers: Optional[dict] = None,
        body: Optional[Union[bytes, str]] = None,
        cookies: Optional[Union[dict, List[dict]]] = None,
        meta: Optional[dict] = None,
        encoding: str = "utf-8",
        priority: int = 0,
        dont_filter: bool = False,
        errback: Optional[Callable] = None,
        flags: Optional[List[str]] = None,
        cb_kwargs: Optional[dict] = None,
    ) -> None:
        self._encoding = encoding  # this one has to be set first
        self.method = str(method).upper()
        self._set_url(url)
        self._set_body(body)
        if not isinstance(priority, int):
            raise TypeError(f"Request priority not an integer: {priority!r}")
        self.priority = priority

        if callback is not None and not callable(callback):
            raise TypeError(
                f'callback must be a callable, got {type(callback).__name__}')
        if errback is not None and not callable(errback):
            raise TypeError(
                f'errback must be a callable, got {type(errback).__name__}')
        self.callback = callback
        self.errback = errback

        self.cookies = cookies or {}
        self.headers = Headers(headers or {}, encoding=encoding)
        self.dont_filter = dont_filter

        self._meta = dict(meta) if meta else None
        self._cb_kwargs = dict(cb_kwargs) if cb_kwargs else None
        self.flags = [] if flags is None else list(flags)
示例#2
0
    def __init__(self,
                 url,
                 callback=None,
                 method='GET',
                 headers=None,
                 body=None,
                 cookies=None,
                 meta=None,
                 encoding='utf-8',
                 priority=0,
                 dont_filter=False,
                 errback=None,
                 flags=None,
                 is_need_proxy=False):

        self._encoding = encoding  # this one has to be set first
        self.method = str(method).upper()
        self._set_url(url)
        self._set_body(body)
        self.is_need_proxy = is_need_proxy
        assert isinstance(
            priority, int), "Request priority not an integer: %r" % priority
        self.priority = priority

        if callback is not None and not callable(callback):
            raise TypeError('callback must be a callable, got %s' %
                            type(callback).__name__)
        if errback is not None and not callable(errback):
            raise TypeError('errback must be a callable, got %s' %
                            type(errback).__name__)
        assert callback or not errback, "Cannot use errback without a callback"
        self.callback = callback
        self.errback = errback

        self.cookies = cookies or {}
        self.headers = Headers(headers or {}, encoding=encoding)
        self.dont_filter = dont_filter

        self._meta = dict(meta) if meta else None
        self.flags = [] if flags is None else list(flags)
示例#3
0
    def __init__(self,
                 url,
                 callback=None,
                 method='GET',
                 headers=None,
                 body=None,
                 cookies=None,
                 meta=None,
                 encoding='utf-8',
                 priority=0,
                 dont_filter=False,
                 errback=None,
                 flags=None,
                 cb_kwargs=None):

        self._encoding = encoding  # this one has to be set first
        self.method = str(method).upper()  # 请求方法
        self._set_url(url)  # 设置 URL
        self._set_body(body)  # 设置body
        assert isinstance(
            priority, int), "Request priority not an integer: %r" % priority
        self.priority = priority  # 优先级

        if callback is not None and not callable(callback):
            raise TypeError('callback must be a callable, got %s' %
                            type(callback).__name__)
        if errback is not None and not callable(errback):
            raise TypeError('errback must be a callable, got %s' %
                            type(errback).__name__)
        assert callback or not errback, "Cannot use errback without a callback"
        self.callback = callback  # 回调函数, 经常被使用
        self.errback = errback  # 异常回调函数

        self.cookies = cookies or {}
        self.headers = Headers(headers or {}, encoding=encoding)  # 构建headers
        self.dont_filter = dont_filter  # 判断是否需要过滤

        self._meta = dict(meta) if meta else None  # 附加信息
        self._cb_kwargs = dict(cb_kwargs) if cb_kwargs else None
        self.flags = [] if flags is None else list(flags)
示例#4
0
    def __init__(self, url, callback=None, method='GET', headers=None, body=None,
                 cookies=None, meta=None, encoding='utf-8', priority=0,
                 dont_filter=False, errback=None, flags=None):

        self._encoding = encoding  # this one has to be set first
        self.method = str(method).upper()
        self._set_url(url)
        self._set_body(body)
        assert isinstance(priority, int), \
            "Request priority not an integer: %r" % priority
        self.priority = priority

        assert callback or not errback, "Cannot use errback without a callback"
        self.callback = callback
        self.errback = errback

        self.cookies = cookies or {}
        self.headers = Headers(headers or {}, encoding=encoding)
        self.dont_filter = dont_filter

        self._meta = dict(meta) if meta else None
        self.flags = [] if flags is None else list(flags)
示例#5
0
    def __init__(self,
                 url,
                 callback=None,
                 method='GET',
                 headers=None,
                 body=None,
                 cookies=None,
                 meta=None,
                 encoding='utf-8',
                 priority=0,
                 dont_filter=False,
                 errback=None,
                 flags=None,
                 cb_kwargs=None):

        self._encoding = encoding  # this one has to be set first
        self.method = str(method).upper()
        self._set_url(url)
        self._set_body(body)
        if not isinstance(priority, int):
            raise TypeError(f"Request priority not an integer: {priority!r}")
        self.priority = priority

        if callback is not None and not callable(callback):
            raise TypeError(
                f'callback must be a callable, got {type(callback).__name__}')
        if errback is not None and not callable(errback):
            raise TypeError(
                f'errback must be a callable, got {type(errback).__name__}')
        self.callback = callback
        self.errback = errback

        self.cookies = cookies or {}
        self.headers = Headers(headers or {}, encoding=encoding)
        self.dont_filter = dont_filter

        self._meta = dict(meta) if meta else None
        self._cb_kwargs = dict(cb_kwargs) if cb_kwargs else None
        self.flags = [] if flags is None else list(flags)
示例#6
0
    async def _download_request_with_page(self, request: Request,
                                          spider: Spider,
                                          page: Page) -> Response:
        start_time = time()
        response = await page.goto(request.url)

        page_coroutines = request.meta.get("playwright_page_coroutines") or ()
        if isinstance(page_coroutines, dict):
            page_coroutines = page_coroutines.values()
        for pc in page_coroutines:
            if isinstance(pc, PageCoroutine):
                method = getattr(page, pc.method)
                pc.result = await method(*pc.args, **pc.kwargs)
                await page.wait_for_load_state(
                    timeout=self.default_navigation_timeout)

        body = (await page.content()).encode("utf8")
        request.meta["download_latency"] = time() - start_time

        if request.meta.get("playwright_include_page"):
            request.meta["playwright_page"] = page
        else:
            await page.close()
            self.stats.inc_value("playwright/page_count/closed")

        headers = Headers(response.headers)
        headers.pop("Content-Encoding", None)
        respcls = responsetypes.from_args(headers=headers,
                                          url=page.url,
                                          body=body)
        return respcls(
            url=page.url,
            status=response.status,
            headers=headers,
            body=body,
            request=request,
            flags=["playwright"],
        )
示例#7
0
    def start_requests(self):
        urls = ["https://www.thestar.com.my/tag/covid-19+watch"]

        script = """
            function main(splash, args)
                assert(splash:wait(2.5))  
                assert(splash:go(args.url))
                assert(splash:wait(5.5))   
                splash:on_request(function(request)
                    request:set_proxy{
                        host = "120.50.56.137",
                        port = 40553,
                        username = "",
                        password = "",
                        type = "socks4"
                    }
                
                return splash:html()
            end
        """
        headers = {
            'User-Agent':
            'Mozilla/5.0 (X11; Linux x86_64; rv:48.0) Gecko/20100101 Firefox/48.0'
        }

        splash_args = {
            'wait': 5,
            'lua_source': script,
            #             'proxy': "socks4://120.50.56.137:40553"
        }

        for url in urls:
            yield SplashRequest(url=url,
                                callback=self.parse_links,
                                endpoint='execute',
                                args=splash_args,
                                headers=Headers(headers))
示例#8
0
    def __init__(self,
                 url,
                 callback=None,
                 method='GET',
                 headers=None,
                 body=None,
                 cookies=None,
                 meta=None,
                 encoding='utf-8',
                 priority=0,
                 dont_filter=False,
                 errback=None):

        self._encoding = encoding  # this one has to be set first
        self.method = str(method).upper()
        self._set_url(url)
        self._set_body(body)
        assert isinstance(
            priority, int), "Request priority not an integer: %r" % priority
        self.priority = priority

        assert callback or not errback, "Cannot use errback without a callback"
        #指定了callback,才能制定errback
        self.callback = callback
        self.errback = errback

        self.cookies = cookies or {}
        #cookies:如果cookies被指定了,就使用cookies,否则使用空字典。

        self.headers = Headers(headers or {}, encoding=encoding)
        #如果制定headers,就使用,否则空字典。

        self.dont_filter = dont_filter
        #是否去重的标志。

        self._meta = dict(meta) if meta else None
示例#9
0
    def process_request(self, request, spider):
        # Called for each request that goes through the downloader
        # middleware.

        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object
        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called

        # 访问列表页,需要 referer
        if 'www.lagou.com/jobs/positionAjax.json' in request.url:
            custom_headers = spider.settings.attributes[
                'INDEX_REQUEST_HEADERS'].value
        else:
            custom_headers = spider.settings.attributes[
                'DETAIL_REQUEST_HEADERS'].value

        request_headers = Headers()
        for key in custom_headers:
            request_headers[key] = custom_headers[key]
        request.headers = request_headers
        return None
示例#10
0
 def process_request(self, request, spider):
     request.headers = Headers(HEADERS)
     request.cookies = parse_cookie(COOKIE)
示例#11
0
class BerrybenkaSpider(scrapy.Spider):
    name = "berrybenka"
    allowed_domains = ["berrybenka.com"]

    MAIN_URL = 'http://berrybenka.com/'

    start_urls = ['http://berrybenka.com/clothing/tops/women/']

    headers = Headers({'Content-Type': 'application/json'})
    body = {"wait": 0.5}

    page_index = 0

    items_per_page = 48

    def start_requests(self):
        for url in self.start_urls:
            self.body['url'] = url
            yield scrapy.Request(RENDER_HTML_URL,
                                 callback=self.parse,
                                 method="POST",
                                 body=json.dumps(self.body, sort_keys=True),
                                 headers=self.headers)

    def parse(self, response):
        """
        Vertical crawl
        """

        detail_links = response.xpath(
            '//a[@class="catalog-img"]/@href').extract()

        for link in detail_links:
            yield scrapy.Request(url=link, callback=self.parse_item)
        """
        Horizontal crawl
        Mulai dari /0
        Cek apakh ada link 'Next'
        Kalo tidak ada maka berhenti
        kalo ada maka tambahkan 48 jadi /sebelumnya+48 untuk url baru yg di scrape
        """
        next_page = response.xpath('//li[@class="next right"]')
        self.page_index += 1

        for url in self.start_urls:
            self.body['url'] = url + str(self.page_index * self.items_per_page)

            print self.body['url']

            yield scrapy.Request(RENDER_HTML_URL,
                                 callback=self.parse,
                                 method="POST",
                                 body=json.dumps(self.body, sort_keys=True),
                                 headers=self.headers,
                                 dont_filter=True)

    def parse_item(self, response):
        item = BerrybenkaItem()
        item['name'] = response.xpath(
            '//div[@class="prod-spec-title"]/h1/text()').extract()
        item['brand'] = response.xpath(
            '//div[@class="prod-spec-title"]/h2/a/text()').extract()
        item['description'] = response.xpath(
            '//p[@id="product_description"]/text()').extract()
        item['price'] = response.xpath(
            '//div[@class="prod-spec-title"]/p/text()').extract()
        item['url'] = response.url

        images = [
            response.xpath(
                '//div[@class="detail-photo left"]/div[@class="big-photo left"]/a/img/@src'
            ).extract()
        ]

        item['image_urls'] = images + response.xpath(
            '//div[@class="detail-photo left"]/div[@class="small-photo left"]/ul/li/a/img/@src'
        ).extract()

        return item
示例#12
0
 def start_requests(self):
     for url in self.start_urls:
         body = json.dumps({"url": url, "wait": 0.5, "js_enabled": False })
         headers = Headers({'Content-Type': 'application/json'})
         yield scrapy.Request(RENDER_HTML_URL, self.parse, method="POST",
                              body=body, headers=headers)
示例#13
0
 def __init__(self, user_agent):
     self.user_agent = user_agent
     self.headers = Headers()
示例#14
0
    def process_request(self, request, spider):
        splash_options = request.meta.get('splash')
        if not splash_options:
            return

        if request.meta.get("_splash_processed"):
            # don't process the same request more than once
            return

        if request.method != 'GET':
            logger.warn(
                "Currently only GET requests are supported by SplashMiddleware;"
                " %(request)s will be handled without Splash",
                {'request': request},
                extra={'spider': spider})
            return request

        meta = request.meta
        meta['_splash_processed'] = splash_options

        slot_policy = splash_options.get('slot_policy', self.slot_policy)
        self._set_download_slot(request, meta, slot_policy)

        args = splash_options.setdefault('args', {})
        args.setdefault('url', request.url)
        body = json.dumps(args, ensure_ascii=False)

        if 'timeout' in args:
            # User requested a Splash timeout explicitly.
            #
            # We can't catch a case when user requested `download_timeout`
            # explicitly because a default value for `download_timeout`
            # is set by DownloadTimeoutMiddleware.
            #
            # As user requested Splash timeout explicitly, we shouldn't change
            # it. Another reason not to change the requested Splash timeout is
            # because it may cause a validation error on the remote end.
            #
            # But we can change Scrapy `download_timeout`: increase
            # it when it's too small. Decreasing `download_timeout` is not
            # safe.

            # no timeout means infinite timeout
            timeout_current = meta.get('download_timeout', 1e6)
            timeout_expected = float(
                args['timeout']) + self.splash_extra_timeout

            if timeout_expected > timeout_current:
                meta['download_timeout'] = timeout_expected

        endpoint = splash_options.setdefault('endpoint', self.default_endpoint)
        splash_base_url = splash_options.get('splash_url',
                                             self.splash_base_url)
        splash_url = urljoin(splash_base_url, endpoint)

        req_rep = request.replace(
            url=splash_url,
            method='POST',
            body=body,

            # FIXME: original HTTP headers (including cookies)
            # are not respected.
            headers=Headers({'Content-Type': 'application/json'}),
        )

        self.crawler.stats.inc_value('splash/%s/request_count' % endpoint)
        return req_rep
示例#15
0
    def parse(self, response):
        #先进搜索首页
        if response.url == 'https://s.taobao.com':
            # "鼠标 无线"的搜索链接
            url = "https://s.taobao.com/search?initiative_id=staobaoz_20120515&q=%E9%BC%A0%E6%A0%87+%E6%97%A0%E7%BA%BF"

            body = json.dumps({'url': url, 'wait': 0.5})
            headers = Headers(
                {'Content-Type': 'application/json; charset=utf-8'})

            yield Request(settings['SPLASH_RENDER_URL'],
                          self.parse,
                          method='POST',
                          body=body,
                          headers=headers)

        else:
            sel = Selector(response)

            # 不能工作
            all = sel.xpath('//div[@class="item  "]/div[2]')

            #file = codecs.open('page_'+str(self.iii)+'.htm', 'wb', encoding='utf-8')
            #file.write(response.body.decode('unicode_escape'))
            #self.iii += 1

            #print all

            for one in all:
                item = TaobabkItem()

                goods_price = one.xpath(
                    'div[1]/div[1]/strong/text()').extract()

                #print goods_price
                goods_sale_num = one.xpath(
                    'div[1]/div[@class="deal-cnt"]/text()').extract()

                #print goods_sale_num
                # 提取数字
                if len(goods_sale_num) > 0:
                    goods_sale_num = "".join(
                        [s for s in goods_sale_num[0] if s.isdigit()])

                goods_name = one.xpath('div[2]/a/text()').extract()

                shop_name = one.xpath(
                    'div[3]/div[@class="shop"]/a/span[2]/text()').extract()
                shop_address = one.xpath(
                    'div[3]/div[@class="location"]/text()').extract()

                item['goods_price'] = goods_price
                item['goods_sale_num'] = goods_sale_num
                item['goods_name'] = [gn.encode('utf-8') for gn in goods_name]
                item['shop_name'] = [sn.encode('utf-8') for sn in shop_name]
                item['shop_address'] = [
                    sa.encode('utf-8') for sa in shop_address
                ]

                yield item

            next_page_urls = [
                'https://s.taobao.com/search?initiative_id=staobaoz_20120515&q=%E9%BC%A0%E6%A0%87+%E6%97%A0%E7%BA%BF&bcoffset=2&ntoffset=2&p4plefttype=3%2C1&p4pleftnum=1%2C3&s=44',
                'https://s.taobao.com/search?initiative_id=staobaoz_20120515&q=%E9%BC%A0%E6%A0%87+%E6%97%A0%E7%BA%BF&bcoffset=-1&ntoffset=-1&p4plefttype=3%2C1&p4pleftnum=1%2C3&s=88',
                'https://s.taobao.com/search?initiative_id=staobaoz_20120515&q=%E9%BC%A0%E6%A0%87+%E6%97%A0%E7%BA%BF&bcoffset=-4&ntoffset=-4&p4plefttype=3%2C1&p4pleftnum=1%2C3&s=132',
                'https://s.taobao.com/search?initiative_id=staobaoz_20120515&q=%E9%BC%A0%E6%A0%87+%E6%97%A0%E7%BA%BF&bcoffset=-7&ntoffset=-7&p4plefttype=3%2C1&p4pleftnum=1%2C3&s=176'
            ]

            for next_page_url in next_page_urls:
                body = json.dumps({'url': next_page_url, 'wait': 0.5})
                headers = Headers(
                    {'Content-Type': 'application/json; charset=utf-8'})

                yield Request(settings['SPLASH_RENDER_URL'],
                              self.parse,
                              method='POST',
                              body=body,
                              headers=headers)
示例#16
0
    def parse_view(self, response):
        if response.status == 200:
            try:
                print('Responded with 200 view')
                unitAtts = {}
                unitAtts['url'] = response.url

                # Details
                detailsTable = response.xpath(
                    '/html/body/div[3]/div[2]/div[1]/div/div[3]/div[1]/dl')
                detailNames = detailsTable.css('dt::text').extract()
                detailValues = detailsTable.css('dd::text').extract()

                for index in range(len(detailNames)):
                    unitAtts[detailNames[index].strip(
                    )] = detailValues[index].strip()

                # Rent & Fees
                feesTable = response.xpath(
                    '/html/body/div[3]/div[2]/div[1]/div/div[3]/div[3]/div[2]/dl'
                )
                feesNames = feesTable.css('dt::text').extract()
                feesValues = feesTable.css('dd::text').extract()

                for index in range(len(feesNames)):
                    unitAtts[feesNames[index].encode(
                        'ascii', 'ignore').strip().encode(
                            'utf-8')] = feesValues[index].encode(
                                'ascii',
                                'ignore').strip().lstrip(u'\xa5').replace(
                                    ',', '').encode('utf-8')

                # Directions
                gmap = response.css('div.rej-map-container')
                lat = gmap.css("::attr('data-lat')").extract_first()
                lng = gmap.css("::attr('data-lng')").extract_first()
                address = (gmap.css("::attr('data-address')").extract_first()
                           or '').encode('utf-8')

                directions1 = self.directions1.encode('utf-8')
                directions2 = self.directions2.encode('utf-8')
                directions = u''

                print("data lat={}, long={}".format(lat, lng))
                if lat == '' or lng == '' or float(lat) == 0.0 or float(
                        lng) == 0.0:
                    directions = directions1 + address + directions2
                else:
                    directions = directions1 + lat + ',' + lng + directions2

                unitAtts['directions'] = directions
                callbackFn = self.parse_map(unitAtts)

                RENDER_HTML_URL = 'http://localhost:8050/render.html'
                body = json.dumps({
                    'url': directions,
                    'wait': 2
                },
                                  sort_keys=True)
                headers = Headers({
                    'Content-Type':
                    'application/json',
                    'Accept':
                    'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                    'Accept-Language':
                    'en',
                    'Referer':
                    response.url,
                    'User-Agent':
                    'Scrapy/1.5.0 (+https://scrapy.org)'
                })

                print("Calling maps. view={}, map={}".format(
                    unitAtts['url'], directions))
                yield scrapy.Request(RENDER_HTML_URL,
                                     callback=callbackFn,
                                     method="POST",
                                     body=body,
                                     headers=headers,
                                     dont_filter=True)
            except:
                print("Unexpected error view={}".format(sys.exc_info()))
                raise
        else:
            print('Not ok request view:{}'.format(response.status))
示例#17
0
    def parse(self, response):
        all_cookies = response.headers.getlist('Set-Cookie')
        global total_results
        global total_pages
        # total_results=''
        # total_pages=''
        if len(all_cookies) > 4:
            guardian_endpoint = response.headers.getlist(
                'Set-Cookie')[0].split(";")[0].split("=")[1]
            asp_net_sessionid = response.headers.getlist(
                'Set-Cookie')[1].split(";")[0].split("=")[1]
            loggedid = response.headers.getlist('Set-Cookie')[4].split(
                ";")[0].split("=")[1]
            cfsApplyFilters = response.headers.getlist('Set-Cookie')[1].split(
                ";")[0].split("=")[1]
            initials = response.headers.getlist('Set-Cookie')[2].split(
                ";")[0].split("=")[1]
            firstName = response.headers.getlist('Set-Cookie')[3].split(
                ";")[0].split("=")[1]
            global cookies
            if len(all_cookies) > 5:
                serverid = response.headers.getlist('Set-Cookie')[5].split(
                    ";")[0].split("=")[1]
                cookies = {
                    "ASP.NET_SessionId": asp_net_sessionid,
                    "cfsApplyFilters": cfsApplyFilters,
                    "Initials": initials,
                    "FirstName": firstName,
                    "LoggedIn": loggedid,
                    "SERVERID": serverid,
                    "GuardianEndpoint": guardian_endpoint
                }
            else:
                cookies = {
                    "ASP.NET_SessionId": asp_net_sessionid,
                    "cfsApplyFilters": cfsApplyFilters,
                    "Initials": initials,
                    "FirstName": firstName,
                    "LoggedIn": loggedid,
                    "GuardianEndpoint": guardian_endpoint
                }

            if response.css('ul.pagination-cfs li div  span ::text'):
                total_pages = response.css(
                    'ul.pagination-cfs li div  span ::text').extract_first()
                total_pages = total_pages.replace("of ", "")
                total_pages = total_pages.replace(",", "")
                total_pages = int(total_pages) + 1
            else:
                total_pages = 0

            if response.css('div h2#searchResultCount ::text'):
                total_results = response.css(
                    'div h2#searchResultCount ::text').extract_first()
                total_results = total_results.replace("Results", "")
                total_results = total_results.replace(",", "")
                total_results = total_results.strip()
            else:
                total_results = ''

        if self.color and self.get_color == 1:
            for res in response.css('li.vehicle-list'):
                # description = res.css('div.vehicle-info-section div span::attr(itemprop)').extract_first()
                # href = res.css('article div div div.col-xs-2.col-V2-lg-2 a::attr(href)').extract_first()
                href = res.css('::attr(data-detailsurl)').extract_first()
                if href:
                    yield scrapy.Request(response.urljoin(href),
                                         callback=self.product_details)
        if self.color:
            self.get_color = 1
            url = "https://www.carsforsale.com/search/filtercolor"
            if int(self.page_number) > 1:
                body = json.dumps({
                    "IsChecked": True,
                    "Value": self.color,
                    "PageNumber": str(self.page_number)
                })
            else:
                body = json.dumps({"IsChecked": True, "Value": self.color})
            headers = Headers({'Content-Type': 'application/json'})
            yield scrapy.Request(url,
                                 callback=self.parse,
                                 method='POST',
                                 body=body,
                                 headers=headers,
                                 cookies=cookies)
        if int(self.page_number) > 1 and self.pages_number == 1:
            for res in response.css('li.vehicle-list'):
                # description = res.css('div.vehicle-info-section div span::attr(itemprop)').extract_first()
                # href = res.css('article div div div.col-xs-2.col-V2-lg-2 a::attr(href)').extract_first()
                href = res.css('::attr(data-detailsurl)').extract_first()
                if href:
                    yield scrapy.Request(response.urljoin(href),
                                         callback=self.product_details)
        if int(self.page_number) > 1 and self.get_color == 0:
            self.pages_number = 1
            url = "https://www.carsforsale.com/search/gotopage"
            body = json.dumps({"PageNumber": str(self.page_number)})
            headers = Headers({'Content-Type': 'application/json'})
            yield scrapy.Request(url,
                                 callback=self.parse,
                                 method='POST',
                                 body=body,
                                 headers=headers,
                                 cookies=cookies)
        if self.get_color == 0 and self.pages_number == 0:
            for res in response.css('li.vehicle-list'):
                # description = res.css('div.vehicle-info-section div span::attr(itemprop)').extract_first()
                # href = res.css('article div div div.col-xs-2.col-V2-lg-2 a::attr(href)').extract_first()
                href = res.css('::attr(data-detailsurl)').extract_first()
                if href:
                    yield scrapy.Request(response.urljoin(href),
                                         callback=self.product_details)
示例#18
0
 def headers(self):
     global authorization_token
     return Headers({
         'Authorization': 'Bearer {}'.format(authorization_token),
         'x-guest-token': guest_token
         }, encoding=self.encoding)
示例#19
0
 def set_headers(self, headers):
     self.headers = Headers(headers or {}, encoding=self._encoding)
示例#20
0
    def process_request(self, request, spider):
        if 'prerender' not in request.meta:
            return

        if request.method not in {'GET', 'POST'}:
            logger.warning(
                "Currently only GET and POST requests are supported by "
                "PrerenderMiddleware; %(request)s will be handled without Prerender",
                {'request': request},
                extra={'spider': spider})
            return request

        if request.meta.get("_prerender_processed"):
            # don't process the same request more than once
            return

        prerender_options = request.meta['prerender']
        request.meta['_prerender_processed'] = True

        slot_policy = prerender_options.get('slot_policy', self.slot_policy)
        self._set_download_slot(request, request.meta, slot_policy)

        args = prerender_options.setdefault('args', {})

        if '_replaced_args' in prerender_options:
            # restore arguments before sending request to the downloader
            load_args = {}
            save_args = []
            local_arg_fingerprints = {}
            for name in prerender_options['_replaced_args']:
                fp = args[name]
                # Use remote Prerender argument cache: if Prerender key
                # for a value is known then don't send the value to Prerender;
                # if it is unknown then try to save the value on server using
                # ``save_args``.
                if fp in self._remote_keys:
                    load_args[name] = self._remote_keys[fp]
                    del args[name]
                else:
                    save_args.append(name)
                    args[name] = self._argument_values[fp]

                local_arg_fingerprints[name] = fp

            if load_args:
                args['load_args'] = load_args
            if save_args:
                args['save_args'] = save_args
            prerender_options[
                '_local_arg_fingerprints'] = local_arg_fingerprints

            del prerender_options['_replaced_args']  # ??

        args.setdefault('url', request.url)
        if request.method == 'POST':
            args.setdefault('http_method', request.method)
            # XXX: non-UTF8 request bodies are not supported now
            args.setdefault('body', request.body.decode('utf8'))

        if not prerender_options.get('dont_send_headers'):
            headers = scrapy_headers_to_unicode_dict(request.headers)
            if headers:
                args.setdefault('headers', headers)

        body = json.dumps(args, ensure_ascii=False, sort_keys=True, indent=4)
        # print(body)

        if 'timeout' in args:
            # User requested a Prerender timeout explicitly.
            #
            # We can't catch a case when user requested `download_timeout`
            # explicitly because a default value for `download_timeout`
            # is set by DownloadTimeoutMiddleware.
            #
            # As user requested Prerender timeout explicitly, we shouldn't change
            # it. Another reason not to change the requested Prerender timeout is
            # because it may cause a validation error on the remote end.
            #
            # But we can change Scrapy `download_timeout`: increase
            # it when it's too small. Decreasing `download_timeout` is not
            # safe.

            timeout_requested = float(args['timeout'])
            timeout_expected = timeout_requested + self.prerender_extra_timeout

            # no timeout means infinite timeout
            timeout_current = request.meta.get('download_timeout', 1e6)

            if timeout_expected > timeout_current:
                request.meta['download_timeout'] = timeout_expected

        endpoint = prerender_options.setdefault('endpoint',
                                                self.default_endpoint)
        prerender_base_url = prerender_options.get('prerender_url',
                                                   self.prerender_base_url)
        prerender_url = urljoin(prerender_base_url, endpoint)

        headers = Headers({'Content-Type': 'application/json'})
        headers.update(prerender_options.get('prerender_headers', {}))
        new_request = request.replace(url=prerender_url,
                                      method='POST',
                                      body=body,
                                      headers=headers,
                                      priority=request.priority +
                                      self.rescheduling_priority_adjust)
        self.crawler.stats.inc_value('prerender/%s/request_count' % endpoint)
        return new_request
示例#21
0
    def process_request(self, request, spider):
        splash_options = request.meta.get('splash')
        if not splash_options:
            return

        if request.meta.get("_splash_processed"):
            # don't process the same request more than once
            return

        if request.method != 'GET':
            logger.warn(
                "Currently only GET requests are supported by SplashMiddleware;"
                " %(request)s will be handled without Splash",
                {'request': request},
                extra={'spider': spider}
            )
            return request

        meta = request.meta
        meta['_splash_processed'] = splash_options

        slot_policy = splash_options.get('slot_policy', self.slot_policy)
        self._set_download_slot(request, meta, slot_policy)

        args = splash_options.setdefault('args', {})
        args.setdefault('url', request.url)

        proxy = meta.get('proxy')
        crawlera_proxy = proxy and _crawlera_proxy_re.match(proxy)
        if proxy:
            del meta['proxy']
            if crawlera_proxy:
                self._check_crawlera_settings(splash_options)

                # prevent crawlera middleware form processing the splash request
                meta['dont_proxy'] = True

                crawlera_settings = args.setdefault('crawlera', {})
                crawlera_headers = crawlera_settings.setdefault('headers', Headers())
                for name in request.headers.keys():
                    if name.startswith('Proxy-') or name.startswith('X-Crawlera-'):
                        # Use header for every request instead of just the first one.
                        crawlera_headers[name] = request.headers.pop(name)

                crawlera_settings['host'] = crawlera_proxy.group(1)
                crawlera_settings['port'] = int(crawlera_proxy.group(2))
                args['lua_source'] = self._get_crawlera_script()
            else:
                # Pass proxy as a parameter to splash. Note that padding a
                # proxy url here is only available on splash >= 1.8
                if "://" not in proxy:
                    # Support for host:port without protocol
                    proxy = "http://" + proxy

                args['proxy'] = proxy


        body = json.dumps(args, ensure_ascii=False)

        if 'timeout' in args:
            # User requested a Splash timeout explicitly.
            #
            # We can't catch a case when user requested `download_timeout`
            # explicitly because a default value for `download_timeout`
            # is set by DownloadTimeoutMiddleware.
            #
            # As user requested Splash timeout explicitly, we shouldn't change
            # it. Another reason not to change the requested Splash timeout is
            # because it may cause a validation error on the remote end.
            #
            # But we can change Scrapy `download_timeout`: increase
            # it when it's too small. Decreasing `download_timeout` is not
            # safe.

            # no timeout means infinite timeout
            timeout_current = meta.get('download_timeout', 1e6)
            timeout_expected = float(args['timeout']) + self.splash_extra_timeout

            if timeout_expected > timeout_current:
                meta['download_timeout'] = timeout_expected

        if crawlera_proxy:
            endpoint = "execute"
        else:
            endpoint = splash_options.setdefault('endpoint', self.default_endpoint)

        splash_base_url = splash_options.get('splash_url', self.splash_base_url)
        splash_url = urljoin(splash_base_url, endpoint)

        req_rep = request.replace(
            url=splash_url,
            method='POST',
            body=body,

            # FIXME: original HTTP headers (including cookies)
            # are not respected.
            headers=Headers({'Content-Type': 'application/json'}),
        )

        self.crawler.stats.inc_value('splash/%s/request_count' % endpoint)
        return req_rep
示例#22
0
 def process_request(self, request, spider):
     headers = MockHeaders()
     headers = headers.get_headers(host='www.xmrc.com.cn')
     request.headers = Headers(headers)
示例#23
0
    def __init__(
        self,
        stream_id: int,
        request: Request,
        protocol: "H2ClientProtocol",
        download_maxsize: int = 0,
        download_warnsize: int = 0,
    ) -> None:
        """
        Arguments:
            stream_id -- Unique identifier for the stream within a single HTTP/2 connection
            request -- The HTTP request associated to the stream
            protocol -- Parent H2ClientProtocol instance
        """
        self.stream_id: int = stream_id
        self._request: Request = request
        self._protocol: "H2ClientProtocol" = protocol

        self._download_maxsize = self._request.meta.get(
            'download_maxsize', download_maxsize)
        self._download_warnsize = self._request.meta.get(
            'download_warnsize', download_warnsize)

        # Metadata of an HTTP/2 connection stream
        # initialized when stream is instantiated
        self.metadata: Dict = {
            'request_content_length':
            0 if self._request.body is None else len(self._request.body),

            # Flag to keep track whether the stream has initiated the request
            'request_sent':
            False,

            # Flag to track whether we have logged about exceeding download warnsize
            'reached_warnsize':
            False,

            # Each time we send a data frame, we will decrease value by the amount send.
            'remaining_content_length':
            0 if self._request.body is None else len(self._request.body),

            # Flag to keep track whether client (self) have closed this stream
            'stream_closed_local':
            False,

            # Flag to keep track whether the server has closed the stream
            'stream_closed_server':
            False,
        }

        # Private variable used to build the response
        # this response is then converted to appropriate Response class
        # passed to the response deferred callback
        self._response: Dict = {
            # Data received frame by frame from the server is appended
            # and passed to the response Deferred when completely received.
            'body': BytesIO(),

            # The amount of data received that counts against the
            # flow control window
            'flow_controlled_size': 0,

            # Headers received after sending the request
            'headers': Headers({}),
        }

        def _cancel(_) -> None:
            # Close this stream as gracefully as possible
            # If the associated request is initiated we reset this stream
            # else we directly call close() method
            if self.metadata['request_sent']:
                self.reset_stream(StreamCloseReason.CANCELLED)
            else:
                self.close(StreamCloseReason.CANCELLED)

        self._deferred_response = Deferred(_cancel)
示例#24
0
from scrapy.http.headers import Headers
import json

h = Headers({"X-Foo": "bar"})
print(h)
print(h.values())
#print(json.dumps(h, indent=3))