def __init__( self, url: str, callback: Optional[Callable] = None, method: str = "GET", headers: Optional[dict] = None, body: Optional[Union[bytes, str]] = None, cookies: Optional[Union[dict, List[dict]]] = None, meta: Optional[dict] = None, encoding: str = "utf-8", priority: int = 0, dont_filter: bool = False, errback: Optional[Callable] = None, flags: Optional[List[str]] = None, cb_kwargs: Optional[dict] = None, ) -> None: self._encoding = encoding # this one has to be set first self.method = str(method).upper() self._set_url(url) self._set_body(body) if not isinstance(priority, int): raise TypeError(f"Request priority not an integer: {priority!r}") self.priority = priority if callback is not None and not callable(callback): raise TypeError( f'callback must be a callable, got {type(callback).__name__}') if errback is not None and not callable(errback): raise TypeError( f'errback must be a callable, got {type(errback).__name__}') self.callback = callback self.errback = errback self.cookies = cookies or {} self.headers = Headers(headers or {}, encoding=encoding) self.dont_filter = dont_filter self._meta = dict(meta) if meta else None self._cb_kwargs = dict(cb_kwargs) if cb_kwargs else None self.flags = [] if flags is None else list(flags)
def __init__(self, url, callback=None, method='GET', headers=None, body=None, cookies=None, meta=None, encoding='utf-8', priority=0, dont_filter=False, errback=None, flags=None, is_need_proxy=False): self._encoding = encoding # this one has to be set first self.method = str(method).upper() self._set_url(url) self._set_body(body) self.is_need_proxy = is_need_proxy assert isinstance( priority, int), "Request priority not an integer: %r" % priority self.priority = priority if callback is not None and not callable(callback): raise TypeError('callback must be a callable, got %s' % type(callback).__name__) if errback is not None and not callable(errback): raise TypeError('errback must be a callable, got %s' % type(errback).__name__) assert callback or not errback, "Cannot use errback without a callback" self.callback = callback self.errback = errback self.cookies = cookies or {} self.headers = Headers(headers or {}, encoding=encoding) self.dont_filter = dont_filter self._meta = dict(meta) if meta else None self.flags = [] if flags is None else list(flags)
def __init__(self, url, callback=None, method='GET', headers=None, body=None, cookies=None, meta=None, encoding='utf-8', priority=0, dont_filter=False, errback=None, flags=None, cb_kwargs=None): self._encoding = encoding # this one has to be set first self.method = str(method).upper() # 请求方法 self._set_url(url) # 设置 URL self._set_body(body) # 设置body assert isinstance( priority, int), "Request priority not an integer: %r" % priority self.priority = priority # 优先级 if callback is not None and not callable(callback): raise TypeError('callback must be a callable, got %s' % type(callback).__name__) if errback is not None and not callable(errback): raise TypeError('errback must be a callable, got %s' % type(errback).__name__) assert callback or not errback, "Cannot use errback without a callback" self.callback = callback # 回调函数, 经常被使用 self.errback = errback # 异常回调函数 self.cookies = cookies or {} self.headers = Headers(headers or {}, encoding=encoding) # 构建headers self.dont_filter = dont_filter # 判断是否需要过滤 self._meta = dict(meta) if meta else None # 附加信息 self._cb_kwargs = dict(cb_kwargs) if cb_kwargs else None self.flags = [] if flags is None else list(flags)
def __init__(self, url, callback=None, method='GET', headers=None, body=None, cookies=None, meta=None, encoding='utf-8', priority=0, dont_filter=False, errback=None, flags=None): self._encoding = encoding # this one has to be set first self.method = str(method).upper() self._set_url(url) self._set_body(body) assert isinstance(priority, int), \ "Request priority not an integer: %r" % priority self.priority = priority assert callback or not errback, "Cannot use errback without a callback" self.callback = callback self.errback = errback self.cookies = cookies or {} self.headers = Headers(headers or {}, encoding=encoding) self.dont_filter = dont_filter self._meta = dict(meta) if meta else None self.flags = [] if flags is None else list(flags)
def __init__(self, url, callback=None, method='GET', headers=None, body=None, cookies=None, meta=None, encoding='utf-8', priority=0, dont_filter=False, errback=None, flags=None, cb_kwargs=None): self._encoding = encoding # this one has to be set first self.method = str(method).upper() self._set_url(url) self._set_body(body) if not isinstance(priority, int): raise TypeError(f"Request priority not an integer: {priority!r}") self.priority = priority if callback is not None and not callable(callback): raise TypeError( f'callback must be a callable, got {type(callback).__name__}') if errback is not None and not callable(errback): raise TypeError( f'errback must be a callable, got {type(errback).__name__}') self.callback = callback self.errback = errback self.cookies = cookies or {} self.headers = Headers(headers or {}, encoding=encoding) self.dont_filter = dont_filter self._meta = dict(meta) if meta else None self._cb_kwargs = dict(cb_kwargs) if cb_kwargs else None self.flags = [] if flags is None else list(flags)
async def _download_request_with_page(self, request: Request, spider: Spider, page: Page) -> Response: start_time = time() response = await page.goto(request.url) page_coroutines = request.meta.get("playwright_page_coroutines") or () if isinstance(page_coroutines, dict): page_coroutines = page_coroutines.values() for pc in page_coroutines: if isinstance(pc, PageCoroutine): method = getattr(page, pc.method) pc.result = await method(*pc.args, **pc.kwargs) await page.wait_for_load_state( timeout=self.default_navigation_timeout) body = (await page.content()).encode("utf8") request.meta["download_latency"] = time() - start_time if request.meta.get("playwright_include_page"): request.meta["playwright_page"] = page else: await page.close() self.stats.inc_value("playwright/page_count/closed") headers = Headers(response.headers) headers.pop("Content-Encoding", None) respcls = responsetypes.from_args(headers=headers, url=page.url, body=body) return respcls( url=page.url, status=response.status, headers=headers, body=body, request=request, flags=["playwright"], )
def start_requests(self): urls = ["https://www.thestar.com.my/tag/covid-19+watch"] script = """ function main(splash, args) assert(splash:wait(2.5)) assert(splash:go(args.url)) assert(splash:wait(5.5)) splash:on_request(function(request) request:set_proxy{ host = "120.50.56.137", port = 40553, username = "", password = "", type = "socks4" } return splash:html() end """ headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:48.0) Gecko/20100101 Firefox/48.0' } splash_args = { 'wait': 5, 'lua_source': script, # 'proxy': "socks4://120.50.56.137:40553" } for url in urls: yield SplashRequest(url=url, callback=self.parse_links, endpoint='execute', args=splash_args, headers=Headers(headers))
def __init__(self, url, callback=None, method='GET', headers=None, body=None, cookies=None, meta=None, encoding='utf-8', priority=0, dont_filter=False, errback=None): self._encoding = encoding # this one has to be set first self.method = str(method).upper() self._set_url(url) self._set_body(body) assert isinstance( priority, int), "Request priority not an integer: %r" % priority self.priority = priority assert callback or not errback, "Cannot use errback without a callback" #指定了callback,才能制定errback self.callback = callback self.errback = errback self.cookies = cookies or {} #cookies:如果cookies被指定了,就使用cookies,否则使用空字典。 self.headers = Headers(headers or {}, encoding=encoding) #如果制定headers,就使用,否则空字典。 self.dont_filter = dont_filter #是否去重的标志。 self._meta = dict(meta) if meta else None
def process_request(self, request, spider): # Called for each request that goes through the downloader # middleware. # Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called # 访问列表页,需要 referer if 'www.lagou.com/jobs/positionAjax.json' in request.url: custom_headers = spider.settings.attributes[ 'INDEX_REQUEST_HEADERS'].value else: custom_headers = spider.settings.attributes[ 'DETAIL_REQUEST_HEADERS'].value request_headers = Headers() for key in custom_headers: request_headers[key] = custom_headers[key] request.headers = request_headers return None
def process_request(self, request, spider): request.headers = Headers(HEADERS) request.cookies = parse_cookie(COOKIE)
class BerrybenkaSpider(scrapy.Spider): name = "berrybenka" allowed_domains = ["berrybenka.com"] MAIN_URL = 'http://berrybenka.com/' start_urls = ['http://berrybenka.com/clothing/tops/women/'] headers = Headers({'Content-Type': 'application/json'}) body = {"wait": 0.5} page_index = 0 items_per_page = 48 def start_requests(self): for url in self.start_urls: self.body['url'] = url yield scrapy.Request(RENDER_HTML_URL, callback=self.parse, method="POST", body=json.dumps(self.body, sort_keys=True), headers=self.headers) def parse(self, response): """ Vertical crawl """ detail_links = response.xpath( '//a[@class="catalog-img"]/@href').extract() for link in detail_links: yield scrapy.Request(url=link, callback=self.parse_item) """ Horizontal crawl Mulai dari /0 Cek apakh ada link 'Next' Kalo tidak ada maka berhenti kalo ada maka tambahkan 48 jadi /sebelumnya+48 untuk url baru yg di scrape """ next_page = response.xpath('//li[@class="next right"]') self.page_index += 1 for url in self.start_urls: self.body['url'] = url + str(self.page_index * self.items_per_page) print self.body['url'] yield scrapy.Request(RENDER_HTML_URL, callback=self.parse, method="POST", body=json.dumps(self.body, sort_keys=True), headers=self.headers, dont_filter=True) def parse_item(self, response): item = BerrybenkaItem() item['name'] = response.xpath( '//div[@class="prod-spec-title"]/h1/text()').extract() item['brand'] = response.xpath( '//div[@class="prod-spec-title"]/h2/a/text()').extract() item['description'] = response.xpath( '//p[@id="product_description"]/text()').extract() item['price'] = response.xpath( '//div[@class="prod-spec-title"]/p/text()').extract() item['url'] = response.url images = [ response.xpath( '//div[@class="detail-photo left"]/div[@class="big-photo left"]/a/img/@src' ).extract() ] item['image_urls'] = images + response.xpath( '//div[@class="detail-photo left"]/div[@class="small-photo left"]/ul/li/a/img/@src' ).extract() return item
def start_requests(self): for url in self.start_urls: body = json.dumps({"url": url, "wait": 0.5, "js_enabled": False }) headers = Headers({'Content-Type': 'application/json'}) yield scrapy.Request(RENDER_HTML_URL, self.parse, method="POST", body=body, headers=headers)
def __init__(self, user_agent): self.user_agent = user_agent self.headers = Headers()
def process_request(self, request, spider): splash_options = request.meta.get('splash') if not splash_options: return if request.meta.get("_splash_processed"): # don't process the same request more than once return if request.method != 'GET': logger.warn( "Currently only GET requests are supported by SplashMiddleware;" " %(request)s will be handled without Splash", {'request': request}, extra={'spider': spider}) return request meta = request.meta meta['_splash_processed'] = splash_options slot_policy = splash_options.get('slot_policy', self.slot_policy) self._set_download_slot(request, meta, slot_policy) args = splash_options.setdefault('args', {}) args.setdefault('url', request.url) body = json.dumps(args, ensure_ascii=False) if 'timeout' in args: # User requested a Splash timeout explicitly. # # We can't catch a case when user requested `download_timeout` # explicitly because a default value for `download_timeout` # is set by DownloadTimeoutMiddleware. # # As user requested Splash timeout explicitly, we shouldn't change # it. Another reason not to change the requested Splash timeout is # because it may cause a validation error on the remote end. # # But we can change Scrapy `download_timeout`: increase # it when it's too small. Decreasing `download_timeout` is not # safe. # no timeout means infinite timeout timeout_current = meta.get('download_timeout', 1e6) timeout_expected = float( args['timeout']) + self.splash_extra_timeout if timeout_expected > timeout_current: meta['download_timeout'] = timeout_expected endpoint = splash_options.setdefault('endpoint', self.default_endpoint) splash_base_url = splash_options.get('splash_url', self.splash_base_url) splash_url = urljoin(splash_base_url, endpoint) req_rep = request.replace( url=splash_url, method='POST', body=body, # FIXME: original HTTP headers (including cookies) # are not respected. headers=Headers({'Content-Type': 'application/json'}), ) self.crawler.stats.inc_value('splash/%s/request_count' % endpoint) return req_rep
def parse(self, response): #先进搜索首页 if response.url == 'https://s.taobao.com': # "鼠标 无线"的搜索链接 url = "https://s.taobao.com/search?initiative_id=staobaoz_20120515&q=%E9%BC%A0%E6%A0%87+%E6%97%A0%E7%BA%BF" body = json.dumps({'url': url, 'wait': 0.5}) headers = Headers( {'Content-Type': 'application/json; charset=utf-8'}) yield Request(settings['SPLASH_RENDER_URL'], self.parse, method='POST', body=body, headers=headers) else: sel = Selector(response) # 不能工作 all = sel.xpath('//div[@class="item "]/div[2]') #file = codecs.open('page_'+str(self.iii)+'.htm', 'wb', encoding='utf-8') #file.write(response.body.decode('unicode_escape')) #self.iii += 1 #print all for one in all: item = TaobabkItem() goods_price = one.xpath( 'div[1]/div[1]/strong/text()').extract() #print goods_price goods_sale_num = one.xpath( 'div[1]/div[@class="deal-cnt"]/text()').extract() #print goods_sale_num # 提取数字 if len(goods_sale_num) > 0: goods_sale_num = "".join( [s for s in goods_sale_num[0] if s.isdigit()]) goods_name = one.xpath('div[2]/a/text()').extract() shop_name = one.xpath( 'div[3]/div[@class="shop"]/a/span[2]/text()').extract() shop_address = one.xpath( 'div[3]/div[@class="location"]/text()').extract() item['goods_price'] = goods_price item['goods_sale_num'] = goods_sale_num item['goods_name'] = [gn.encode('utf-8') for gn in goods_name] item['shop_name'] = [sn.encode('utf-8') for sn in shop_name] item['shop_address'] = [ sa.encode('utf-8') for sa in shop_address ] yield item next_page_urls = [ 'https://s.taobao.com/search?initiative_id=staobaoz_20120515&q=%E9%BC%A0%E6%A0%87+%E6%97%A0%E7%BA%BF&bcoffset=2&ntoffset=2&p4plefttype=3%2C1&p4pleftnum=1%2C3&s=44', 'https://s.taobao.com/search?initiative_id=staobaoz_20120515&q=%E9%BC%A0%E6%A0%87+%E6%97%A0%E7%BA%BF&bcoffset=-1&ntoffset=-1&p4plefttype=3%2C1&p4pleftnum=1%2C3&s=88', 'https://s.taobao.com/search?initiative_id=staobaoz_20120515&q=%E9%BC%A0%E6%A0%87+%E6%97%A0%E7%BA%BF&bcoffset=-4&ntoffset=-4&p4plefttype=3%2C1&p4pleftnum=1%2C3&s=132', 'https://s.taobao.com/search?initiative_id=staobaoz_20120515&q=%E9%BC%A0%E6%A0%87+%E6%97%A0%E7%BA%BF&bcoffset=-7&ntoffset=-7&p4plefttype=3%2C1&p4pleftnum=1%2C3&s=176' ] for next_page_url in next_page_urls: body = json.dumps({'url': next_page_url, 'wait': 0.5}) headers = Headers( {'Content-Type': 'application/json; charset=utf-8'}) yield Request(settings['SPLASH_RENDER_URL'], self.parse, method='POST', body=body, headers=headers)
def parse_view(self, response): if response.status == 200: try: print('Responded with 200 view') unitAtts = {} unitAtts['url'] = response.url # Details detailsTable = response.xpath( '/html/body/div[3]/div[2]/div[1]/div/div[3]/div[1]/dl') detailNames = detailsTable.css('dt::text').extract() detailValues = detailsTable.css('dd::text').extract() for index in range(len(detailNames)): unitAtts[detailNames[index].strip( )] = detailValues[index].strip() # Rent & Fees feesTable = response.xpath( '/html/body/div[3]/div[2]/div[1]/div/div[3]/div[3]/div[2]/dl' ) feesNames = feesTable.css('dt::text').extract() feesValues = feesTable.css('dd::text').extract() for index in range(len(feesNames)): unitAtts[feesNames[index].encode( 'ascii', 'ignore').strip().encode( 'utf-8')] = feesValues[index].encode( 'ascii', 'ignore').strip().lstrip(u'\xa5').replace( ',', '').encode('utf-8') # Directions gmap = response.css('div.rej-map-container') lat = gmap.css("::attr('data-lat')").extract_first() lng = gmap.css("::attr('data-lng')").extract_first() address = (gmap.css("::attr('data-address')").extract_first() or '').encode('utf-8') directions1 = self.directions1.encode('utf-8') directions2 = self.directions2.encode('utf-8') directions = u'' print("data lat={}, long={}".format(lat, lng)) if lat == '' or lng == '' or float(lat) == 0.0 or float( lng) == 0.0: directions = directions1 + address + directions2 else: directions = directions1 + lat + ',' + lng + directions2 unitAtts['directions'] = directions callbackFn = self.parse_map(unitAtts) RENDER_HTML_URL = 'http://localhost:8050/render.html' body = json.dumps({ 'url': directions, 'wait': 2 }, sort_keys=True) headers = Headers({ 'Content-Type': 'application/json', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en', 'Referer': response.url, 'User-Agent': 'Scrapy/1.5.0 (+https://scrapy.org)' }) print("Calling maps. view={}, map={}".format( unitAtts['url'], directions)) yield scrapy.Request(RENDER_HTML_URL, callback=callbackFn, method="POST", body=body, headers=headers, dont_filter=True) except: print("Unexpected error view={}".format(sys.exc_info())) raise else: print('Not ok request view:{}'.format(response.status))
def parse(self, response): all_cookies = response.headers.getlist('Set-Cookie') global total_results global total_pages # total_results='' # total_pages='' if len(all_cookies) > 4: guardian_endpoint = response.headers.getlist( 'Set-Cookie')[0].split(";")[0].split("=")[1] asp_net_sessionid = response.headers.getlist( 'Set-Cookie')[1].split(";")[0].split("=")[1] loggedid = response.headers.getlist('Set-Cookie')[4].split( ";")[0].split("=")[1] cfsApplyFilters = response.headers.getlist('Set-Cookie')[1].split( ";")[0].split("=")[1] initials = response.headers.getlist('Set-Cookie')[2].split( ";")[0].split("=")[1] firstName = response.headers.getlist('Set-Cookie')[3].split( ";")[0].split("=")[1] global cookies if len(all_cookies) > 5: serverid = response.headers.getlist('Set-Cookie')[5].split( ";")[0].split("=")[1] cookies = { "ASP.NET_SessionId": asp_net_sessionid, "cfsApplyFilters": cfsApplyFilters, "Initials": initials, "FirstName": firstName, "LoggedIn": loggedid, "SERVERID": serverid, "GuardianEndpoint": guardian_endpoint } else: cookies = { "ASP.NET_SessionId": asp_net_sessionid, "cfsApplyFilters": cfsApplyFilters, "Initials": initials, "FirstName": firstName, "LoggedIn": loggedid, "GuardianEndpoint": guardian_endpoint } if response.css('ul.pagination-cfs li div span ::text'): total_pages = response.css( 'ul.pagination-cfs li div span ::text').extract_first() total_pages = total_pages.replace("of ", "") total_pages = total_pages.replace(",", "") total_pages = int(total_pages) + 1 else: total_pages = 0 if response.css('div h2#searchResultCount ::text'): total_results = response.css( 'div h2#searchResultCount ::text').extract_first() total_results = total_results.replace("Results", "") total_results = total_results.replace(",", "") total_results = total_results.strip() else: total_results = '' if self.color and self.get_color == 1: for res in response.css('li.vehicle-list'): # description = res.css('div.vehicle-info-section div span::attr(itemprop)').extract_first() # href = res.css('article div div div.col-xs-2.col-V2-lg-2 a::attr(href)').extract_first() href = res.css('::attr(data-detailsurl)').extract_first() if href: yield scrapy.Request(response.urljoin(href), callback=self.product_details) if self.color: self.get_color = 1 url = "https://www.carsforsale.com/search/filtercolor" if int(self.page_number) > 1: body = json.dumps({ "IsChecked": True, "Value": self.color, "PageNumber": str(self.page_number) }) else: body = json.dumps({"IsChecked": True, "Value": self.color}) headers = Headers({'Content-Type': 'application/json'}) yield scrapy.Request(url, callback=self.parse, method='POST', body=body, headers=headers, cookies=cookies) if int(self.page_number) > 1 and self.pages_number == 1: for res in response.css('li.vehicle-list'): # description = res.css('div.vehicle-info-section div span::attr(itemprop)').extract_first() # href = res.css('article div div div.col-xs-2.col-V2-lg-2 a::attr(href)').extract_first() href = res.css('::attr(data-detailsurl)').extract_first() if href: yield scrapy.Request(response.urljoin(href), callback=self.product_details) if int(self.page_number) > 1 and self.get_color == 0: self.pages_number = 1 url = "https://www.carsforsale.com/search/gotopage" body = json.dumps({"PageNumber": str(self.page_number)}) headers = Headers({'Content-Type': 'application/json'}) yield scrapy.Request(url, callback=self.parse, method='POST', body=body, headers=headers, cookies=cookies) if self.get_color == 0 and self.pages_number == 0: for res in response.css('li.vehicle-list'): # description = res.css('div.vehicle-info-section div span::attr(itemprop)').extract_first() # href = res.css('article div div div.col-xs-2.col-V2-lg-2 a::attr(href)').extract_first() href = res.css('::attr(data-detailsurl)').extract_first() if href: yield scrapy.Request(response.urljoin(href), callback=self.product_details)
def headers(self): global authorization_token return Headers({ 'Authorization': 'Bearer {}'.format(authorization_token), 'x-guest-token': guest_token }, encoding=self.encoding)
def set_headers(self, headers): self.headers = Headers(headers or {}, encoding=self._encoding)
def process_request(self, request, spider): if 'prerender' not in request.meta: return if request.method not in {'GET', 'POST'}: logger.warning( "Currently only GET and POST requests are supported by " "PrerenderMiddleware; %(request)s will be handled without Prerender", {'request': request}, extra={'spider': spider}) return request if request.meta.get("_prerender_processed"): # don't process the same request more than once return prerender_options = request.meta['prerender'] request.meta['_prerender_processed'] = True slot_policy = prerender_options.get('slot_policy', self.slot_policy) self._set_download_slot(request, request.meta, slot_policy) args = prerender_options.setdefault('args', {}) if '_replaced_args' in prerender_options: # restore arguments before sending request to the downloader load_args = {} save_args = [] local_arg_fingerprints = {} for name in prerender_options['_replaced_args']: fp = args[name] # Use remote Prerender argument cache: if Prerender key # for a value is known then don't send the value to Prerender; # if it is unknown then try to save the value on server using # ``save_args``. if fp in self._remote_keys: load_args[name] = self._remote_keys[fp] del args[name] else: save_args.append(name) args[name] = self._argument_values[fp] local_arg_fingerprints[name] = fp if load_args: args['load_args'] = load_args if save_args: args['save_args'] = save_args prerender_options[ '_local_arg_fingerprints'] = local_arg_fingerprints del prerender_options['_replaced_args'] # ?? args.setdefault('url', request.url) if request.method == 'POST': args.setdefault('http_method', request.method) # XXX: non-UTF8 request bodies are not supported now args.setdefault('body', request.body.decode('utf8')) if not prerender_options.get('dont_send_headers'): headers = scrapy_headers_to_unicode_dict(request.headers) if headers: args.setdefault('headers', headers) body = json.dumps(args, ensure_ascii=False, sort_keys=True, indent=4) # print(body) if 'timeout' in args: # User requested a Prerender timeout explicitly. # # We can't catch a case when user requested `download_timeout` # explicitly because a default value for `download_timeout` # is set by DownloadTimeoutMiddleware. # # As user requested Prerender timeout explicitly, we shouldn't change # it. Another reason not to change the requested Prerender timeout is # because it may cause a validation error on the remote end. # # But we can change Scrapy `download_timeout`: increase # it when it's too small. Decreasing `download_timeout` is not # safe. timeout_requested = float(args['timeout']) timeout_expected = timeout_requested + self.prerender_extra_timeout # no timeout means infinite timeout timeout_current = request.meta.get('download_timeout', 1e6) if timeout_expected > timeout_current: request.meta['download_timeout'] = timeout_expected endpoint = prerender_options.setdefault('endpoint', self.default_endpoint) prerender_base_url = prerender_options.get('prerender_url', self.prerender_base_url) prerender_url = urljoin(prerender_base_url, endpoint) headers = Headers({'Content-Type': 'application/json'}) headers.update(prerender_options.get('prerender_headers', {})) new_request = request.replace(url=prerender_url, method='POST', body=body, headers=headers, priority=request.priority + self.rescheduling_priority_adjust) self.crawler.stats.inc_value('prerender/%s/request_count' % endpoint) return new_request
def process_request(self, request, spider): splash_options = request.meta.get('splash') if not splash_options: return if request.meta.get("_splash_processed"): # don't process the same request more than once return if request.method != 'GET': logger.warn( "Currently only GET requests are supported by SplashMiddleware;" " %(request)s will be handled without Splash", {'request': request}, extra={'spider': spider} ) return request meta = request.meta meta['_splash_processed'] = splash_options slot_policy = splash_options.get('slot_policy', self.slot_policy) self._set_download_slot(request, meta, slot_policy) args = splash_options.setdefault('args', {}) args.setdefault('url', request.url) proxy = meta.get('proxy') crawlera_proxy = proxy and _crawlera_proxy_re.match(proxy) if proxy: del meta['proxy'] if crawlera_proxy: self._check_crawlera_settings(splash_options) # prevent crawlera middleware form processing the splash request meta['dont_proxy'] = True crawlera_settings = args.setdefault('crawlera', {}) crawlera_headers = crawlera_settings.setdefault('headers', Headers()) for name in request.headers.keys(): if name.startswith('Proxy-') or name.startswith('X-Crawlera-'): # Use header for every request instead of just the first one. crawlera_headers[name] = request.headers.pop(name) crawlera_settings['host'] = crawlera_proxy.group(1) crawlera_settings['port'] = int(crawlera_proxy.group(2)) args['lua_source'] = self._get_crawlera_script() else: # Pass proxy as a parameter to splash. Note that padding a # proxy url here is only available on splash >= 1.8 if "://" not in proxy: # Support for host:port without protocol proxy = "http://" + proxy args['proxy'] = proxy body = json.dumps(args, ensure_ascii=False) if 'timeout' in args: # User requested a Splash timeout explicitly. # # We can't catch a case when user requested `download_timeout` # explicitly because a default value for `download_timeout` # is set by DownloadTimeoutMiddleware. # # As user requested Splash timeout explicitly, we shouldn't change # it. Another reason not to change the requested Splash timeout is # because it may cause a validation error on the remote end. # # But we can change Scrapy `download_timeout`: increase # it when it's too small. Decreasing `download_timeout` is not # safe. # no timeout means infinite timeout timeout_current = meta.get('download_timeout', 1e6) timeout_expected = float(args['timeout']) + self.splash_extra_timeout if timeout_expected > timeout_current: meta['download_timeout'] = timeout_expected if crawlera_proxy: endpoint = "execute" else: endpoint = splash_options.setdefault('endpoint', self.default_endpoint) splash_base_url = splash_options.get('splash_url', self.splash_base_url) splash_url = urljoin(splash_base_url, endpoint) req_rep = request.replace( url=splash_url, method='POST', body=body, # FIXME: original HTTP headers (including cookies) # are not respected. headers=Headers({'Content-Type': 'application/json'}), ) self.crawler.stats.inc_value('splash/%s/request_count' % endpoint) return req_rep
def process_request(self, request, spider): headers = MockHeaders() headers = headers.get_headers(host='www.xmrc.com.cn') request.headers = Headers(headers)
def __init__( self, stream_id: int, request: Request, protocol: "H2ClientProtocol", download_maxsize: int = 0, download_warnsize: int = 0, ) -> None: """ Arguments: stream_id -- Unique identifier for the stream within a single HTTP/2 connection request -- The HTTP request associated to the stream protocol -- Parent H2ClientProtocol instance """ self.stream_id: int = stream_id self._request: Request = request self._protocol: "H2ClientProtocol" = protocol self._download_maxsize = self._request.meta.get( 'download_maxsize', download_maxsize) self._download_warnsize = self._request.meta.get( 'download_warnsize', download_warnsize) # Metadata of an HTTP/2 connection stream # initialized when stream is instantiated self.metadata: Dict = { 'request_content_length': 0 if self._request.body is None else len(self._request.body), # Flag to keep track whether the stream has initiated the request 'request_sent': False, # Flag to track whether we have logged about exceeding download warnsize 'reached_warnsize': False, # Each time we send a data frame, we will decrease value by the amount send. 'remaining_content_length': 0 if self._request.body is None else len(self._request.body), # Flag to keep track whether client (self) have closed this stream 'stream_closed_local': False, # Flag to keep track whether the server has closed the stream 'stream_closed_server': False, } # Private variable used to build the response # this response is then converted to appropriate Response class # passed to the response deferred callback self._response: Dict = { # Data received frame by frame from the server is appended # and passed to the response Deferred when completely received. 'body': BytesIO(), # The amount of data received that counts against the # flow control window 'flow_controlled_size': 0, # Headers received after sending the request 'headers': Headers({}), } def _cancel(_) -> None: # Close this stream as gracefully as possible # If the associated request is initiated we reset this stream # else we directly call close() method if self.metadata['request_sent']: self.reset_stream(StreamCloseReason.CANCELLED) else: self.close(StreamCloseReason.CANCELLED) self._deferred_response = Deferred(_cancel)
from scrapy.http.headers import Headers import json h = Headers({"X-Foo": "bar"}) print(h) print(h.values()) #print(json.dumps(h, indent=3))