def test_referer_header(self): """Referer header is set by RefererMiddleware unless it is already set""" req0 = Request(self.mockserver.url('/echo?headers=1&body=0'), dont_filter=1) req1 = req0.replace() req2 = req0.replace(headers={'Referer': None}) req3 = req0.replace(headers={'Referer': 'http://example.com'}) req0.meta['next'] = req1 req1.meta['next'] = req2 req2.meta['next'] = req3 crawler = self.runner.create_crawler(SingleRequestSpider) yield crawler.crawl(seed=req0, mockserver=self.mockserver) # basic asserts in case of weird communication errors self.assertIn('responses', crawler.spider.meta) self.assertNotIn('failures', crawler.spider.meta) # start requests doesn't set Referer header echo0 = json.loads(to_unicode(crawler.spider.meta['responses'][2].body)) self.assertNotIn('Referer', echo0['headers']) # following request sets Referer to start request url echo1 = json.loads(to_unicode(crawler.spider.meta['responses'][1].body)) self.assertEqual(echo1['headers'].get('Referer'), [req0.url]) # next request avoids Referer header echo2 = json.loads(to_unicode(crawler.spider.meta['responses'][2].body)) self.assertNotIn('Referer', echo2['headers']) # last request explicitly sets a Referer header echo3 = json.loads(to_unicode(crawler.spider.meta['responses'][3].body)) self.assertEqual(echo3['headers'].get('Referer'), ['http://example.com'])
def to_unicode_dict(self): """ Return headers as a CaselessDict with unicode keys and unicode values. Multiple values are joined with ','. """ return CaselessDict( (to_unicode(key, encoding=self.encoding), to_unicode(b','.join(value), encoding=self.encoding)) for key, value in self.items())
def render_GET(self, request): output = { 'headers': dict( (to_unicode(k), [to_unicode(v) for v in vs]) for k, vs in request.requestHeaders.getAllRawHeaders()), 'body': to_unicode(request.content.read()), } return to_bytes(json.dumps(output))
def scrapy_headers_to_unicode_dict(headers): """ Convert scrapy.http.Headers instance to a dictionary suitable for JSON encoding. """ return { to_unicode(key): to_unicode(b','.join(value)) for key, value in headers.items() }
def _connect(self, factory): host, port = to_unicode(factory.host), factory.port if factory.scheme == b'https': return reactor.connectSSL(host, port, factory, self.ClientContextFactory()) else: return reactor.connectTCP(host, port, factory)
def assertTwoItemsExported(self, item): self.ie.start_exporting() self.ie.export_item(item) self.ie.export_item(item) self.ie.finish_exporting() exported = json.loads(to_unicode(self.output.getvalue())) self.assertEqual(exported, [dict(item), dict(item)])
def _clientfactory(url, *args, **kwargs): url = to_unicode(url) timeout = kwargs.pop('timeout', 0) f = client.ScrapyHTTPClientFactory( Request(url, *args, **kwargs), timeout=timeout) f.deferred.addCallback(response_transform or (lambda r: r.body)) return f
def parse_url(url, encoding=None): """Return urlparsed url from the given argument (which could be an already parsed url) """ if isinstance(url, ParseResult): return url return urlparse(to_unicode(url, encoding))
def request_to_dict(request, spider=None): """Convert Request object to a dict. If a spider is given, it will try to find out the name of the spider method used in the callback and store that as the callback. """ cb = request.callback if callable(cb): cb = _find_method(spider, cb) eb = request.errback if callable(eb): eb = _find_method(spider, eb) d = { 'url': to_unicode(request.url), # urls should be safe (safe_string_url) 'callback': cb, 'errback': eb, 'method': request.method, 'headers': dict(request.headers), 'body': request.body, 'cookies': request.cookies, 'meta': request.meta, '_encoding': request._encoding, 'priority': request.priority, 'dont_filter': request.dont_filter, } return d
def to_native_str(text, encoding=None, errors='strict'): """ Return str representation of `text` (bytes in Python 2.x and unicode in Python 3.x). """ if six.PY2: return to_bytes(text, encoding, errors) else: return to_unicode(text, encoding, errors)
def from_content_disposition(self, content_disposition): try: filename = to_unicode( content_disposition, encoding='latin-1', errors='replace').split(';')[1].split('=')[1].strip('"\'') return self.from_filename(filename) except IndexError: return Response
def _debug_set_cookie(self, response, spider): if self.debug: cl = [to_unicode(c, errors='replace') for c in response.headers.getlist('Set-Cookie')] if cl: cookies = "\n".join(f"Set-Cookie: {c}\n" for c in cl) msg = f"Received cookies from: {response}\n{cookies}" logger.debug(msg, extra={'spider': spider})
def _debug_cookie(self, request, spider): if self.debug: cl = [to_unicode(c, errors='replace') for c in request.headers.getlist('Cookie')] if cl: cookies = "\n".join(f"Cookie: {c}\n" for c in cl) msg = f"Sending cookies to: {request}\n{cookies}" logger.debug(msg, extra={'spider': spider})
def test_nonstring_types_item(self): item = self._get_nonstring_types_item() self.ie.start_exporting() self.ie.export_item(item) self.ie.finish_exporting() exported = json.loads(to_unicode(self.output.getvalue())) item['time'] = str(item['time']) self.assertEqual(exported, [item])
def _body_inferred_encoding(self): if self._cached_benc is None: content_type = to_unicode(self.headers.get(b'Content-Type', b'')) benc, ubody = html_to_unicode(content_type, self.body, auto_detect_fun=self._auto_detect_fun, default_encoding=self._DEFAULT_ENCODING) self._cached_benc = benc self._cached_ubody = ubody return self._cached_benc
def replace_chars(text, which_ones=('\n', '\t', '\x85', '\x97'), replace_by=u'', encoding=None): """Remove escape characters. `which_ones` is a tuple of which escape characters we want to remove. By default removes ``\\n``, ``\\t``, ``\\x85``, ``\\x97``. `replace_by` is the string to replace the escape characters by. It defaults to ``''``, meaning the escape characters are removed. """ text = to_unicode(text, encoding) for ec in which_ones: text = text.replace(ec, to_unicode(replace_by, encoding)) return text
def _connect(self, factory): host, port = to_unicode(factory.host), factory.port if factory.scheme == b'https': client_context_factory = create_instance( self.ClientContextFactory, settings=self._settings, crawler=None) return reactor.connectSSL( host, port, factory, client_context_factory) else: return reactor.connectTCP(host, port, factory)
def test_nested_item(self): i1 = TestItem(name=u'Joseph', age='22') i2 = dict(name=u'Maria', age=i1) i3 = TestItem(name=u'Jesus', age=i2) self.ie.start_exporting() self.ie.export_item(i3) self.ie.finish_exporting() exported = json.loads(to_unicode(self.output.getvalue())) self.assertEqual(exported, self._expected_nested)
def test_nested_item(self): i1 = self.item_class(name="Joseph", age="22") i2 = dict(name="Maria", age=i1) i3 = self.item_class(name="Jesus", age=i2) self.ie.start_exporting() self.ie.export_item(i3) self.ie.finish_exporting() exported = json.loads(to_unicode(self.output.getvalue())) self.assertEqual(exported, self._expected_nested)
def assertTwoItemsExported(self, item): self.ie.start_exporting() self.ie.export_item(item) self.ie.export_item(item) self.ie.finish_exporting() exported = json.loads(to_unicode(self.output.getvalue())) self.assertEqual( exported, [ItemAdapter(item).asdict(), ItemAdapter(item).asdict()])
def _serialize_dict(self, value, pre=None, field_filter=None): for key, val in value.items(): k = None if field_filter: if pre is not None: k = pre_join(pre, key) if k in field_filter: continue yield to_unicode(key), self._serialize_value( val, pre=k, field_filter=field_filter)
def test_unicode_url(self): # instantiate with unicode url without encoding (should set default encoding) resp = self.response_class(u"http://www.example.com/") self._assert_response_encoding(resp, self.response_class._DEFAULT_ENCODING) # make sure urls are converted to str resp = self.response_class(url=u"http://www.example.com/", encoding='utf-8') assert isinstance(resp.url, str) resp = self.response_class(url=u"http://www.example.com/price/\xa3", encoding='utf-8') self.assertEqual(resp.url, to_unicode(b'http://www.example.com/price/\xc2\xa3')) resp = self.response_class(url=u"http://www.example.com/price/\xa3", encoding='latin-1') self.assertEqual(resp.url, 'http://www.example.com/price/\xa3') resp = self.response_class(u"http://www.example.com/price/\xa3", headers={"Content-type": ["text/html; charset=utf-8"]}) self.assertEqual(resp.url, to_unicode(b'http://www.example.com/price/\xc2\xa3')) resp = self.response_class(u"http://www.example.com/price/\xa3", headers={"Content-type": ["text/html; charset=iso-8859-1"]}) self.assertEqual(resp.url, 'http://www.example.com/price/\xa3')
def _debug_cookie(self, request, spider): if self.debug: cl = [ to_unicode(c, errors='replace') for c in request.headers.getlist('Cookie') ] if cl: cookies = "\n".join("Cookie: {}\n".format(c) for c in cl) msg = "Sending cookies to: {}\n{}".format(request, cookies) logger.debug(msg, extra={'spider': spider})
def test_nested_dict_item(self): i1 = dict(name="Joseph\xa3", age="22") i2 = self.item_class(name="Maria", age=i1) i3 = dict(name="Jesus", age=i2) self.ie.start_exporting() self.ie.export_item(i3) self.ie.finish_exporting() exported = json.loads(to_unicode(self.output.getvalue())) expected = {"name": "Jesus", "age": {"name": "Maria", "age": i1}} self.assertEqual(exported, [expected])
def test_nested_dict_item(self): i1 = dict(name=u"Joseph\xa3", age="22") i2 = TestItem(name=u"Maria", age=i1) i3 = dict(name=u"Jesus", age=i2) self.ie.start_exporting() self.ie.export_item(i3) self.ie.finish_exporting() exported = json.loads(to_unicode(self.output.getvalue())) expected = {"name": u"Jesus", "age": {"name": "Maria", "age": i1}} self.assertEqual(exported, [expected])
def test_nested_item(self): i1 = self.item_class(name='Joseph\xa3', age='22') i2 = self.item_class(name='Maria', age=i1) i3 = self.item_class(name='Jesus', age=i2) self.ie.start_exporting() self.ie.export_item(i3) self.ie.finish_exporting() exported = json.loads(to_unicode(self.output.getvalue())) expected = {'name': 'Jesus', 'age': {'name': 'Maria', 'age': ItemAdapter(i1).asdict()}} self.assertEqual(exported, [expected])
def proc(self, *new_args, **popen_kwargs): args = (sys.executable, '-m', 'scrapy.cmdline') + new_args p = subprocess.Popen(args, cwd=self.cwd, env=self.env, stdout=subprocess.PIPE, stderr=subprocess.PIPE, **popen_kwargs) def kill_proc(): p.kill() p.communicate() assert False, 'Command took too much time to complete' timer = Timer(15, kill_proc) try: timer.start() stdout, stderr = p.communicate() finally: timer.cancel() return p, to_unicode(stdout), to_unicode(stderr)
def response_to_dict(response): d = { 'url': to_unicode(response.url), 'status': int(response.status), 'headers': dict(response.headers), 'body': response.body, 'flags': list(response.flags), 'request': request_to_dict(response.request) } return d
def _build_response(self, body, request): request.meta['download_latency'] = self.headers_time - self.start_time status = int(self.status) headers = Headers(self.response_headers) respcls = responsetypes.from_args(headers=headers, url=self._url) return respcls(url=self._url, status=status, headers=headers, body=body, protocol=to_unicode(self.version))
def test_nested_dict_item(self): i1 = dict(name=u'Joseph\xa3', age='22') i2 = TestItem(name=u'Maria', age=i1) i3 = dict(name=u'Jesus', age=i2) self.ie.start_exporting() self.ie.export_item(i3) self.ie.finish_exporting() exported = json.loads(to_unicode(self.output.getvalue())) expected = {'name': u'Jesus', 'age': {'name': 'Maria', 'age': i1}} self.assertEqual(exported, [expected])
def assertExportedJsonLines(self, items, rows, settings=None): settings = settings or {} settings.update({ 'FEEDS': { self._random_temp_filename(): {'format': 'jl'}, }, }) data = yield self.exported_data(items, settings) parsed = [json.loads(to_unicode(line)) for line in data['jl'].splitlines()] rows = [{k: v for k, v in row.items() if v} for row in rows] self.assertEqual(rows, parsed)
def _serialize_value(self, value): if isinstance(value, BaseItem): return self.export_item(value) if isinstance(value, dict): return dict(self._serialize_dict(value)) if is_listlike(value): return [self._serialize_value(v) for v in value] if self.binary: return to_bytes(value, encoding=self.encoding) else: return to_unicode(value, encoding=self.encoding)
def push(self, request): url = request.url cb = request.callback if callable(cb): cb = _find_method(self.spider, cb) eb = request.errback if callable(eb): eb = _find_method(self.spider, eb) d = {'url': to_unicode(url), 'callback': cb, 'errback': eb} d = self.serializer.dumps(d) self.server.rpush(self.key, d)
def parse_data(data): if isinstance(data, (dict, scrapy.Item)): return {parse_data(k): parse_data(v) for k, v in data.items()} elif isinstance(data, list): return [parse_data(x) for x in data] elif isinstance(data, bytes): return to_unicode(data) elif isinstance(data, datetime): return data.isoformat() elif isinstance(data, (int, float)): return data return str(data)
def _connect(self, factory): from twisted.internet import reactor host, port = to_unicode(factory.host), factory.port if factory.scheme == b'https': client_context_factory = create_instance( objcls=self.ClientContextFactory, settings=self._settings, crawler=self._crawler, ) return reactor.connectSSL(host, port, factory, client_context_factory) else: return reactor.connectTCP(host, port, factory)
def _serialize_value(self, value): try: if isinstance(value, dict): return dict(self._serialize_dict(value)) value = super(TextDictKeyPythonItemExporter, self)._serialize_value(value) except UnicodeDecodeError as e: if self.ensure_base64 and isinstance(value, bytes): value = to_unicode(base64.encodebytes(value)) else: raise e return value
def _get_agent(self, request, timeout): from twisted.internet import reactor bindaddress = request.meta.get('bindaddress') or self._bindAddress proxy = request.meta.get('proxy') if proxy: proxyScheme, proxyNetloc, proxyHost, proxyPort, proxyParams = _parse( proxy) scheme = _parse(request.url)[0] proxyHost = to_unicode(proxyHost) omitConnectTunnel = b'noconnect' in proxyParams if omitConnectTunnel: warnings.warn( "Using HTTPS proxies in the noconnect mode is deprecated. " "If you use Zyte Smart Proxy Manager, it doesn't require " "this mode anymore, so you should update scrapy-crawlera " "to scrapy-zyte-smartproxy and remove '?noconnect' " "from the Zyte Smart Proxy Manager URL.", ScrapyDeprecationWarning, ) if scheme == b'https' and not omitConnectTunnel: proxyAuth = request.headers.get(b'Proxy-Authorization', None) proxyConf = (proxyHost, proxyPort, proxyAuth) return self._TunnelingAgent( reactor=reactor, proxyConf=proxyConf, contextFactory=self._contextFactory, connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool, ) else: proxyScheme = proxyScheme or b'http' proxyHost = to_bytes(proxyHost, encoding='ascii') proxyPort = to_bytes(str(proxyPort), encoding='ascii') proxyURI = urlunparse( (proxyScheme, proxyNetloc, proxyParams, '', '', '')) return self._ProxyAgent( reactor=reactor, proxyURI=to_bytes(proxyURI, encoding='ascii'), connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool, ) return self._Agent( reactor=reactor, contextFactory=self._contextFactory, connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool, )
def export_item(self, item): if self._headers_not_written: self._headers_not_written = False self._write_headers_and_set_fields_to_export(item) fields = self._get_serialized_fields( item, default_value='', include_empty=True) values = list(self._build_row(x for _, x in fields)) self.stream.write(to_unicode( serialize_html_table_row(values), self.encoding))
def test_jsonrpc_client_call_request(self): sentcall = {} def _urlopen(url, data): sentcall['url'] = url sentcall['data'] = data return _umock(1) with patch.object(urllib.request, 'urlopen', _urlopen): jsonrpc_client_call('url', 'test', 'one', 2) req = json.loads(to_unicode(sentcall['data'])) assert 'id' in req self.assertEqual(sentcall['url'], 'url') self.assertEqual(req['jsonrpc'], '2.0') self.assertEqual(req['method'], 'test') self.assertEqual(req['params'], ['one', 2])
def decode_robotstxt(robotstxt_body, spider, to_native_str_type=False): try: if to_native_str_type: robotstxt_body = to_unicode(robotstxt_body) else: robotstxt_body = robotstxt_body.decode('utf-8') except UnicodeDecodeError: # If we found garbage or robots.txt in an encoding other than UTF-8, disregard it. # Switch to 'allow all' state. logger.warning( "Failure while parsing robots.txt. " "File either contains garbage or is in an encoding other than UTF-8, treating it as an empty file.", exc_info=sys.exc_info(), extra={'spider': spider}) robotstxt_body = '' return robotstxt_body
def assertExportedMultiple(self, items, rows, settings=None): settings = settings or {} settings.update({ 'FEEDS': { self._random_temp_filename(): {'format': 'xml'}, self._random_temp_filename(): {'format': 'json'}, }, }) data = yield self.exported_data(items, settings) rows = [{k: v for k, v in row.items() if v} for row in rows] # XML root = lxml.etree.fromstring(data['xml']) xml_rows = [{e.tag: e.text for e in it} for it in root.findall('item')] self.assertEqual(rows, xml_rows) # JSON json_rows = json.loads(to_unicode(data['json'])) self.assertEqual(rows, json_rows)
def assertExportedCsv(self, items, header, rows, settings=None, ordered=True): settings = settings or {} settings.update({ 'FEEDS': { self._random_temp_filename(): {'format': 'csv'}, }, }) data = yield self.exported_data(items, settings) reader = csv.DictReader(to_unicode(data['csv']).splitlines()) got_rows = list(reader) if ordered: self.assertEqual(reader.fieldnames, header) else: self.assertEqual(set(reader.fieldnames), set(header)) self.assertEqual(rows, got_rows)
def _extract_links(self, response_text, response_url, response_encoding, base_url=None): """ Do the real extraction work """ self.reset() self.feed(response_text) self.close() ret = [] if base_url is None: base_url = urljoin(response_url, self.base_url) if self.base_url else response_url for link in self.links: if isinstance(link.url, unicode): link.url = link.url.encode(response_encoding) link.url = urljoin(base_url, link.url) link.url = safe_url_string(link.url, response_encoding) link.text = to_unicode(link.text, response_encoding, errors='replace').strip() ret.append(link) return ret
def _get_agent(self, request, timeout): bindaddress = request.meta.get('bindaddress') or self._bindAddress proxy = request.meta.get('proxy') if proxy: _, _, proxyHost, proxyPort, proxyParams = _parse(proxy) scheme = _parse(request.url)[0] proxyHost = to_unicode(proxyHost) omitConnectTunnel = b'noconnect' in proxyParams if scheme == b'https' and not omitConnectTunnel: proxyConf = (proxyHost, proxyPort, request.headers.get(b'Proxy-Authorization', None)) return self._TunnelingAgent(reactor, proxyConf, contextFactory=self._contextFactory, connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool) else: return self._ProxyAgent(reactor, proxyURI=to_bytes(proxy, encoding='ascii'), connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool) return self._Agent(reactor, contextFactory=self._contextFactory, connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool)
def parse_x_splash_saved_arguments_header(value): """ Parse X-Splash-Saved-Arguments header value. >>> value = u"name1=9a6747fc6259aa374ab4e1bb03074b6ec672cf99;name2=ba001160ef96fe2a3f938fea9e6762e204a562b3" >>> dct = parse_x_splash_saved_arguments_header(value) >>> sorted(list(dct.keys())) ['name1', 'name2'] >>> dct['name1'] '9a6747fc6259aa374ab4e1bb03074b6ec672cf99' >>> dct['name2'] 'ba001160ef96fe2a3f938fea9e6762e204a562b3' Binary header values are also supported: >>> dct2 = parse_x_splash_saved_arguments_header(value.encode('utf8')) >>> dct2 == dct True """ value = to_unicode(value) return dict(kv.split('=', 1) for kv in value.split(";"))
def request_to_dict(self, request): ''' Convert Request object to a dict. modified from scrapy.utils.reqser ''' req_dict = { # urls should be safe (safe_string_url) 'url': to_unicode(request.url), 'method': request.method, 'headers': dict(request.headers), 'body': request.body, 'cookies': request.cookies, 'meta': request.meta, '_encoding': request._encoding, 'priority': request.priority, 'dont_filter': request.dont_filter, # callback/errback are assumed to be a bound instance of the spider 'callback': None if request.callback is None else request.callback.__name__, 'errback': None if request.errback is None else request.errback.__name__, } return req_dict
def extract_regex(regex, text, encoding="utf-8"): """Extract a list of unicode strings from the given text/encoding using the following policies: * if the regex contains a named group called "extract" that will be returned * if the regex contains multiple numbered groups, all those will be returned (flattened) * if the regex doesn't contain any group the entire regex matching is returned """ if isinstance(regex, six.string_types): regex = re.compile(regex, re.UNICODE) try: strings = [regex.search(text).group("extract")] # named group except: strings = regex.findall(text) # full regex or numbered groups strings = flatten(strings) if isinstance(text, six.text_type): return [replace_entities(s, keep=["lt", "amp"]) for s in strings] else: return [replace_entities(to_unicode(s, encoding), keep=["lt", "amp"]) for s in strings]
def _getrow(csv_r): return [to_unicode(field, encoding) for field in next(csv_r)]
def assertCsvEqual(self, first, second, msg=None): first = to_unicode(first) second = to_unicode(second) csvsplit = lambda csv: [sorted(re.split(r'(,|\s+)', line)) for line in csv.splitlines(True)] return self.assertEqual(csvsplit(first), csvsplit(second), msg)
def _check_output(self): self.assertCsvEqual(to_unicode(self.output.getvalue()), u'age,name\r\n22,John\xa3\r\n')
def test_errors_argument(self): self.assertEqual( to_unicode(b'a\xedb', 'utf-8', errors='replace'), u'a\ufffdb' )
def _assert_expected_item(self, exported_dict): for k, v in exported_dict.items(): exported_dict[k] = to_unicode(v) self.assertEqual(self.i, exported_dict)
def _check_output(self): exported = json.loads(to_unicode(self.output.getvalue().strip())) self.assertEqual(exported, [dict(self.i)])
def rfc1123_to_epoch(date_str): try: date_str = to_unicode(date_str, encoding='ascii') return mktime_tz(parsedate_tz(date_str)) except Exception: return None
def _check_Encoding(self, response, original_body): content_encoding = to_unicode(response.headers[b'Content-Encoding']) self.assertEquals(content_encoding, EncodingResource.out_encoding) self.assertEquals( response.body.decode(content_encoding), to_unicode(original_body))
def testFactoryInfo(self): url = self.getURL('file') _, _, host, port, _ = client._parse(url) factory = client.ScrapyHTTPClientFactory(Request(url)) reactor.connectTCP(to_unicode(host), port, factory) return factory.deferred.addCallback(self._cbFactoryInfo, factory)
def render(self, request): body = to_unicode(request.content.read()) request.setHeader(b'content-encoding', self.out_encoding) return body.encode(self.out_encoding)