def test_spider_with_no_rules_attribute(self): """Using -r with a spider with no rule should not produce items.""" status, out, stderr = yield self.execute( ['--spider', self.spider_name, '-r', self.url('/html')] ) self.assertRegexpMatches(to_native_str(out), """# Scraped Items -+\n\[\]""") self.assertIn("""No CrawlSpider rules found""", to_native_str(stderr))
def test_crawlspider_no_matching_rule(self): """The requested URL has no matching rule, so no items should be scraped""" status, out, stderr = yield self.execute( ['--spider', 'badcrawl'+self.spider_name, '-r', self.url('/enc-gb18030')] ) self.assertRegexpMatches(to_native_str(out), """# Scraped Items -+\n\[\]""") self.assertIn("""Cannot find a rule that matches""", to_native_str(stderr))
def header_items(self): return [ (to_native_str(k, errors='replace'), [to_native_str(x, errors='replace') for x in v]) for k, v in self.request.headers.items() ]
def test_template(self, tplname="crawl"): args = ["--template=%s" % tplname] if tplname else [] spname = "test_spider" p = self.proc("genspider", spname, "test.com", *args) out = to_native_str(retry_on_eintr(p.stdout.read)) self.assertIn("Created spider %r using template %r in module" % (spname, tplname), out) self.assertTrue(exists(join(self.proj_mod_path, "spiders", "test_spider.py"))) p = self.proc("genspider", spname, "test.com", *args) out = to_native_str(retry_on_eintr(p.stdout.read)) self.assertIn("Spider %r already exists in module" % spname, out)
def test_template(self, tplname='crawl'): args = ['--template=%s' % tplname] if tplname else [] spname = 'test_spider' p = self.proc('genspider', spname, 'test.com', *args) out = to_native_str(retry_on_eintr(p.stdout.read)) self.assertIn("Created spider %r using template %r in module" % (spname, tplname), out) self.assertTrue(exists(join(self.proj_mod_path, 'spiders', 'test_spider.py'))) p = self.proc('genspider', spname, 'test.com', *args) out = to_native_str(retry_on_eintr(p.stdout.read)) self.assertIn("Spider %r already exists in module" % spname, out)
def _safe_ParseResult(parts, encoding='utf8', path_encoding='utf8'): return ( to_native_str(parts.scheme), to_native_str(parts.netloc.encode('idna')), # default encoding for path component SHOULD be UTF-8 quote(to_bytes(parts.path, path_encoding), _safe_chars), quote(to_bytes(parts.params, path_encoding), _safe_chars), # encoding of query and fragment follows page encoding # or form-charset (if known and passed) quote(to_bytes(parts.query, encoding), _safe_chars), quote(to_bytes(parts.fragment, encoding), _safe_chars) )
def test_streaming_args(self): path = os.path.abspath(os.path.dirname(__file__)) test1 = os.path.join(path, 'spiders', 'sample1.py') p = self.proc('streaming', 'python', '-a', test1) log = to_native_str(p.stderr.read()) self.assertIn('sample1.py working', log)
def request_from_dict(d, spider=None): """Create Request object from a dict. If a spider is given, it will try to resolve the callbacks looking at the spider for methods with the same name. """ cb = d['callback'] if cb and spider: cb = _get_method(spider, cb) eb = d['errback'] if eb and spider: eb = _get_method(spider, eb) request_cls = load_object(d['_class']) if '_class' in d else Request return request_cls( url=to_native_str(d['url']), callback=cb, errback=eb, method=d['method'], headers=d['headers'], body=d['body'], cookies=d['cookies'], meta=d['meta'], encoding=d['_encoding'], priority=d['priority'], dont_filter=d['dont_filter'], flags=d.get('flags'))
def assertExportedJsonLines(self, items, rows, settings=None): settings = settings or {} settings.update({'FEED_FORMAT': 'jl'}) data = yield self.exported_data(items, settings) parsed = [json.loads(to_native_str(line)) for line in data.splitlines()] rows = [{k: v for k, v in row.items() if v} for row in rows] self.assertEqual(rows, parsed)
def _extract_links(self, selector, response_url, response_encoding, base_url): ''' Pretty much the same function, just added 'ignore' to to_native_str() ''' links = [] # hacky way to get the underlying lxml parsed document for el, attr, attr_val in self._iter_links(selector.root): # pseudo lxml.html.HtmlElement.make_links_absolute(base_url) try: attr_val = urljoin(base_url, attr_val) except ValueError: continue # skipping bogus links else: url = self.process_attr(attr_val) if url is None: continue # added 'ignore' to encoding errors url = to_native_str(url, encoding=response_encoding, errors='ignore') # to fix relative links after process_value url = urljoin(response_url, url) link = Link(url, _collect_string_content(el) or u'', nofollow=rel_has_nofollow(el.get('rel'))) links.append(link) return self._deduplicate_if_needed(links)
def from_content_disposition(self, content_disposition): try: filename = to_native_str(content_disposition).split(';')[1].split('=')[1] filename = filename.strip('"\'') return self.from_filename(filename) except IndexError: return Response
def from_content_type(self, content_type, content_encoding=None): """Return the most appropriate Response class from an HTTP Content-Type header """ if content_encoding: return Response mimetype = to_native_str(content_type).split(';')[0].strip().lower() return self.from_mimetype(mimetype)
def parse_url(url, encoding=None): """Return urlparsed url from the given argument (which could be an already parsed url) """ if isinstance(url, ParseResult): return url return urlparse(to_native_str(url, encoding))
def test_unicode_url(self): # instantiate with unicode url without encoding (should set default encoding) resp = self.response_class(u"http://www.example.com/") self._assert_response_encoding(resp, self.response_class._DEFAULT_ENCODING) # make sure urls are converted to str resp = self.response_class(url=u"http://www.example.com/", encoding='utf-8') assert isinstance(resp.url, str) resp = self.response_class(url=u"http://www.example.com/price/\xa3", encoding='utf-8') self.assertEqual(resp.url, to_native_str(b'http://www.example.com/price/\xc2\xa3')) resp = self.response_class(url=u"http://www.example.com/price/\xa3", encoding='latin-1') self.assertEqual(resp.url, 'http://www.example.com/price/\xa3') resp = self.response_class(u"http://www.example.com/price/\xa3", headers={"Content-type": ["text/html; charset=utf-8"]}) self.assertEqual(resp.url, to_native_str(b'http://www.example.com/price/\xc2\xa3')) resp = self.response_class(u"http://www.example.com/price/\xa3", headers={"Content-type": ["text/html; charset=iso-8859-1"]}) self.assertEqual(resp.url, 'http://www.example.com/price/\xa3')
def _debug_cookie(self, request, spider): if self.debug: cl = [to_native_str(c, errors='replace') for c in request.headers.getlist('Cookie')] if cl: cookies = "\n".join("Cookie: {0}\n".format(c) for c in cl) msg = "Sending cookies to: {0}\n{1}".format(request, cookies) logger.debug(msg, extra={'spider': spider})
def test_url_with_fragment(): mw = _get_mw() url = "http://example.com#id1" req = scrapy.Request("http://example.com", meta={ 'splash': {'args': {'url': url}} }) req = mw.process_request(req, None) assert json.loads(to_native_str(req.body)) == {'url': url}
def _debug_set_cookie(self, response, spider): if self.debug: cl = [to_native_str(c, errors='replace') for c in response.headers.getlist('Set-Cookie')] if cl: cookies = "\n".join("Set-Cookie: {0}\n".format(c) for c in cl) msg = "Received cookies from: {0}\n{1}".format(response, cookies) logger.debug(msg, extra={'spider': spider})
def from_content_disposition(self, content_disposition): try: filename = to_native_str(content_disposition, encoding='latin-1', errors='replace').split(';')[1].split('=')[1] filename = filename.strip('"\'') return self.from_filename(filename) except IndexError: return Response
def _set_url(self, url): if isinstance(url, six.text_type): if six.PY2 and self.encoding is None: raise TypeError("Cannot convert unicode url - %s " "has no encoding" % type(self).__name__) self._url = to_native_str(url, self.encoding) else: super(TextResponse, self)._set_url(url)
def _set_url(self, url): if not isinstance(url, six.string_types): raise TypeError('Request url must be str or unicode, got %s:' % type(url).__name__) url = to_native_str(url.strip(), self.encoding) self._url = escape_ajax(safe_url_string(url)) if ':' not in self._url: raise ValueError('Missing scheme in request url: %s' % self._url)
def test_runspider_unable_to_load(self): tmpdir = self.mktemp() os.mkdir(tmpdir) fname = abspath(join(tmpdir, 'myspider.txt')) with open(fname, 'w') as f: f.write("") p = self.proc('runspider', fname) log = to_native_str(p.stderr.read()) self.assertIn("Unable to load", log)
def _body_inferred_encoding(self): if self._cached_benc is None: content_type = to_native_str(self.headers.get(b'Content-Type', b'')) benc, ubody = html_to_unicode(content_type, self.body, auto_detect_fun=self._auto_detect_fun, default_encoding=self._DEFAULT_ENCODING) self._cached_benc = benc self._cached_ubody = ubody return self._cached_benc
def test_float_wait_arg(): mw = _get_mw() req1 = scrapy.Request("http://example.com", meta={ 'splash': { 'endpoint': 'render.html', 'args': {'wait': 0.5} } }) req = mw.process_request(req1, None) assert json.loads(to_native_str(req.body)) == {'url': req1.url, 'wait': 0.5}
def response_status_message(status): """Return status code plus status text descriptive message >>> response_status_message(200) '200 OK' >>> response_status_message(404) '404 Not Found' """ return '%s %s' % (status, to_native_str(http.RESPONSES.get(int(status))))
def test_override_splash_url(): mw = _get_mw() req1 = scrapy.Request("http://example.com", meta={ 'splash': { 'endpoint': 'render.png', 'splash_url': 'http://splash.example.com' } }) req = mw.process_request(req1, None) assert req.url == 'http://splash.example.com/render.png' assert json.loads(to_native_str(req.body)) == {'url': req1.url}
def test_runspider_no_spider_found(self): tmpdir = self.mktemp() os.mkdir(tmpdir) fname = abspath(join(tmpdir, 'myspider.py')) with open(fname, 'w') as f: f.write(""" from scrapy.spiders import Spider """) p = self.proc('runspider', fname) log = to_native_str(p.stderr.read()) self.assertIn("No spider found in file", log)
def test_startproject_template_override(self): copytree(join(scrapy.__path__[0], 'templates'), self.tmpl) os.mknod(join(self.tmpl_proj, 'root_template')) assert exists(join(self.tmpl_proj, 'root_template')) args = ['--set', 'TEMPLATES_DIR=%s' % self.tmpl] p = self.proc('startproject', self.project_name, *args) out = to_native_str(retry_on_eintr(p.stdout.read)) self.assertIn("New Scrapy project %r, using template directory %r, created in:" % \ (self.project_name, join(self.tmpl, 'project')), out) assert exists(join(self.proj_path, 'root_template'))
def _safe_ParseResult(parts, encoding='utf8', path_encoding='utf8'): # IDNA encoding can fail for too long labels (>63 characters) # or missing labels (e.g. http://.example.com) try: netloc = parts.netloc.encode('idna') except UnicodeError: netloc = parts.netloc return ( to_native_str(parts.scheme), to_native_str(netloc), # default encoding for path component SHOULD be UTF-8 quote(to_bytes(parts.path, path_encoding), _safe_chars), quote(to_bytes(parts.params, path_encoding), _safe_chars), # encoding of query and fragment follows page encoding # or form-charset (if known and passed) quote(to_bytes(parts.query, encoding), _safe_chars), quote(to_bytes(parts.fragment, encoding), _safe_chars) )
def test_from_response_unicode_clickdata(self): response = _buildresponse( u"""<form action="get.php" method="GET"> <input type="submit" name="price in \u00a3" value="\u00a3 1000"> <input type="submit" name="price in \u20ac" value="\u20ac 2000"> <input type="hidden" name="poundsign" value="\u00a3"> <input type="hidden" name="eurosign" value="\u20ac"> </form>""") req = self.request_class.from_response(response, \ clickdata={'name': u'price in \u00a3'}) fs = _qs(req) self.assertTrue(fs[to_native_str(u'price in \u00a3')])
def _qs(req, encoding='utf-8', to_unicode=False): if req.method == 'POST': qs = req.body else: qs = req.url.partition('?')[2] if six.PY2: uqs = unquote(to_native_str(qs, encoding)) elif six.PY3: uqs = unquote_to_bytes(qs) if to_unicode: uqs = uqs.decode(encoding) return parse_qs(uqs, True)
def request_from_dict(d, spider=None): """Create Request object from a dict. If a spider is given, it will try to resolve the callbacks looking at the spider for methods with the same name. """ cb = d['callback'] if cb and spider: cb = _get_method(spider, cb) eb = d['errback'] if eb and spider: eb = _get_method(spider, eb) request_cls = load_object(d['_class']) if '_class' in d else Request return request_cls(url=to_native_str(d['url']), callback=cb, errback=eb, method=d['method'], headers=d['headers'], body=d['body'], cookies=d['cookies'], meta=d['meta'], encoding=d['_encoding'], priority=d['priority'], dont_filter=d['dont_filter'])
def _parse_robots(self, response, netloc): self.crawler.stats.inc_value('robotstxt/response_count') self.crawler.stats.inc_value( 'robotstxt/response_status_count/{}'.format(response.status)) rp = robotparser.RobotFileParser(response.url) body = '' if hasattr(response, 'text'): body = response.text else: # last effort try try: body = response.body.decode('utf-8') except UnicodeDecodeError: # If we found garbage, disregard it:, # but keep the lookup cached (in self._parsers) # Running rp.parse() will set rp state from # 'disallow all' to 'allow any'. self.crawler.stats.inc_value('robotstxt/unicode_error_count') # stdlib's robotparser expects native 'str' ; # with unicode input, non-ASCII encoded bytes decoding fails in Python2 rp.parse(to_native_str(body).splitlines()) rp_dfd = self._parsers[netloc] self._parsers[netloc] = rp rp_dfd.callback(rp)
def policy(self, resp_or_url, request): """ Determine Referrer-Policy to use from a parent Response (or URL), and a Request to be sent. - if a valid policy is set in Request meta, it is used. - if the policy is set in meta but is wrong (e.g. a typo error), the policy from settings is used - if the policy is not set in Request meta, but there is a Referrer-policy header in the parent response, it is used if valid - otherwise, the policy from settings is used. """ policy_name = request.meta.get('referrer_policy') if policy_name is None: if isinstance(resp_or_url, Response): policy_header = resp_or_url.headers.get('Referrer-Policy') if policy_header is not None: policy_name = to_native_str(policy_header.decode('latin1')) if policy_name is None: return self.default_policy() cls = _load_policy_class(policy_name, warning_only=True) return cls() if cls else self.default_policy()
def file_path(self, request, response=None, info=None): start_time = self._get_start_time(info.spider) start_time_str = start_time.strftime('%Y%m%d_%H%M%S') content_type = '' if response: # This is to cover the case when the url has . after the last / # and the text after the . is not a file extension but the response is a json content_type = to_native_str(response.headers['Content-Type']) url = request.url media_guid = hashlib.sha1(to_bytes(url)).hexdigest() media_ext = os.path.splitext(url)[1] if hasattr(info.spider, 'ext'): media_ext = info.spider.ext elif not media_ext or ('json' in content_type and media_ext != '.json'): media_ext = '.json' # Put files in a directory named after the scraper they came from, and the scraper starttime if hasattr(info.spider, 'sample') and info.spider.sample == 'true': return '%s_sample/%s/%s%s' % (info.spider.name, start_time_str, media_guid, media_ext) else: return '%s/%s/%s%s' % (info.spider.name, start_time_str, media_guid, media_ext)
def _extract_links(self, selector, response_url, response_encoding, base_url): links = [] # hacky way to get the underlying lxml parsed document for el, attr, attr_val in self._iter_links(selector.root): # pseudo lxml.html.HtmlElement.make_links_absolute(base_url) try: if self.strip: attr_val = strip_html5_whitespace(attr_val) attr_val = urljoin(base_url, attr_val) except ValueError: continue # skipping bogus links else: url = self.process_attr(attr_val) if url is None: continue url = to_native_str(url, encoding=response_encoding) # to fix relative links after process_value url = urljoin(response_url, url) link = Link(url, _collect_string_content(el) or u'', nofollow=rel_has_nofollow(el.get('rel'))) links.append(link) return self._deduplicate_if_needed(links)
def _build_row(self, values): for s in values: try: yield to_native_str(s, self.encoding) except TypeError: yield s
def referer_str(request): """ Return Referer HTTP header suitable for logging. """ referrer = request.headers.get('Referer') if referrer is None: return referrer return to_native_str(referrer, errors='replace')
def parse_response(self, response): parser = None if response.status >= 300 and response.status < 400: self.recursive_flag = False location = to_native_str( response.headers['location'].decode('latin1')) request = response.request redirected_url = urljoin(request.url, location) req = self.make_request(reqtype='regular', url=redirected_url, dont_filter=True, shared=False) if response.meta['reqtype']: req.meta['reqtype'] = response.meta['reqtype'] req.meta['req_once_logged'] = response.meta[ 'req_once_logged'] if 'req_once_logged' in response.meta else response.request req.priority = 50 self.logger.warning( "%s: Being redirected from %s to %s" % (self.login['username'], request.url, redirected_url)) yield req elif self.islogged(response) is False: self.recursive_flag = False # DDoS and login handling block. req_once_logged = response.meta[ 'req_once_logged'] if 'req_once_logged' in response.meta else response.request if self.is_login_page(response) is True: self.logger.warning("On login page. Proceeding to log in.") self.logintrial += 1 if self.logintrial > self.settings['MAX_LOGIN_RETRY']: self.wait_for_input("Too many login failed", req_once_logged) self.logintrial = 0 return yield self.make_request(reqtype='dologin', response=response, dont_filter=True, req_once_logged=req_once_logged, shared=False) elif self.islogged(response) is False: self.logger.warning("Going to login page.") self.captcha_trial = 0 self.logintrial = 0 yield self.make_request(reqtype='loginpage', req_once_logged=req_once_logged, dont_filter=True, shared=False) else: self.logger.warning( 'DDoS/Login-block: This is not supposed to happen. HTML %s' % response.body) else: self.recursive_flag = True self.captcha_trial = 0 self.logintrial = 0 if response.meta['reqtype'] == 'dologin': self.logger.info( "Succesfully logged in as %s! Setting parsing flag and returning to stored request %s" % (self.login['username'], response.meta['req_once_logged'])) yield response.meta['req_once_logged'] if self.is_listing_page(response): if self.is_multilisting(response): parser = self.requests_from_multilisting else: parser = self.parse_listing elif self.is_vendor(response): parser = self.parse_vendor if parser is not None: for x in parser(response): yield x
def test_parse_items(self): status, out, stderr = yield self.execute( ['--spider', self.spider_name, '-c', 'parse', self.url('/html')] ) self.assertIn("""[{}, {'foo': 'bar'}]""", to_native_str(out))
def test_spider_arguments(self): _, _, stderr = yield self.execute(['--spider', self.spider_name, '-a', 'test_arg=1', '-c', 'parse', self.url('/html')]) self.assertIn("DEBUG: It Works!", to_native_str(stderr))
def test_runspider_file_not_found(self): p = self.proc('runspider', 'some_non_existent_file') log = to_native_str(p.stderr.read()) self.assertIn("File not found: some_non_existent_file", log)
def json_call(hosts, endpoint, method, param=None, files=None): res = [] for h in hosts: res.append( scrapyd_http_api(to_native_str(h), endpoint, method, param, files)) return res
def json_call(hosts, endpoint, method, param=None, files=None): res = {} for h in hosts: h = to_native_str(h) res[h] = scrapyd_http_api(h, endpoint, method, param, files) return res
def response_status_message(status): """Return status code plus status text descriptive message """ message = http.RESPONSES.get(int(status), "Unknown Status") return '%s %s' % (status, to_native_str(message))
def test_request_without_meta(self): _, _, stderr = yield self.execute([ '--spider', self.spider_name, '-c', 'parse_request_without_meta', self.url('/html') ]) self.assertIn("DEBUG: It Works!", to_native_str(stderr))
def process_request_2(self, rp, request, spider): if rp is not None and not rp.can_fetch( to_native_str(self._useragent), request.url): logger.debug("Forbidden by robots.txt: %(request)s", {'request': request}, extra={'spider': spider}) raise IgnoreRequest()
def assertQueryEqual(self, first, second, msg=None): first = to_native_str(first).split("&") second = to_native_str(second).split("&") return self.assertEqual(sorted(first), sorted(second), msg)
def _textmode(bstr): """Normalize input the same as writing to a file and reading from it in text mode""" return to_native_str(bstr).replace(os.linesep, '\n')
def test_runspider_no_spider_found(self): p = self.runspider("from scrapy.spiders import Spider\n") log = to_native_str(p.stderr.read()) self.assertIn("No spider found in file", log)
def get_header(self, name, default=None): return to_native_str(self.request.headers.get(name, default), errors='replace')
def test_runspider_unable_to_load(self): p = self.runspider('', 'myspider.txt') log = to_native_str(p.stderr.read()) self.assertIn('Unable to load', log)
def allowed(self, url, user_agent): user_agent = to_native_str(user_agent) url = to_native_str(url) return self.rp.can_fetch(user_agent, url)
def test_pipelines(self): _, _, stderr = yield self.execute(['--spider', self.spider_name, '--pipelines', '-c', 'parse', self.url('/html')]) self.assertIn("INFO: It Works!", to_native_str(stderr))
def header_items(self): return [(to_native_str(k, errors='replace'), [to_native_str(x, errors='replace') for x in v]) for k, v in self.request.headers.items()]
def test_run(self): p = self.proc('bench', '-s', 'LOGSTATS_INTERVAL=0.001', '-s', 'CLOSESPIDER_TIMEOUT=0.01') log = to_native_str(p.stderr.read()) self.assertIn('INFO: Crawled', log) self.assertNotIn('Unhandled Error', log)
def parse_response(self, response): parser = None if response.status >= 300 and response.status < 400: self.recursive_flag = False location = to_native_str( response.headers['location'].decode('latin1')) request = response.request redirected_url = urljoin(request.url, location) req = self.make_request(url=redirected_url, dont_filter=True, shared=False) if response.meta['reqtype']: req.meta['reqtype'] = response.meta['reqtype'] req.meta['req_once_logged'] = response.meta[ 'req_once_logged'] if 'req_once_logged' in response.meta else response.request req.priority = 50 self.logger.warning( "%s: Being redirected from %s to %s. [Priority: %s | Shared: %s]" % (self.login['username'], request.url, req.url, req.priority, req.meta['shared'])) yield req elif self.islogged(response) is False: self.recursive_flag = False req_once_logged = response.meta[ 'req_once_logged'] if 'req_once_logged' in response.meta else response.request if response.url.endswith( ".png") is True or response.meta['reqtype'] == 'image': pass elif self.is_login_page(response) is True: login_error = self.get_text( response.xpath(".//div[@class='alert alert-danger']") ) # Catch and print the error message. if len(login_error) > 0: self.logger.warning("Got login error: %s" % login_error) self.logger.warning( "%s: On login page. Proceeding to log in. Have used %s attempts." % (self.login['username'], self.logintrial)) self.logintrial += 1 if self.logintrial > self.settings['MAX_LOGIN_RETRY']: self.wait_for_input("Too many login failed", req_once_logged) self.logintrial = 0 return yield self.make_request(reqtype='dologin', response=response, dont_filter=True, req_once_logged=req_once_logged) elif self.is_login_page(response) is False: self.logger.warning("%s: Going to login page." % self.login['username']) self.logintrial = 0 yield self.make_request(reqtype='loginpage', req_once_logged=req_once_logged, dont_filter=True) else: self.recursive_flag = True self.logintrial = 0 if response.meta['reqtype'] == 'dologin': self.logger.info( "Succesfully logged in as %s! Setting parsing flag." % (self.login['username'])) if self.is_product_page(response) is True: parser = self.parse_product elif self.is_product_tac_page(response) is True: parser = self.parse_product_tac elif self.is_product_rating_page(response) is True: parser = self.parse_product_rating elif self.is_user_page(response) is True: parser = self.parse_user if parser is not None: for x in parser(response): yield x
def _update_ip(self): self.proxies = [ to_native_str(i) for i in self.server.srandmember(self.key, self.limit) ]
def _headers_encoding(self): content_type = self.headers.get(b'Content-Type', b'') return http_content_type_encoding(to_native_str(content_type))
def ffi_buf_to_string(buf): return to_native_str(pyOpenSSLutil.ffi.string(buf))
def get_all(self, name, default=None): return [ to_native_str(v, errors='replace') for v in self.response.headers.getlist(name) ]