Пример #1
0
 def test_spider_with_no_rules_attribute(self):
     """Using -r with a spider with no rule should not produce items."""
     status, out, stderr = yield self.execute(
         ['--spider', self.spider_name, '-r', self.url('/html')]
     )
     self.assertRegexpMatches(to_native_str(out), """# Scraped Items  -+\n\[\]""")
     self.assertIn("""No CrawlSpider rules found""", to_native_str(stderr))
Пример #2
0
 def test_crawlspider_no_matching_rule(self):
     """The requested URL has no matching rule, so no items should be scraped"""
     status, out, stderr = yield self.execute(
         ['--spider', 'badcrawl'+self.spider_name, '-r', self.url('/enc-gb18030')]
     )
     self.assertRegexpMatches(to_native_str(out), """# Scraped Items  -+\n\[\]""")
     self.assertIn("""Cannot find a rule that matches""", to_native_str(stderr))
Пример #3
0
 def header_items(self):
     return [
         (to_native_str(k,
                        errors='replace'),
          [to_native_str(x,
                         errors='replace')
           for x in v]) for k, v in self.request.headers.items()
     ]
Пример #4
0
 def test_template(self, tplname="crawl"):
     args = ["--template=%s" % tplname] if tplname else []
     spname = "test_spider"
     p = self.proc("genspider", spname, "test.com", *args)
     out = to_native_str(retry_on_eintr(p.stdout.read))
     self.assertIn("Created spider %r using template %r in module" % (spname, tplname), out)
     self.assertTrue(exists(join(self.proj_mod_path, "spiders", "test_spider.py")))
     p = self.proc("genspider", spname, "test.com", *args)
     out = to_native_str(retry_on_eintr(p.stdout.read))
     self.assertIn("Spider %r already exists in module" % spname, out)
Пример #5
0
 def test_template(self, tplname='crawl'):
     args = ['--template=%s' % tplname] if tplname else []
     spname = 'test_spider'
     p = self.proc('genspider', spname, 'test.com', *args)
     out = to_native_str(retry_on_eintr(p.stdout.read))
     self.assertIn("Created spider %r using template %r in module" % (spname, tplname), out)
     self.assertTrue(exists(join(self.proj_mod_path, 'spiders', 'test_spider.py')))
     p = self.proc('genspider', spname, 'test.com', *args)
     out = to_native_str(retry_on_eintr(p.stdout.read))
     self.assertIn("Spider %r already exists in module" % spname, out)
Пример #6
0
def _safe_ParseResult(parts, encoding='utf8', path_encoding='utf8'):
    return (
        to_native_str(parts.scheme),
        to_native_str(parts.netloc.encode('idna')),

        # default encoding for path component SHOULD be UTF-8
        quote(to_bytes(parts.path, path_encoding), _safe_chars),
        quote(to_bytes(parts.params, path_encoding), _safe_chars),

        # encoding of query and fragment follows page encoding
        # or form-charset (if known and passed)
        quote(to_bytes(parts.query, encoding), _safe_chars),
        quote(to_bytes(parts.fragment, encoding), _safe_chars)
    )
Пример #7
0
    def test_streaming_args(self):
        path = os.path.abspath(os.path.dirname(__file__))
        test1 = os.path.join(path, 'spiders', 'sample1.py')
        p = self.proc('streaming', 'python', '-a', test1)
        log = to_native_str(p.stderr.read())

        self.assertIn('sample1.py working', log)
Пример #8
0
def request_from_dict(d, spider=None):
    """Create Request object from a dict.

    If a spider is given, it will try to resolve the callbacks looking at the
    spider for methods with the same name.
    """
    cb = d['callback']
    if cb and spider:
        cb = _get_method(spider, cb)
    eb = d['errback']
    if eb and spider:
        eb = _get_method(spider, eb)
    request_cls = load_object(d['_class']) if '_class' in d else Request
    return request_cls(
        url=to_native_str(d['url']),
        callback=cb,
        errback=eb,
        method=d['method'],
        headers=d['headers'],
        body=d['body'],
        cookies=d['cookies'],
        meta=d['meta'],
        encoding=d['_encoding'],
        priority=d['priority'],
        dont_filter=d['dont_filter'],
        flags=d.get('flags'))
Пример #9
0
 def assertExportedJsonLines(self, items, rows, settings=None):
     settings = settings or {}
     settings.update({'FEED_FORMAT': 'jl'})
     data = yield self.exported_data(items, settings)
     parsed = [json.loads(to_native_str(line)) for line in data.splitlines()]
     rows = [{k: v for k, v in row.items() if v} for row in rows]
     self.assertEqual(rows, parsed)
Пример #10
0
 def _extract_links(self, selector, response_url, response_encoding, base_url):
     '''
     Pretty much the same function, just added 'ignore' to to_native_str()
     '''
     links = []
     # hacky way to get the underlying lxml parsed document
     for el, attr, attr_val in self._iter_links(selector.root):
         # pseudo lxml.html.HtmlElement.make_links_absolute(base_url)
         try:
             attr_val = urljoin(base_url, attr_val)
         except ValueError:
             continue # skipping bogus links
         else:
             url = self.process_attr(attr_val)
             if url is None:
                 continue
         # added 'ignore' to encoding errors
         url = to_native_str(url, encoding=response_encoding,
                             errors='ignore')
         # to fix relative links after process_value
         url = urljoin(response_url, url)
         link = Link(url, _collect_string_content(el) or u'',
                     nofollow=rel_has_nofollow(el.get('rel')))
         links.append(link)
     return self._deduplicate_if_needed(links)
Пример #11
0
 def from_content_disposition(self, content_disposition):
     try:
         filename = to_native_str(content_disposition).split(';')[1].split('=')[1]
         filename = filename.strip('"\'')
         return self.from_filename(filename)
     except IndexError:
         return Response
Пример #12
0
 def from_content_type(self, content_type, content_encoding=None):
     """Return the most appropriate Response class from an HTTP Content-Type
     header """
     if content_encoding:
         return Response
     mimetype = to_native_str(content_type).split(';')[0].strip().lower()
     return self.from_mimetype(mimetype)
Пример #13
0
def parse_url(url, encoding=None):
    """Return urlparsed url from the given argument (which could be an already
    parsed url)
    """
    if isinstance(url, ParseResult):
        return url
    return urlparse(to_native_str(url, encoding))
Пример #14
0
    def test_unicode_url(self):
        # instantiate with unicode url without encoding (should set default encoding)
        resp = self.response_class(u"http://www.example.com/")
        self._assert_response_encoding(resp, self.response_class._DEFAULT_ENCODING)

        # make sure urls are converted to str
        resp = self.response_class(url=u"http://www.example.com/", encoding='utf-8')
        assert isinstance(resp.url, str)

        resp = self.response_class(url=u"http://www.example.com/price/\xa3", encoding='utf-8')
        self.assertEqual(resp.url, to_native_str(b'http://www.example.com/price/\xc2\xa3'))
        resp = self.response_class(url=u"http://www.example.com/price/\xa3", encoding='latin-1')
        self.assertEqual(resp.url, 'http://www.example.com/price/\xa3')
        resp = self.response_class(u"http://www.example.com/price/\xa3", headers={"Content-type": ["text/html; charset=utf-8"]})
        self.assertEqual(resp.url, to_native_str(b'http://www.example.com/price/\xc2\xa3'))
        resp = self.response_class(u"http://www.example.com/price/\xa3", headers={"Content-type": ["text/html; charset=iso-8859-1"]})
        self.assertEqual(resp.url, 'http://www.example.com/price/\xa3')
Пример #15
0
 def _debug_cookie(self, request, spider):
     if self.debug:
         cl = [to_native_str(c, errors='replace')
               for c in request.headers.getlist('Cookie')]
         if cl:
             cookies = "\n".join("Cookie: {0}\n".format(c) for c in cl)
             msg = "Sending cookies to: {0}\n{1}".format(request, cookies)
             logger.debug(msg, extra={'spider': spider})
Пример #16
0
def test_url_with_fragment():
    mw = _get_mw()
    url = "http://example.com#id1"
    req = scrapy.Request("http://example.com", meta={
        'splash': {'args': {'url': url}}
    })
    req = mw.process_request(req, None)
    assert json.loads(to_native_str(req.body)) == {'url': url}
Пример #17
0
 def _debug_set_cookie(self, response, spider):
     if self.debug:
         cl = [to_native_str(c, errors='replace')
               for c in response.headers.getlist('Set-Cookie')]
         if cl:
             cookies = "\n".join("Set-Cookie: {0}\n".format(c) for c in cl)
             msg = "Received cookies from: {0}\n{1}".format(response, cookies)
             logger.debug(msg, extra={'spider': spider})
Пример #18
0
 def from_content_disposition(self, content_disposition):
     try:
         filename = to_native_str(content_disposition,
             encoding='latin-1', errors='replace').split(';')[1].split('=')[1]
         filename = filename.strip('"\'')
         return self.from_filename(filename)
     except IndexError:
         return Response
Пример #19
0
 def _set_url(self, url):
     if isinstance(url, six.text_type):
         if six.PY2 and self.encoding is None:
             raise TypeError("Cannot convert unicode url - %s "
                             "has no encoding" % type(self).__name__)
         self._url = to_native_str(url, self.encoding)
     else:
         super(TextResponse, self)._set_url(url)
Пример #20
0
    def _set_url(self, url):
        if not isinstance(url, six.string_types):
            raise TypeError('Request url must be str or unicode, got %s:' % type(url).__name__)

        url = to_native_str(url.strip(), self.encoding)
        self._url = escape_ajax(safe_url_string(url))

        if ':' not in self._url:
            raise ValueError('Missing scheme in request url: %s' % self._url)
Пример #21
0
 def test_runspider_unable_to_load(self):
     tmpdir = self.mktemp()
     os.mkdir(tmpdir)
     fname = abspath(join(tmpdir, 'myspider.txt'))
     with open(fname, 'w') as f:
         f.write("")
     p = self.proc('runspider', fname)
     log = to_native_str(p.stderr.read())
     self.assertIn("Unable to load", log)
Пример #22
0
 def _body_inferred_encoding(self):
     if self._cached_benc is None:
         content_type = to_native_str(self.headers.get(b'Content-Type', b''))
         benc, ubody = html_to_unicode(content_type, self.body,
                 auto_detect_fun=self._auto_detect_fun,
                 default_encoding=self._DEFAULT_ENCODING)
         self._cached_benc = benc
         self._cached_ubody = ubody
     return self._cached_benc
Пример #23
0
def test_float_wait_arg():
    mw = _get_mw()
    req1 = scrapy.Request("http://example.com", meta={
        'splash': {
            'endpoint': 'render.html',
            'args': {'wait': 0.5}
        }
    })
    req = mw.process_request(req1, None)
    assert json.loads(to_native_str(req.body)) == {'url': req1.url, 'wait': 0.5}
Пример #24
0
def response_status_message(status):
    """Return status code plus status text descriptive message

    >>> response_status_message(200)
    '200 OK'

    >>> response_status_message(404)
    '404 Not Found'
    """
    return '%s %s' % (status, to_native_str(http.RESPONSES.get(int(status))))
Пример #25
0
def test_override_splash_url():
    mw = _get_mw()
    req1 = scrapy.Request("http://example.com", meta={
        'splash': {
            'endpoint': 'render.png',
            'splash_url': 'http://splash.example.com'
        }
    })
    req = mw.process_request(req1, None)
    assert req.url == 'http://splash.example.com/render.png'
    assert json.loads(to_native_str(req.body)) == {'url': req1.url}
Пример #26
0
    def test_runspider_no_spider_found(self):
        tmpdir = self.mktemp()
        os.mkdir(tmpdir)
        fname = abspath(join(tmpdir, 'myspider.py'))
        with open(fname, 'w') as f:
            f.write("""
from scrapy.spiders import Spider
""")
        p = self.proc('runspider', fname)
        log = to_native_str(p.stderr.read())
        self.assertIn("No spider found in file", log)
Пример #27
0
    def test_startproject_template_override(self):
        copytree(join(scrapy.__path__[0], 'templates'), self.tmpl)
        os.mknod(join(self.tmpl_proj, 'root_template'))
        assert exists(join(self.tmpl_proj, 'root_template'))

        args = ['--set', 'TEMPLATES_DIR=%s' % self.tmpl]
        p = self.proc('startproject', self.project_name, *args)
        out = to_native_str(retry_on_eintr(p.stdout.read))
        self.assertIn("New Scrapy project %r, using template directory %r, created in:" % \
                      (self.project_name, join(self.tmpl, 'project')), out)
        assert exists(join(self.proj_path, 'root_template'))
Пример #28
0
def _safe_ParseResult(parts, encoding='utf8', path_encoding='utf8'):
    # IDNA encoding can fail for too long labels (>63 characters)
    # or missing labels (e.g. http://.example.com)
    try:
        netloc = parts.netloc.encode('idna')
    except UnicodeError:
        netloc = parts.netloc

    return (
        to_native_str(parts.scheme),
        to_native_str(netloc),

        # default encoding for path component SHOULD be UTF-8
        quote(to_bytes(parts.path, path_encoding), _safe_chars),
        quote(to_bytes(parts.params, path_encoding), _safe_chars),

        # encoding of query and fragment follows page encoding
        # or form-charset (if known and passed)
        quote(to_bytes(parts.query, encoding), _safe_chars),
        quote(to_bytes(parts.fragment, encoding), _safe_chars)
    )
Пример #29
0
 def test_from_response_unicode_clickdata(self):
     response = _buildresponse(
         u"""<form action="get.php" method="GET">
         <input type="submit" name="price in \u00a3" value="\u00a3 1000">
         <input type="submit" name="price in \u20ac" value="\u20ac 2000">
         <input type="hidden" name="poundsign" value="\u00a3">
         <input type="hidden" name="eurosign" value="\u20ac">
         </form>""")
     req = self.request_class.from_response(response, \
             clickdata={'name': u'price in \u00a3'})
     fs = _qs(req)
     self.assertTrue(fs[to_native_str(u'price in \u00a3')])
Пример #30
0
def _qs(req, encoding='utf-8', to_unicode=False):
    if req.method == 'POST':
        qs = req.body
    else:
        qs = req.url.partition('?')[2]
    if six.PY2:
        uqs = unquote(to_native_str(qs, encoding))
    elif six.PY3:
        uqs = unquote_to_bytes(qs)
    if to_unicode:
        uqs = uqs.decode(encoding)
    return parse_qs(uqs, True)
Пример #31
0
def request_from_dict(d, spider=None):
    """Create Request object from a dict.

    If a spider is given, it will try to resolve the callbacks looking at the
    spider for methods with the same name.
    """
    cb = d['callback']
    if cb and spider:
        cb = _get_method(spider, cb)
    eb = d['errback']
    if eb and spider:
        eb = _get_method(spider, eb)
    request_cls = load_object(d['_class']) if '_class' in d else Request
    return request_cls(url=to_native_str(d['url']),
                       callback=cb,
                       errback=eb,
                       method=d['method'],
                       headers=d['headers'],
                       body=d['body'],
                       cookies=d['cookies'],
                       meta=d['meta'],
                       encoding=d['_encoding'],
                       priority=d['priority'],
                       dont_filter=d['dont_filter'])
Пример #32
0
    def _parse_robots(self, response, netloc):
        self.crawler.stats.inc_value('robotstxt/response_count')
        self.crawler.stats.inc_value(
            'robotstxt/response_status_count/{}'.format(response.status))
        rp = robotparser.RobotFileParser(response.url)
        body = ''
        if hasattr(response, 'text'):
            body = response.text
        else:  # last effort try
            try:
                body = response.body.decode('utf-8')
            except UnicodeDecodeError:
                # If we found garbage, disregard it:,
                # but keep the lookup cached (in self._parsers)
                # Running rp.parse() will set rp state from
                # 'disallow all' to 'allow any'.
                self.crawler.stats.inc_value('robotstxt/unicode_error_count')
        # stdlib's robotparser expects native 'str' ;
        # with unicode input, non-ASCII encoded bytes decoding fails in Python2
        rp.parse(to_native_str(body).splitlines())

        rp_dfd = self._parsers[netloc]
        self._parsers[netloc] = rp
        rp_dfd.callback(rp)
Пример #33
0
    def policy(self, resp_or_url, request):
        """
        Determine Referrer-Policy to use from a parent Response (or URL),
        and a Request to be sent.

        - if a valid policy is set in Request meta, it is used.
        - if the policy is set in meta but is wrong (e.g. a typo error),
          the policy from settings is used
        - if the policy is not set in Request meta,
          but there is a Referrer-policy header in the parent response,
          it is used if valid
        - otherwise, the policy from settings is used.
        """
        policy_name = request.meta.get('referrer_policy')
        if policy_name is None:
            if isinstance(resp_or_url, Response):
                policy_header = resp_or_url.headers.get('Referrer-Policy')
                if policy_header is not None:
                    policy_name = to_native_str(policy_header.decode('latin1'))
        if policy_name is None:
            return self.default_policy()

        cls = _load_policy_class(policy_name, warning_only=True)
        return cls() if cls else self.default_policy()
Пример #34
0
    def file_path(self, request, response=None, info=None):
        start_time = self._get_start_time(info.spider)
        start_time_str = start_time.strftime('%Y%m%d_%H%M%S')
        content_type = ''
        if response:
            # This is to cover the case when the url has . after the last /
            # and the text after the . is not a file extension but the response is a json
            content_type = to_native_str(response.headers['Content-Type'])
        url = request.url
        media_guid = hashlib.sha1(to_bytes(url)).hexdigest()
        media_ext = os.path.splitext(url)[1]

        if hasattr(info.spider, 'ext'):
            media_ext = info.spider.ext
        elif not media_ext or ('json' in content_type
                               and media_ext != '.json'):
            media_ext = '.json'
        # Put files in a directory named after the scraper they came from, and the scraper starttime
        if hasattr(info.spider, 'sample') and info.spider.sample == 'true':
            return '%s_sample/%s/%s%s' % (info.spider.name, start_time_str,
                                          media_guid, media_ext)
        else:
            return '%s/%s/%s%s' % (info.spider.name, start_time_str,
                                   media_guid, media_ext)
Пример #35
0
 def _extract_links(self, selector, response_url, response_encoding,
                    base_url):
     links = []
     # hacky way to get the underlying lxml parsed document
     for el, attr, attr_val in self._iter_links(selector.root):
         # pseudo lxml.html.HtmlElement.make_links_absolute(base_url)
         try:
             if self.strip:
                 attr_val = strip_html5_whitespace(attr_val)
             attr_val = urljoin(base_url, attr_val)
         except ValueError:
             continue  # skipping bogus links
         else:
             url = self.process_attr(attr_val)
             if url is None:
                 continue
         url = to_native_str(url, encoding=response_encoding)
         # to fix relative links after process_value
         url = urljoin(response_url, url)
         link = Link(url,
                     _collect_string_content(el) or u'',
                     nofollow=rel_has_nofollow(el.get('rel')))
         links.append(link)
     return self._deduplicate_if_needed(links)
Пример #36
0
 def _build_row(self, values):
     for s in values:
         try:
             yield to_native_str(s, self.encoding)
         except TypeError:
             yield s
Пример #37
0
def referer_str(request):
    """ Return Referer HTTP header suitable for logging. """
    referrer = request.headers.get('Referer')
    if referrer is None:
        return referrer
    return to_native_str(referrer, errors='replace')
Пример #38
0
    def parse_response(self, response):
        parser = None
        if response.status >= 300 and response.status < 400:
            self.recursive_flag = False
            location = to_native_str(
                response.headers['location'].decode('latin1'))
            request = response.request
            redirected_url = urljoin(request.url, location)
            req = self.make_request(reqtype='regular',
                                    url=redirected_url,
                                    dont_filter=True,
                                    shared=False)
            if response.meta['reqtype']:
                req.meta['reqtype'] = response.meta['reqtype']
            req.meta['req_once_logged'] = response.meta[
                'req_once_logged'] if 'req_once_logged' in response.meta else response.request
            req.priority = 50
            self.logger.warning(
                "%s: Being redirected from %s to %s" %
                (self.login['username'], request.url, redirected_url))
            yield req

        elif self.islogged(response) is False:
            self.recursive_flag = False
            # DDoS and login handling block.
            req_once_logged = response.meta[
                'req_once_logged'] if 'req_once_logged' in response.meta else response.request
            if self.is_login_page(response) is True:
                self.logger.warning("On login page. Proceeding to log in.")
                self.logintrial += 1
                if self.logintrial > self.settings['MAX_LOGIN_RETRY']:
                    self.wait_for_input("Too many login failed",
                                        req_once_logged)
                    self.logintrial = 0
                    return
                yield self.make_request(reqtype='dologin',
                                        response=response,
                                        dont_filter=True,
                                        req_once_logged=req_once_logged,
                                        shared=False)
            elif self.islogged(response) is False:
                self.logger.warning("Going to login page.")
                self.captcha_trial = 0
                self.logintrial = 0
                yield self.make_request(reqtype='loginpage',
                                        req_once_logged=req_once_logged,
                                        dont_filter=True,
                                        shared=False)
            else:
                self.logger.warning(
                    'DDoS/Login-block: This is not supposed to happen. HTML %s'
                    % response.body)

        else:
            self.recursive_flag = True
            self.captcha_trial = 0
            self.logintrial = 0
            if response.meta['reqtype'] == 'dologin':
                self.logger.info(
                    "Succesfully logged in as %s! Setting parsing flag and returning to stored request %s"
                    %
                    (self.login['username'], response.meta['req_once_logged']))
                yield response.meta['req_once_logged']

            if self.is_listing_page(response):
                if self.is_multilisting(response):
                    parser = self.requests_from_multilisting
                else:
                    parser = self.parse_listing
            elif self.is_vendor(response):
                parser = self.parse_vendor
            if parser is not None:
                for x in parser(response):
                    yield x
Пример #39
0
 def test_parse_items(self):
     status, out, stderr = yield self.execute(
         ['--spider', self.spider_name, '-c', 'parse', self.url('/html')]
     )
     self.assertIn("""[{}, {'foo': 'bar'}]""", to_native_str(out))
Пример #40
0
 def test_spider_arguments(self):
     _, _, stderr = yield self.execute(['--spider', self.spider_name,
                                        '-a', 'test_arg=1',
                                        '-c', 'parse',
                                        self.url('/html')])
     self.assertIn("DEBUG: It Works!", to_native_str(stderr))
Пример #41
0
 def test_runspider_file_not_found(self):
     p = self.proc('runspider', 'some_non_existent_file')
     log = to_native_str(p.stderr.read())
     self.assertIn("File not found: some_non_existent_file", log)
Пример #42
0
def json_call(hosts, endpoint, method, param=None, files=None):
    res = []
    for h in hosts:
        res.append(
            scrapyd_http_api(to_native_str(h), endpoint, method, param, files))
    return res
Пример #43
0
def json_call(hosts, endpoint, method, param=None, files=None):
    res = {}
    for h in hosts:
        h = to_native_str(h)
        res[h] = scrapyd_http_api(h, endpoint, method, param, files)
    return res
Пример #44
0
def response_status_message(status):
    """Return status code plus status text descriptive message
    """
    message = http.RESPONSES.get(int(status), "Unknown Status")
    return '%s %s' % (status, to_native_str(message))
Пример #45
0
 def test_request_without_meta(self):
     _, _, stderr = yield self.execute([
         '--spider', self.spider_name, '-c', 'parse_request_without_meta',
         self.url('/html')
     ])
     self.assertIn("DEBUG: It Works!", to_native_str(stderr))
Пример #46
0
 def process_request_2(self, rp, request, spider):
     if rp is not None and not rp.can_fetch(
              to_native_str(self._useragent), request.url):
         logger.debug("Forbidden by robots.txt: %(request)s",
                      {'request': request}, extra={'spider': spider})
         raise IgnoreRequest()
Пример #47
0
 def assertQueryEqual(self, first, second, msg=None):
     first = to_native_str(first).split("&")
     second = to_native_str(second).split("&")
     return self.assertEqual(sorted(first), sorted(second), msg)
Пример #48
0
def _textmode(bstr):
    """Normalize input the same as writing to a file
    and reading from it in text mode"""
    return to_native_str(bstr).replace(os.linesep, '\n')
Пример #49
0
 def test_runspider_no_spider_found(self):
     p = self.runspider("from scrapy.spiders import Spider\n")
     log = to_native_str(p.stderr.read())
     self.assertIn("No spider found in file", log)
Пример #50
0
 def get_header(self, name, default=None):
     return to_native_str(self.request.headers.get(name, default),
                          errors='replace')
Пример #51
0
 def test_runspider_unable_to_load(self):
     p = self.runspider('', 'myspider.txt')
     log = to_native_str(p.stderr.read())
     self.assertIn('Unable to load', log)
Пример #52
0
 def allowed(self, url, user_agent):
     user_agent = to_native_str(user_agent)
     url = to_native_str(url)
     return self.rp.can_fetch(user_agent, url)
Пример #53
0
 def test_pipelines(self):
     _, _, stderr = yield self.execute(['--spider', self.spider_name,
                                        '--pipelines',
                                        '-c', 'parse',
                                        self.url('/html')])
     self.assertIn("INFO: It Works!", to_native_str(stderr))
Пример #54
0
 def header_items(self):
     return [(to_native_str(k, errors='replace'),
              [to_native_str(x, errors='replace') for x in v])
             for k, v in self.request.headers.items()]
Пример #55
0
 def test_run(self):
     p = self.proc('bench', '-s', 'LOGSTATS_INTERVAL=0.001',
             '-s', 'CLOSESPIDER_TIMEOUT=0.01')
     log = to_native_str(p.stderr.read())
     self.assertIn('INFO: Crawled', log)
     self.assertNotIn('Unhandled Error', log)
Пример #56
0
    def parse_response(self, response):
        parser = None
        if response.status >= 300 and response.status < 400:
            self.recursive_flag = False
            location = to_native_str(
                response.headers['location'].decode('latin1'))
            request = response.request
            redirected_url = urljoin(request.url, location)
            req = self.make_request(url=redirected_url,
                                    dont_filter=True,
                                    shared=False)
            if response.meta['reqtype']:
                req.meta['reqtype'] = response.meta['reqtype']
            req.meta['req_once_logged'] = response.meta[
                'req_once_logged'] if 'req_once_logged' in response.meta else response.request
            req.priority = 50
            self.logger.warning(
                "%s: Being redirected from %s to %s. [Priority: %s | Shared: %s]"
                % (self.login['username'], request.url, req.url, req.priority,
                   req.meta['shared']))
            yield req
        elif self.islogged(response) is False:
            self.recursive_flag = False
            req_once_logged = response.meta[
                'req_once_logged'] if 'req_once_logged' in response.meta else response.request
            if response.url.endswith(
                    ".png") is True or response.meta['reqtype'] == 'image':
                pass
            elif self.is_login_page(response) is True:
                login_error = self.get_text(
                    response.xpath(".//div[@class='alert alert-danger']")
                )  # Catch and print the error message.

                if len(login_error) > 0:
                    self.logger.warning("Got login error: %s" % login_error)

                self.logger.warning(
                    "%s: On login page. Proceeding to log in. Have used %s attempts."
                    % (self.login['username'], self.logintrial))
                self.logintrial += 1

                if self.logintrial > self.settings['MAX_LOGIN_RETRY']:
                    self.wait_for_input("Too many login failed",
                                        req_once_logged)
                    self.logintrial = 0
                    return

                yield self.make_request(reqtype='dologin',
                                        response=response,
                                        dont_filter=True,
                                        req_once_logged=req_once_logged)

            elif self.is_login_page(response) is False:
                self.logger.warning("%s: Going to login page." %
                                    self.login['username'])
                self.logintrial = 0

                yield self.make_request(reqtype='loginpage',
                                        req_once_logged=req_once_logged,
                                        dont_filter=True)
        else:
            self.recursive_flag = True
            self.logintrial = 0
            if response.meta['reqtype'] == 'dologin':
                self.logger.info(
                    "Succesfully logged in as %s! Setting parsing flag." %
                    (self.login['username']))

            if self.is_product_page(response) is True:
                parser = self.parse_product
            elif self.is_product_tac_page(response) is True:
                parser = self.parse_product_tac
            elif self.is_product_rating_page(response) is True:
                parser = self.parse_product_rating
            elif self.is_user_page(response) is True:
                parser = self.parse_user

            if parser is not None:
                for x in parser(response):
                    yield x
Пример #57
0
 def _update_ip(self):
     self.proxies = [
         to_native_str(i)
         for i in self.server.srandmember(self.key, self.limit)
     ]
Пример #58
0
 def _headers_encoding(self):
     content_type = self.headers.get(b'Content-Type', b'')
     return http_content_type_encoding(to_native_str(content_type))
Пример #59
0
def ffi_buf_to_string(buf):
    return to_native_str(pyOpenSSLutil.ffi.string(buf))
Пример #60
0
 def get_all(self, name, default=None):
     return [
         to_native_str(v, errors='replace')
         for v in self.response.headers.getlist(name)
     ]