Пример #1
0
 def test_requote_url(self):
     url = 'http://%68%65%2f%6c%6c%6f.com/%'
     self.assertEqual(requote_url(url), 'http://he%2Fllo.com/%')
     # test some real-life urls
     url = 'mailto:[email protected]?Subject=Registration%Help'
     self.assertEqual(requote_url(url), 'mailto:[email protected]?Subject=Registration%Help')
     url = 'https://maps.google.nl/maps?q=Eekholt+4,+Diemen&%u205Ehl=nl'
     self.assertEqual(requote_url(url), 'https://maps.google.nl/maps?q=Eekholt+4,+Diemen&%u205Ehl=nl')
Пример #2
0
 def test_requote_url(self):
     url = 'http://%68%65%2f%6c%6c%6f.com/%'
     self.assertEqual(requote_url(url), 'http://he%2Fllo.com/%')
     # test some real-life urls
     url = 'mailto:[email protected]?Subject=Registration%Help'
     self.assertEqual(
         requote_url(url),
         'mailto:[email protected]?Subject=Registration%Help')
     url = 'https://maps.google.nl/maps?q=Eekholt+4,+Diemen&%u205Ehl=nl'
     self.assertEqual(
         requote_url(url),
         'https://maps.google.nl/maps?q=Eekholt+4,+Diemen&%u205Ehl=nl')
Пример #3
0
 def _extract_links_from_html(self, html, response_encoding):
     links = []
     for el, attr, attr_val, pos in html.iterlinks():
         if self.tag_func(el.tag):
             if self.attr_func(attr):
                 try:
                     url = attr_val
                     if isinstance(url, unicode):
                         try:
                             url = to_str(url, response_encoding)
                         except UnicodeEncodeError:
                             # fallback
                             url = to_str(url, 'utf-8')
                     url = requote_url(url)
                     url = correct_relative_path(url)
                     text = el.text or u''
                     text = to_unicode(text, 'utf-8')
                     nofollow = (el.attrib.get('rel') == 'nofollow')
                 except Exception as e:
                     log.msg(
                         format='Error occurred while extracting links from %(url)s. Error (%(etype)s): %(error)s',
                         level=log.WARNING, url=html.base_url, etype=type(e),
                         error=e)
                 else:
                     links.append(Link(url=url, text=text, nofollow=nofollow))
     return links
Пример #4
0
 def base_url(self):
     if self._base_url is None:
         self._base_url = self.url
         chunk = self.body[:4096]
         m = _base_url_re.search(chunk)
         if m:
             self._base_url = urljoin(self._base_url, m.group(1))
         self._base_url = requote_url(self._base_url)
     return self._base_url
Пример #5
0
 def base_url(self):
     if self._base_url is None:
         self._base_url = self.url
         chunk = self.text[:4096]
         m = _base_url_re.search(chunk)
         if m:
             self._base_url = urljoin(self._base_url,
                                      m.group(1).encode(self.encoding))
         self._base_url = requote_url(self._base_url)
     return self._base_url
Пример #6
0
 def process_response(self, response):
     canonical_url = None
     if 'Link' in response.headers:
         m = self.canonical_header_re.search(response.headers['link'])
         if m:
             canonical_url = m.group(1)
     if isinstance(response, HtmlResponse):
         m = self.canonical_tag_re.search(response.body[:4096])
         if m:
             canonical_url = m.group(1)
     if canonical_url:
         response.meta['canonical_url'] = requote_url(urlparse.urljoin(response.base_url, canonical_url))
     return response
Пример #7
0
def get_meta_refresh(response):
    '''Parse the http-equiv refrsh parameter from the given HTML response.
    Return tuple (interval, url).'''
    text = remove_entities(response.text[0:4096])
    text = html_comment_re.sub(u'', text)
    text = html_noscript_re.sub(u'', text)
    text = html_script_re.sub(u'', text)

    m = _meta_refresh_re.search(text)
    if m:
        interval = float(m.group('int'))
        url = requote_url(to_str(m.group('url').strip(' "\''), response.encoding))
        url = urlparse.urljoin(response.url, url)
        return (interval, url)
    else:
        return (None, None)
Пример #8
0
def get_meta_refresh(response):
    '''Parse the http-equiv refrsh parameter from the given HTML response.
    Return tuple (interval, url).'''
    text = remove_entities(response.text[0:4096])
    text = html_comment_re.sub(u'', text)
    text = html_noscript_re.sub(u'', text)
    text = html_script_re.sub(u'', text)

    m = _meta_refresh_re.search(text)
    if m:
        interval = float(m.group('int'))
        url = requote_url(
            to_str(m.group('url').strip(' "\''), response.encoding))
        url = urlparse.urljoin(response.url, url)
        return (interval, url)
    else:
        return (None, None)
Пример #9
0
 def _extract_links_from_html(self, html, response_encoding):
     links = []
     for e, a, l, p in html.iterlinks():
         if self.tag_func(e.tag):
             if self.attr_func(a):
                 try:
                     url = requote_url(to_str(to_unicode(l, 'utf-8'), response_encoding))
                     text = e.text or u''
                     text = to_unicode(text, 'utf-8')
                     nofollow = (e.attrib.get('rel') == 'nofollow')
                 except Exception as e:
                     log.msg(
                         format='Error occurred while extracting links from %(url)s. Error (%(etype)s): %(error)s',
                         level=log.WARNING, url=html.base_url, etype=type(e),
                         error=e)
                 else:
                     links.append(Link(url=url, text=text, nofollow=nofollow))
     return links
Пример #10
0
    def _prepare_url(self, url, params):
        if isinstance(url, basestring):
            url = to_str(url, self._encoding)
        else:
            raise TypeError('Bad type for `url` object: %s' % type(url))

        scheme, netloc, path, _params, query, fragment = urlparse(url)
        if not scheme:
            raise ValueError('Invalid URL %s: No schema supplied.' % url)
        if not netloc and not path:
            raise ValueError('Invalid URL %s: No netloc nor path supplied.' %
                             url)

        # Bare domains aren't valid URLs.
        if not path:
            path = '/'

        enc_params = self._encode_params(params)
        if enc_params:
            if query:
                query = '%s&%s' % (query, enc_params)
            else:
                query = enc_params

        # ajax excaping
        if fragment.startswith('!'):
            fragment = requote_ajax(fragment[1:])
            if query:
                query = '%s&_escaped_fragment_=%s' % (query, fragment)
            else:
                query = '_escaped_fragment_=%s' % fragment
            fragment = ''

        quoted = requote_url(urlunparse([scheme, netloc, path, _params, query,
                                         fragment]))
        self.parsed_url = urlparse(quoted)
        return quoted
Пример #11
0
    def _prepare_url(self, url, params):
        if isinstance(url, basestring):
            url = to_str(url, self._encoding)
        else:
            raise TypeError('Bad type for `url` object: %s' % type(url))

        scheme, netloc, path, _params, query, fragment = urlparse(url)
        if not scheme:
            raise ValueError('Invalid URL %s: No schema supplied.' % url)
        if not netloc and not path:
            raise ValueError('Invalid URL %s: No netloc nor path supplied.' %
                             url)

        # Bare domains aren't valid URLs.
        if not path:
            path = '/'

        enc_params = self._encode_params(params)
        if enc_params:
            if query:
                query = '%s&%s' % (query, enc_params)
            else:
                query = enc_params

        # ajax excaping
        if fragment.startswith('!'):
            fragment = requote_ajax(fragment[1:])
            if query:
                query = '%s&_escaped_fragment_=%s' % (query, fragment)
            else:
                query = '_escaped_fragment_=%s' % fragment
            fragment = ''

        quoted = requote_url(
            urlunparse([scheme, netloc, path, _params, query, fragment]))
        self.parsed_url = urlparse(quoted)
        return quoted
Пример #12
0
 def _extract_links_from_html(self, html, response_encoding):
     links = []
     for e, a, l, p in html.iterlinks():
         if self.tag_func(e.tag):
             if self.attr_func(a):
                 try:
                     url = requote_url(
                         to_str(to_unicode(l, 'utf-8'), response_encoding))
                     url = correct_relative_path(url)
                     text = e.text or u''
                     text = to_unicode(text, 'utf-8')
                     nofollow = (e.attrib.get('rel') == 'nofollow')
                 except Exception as e:
                     log.msg(
                         format=
                         'Error occurred while extracting links from %(url)s. Error (%(etype)s): %(error)s',
                         level=log.WARNING,
                         url=html.base_url,
                         etype=type(e),
                         error=e)
                 else:
                     links.append(
                         Link(url=url, text=text, nofollow=nofollow))
     return links