def test_requote_url(self): url = 'http://%68%65%2f%6c%6c%6f.com/%' self.assertEqual(requote_url(url), 'http://he%2Fllo.com/%') # test some real-life urls url = 'mailto:[email protected]?Subject=Registration%Help' self.assertEqual(requote_url(url), 'mailto:[email protected]?Subject=Registration%Help') url = 'https://maps.google.nl/maps?q=Eekholt+4,+Diemen&%u205Ehl=nl' self.assertEqual(requote_url(url), 'https://maps.google.nl/maps?q=Eekholt+4,+Diemen&%u205Ehl=nl')
def test_requote_url(self): url = 'http://%68%65%2f%6c%6c%6f.com/%' self.assertEqual(requote_url(url), 'http://he%2Fllo.com/%') # test some real-life urls url = 'mailto:[email protected]?Subject=Registration%Help' self.assertEqual( requote_url(url), 'mailto:[email protected]?Subject=Registration%Help') url = 'https://maps.google.nl/maps?q=Eekholt+4,+Diemen&%u205Ehl=nl' self.assertEqual( requote_url(url), 'https://maps.google.nl/maps?q=Eekholt+4,+Diemen&%u205Ehl=nl')
def _extract_links_from_html(self, html, response_encoding): links = [] for el, attr, attr_val, pos in html.iterlinks(): if self.tag_func(el.tag): if self.attr_func(attr): try: url = attr_val if isinstance(url, unicode): try: url = to_str(url, response_encoding) except UnicodeEncodeError: # fallback url = to_str(url, 'utf-8') url = requote_url(url) url = correct_relative_path(url) text = el.text or u'' text = to_unicode(text, 'utf-8') nofollow = (el.attrib.get('rel') == 'nofollow') except Exception as e: log.msg( format='Error occurred while extracting links from %(url)s. Error (%(etype)s): %(error)s', level=log.WARNING, url=html.base_url, etype=type(e), error=e) else: links.append(Link(url=url, text=text, nofollow=nofollow)) return links
def base_url(self): if self._base_url is None: self._base_url = self.url chunk = self.body[:4096] m = _base_url_re.search(chunk) if m: self._base_url = urljoin(self._base_url, m.group(1)) self._base_url = requote_url(self._base_url) return self._base_url
def base_url(self): if self._base_url is None: self._base_url = self.url chunk = self.text[:4096] m = _base_url_re.search(chunk) if m: self._base_url = urljoin(self._base_url, m.group(1).encode(self.encoding)) self._base_url = requote_url(self._base_url) return self._base_url
def process_response(self, response): canonical_url = None if 'Link' in response.headers: m = self.canonical_header_re.search(response.headers['link']) if m: canonical_url = m.group(1) if isinstance(response, HtmlResponse): m = self.canonical_tag_re.search(response.body[:4096]) if m: canonical_url = m.group(1) if canonical_url: response.meta['canonical_url'] = requote_url(urlparse.urljoin(response.base_url, canonical_url)) return response
def get_meta_refresh(response): '''Parse the http-equiv refrsh parameter from the given HTML response. Return tuple (interval, url).''' text = remove_entities(response.text[0:4096]) text = html_comment_re.sub(u'', text) text = html_noscript_re.sub(u'', text) text = html_script_re.sub(u'', text) m = _meta_refresh_re.search(text) if m: interval = float(m.group('int')) url = requote_url(to_str(m.group('url').strip(' "\''), response.encoding)) url = urlparse.urljoin(response.url, url) return (interval, url) else: return (None, None)
def get_meta_refresh(response): '''Parse the http-equiv refrsh parameter from the given HTML response. Return tuple (interval, url).''' text = remove_entities(response.text[0:4096]) text = html_comment_re.sub(u'', text) text = html_noscript_re.sub(u'', text) text = html_script_re.sub(u'', text) m = _meta_refresh_re.search(text) if m: interval = float(m.group('int')) url = requote_url( to_str(m.group('url').strip(' "\''), response.encoding)) url = urlparse.urljoin(response.url, url) return (interval, url) else: return (None, None)
def _extract_links_from_html(self, html, response_encoding): links = [] for e, a, l, p in html.iterlinks(): if self.tag_func(e.tag): if self.attr_func(a): try: url = requote_url(to_str(to_unicode(l, 'utf-8'), response_encoding)) text = e.text or u'' text = to_unicode(text, 'utf-8') nofollow = (e.attrib.get('rel') == 'nofollow') except Exception as e: log.msg( format='Error occurred while extracting links from %(url)s. Error (%(etype)s): %(error)s', level=log.WARNING, url=html.base_url, etype=type(e), error=e) else: links.append(Link(url=url, text=text, nofollow=nofollow)) return links
def _prepare_url(self, url, params): if isinstance(url, basestring): url = to_str(url, self._encoding) else: raise TypeError('Bad type for `url` object: %s' % type(url)) scheme, netloc, path, _params, query, fragment = urlparse(url) if not scheme: raise ValueError('Invalid URL %s: No schema supplied.' % url) if not netloc and not path: raise ValueError('Invalid URL %s: No netloc nor path supplied.' % url) # Bare domains aren't valid URLs. if not path: path = '/' enc_params = self._encode_params(params) if enc_params: if query: query = '%s&%s' % (query, enc_params) else: query = enc_params # ajax excaping if fragment.startswith('!'): fragment = requote_ajax(fragment[1:]) if query: query = '%s&_escaped_fragment_=%s' % (query, fragment) else: query = '_escaped_fragment_=%s' % fragment fragment = '' quoted = requote_url(urlunparse([scheme, netloc, path, _params, query, fragment])) self.parsed_url = urlparse(quoted) return quoted
def _prepare_url(self, url, params): if isinstance(url, basestring): url = to_str(url, self._encoding) else: raise TypeError('Bad type for `url` object: %s' % type(url)) scheme, netloc, path, _params, query, fragment = urlparse(url) if not scheme: raise ValueError('Invalid URL %s: No schema supplied.' % url) if not netloc and not path: raise ValueError('Invalid URL %s: No netloc nor path supplied.' % url) # Bare domains aren't valid URLs. if not path: path = '/' enc_params = self._encode_params(params) if enc_params: if query: query = '%s&%s' % (query, enc_params) else: query = enc_params # ajax excaping if fragment.startswith('!'): fragment = requote_ajax(fragment[1:]) if query: query = '%s&_escaped_fragment_=%s' % (query, fragment) else: query = '_escaped_fragment_=%s' % fragment fragment = '' quoted = requote_url( urlunparse([scheme, netloc, path, _params, query, fragment])) self.parsed_url = urlparse(quoted) return quoted
def _extract_links_from_html(self, html, response_encoding): links = [] for e, a, l, p in html.iterlinks(): if self.tag_func(e.tag): if self.attr_func(a): try: url = requote_url( to_str(to_unicode(l, 'utf-8'), response_encoding)) url = correct_relative_path(url) text = e.text or u'' text = to_unicode(text, 'utf-8') nofollow = (e.attrib.get('rel') == 'nofollow') except Exception as e: log.msg( format= 'Error occurred while extracting links from %(url)s. Error (%(etype)s): %(error)s', level=log.WARNING, url=html.base_url, etype=type(e), error=e) else: links.append( Link(url=url, text=text, nofollow=nofollow)) return links