def _set_url(self, url): if isinstance(url, str): self._url = safe_url_string(url) elif isinstance(url, unicode): if self.encoding is None: raise TypeError("Cannot convert unicode url - %s has no encoding" % type(self).__name__) unicode_url = url if isinstance(url, unicode) else url.decode(self.encoding) self._url = safe_url_string(unicode_url, self.encoding) else: raise TypeError("Request url must be str or unicode, got %s:" % type(url).__name__)
def _set_url(self, url): if isinstance(url, str): self._url = safe_url_string(url) elif isinstance(url, unicode): if self.encoding is None: raise TypeError( 'Cannot convert unicode url - %s has no encoding' % type(self).__name__) unicode_url = url if isinstance(url, unicode) else url.decode( self.encoding) self._url = safe_url_string(unicode_url, self.encoding) else: raise TypeError('Request url must be str or unicode, got %s:' % type(url).__name__)
def canonicalize_url(url, keep_blank_values=True, keep_fragments=False, encoding=None): """Canonicalize the given url by applying the following procedures: - sort query arguments, first by key, then by value - percent encode paths and query arguments. non-ASCII characters are percent-encoded using UTF-8 (RFC-3986) - normalize all spaces (in query arguments) '+' (plus symbol) - normalize percent encodings case (%2f -> %2F) - remove query arguments with blank values (unless keep_blank_values is True) - remove duplicate query arguments - remove fragments (unless keep_fragments is True) The url passed can be a str or unicode, while the url returned is always a str. This builds on scrapy.utils.url.canonicalize_url to remove duplicate arguments. """ scheme, netloc, path, params, query, fragment = parse_url(url) keyvals = urlparse.parse_qsl(query, keep_blank_values) keyvals = list(set(keyvals)) keyvals.sort() query = urllib.urlencode(keyvals) path = safe_url_string(_unquotepath(path)) or '/' fragment = '' if not keep_fragments else fragment return urlparse.urlunparse((scheme, netloc.lower(), path, params, query, fragment))
def canonicalize_url(url, keep_blank_values=True, keep_fragments=False, \ encoding=None): """Canonicalize the given url by applying the following procedures: - sort query arguments, first by key, then by value - percent encode paths and query arguments. non-ASCII characters are percent-encoded using UTF-8 (RFC-3986) - normalize all spaces (in query arguments) '+' (plus symbol) - normalize percent encodings case (%2f -> %2F) - remove query arguments with blank values (unless keep_blank_values is True) - remove fragments (unless keep_fragments is True) The url passed can be a str or unicode, while the url returned is always a str. For examples see the tests in scrapy.tests.test_utils_url """ url = unicode_to_str(url, encoding) scheme, netloc, path, params, query, fragment = urlparse.urlparse(url) keyvals = cgi.parse_qsl(query, keep_blank_values) keyvals.sort() query = urllib.urlencode(keyvals) # strip is added by hewei path = safe_url_string(urllib.unquote(path).strip()) fragment = '' if not keep_fragments else fragment return urlparse.urlunparse((scheme, netloc.lower(), path, params, query, fragment))
def canonicalize_url(url, keep_blank_values=True, keep_fragments=False, \ encoding=None): """Canonicalize the given url by applying the following procedures: - sort query arguments, first by key, then by value - percent encode paths and query arguments. non-ASCII characters are percent-encoded using UTF-8 (RFC-3986) - normalize all spaces (in query arguments) '+' (plus symbol) - normalize percent encodings case (%2f -> %2F) - remove query arguments with blank values (unless keep_blank_values is True) - remove fragments (unless keep_fragments is True) The url passed can be a str or unicode, while the url returned is always a str. For examples see the tests in scrapy.tests.test_utils_url """ url = unicode_to_str(url, encoding) scheme, netloc, path, params, query, fragment = urlparse.urlparse(url) keyvals = cgi.parse_qsl(query, keep_blank_values) keyvals.sort() query = urllib.urlencode(keyvals) # strip is added by hewei path = safe_url_string(urllib.unquote(path).strip()) fragment = '' if not keep_fragments else fragment return urlparse.urlunparse( (scheme, netloc.lower(), path, params, query, fragment))
def fetch_links(self, document, base_url): global fetchlinks try: for el, attr, attr_val in self._iter_links(document): if self._match_element(el.tag, attr): if attr_val.find('http://') > 0: url = attr_val[attr_val.find('http://'):] elif attr_val.find('https://') > 0: url = attr_val[attr_val.find('https://'):] elif attr_val[:2] == '//': url = base_url[:base_url.find('://') + 1] + attr_val else: url = urljoin(base_url, attr_val) _url = str(url) #if isinstance(url, unicode): #url = url.encode(response_encoding, errors='ignore') url = escape_ajax(safe_url_string(url)) n = url.find('#') if n != -1: url = url[:n] urlmd5 = self.get_md5(url) _tag = str(el.tag) _attr = str(attr if attr is not None else '') _txt = str(el.text if el.text is not None else '') fetchlinks[urlmd5] = (_tag, _attr, _txt, _url) except AttributeError as e: print 'Exception: ', e, base_url
def _make_absolute_urls(self, base_url, encoding): """Makes all request's urls absolute""" for req in self.requests: url = req.url # make absolute url url = urljoin_rfc(base_url, url, encoding) url = safe_url_string(url, encoding) # replace in-place request's url req.url = url
def image_url(txt): """convert text to a url this is quite conservative, since relative urls are supported Example: >>> image_url('') >>> image_url(' ') >>> image_url(' \\n\\n ') >>> image_url('foo-bar.jpg') ['foo-bar.jpg'] >>> image_url('/images/main_logo12.gif') ['/images/main_logo12.gif'] >>> image_url("http://www.image.com/image.jpg") ['http://www.image.com/image.jpg'] >>> image_url("http://www.domain.com/path1/path2/path3/image.jpg") ['http://www.domain.com/path1/path2/path3/image.jpg'] >>> image_url("/path1/path2/path3/image.jpg") ['/path1/path2/path3/image.jpg'] >>> image_url("path1/path2/image.jpg") ['path1/path2/image.jpg'] >>> image_url("background-image : url(http://www.site.com/path1/path2/image.jpg)") ['http://www.site.com/path1/path2/image.jpg'] >>> image_url("background-image : url('http://www.site.com/path1/path2/image.jpg')") ['http://www.site.com/path1/path2/image.jpg'] >>> image_url('background-image : url("http://www.site.com/path1/path2/image.jpg")') ['http://www.site.com/path1/path2/image.jpg'] >>> image_url("background : url(http://www.site.com/path1/path2/image.jpg)") ['http://www.site.com/path1/path2/image.jpg'] >>> image_url("background : url('http://www.site.com/path1/path2/image.jpg')") ['http://www.site.com/path1/path2/image.jpg'] >>> image_url('background : url("http://www.site.com/path1/path2/image.jpg")') ['http://www.site.com/path1/path2/image.jpg'] >>> image_url('/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350') ['/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350'] >>> image_url('http://www.site.com/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350') ['http://www.site.com/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350'] >>> image_url('http://s7d4.scene7.com/is/image/Kohler/jaa03267?hei=425&wid=457&op_usm=2,1,2,1&qlt=80') ['http://s7d4.scene7.com/is/image/Kohler/jaa03267?hei=425&wid=457&op_usm=2,1,2,1&qlt=80'] >>> image_url('../image.aspx?thumb=true&boxSize=175&img=Unknoportrait[1].jpg') ['../image.aspx?thumb=true&boxSize=175&img=Unknoportrait%5B1%5D.jpg'] >>> image_url('http://www.sundancecatalog.com/mgen/catalog/test.ms?args=%2245932|MERIDIAN+PENDANT|.jpg%22&is=336,336,0xffffff') ['http://www.sundancecatalog.com/mgen/catalog/test.ms?args=%2245932|MERIDIAN+PENDANT|.jpg%22&is=336,336,0xffffff'] >>> image_url('http://www.site.com/image.php') ['http://www.site.com/image.php'] >>> image_url('background-image:URL(http://s7d5.scene7.com/is/image/wasserstrom/165133?wid=227&hei=227&defaultImage=noimage_wasserstrom)') ['http://s7d5.scene7.com/is/image/wasserstrom/165133?wid=227&hei=227&defaultImage=noimage_wasserstrom'] """ imgurl = extract_image_url(txt) return [safe_url_string(remove_entities(url(imgurl)))] if imgurl else None
def _extract_links(self, response_text, response_url, response_encoding): self.base_url, self.links = etree.HTML(response_text, self.parser) links = unique_list(self.links, key=lambda link: link.url) if self.unique else self.links ret = [] base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url for link in links: link.url = urljoin_rfc(base_url, link.url, response_encoding) link.url = safe_url_string(link.url, response_encoding) link.text = str_to_unicode(link.text, response_encoding) ret.append(link) return ret
def _extract_links(self, response_text, response_url, response_encoding): """ Do the real extraction work """ self.reset() self.feed(response_text) self.close() ret = [] base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url for link in self.links: link.url = urljoin_rfc(base_url, link.url, response_encoding) link.url = safe_url_string(link.url, response_encoding) link.text = str_to_unicode(link.text, response_encoding) ret.append(link) return ret
def _extract_links(self, response_text, response_url, response_encoding): self.reset() self.feed(response_text) self.close() links = unique_list(self.links, key=lambda link: link.url) if self.unique else self.links ret = [] base_url = self.base_url if self.base_url else response_url for link in links: link.url = urljoin_rfc(base_url, link.url, response_encoding) link.url = safe_url_string(link.url, response_encoding) link.text = link.text.decode(response_encoding) ret.append(link) return ret
def _extract_links(self, response_text, response_url, response_encoding): self.reset() self.feed(response_text) self.close() links = unique_list(self.links, key=lambda link: link.url) if self.unique else self.links ret = [] base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url for link in links: link.url = urljoin_rfc(base_url, link.url, response_encoding) link.url = safe_url_string(link.url, response_encoding) link.text = link.text.decode(response_encoding) ret.append(link) return ret
def _extract_links(self, response_text, response_url, response_encoding): self.base_url, self.links = etree.HTML(response_text, self.parser) links = unique_list( self.links, key=lambda link: link.url) if self.unique else self.links ret = [] base_url = urljoin_rfc( response_url, self.base_url) if self.base_url else response_url for link in links: link.url = urljoin_rfc(base_url, link.url, response_encoding) link.url = safe_url_string(link.url, response_encoding) link.text = str_to_unicode(link.text, response_encoding, errors='replace') ret.append(link) return ret
def get_meta_refresh(response): """Parse the http-equiv parameter of the HTML meta element from the given response and return a tuple (interval, url) where interval is an integer containing the delay in seconds (or zero if not present) and url is a string with the absolute url to redirect. If no meta redirect is found, (None, None) is returned. """ if response not in _metaref_cache: body_chunk = remove_comments(remove_entities(response.body_as_unicode()[0:4096])) match = META_REFRESH_RE.search(body_chunk) if match: interval = float(match.group('int')) url = safe_url_string(match.group('url').strip(' "\'')) url = urljoin_rfc(response.url, url) _metaref_cache[response] = (interval, url) else: _metaref_cache[response] = (None, None) #_metaref_cache[response] = match.groups() if match else (None, None) return _metaref_cache[response]
def get_meta_refresh(response): """Parse the http-equiv parameter of the HTML meta element from the given response and return a tuple (interval, url) where interval is an integer containing the delay in seconds (or zero if not present) and url is a string with the absolute url to redirect. If no meta redirect is found, (None, None) is returned. """ if response not in _metaref_cache: body_chunk = remove_comments( remove_entities(response.body_as_unicode()[0:4096])) match = META_REFRESH_RE.search(body_chunk) if match: interval = float(match.group('int')) url = safe_url_string(match.group('url').strip(' "\'')) url = urljoin_rfc(response.url, url) _metaref_cache[response] = (interval, url) else: _metaref_cache[response] = (None, None) #_metaref_cache[response] = match.groups() if match else (None, None) return _metaref_cache[response]
def _extract_links(self, response_text, response_url, response_encoding, base_url=None): """ Do the real extraction work """ self.reset() self.feed(response_text) self.close() ret = [] if base_url is None: base_url = urljoin_rfc( response_url, self.base_url) if self.base_url else response_url for link in self.links: link.url = urljoin_rfc(base_url, link.url, response_encoding) link.url = safe_url_string(link.url, response_encoding) link.text = str_to_unicode(link.text, response_encoding, errors='replace') ret.append(link) return ret
def get_meta_refresh(response): """Parse the http-equiv parameter of the HTML meta element from the given response and return a tuple (interval, url) where interval is an integer containing the delay in seconds (or zero if not present) and url is a string with the absolute url to redirect. If no meta redirect is found, (None, None) is returned. """ if response not in _metaref_cache: body_chunk = remove_comments(remove_entities(response.body_as_unicode()[0:4096])) for match1 in META_TAG_RE.finditer(body_chunk): params = {} for match2 in META_TAG_ATTRS_RE.finditer(match1.group(1)): params[match2.group("key")] = match2.group("value") if params.get("http-equiv") == "refresh": match = META_CONTENT_RE.search(params.get("content", "")) if match: interval = float(match.group("int")) url = urljoin_rfc(response.url, safe_url_string((match.group("url") or "").strip(' "\''))) _metaref_cache[response] = (interval, url) return (interval, url) _metaref_cache[response] = (None, None) return _metaref_cache[response]
def _extract_links(self, response_text, response_url, response_encoding, base_url=None, jsurl=None): """ Do the real extraction work """ self.reset() self.feed(response_text) self.close() ret = [] if base_url is None: base_url = urljoin_rfc( response_url, self.base_url) if self.base_url else response_url for link in self.links: link.url = link.url.strip() if jsurl: lu = link.url if lu.startswith('javascript:'): g = _re_js_link.search(lu) if g: gs = g.groups() link.url = gs[1] link.url = urljoin_rfc(base_url, link.url, response_encoding) link.url = safe_url_string(link.url, response_encoding) try: link.text = str_to_unicode(link.text, response_encoding) except: link.text = None log.msg("link text codec error: [%s]" % link.url, level=log.INFO) ret.append(link) return ret
def image_url(txt): """convert text to a url this is quite conservative, since relative urls are supported Example: >>> image_url('') >>> image_url(' ') >>> image_url(' \\n\\n ') >>> image_url('foo-bar.jpg') ['foo-bar.jpg'] >>> image_url('/images/main_logo12.gif') ['/images/main_logo12.gif'] >>> image_url("http://www.image.com/image.jpg") ['http://www.image.com/image.jpg'] >>> image_url("http://www.domain.com/path1/path2/path3/image.jpg") ['http://www.domain.com/path1/path2/path3/image.jpg'] >>> image_url("/path1/path2/path3/image.jpg") ['/path1/path2/path3/image.jpg'] >>> image_url("path1/path2/image.jpg") ['path1/path2/image.jpg'] >>> image_url("background-image : url(http://www.site.com/path1/path2/image.jpg)") ['http://www.site.com/path1/path2/image.jpg'] >>> image_url("background-image : url('http://www.site.com/path1/path2/image.jpg')") ['http://www.site.com/path1/path2/image.jpg'] >>> image_url('background-image : url("http://www.site.com/path1/path2/image.jpg")') ['http://www.site.com/path1/path2/image.jpg'] >>> image_url("background : url(http://www.site.com/path1/path2/image.jpg)") ['http://www.site.com/path1/path2/image.jpg'] >>> image_url("background : url('http://www.site.com/path1/path2/image.jpg')") ['http://www.site.com/path1/path2/image.jpg'] >>> image_url('background : url("http://www.site.com/path1/path2/image.jpg")') ['http://www.site.com/path1/path2/image.jpg'] >>> image_url('/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350') ['/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350'] >>> image_url('http://www.site.com/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350') ['http://www.site.com/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350'] >>> image_url('http://s7d4.scene7.com/is/image/Kohler/jaa03267?hei=425&wid=457&op_usm=2,1,2,1&qlt=80') ['http://s7d4.scene7.com/is/image/Kohler/jaa03267?hei=425&wid=457&op_usm=2,1,2,1&qlt=80'] >>> image_url('../image.aspx?thumb=true&boxSize=175&img=Unknoportrait[1].jpg') ['../image.aspx?thumb=true&boxSize=175&img=Unknoportrait%5B1%5D.jpg'] >>> image_url('http://www.sundancecatalog.com/mgen/catalog/test.ms?args=%2245932|MERIDIAN+PENDANT|.jpg%22&is=336,336,0xffffff') ['http://www.sundancecatalog.com/mgen/catalog/test.ms?args=%2245932|MERIDIAN+PENDANT|.jpg%22&is=336,336,0xffffff'] >>> image_url('http://www.site.com/image.php') ['http://www.site.com/image.php'] >>> image_url('background-image:URL(http://s7d5.scene7.com/is/image/wasserstrom/165133?wid=227&hei=227&defaultImage=noimage_wasserstrom)') ['http://s7d5.scene7.com/is/image/wasserstrom/165133?wid=227&hei=227&defaultImage=noimage_wasserstrom'] """ txt = url(txt) imgurl = None if txt: # check if the text is style content m = _CSS_IMAGERE.search(txt) txt = m.groups()[0] if m else txt parsed = urlparse.urlparse(txt) path = None m = _IMAGE_PATH_RE.search(parsed.path) if m: path = m.group() elif parsed.query: m = _GENERIC_PATH_RE.search(parsed.path) if m: path = m.group() if path is not None: parsed = list(parsed) parsed[2] = path imgurl = urlparse.urlunparse(parsed) if not imgurl: imgurl = txt return [safe_url_string(remove_entities(url(imgurl)))] if imgurl else None
def test_safe_url_string(self): # Motoko Kusanagi (Cyborg from Ghost in the Shell) motoko = u'\u8349\u8599 \u7d20\u5b50' self.assertEqual(safe_url_string(motoko), # note the %20 for space '%E8%8D%89%E8%96%99%20%E7%B4%A0%E5%AD%90') self.assertEqual(safe_url_string(motoko), safe_url_string(safe_url_string(motoko))) self.assertEqual(safe_url_string(u'\xa9'), # copyright symbol '%C2%A9') self.assertEqual(safe_url_string(u'\xa9', 'iso-8859-1'), '%A9') self.assertEqual(safe_url_string("http://www.scrapy.org/"), 'http://www.scrapy.org/') alessi = u'/ecommerce/oggetto/Te \xf2/tea-strainer/1273' self.assertEqual(safe_url_string(alessi), '/ecommerce/oggetto/Te%20%C3%B2/tea-strainer/1273') self.assertEqual(safe_url_string("http://www.example.com/test?p(29)url(http://www.another.net/page)"), "http://www.example.com/test?p(29)url(http://www.another.net/page)") self.assertEqual(safe_url_string("http://www.example.com/Brochures_&_Paint_Cards&PageSize=200"), "http://www.example.com/Brochures_&_Paint_Cards&PageSize=200") safeurl = safe_url_string(u"http://www.example.com/\xa3", encoding='latin-1') self.assert_(isinstance(safeurl, str)) self.assertEqual(safeurl, "http://www.example.com/%A3") safeurl = safe_url_string(u"http://www.example.com/\xa3", encoding='utf-8') self.assert_(isinstance(safeurl, str)) self.assertEqual(safeurl, "http://www.example.com/%C2%A3")
def _make_absolute_urls(self, base_url, encoding): """Makes all request's urls absolute""" self.requests = [x.replace(url=safe_url_string(urljoin_rfc(base_url, \ x.url, encoding), encoding)) for x in self.requests]