def test_get_base_url_empty_basehref(self): """Base tag exists but href is empty""" html = u'<html><head><base href="" />\ <body></body></html>' url = "http://example.com/products/p19.html" page = HtmlPage(url, body=html) self.assertEqual(get_base_url(page), url)
def test_get_base_url(self): """Basic get_base_url test""" html = u'<html><head><base href="http://example.com/products/" />\ <body></body></html>' page = HtmlPage("http://example.com/products/p19.html", body=html) self.assertEqual(get_base_url(page), "http://example.com/products/")
def adapt(self, text, htmlpage=None): if htmlpage is None: return text if text is None: return encoding = getattr(htmlpage, 'encoding', 'utf-8') text = text.encode(encoding) unquoted = unquote_markup(text, encoding=encoding) cleaned = strip_url(disallowed.sub('', unquoted)) base = get_base_url(htmlpage).encode(encoding) base_url = strip_url(unquote_markup(base, encoding=encoding)) joined = urljoin(base_url, cleaned) return safe_download_url(joined)
def test_get_base_url_nobase(self): """Base tag does not exists""" html = u"<html><head><body></body></html>" page = HtmlPage("http://example.com/products/p19.html", body=html) self.assertEqual(get_base_url(page), "http://example.com/products/p19.html")
def adapt(self, text, htmlpage): text = text.encode(htmlpage.encoding) joined = urljoin_rfc(get_base_url(htmlpage), text) return safe_download_url(unquote_markup(joined))
def adapt(self, text, htmlpage): text = text.encode(htmlpage.encoding) joined = urljoin( get_base_url(htmlpage).encode(htmlpage.encoding), text) return safe_download_url( unquote_markup(joined, encoding=htmlpage.encoding))
def test_get_base_url_nobase(self): """Base tag does not exists""" html = u'<html><head><body></body></html>' page = HtmlPage("http://example.com/products/p19.html", body=html) self.assertEqual(get_base_url(page), "http://example.com/products/p19.html")
def adapt(self, text, htmlpage=None): if htmlpage is None: return text text = text.encode(htmlpage.encoding) joined = urljoin(get_base_url(htmlpage).encode(htmlpage.encoding), text) return safe_download_url(unquote_markup(joined, encoding=htmlpage.encoding))