def test_link_text_wrong_encoding(self): html = """<body><p><a href="item/12.html">Wrong: \xed</a></p></body></html>""" response = HtmlResponse("http://www.example.com", body=html, encoding='utf-8') lx = BaseSgmlLinkExtractor() self.assertEqual(lx.extract_links(response), [ Link(url='http://www.example.com/item/12.html', text=u'Wrong: \ufffd'), ])
def test_extraction_encoding(self): body = get_testdata('link_extractor', 'linkextractor_noenc.html') response_utf8 = HtmlResponse( url='http://example.com/utf8', body=body, headers={'Content-Type': ['text/html; charset=utf-8']}) response_noenc = HtmlResponse( url='http://example.com/noenc', body=body) body = get_testdata('link_extractor', 'linkextractor_latin1.html') response_latin1 = HtmlResponse( url='http://example.com/latin1', body=body) lx = BaseSgmlLinkExtractor() self.assertEqual(lx.extract_links(response_utf8), [ Link(url='http://example.com/sample_%C3%B1.html', text=''), Link(url='http://example.com/sample_%E2%82%AC.html', text='sample \xe2\x82\xac text'.decode('utf-8')), ]) self.assertEqual(lx.extract_links(response_noenc), [ Link(url='http://example.com/sample_%C3%B1.html', text=''), Link(url='http://example.com/sample_%E2%82%AC.html', text='sample \xe2\x82\xac text'.decode('utf-8')), ]) self.assertEqual(lx.extract_links(response_latin1), [ Link(url='http://example.com/sample_%F1.html', text=''), Link(url='http://example.com/sample_%E1.html', text='sample \xe1 text'.decode('latin1')), ])
def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(), tags=('a', 'area'), attrs=('href'), canonicalize=True, unique=True, process_value=None): self.allow_res = [ x if isinstance(x, _re_type) else re.compile(x) for x in arg_to_iter(allow) ] self.deny_res = [ x if isinstance(x, _re_type) else re.compile(x) for x in arg_to_iter(deny) ] self.allow_domains = set(arg_to_iter(allow_domains)) self.deny_domains = set(arg_to_iter(deny_domains)) self.restrict_xpaths = tuple(arg_to_iter(restrict_xpaths)) self.canonicalize = canonicalize tag_func = lambda x: x in tags attr_func = lambda x: x in attrs BaseSgmlLinkExtractor.__init__(self, tag=tag_func, attr=attr_func, unique=unique, process_value=process_value)
def test_matches(self): url1 = 'http://lotsofstuff.com/stuff1/index' url2 = 'http://evenmorestuff.com/uglystuff/index' lx = BaseSgmlLinkExtractor() self.assertEqual(lx.matches(url1), True) self.assertEqual(lx.matches(url2), True)
def test_base_url(self): html = """<html><head><title>Page title<title><base href="http://otherdomain.com/base/" /> <body><p><a href="item/12.html">Item 12</a></p> </body></html>""" response = HtmlResponse( "http://example.org/somepage/index.html", body=html) lx = BaseSgmlLinkExtractor() # default: tag=a, attr=href self.assertEqual(lx.extract_links(response), [Link(url='http://otherdomain.com/base/item/12.html', text='Item 12')]) # base url is an absolute path and relative to host html = """<html><head><title>Page title<title><base href="/" /> <body><p><a href="item/12.html">Item 12</a></p></body></html>""" response = HtmlResponse( "https://example.org/somepage/index.html", body=html) self.assertEqual(lx.extract_links(response), [Link(url='https://example.org/item/12.html', text='Item 12')]) # base url has no scheme html = """<html><head><title>Page title<title><base href="//noschemedomain.com/path/to/" /> <body><p><a href="item/12.html">Item 12</a></p></body></html>""" response = HtmlResponse( "https://example.org/somepage/index.html", body=html) self.assertEqual(lx.extract_links(response), [Link(url='https://noschemedomain.com/path/to/item/12.html', text='Item 12')])
def test_base_url(self): html = """<html><head><title>Page title<title><base href="http://otherdomain.com/base/" /> <body><p><a href="item/12.html">Item 12</a></p> </body></html>""" response = HtmlResponse("http://example.org/somepage/index.html", body=html) lx = BaseSgmlLinkExtractor() # default: tag=a, attr=href self.assertEqual(lx.extract_links(response), [ Link(url='http://otherdomain.com/base/item/12.html', text='Item 12') ]) # base url is an absolute path and relative to host html = """<html><head><title>Page title<title><base href="/" /> <body><p><a href="item/12.html">Item 12</a></p></body></html>""" response = HtmlResponse("https://example.org/somepage/index.html", body=html) self.assertEqual( lx.extract_links(response), [Link(url='https://example.org/item/12.html', text='Item 12')]) # base url has no scheme html = """<html><head><title>Page title<title><base href="//noschemedomain.com/path/to/" /> <body><p><a href="item/12.html">Item 12</a></p></body></html>""" response = HtmlResponse("https://example.org/somepage/index.html", body=html) self.assertEqual(lx.extract_links(response), [ Link(url='https://noschemedomain.com/path/to/item/12.html', text='Item 12') ])
def test_extraction_encoding(self): body = get_testdata('link_extractor', 'linkextractor_noenc.html') response_utf8 = HtmlResponse( url='http://example.com/utf8', body=body, headers={'Content-Type': ['text/html; charset=utf-8']}) response_noenc = HtmlResponse(url='http://example.com/noenc', body=body) body = get_testdata('link_extractor', 'linkextractor_latin1.html') response_latin1 = HtmlResponse(url='http://example.com/latin1', body=body) lx = BaseSgmlLinkExtractor() self.assertEqual(lx.extract_links(response_utf8), [ Link(url='http://example.com/sample_%C3%B1.html', text=''), Link(url='http://example.com/sample_%E2%82%AC.html', text='sample \xe2\x82\xac text'.decode('utf-8')) ]) self.assertEqual(lx.extract_links(response_noenc), [ Link(url='http://example.com/sample_%C3%B1.html', text=''), Link(url='http://example.com/sample_%E2%82%AC.html', text='sample \xe2\x82\xac text'.decode('utf-8')) ]) self.assertEqual(lx.extract_links(response_latin1), [ Link(url='http://example.com/sample_%F1.html', text=''), Link(url='http://example.com/sample_%E1.html', text='sample \xe1 text'.decode('latin1')) ])
def test_extraction_encoding(self): body = get_testdata("link_extractor", "linkextractor_noenc.html") response_utf8 = HtmlResponse( url="http://example.com/utf8", body=body, headers={"Content-Type": ["text/html; charset=utf-8"]} ) response_noenc = HtmlResponse(url="http://example.com/noenc", body=body) body = get_testdata("link_extractor", "linkextractor_latin1.html") response_latin1 = HtmlResponse(url="http://example.com/latin1", body=body) lx = BaseSgmlLinkExtractor() self.assertEqual( lx.extract_links(response_utf8), [ Link(url="http://example.com/sample_%C3%B1.html", text=""), Link(url="http://example.com/sample_%E2%82%AC.html", text="sample \xe2\x82\xac text".decode("utf-8")), ], ) self.assertEqual( lx.extract_links(response_noenc), [ Link(url="http://example.com/sample_%C3%B1.html", text=""), Link(url="http://example.com/sample_%E2%82%AC.html", text="sample \xe2\x82\xac text".decode("utf-8")), ], ) self.assertEqual( lx.extract_links(response_latin1), [ Link(url="http://example.com/sample_%F1.html", text=""), Link(url="http://example.com/sample_%E1.html", text="sample \xe1 text".decode("latin1")), ], )
def test_base_url(self): html = """<html><head><title>Page title<title><base href="http://otherdomain.com/base/" /> <body><p><a href="item/12.html">Item 12</a></p> </body></html>""" response = HtmlResponse("http://example.org/somepage/index.html", body=html) lx = BaseSgmlLinkExtractor() # default: tag=a, attr=href self.assertEqual( lx.extract_links(response), [Link(url="http://otherdomain.com/base/item/12.html", text="Item 12")] )
def test_basic(self): html = """<html><head><title>Page title<title> <body><p><a href="item/12.html">Item 12</a></p> <p><a href="/about.html">About us</a></p> <img src="/logo.png" alt="Company logo (not a link)" /> <p><a href="../othercat.html">Other category</a></p> <p><a href="/" /></p> </body></html>""" response = HtmlResponse("http://example.org/somepage/index.html", body=html) lx = BaseSgmlLinkExtractor() # default: tag=a, attr=href self.assertEqual(lx.extract_links(response), [Link(url='http://example.org/somepage/item/12.html', text='Item 12'), Link(url='http://example.org/about.html', text='About us'), Link(url='http://example.org/othercat.html', text='Other category'), Link(url='http://example.org/', text='')])
def _process_links(self, links): links = [ link for link in links if not self.check_url or _is_valid_url(link.url) ] if self.allow_res: links = [ link for link in links if _matches(link.url, self.allow_res) ] if self.deny_res: links = [ link for link in links if not _matches(link.url, self.deny_res) ] if self.allow_domains: links = [ link for link in links if url_is_from_any_domain(link.url, self.allow_domains) ] if self.deny_domains: links = [ link for link in links if not url_is_from_any_domain(link.url, self.deny_domains) ] if self.canonicalize: for link in links: link.url = canonicalize_url(link.url) links = BaseSgmlLinkExtractor._process_links(self, links) return links
def _process_links(self, links): links = [link for link in links if _is_valid_url(link.url)] if self.allow_res: links = [link for link in links if _matches(link.url, self.allow_res)] if self.deny_res: links = [link for link in links if not _matches(link.url, self.deny_res)] if self.allow_domains: links = [link for link in links if url_is_from_any_domain(link.url, self.allow_domains)] if self.deny_domains: links = [link for link in links if not url_is_from_any_domain(link.url, self.deny_domains)] new_links = [] for link in links: CustomerId = link.url.split('/')[6] if not self._ignore_identifier(CustomerId): log.msg("Found CustomerId: "+CustomerId,level=log.DEBUG) new_links.append(link) links = new_links if self.canonicalize: for link in links: link.url = canonicalize_url(link.url) links = BaseSgmlLinkExtractor._process_links(self, links) return links
def _process_links(self, links): links = [link for link in links if _is_valid_url(link.url)] if self.allow_res: links = [ link for link in links if _matches(link.url, self.allow_res) ] if self.deny_res: links = [ link for link in links if not _matches(link.url, self.deny_res) ] if self.allow_domains: #links = [link for link in links if self._url_is_from_any_host(link.url, self.allow_domains)] links = [ link for link in links if self._url_is_from_any_domain(link.url, self.allow_domains) ] if self.deny_domains: #links = [link for link in links if not self._url_is_from_any_host(link.url, self.deny_domains)] links = [ link for link in links if not self._url_is_from_any_domain(link.url, self.deny_domains) ] if self.canonicalize: for link in links: #log.msg("extract link before normalize: [%s]" % link.url, level=log.INFO) link.url = canonicalize_url(link.url) links = BaseSgmlLinkExtractor._process_links(self, links) return links
def _process_links(self, links): links = [link for link in links if _is_valid_url(link.url)] if self.allow_res: links = [link for link in links if _matches(link.url, self.allow_res)] if self.deny_res: links = [link for link in links if not _matches(link.url, self.deny_res)] if self.allow_domains: links = [link for link in links if url_is_from_any_domain(link.url, self.allow_domains)] if self.deny_domains: links = [link for link in links if not url_is_from_any_domain(link.url, self.deny_domains)] new_links = [] for link in links: ASIN = link.url.split('/')[5] if not self._ignore_identifier(ASIN): log.msg("Found ASIN: "+ASIN,level=log.DEBUG) link.url = "http://www.amazon.com/product-reviews/"+ASIN+"/ref%3Ddp_top_cm_cr_acr_txt?ie=UTF8&showViewpoints=0" new_links.append(link) links = new_links if self.canonicalize: for link in links: link.url = canonicalize_url(link.url) links = BaseSgmlLinkExtractor._process_links(self, links) return links
def test_basic(self): html = """<html><head><title>Page title<title> <body><p><a href="item/12.html">Item 12</a></p> <p><a href="/about.html">About us</a></p> <img src="/logo.png" alt="Company logo (not a link)" /> <p><a href="../othercat.html">Other category</a></p> <p><a href="/">>></a></p> <p><a href="/" /></p> </body></html>""" response = HtmlResponse("http://example.org/somepage/index.html", body=html) lx = BaseSgmlLinkExtractor() # default: tag=a, attr=href self.assertEqual(lx.extract_links(response), [Link(url='http://example.org/somepage/item/12.html', text='Item 12'), Link(url='http://example.org/about.html', text='About us'), Link(url='http://example.org/othercat.html', text='Other category'), Link(url='http://example.org/', text='>>'), Link(url='http://example.org/', text='')])
def _process_links(self, links): links = [link for link in links if not self.check_url or _is_valid_url(link.url)] if self.allow_res: links = [link for link in links if _matches(link.url, self.allow_res)] if self.deny_res: links = [link for link in links if not _matches(link.url, self.deny_res)] if self.allow_domains: links = [link for link in links if url_is_from_any_domain(link.url, self.allow_domains)] if self.deny_domains: links = [link for link in links if not url_is_from_any_domain(link.url, self.deny_domains)] if self.canonicalize: for link in links: link.url = canonicalize_url(link.url) links = BaseSgmlLinkExtractor._process_links(self, links) return links
def _process_links(self, links): links = [link for link in links if _is_valid_url(link.url)] if self.allow_res: links = [ link for link in links if _matches(link.url, self.allow_res) ] if self.deny_res: links = [ link for link in links if not _matches(link.url, self.deny_res) ] if self.allow_domains: links = [ link for link in links if url_is_from_any_domain(link.url, self.allow_domains) ] if self.deny_domains: links = [ link for link in links if not url_is_from_any_domain(link.url, self.deny_domains) ] new_links = [] for link in links: ASIN = link.url.split('/')[5] if not self._ignore_identifier(ASIN): log.msg("Found ASIN: " + ASIN, level=log.DEBUG) link.url = "http://www.amazon.com/product-reviews/" + ASIN + "/ref%3Ddp_top_cm_cr_acr_txt?ie=UTF8&showViewpoints=0" new_links.append(link) links = new_links if self.canonicalize: for link in links: link.url = canonicalize_url(link.url) links = BaseSgmlLinkExtractor._process_links(self, links) return links
def _process_links(self, links): links = [link for link in links if _is_valid_url(link.url)] if self.allow_res: links = [ link for link in links if _matches(link.url, self.allow_res) ] if self.deny_res: links = [ link for link in links if not _matches(link.url, self.deny_res) ] if self.allow_domains: links = [ link for link in links if url_is_from_any_domain(link.url, self.allow_domains) ] if self.deny_domains: links = [ link for link in links if not url_is_from_any_domain(link.url, self.deny_domains) ] new_links = [] for link in links: CustomerId = link.url.split('/')[6] if not self._ignore_identifier(CustomerId): log.msg("Found CustomerId: " + CustomerId, level=log.DEBUG) new_links.append(link) links = new_links if self.canonicalize: for link in links: link.url = canonicalize_url(link.url) links = BaseSgmlLinkExtractor._process_links(self, links) return links