def _extract_links(self, selector, response_url, response_encoding, base_url): links = [] # hacky way to get the underlying lxml parsed document for el, attr, attr_val in self._iter_links(selector._root): # pseudo lxml.html.HtmlElement.make_links_absolute(base_url) attr_val = urljoin(base_url, attr_val) url = self.process_attr(attr_val) if url is None: continue if isinstance(url, unicode): url = url.encode(response_encoding) # to fix relative links after process_value url = urljoin(response_url, url) link = Link( url, _collect_string_content(el) or u'', nofollow=True if el.get('rel') == 'nofollow' else False) links.append(link) return unique_list(links, key=lambda link: link.url) \ if self.unique else links
def _extract_links(self, selector, response_url, response_encoding, base_url): links = [] # hacky way to get the underlying lxml parsed document for el, attr, attr_val in self._iter_links(selector.root): # pseudo lxml.html.HtmlElement.make_links_absolute(base_url) try: attr_val = urljoin(base_url, attr_val) except ValueError: continue # skipping bogus links else: url = self.process_attr(attr_val) if url is None: continue url = to_native_str(url, encoding=response_encoding) # to fix relative links after process_value url = urljoin(response_url, url) link = Link(url, _collect_string_content(el) or u'', nofollow=rel_has_nofollow(el.get('rel'))) links.append(link) return self._deduplicate_if_needed(links)
def test_extract_all_links(self): lx = self.extractor_cls() if self.escapes_whitespace: page4_url = 'http://example.com/page%204.html' else: page4_url = 'http://example.com/page 4.html' self.assertEqual( [link for link in lx.extract_links(self.response)], [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2'), Link(url='http://example.com/sample3.html', text=u'sample 3 text'), Link(url='http://example.com/sample3.html#foo', text='sample 3 repetition with fragment'), Link(url='http://www.google.com/something', text=u''), Link(url='http://example.com/innertag.html', text=u'inner tag'), Link(url=page4_url, text=u'href with whitespaces'), ])
def _extract_links(self, selector, response_url, response_encoding, base_url): links = [] # hacky way to get the underlying lxml parsed document for el, attr, attr_val in self._iter_links(selector.root): if self.scan_tag(el.tag) and self.scan_attr(attr): # pseudo root.make_links_absolute(base_url) # START PATCH: Added check to filter links before making absolute if not _is_valid_link(attr_val): continue # END PATCH attr_val = urljoin(base_url, attr_val) url = self.process_attr(attr_val) if url is None: continue # to fix relative links after process_value url = urljoin(response_url, url) link = Link( url, _collect_string_content(el) or '', nofollow=True if el.get('rel') == 'nofollow' else False ) links.append(link) return unique_list(links, key=lambda link: link.url) \ if self.unique else links
def extract_links(self, response): hxs = Selector(response) list_css = self.get_css("list_css") if not list_css: return [] urls = [] try: links = hxs.css(list_css).xpath('@href').extract() for url in links: urls.append(url) next_url = self.extract_next_links(response) urls.extend(next_url) except Exception as err: self.logger.error("%s" % err) rtn = [] for url in urls: url = URL.s_get_full_url(URL(url), URL(response.url)) if url: rtn.append(Link(url=url)) return rtn
def extract_links(self, response): base_url = get_base_url(response) if self.restrict_xpaths: links = [ link for xpath in self.restrict_xpaths for link in response.xpath(xpath) ] else: links = [ response.selector, ] all_links = [ Link(response.url), ] for link in links: new_link = self._extract_links(link, response.url, response.encoding, base_url) all_links.extend(self._process_links(new_link)) return unique_list(all_links)
def test_extract_all_links(self): lx = self.extractor_cls() page4_url = "http://example.com/page%204.html" self.assertEqual( [link for link in lx.extract_links(self.response)], [ Link(url="http://example.com/sample1.html", text=""), Link(url="http://example.com/sample2.html", text="sample 2"), Link(url="http://example.com/sample3.html", text="sample 3 text"), Link( url="http://example.com/sample3.html#foo", text="sample 3 repetition with fragment", ), Link(url="http://www.google.com/something", text=""), Link(url="http://example.com/innertag.html", text="inner tag"), Link(url=page4_url, text="href with whitespaces"), ], )
def test_eq_and_hash(self): l1 = Link("http://www.example.com") l2 = Link("http://www.example.com/other") l3 = Link("http://www.example.com") self.assertEqual(l1, l1) self.assertEqual(hash(l1), hash(l1)) self.assertNotEqual(l1, l2) self.assertNotEqual(hash(l1), hash(l2)) self.assertEqual(l1, l3) self.assertEqual(hash(l1), hash(l3)) l4 = Link("http://www.example.com", text="test") l5 = Link("http://www.example.com", text="test2") l6 = Link("http://www.example.com", text="test") self.assertEqual(l4, l4) self.assertEqual(hash(l4), hash(l4)) self.assertNotEqual(l4, l5) self.assertNotEqual(hash(l4), hash(l5)) self.assertEqual(l4, l6) self.assertEqual(hash(l4), hash(l6))
def test_extraction_encoding(self): body = get_testdata('link_extractor', 'linkextractor_noenc.html') response_utf8 = HtmlResponse(url='http://example.com/utf8', body=body, headers={'Content-Type': ['text/html; charset=utf-8']}) response_noenc = HtmlResponse(url='http://example.com/noenc', body=body) body = get_testdata('link_extractor', 'linkextractor_latin1.html') response_latin1 = HtmlResponse(url='http://example.com/latin1', body=body) lx = BaseSgmlLinkExtractor() self.assertEqual(lx.extract_links(response_utf8), [ Link(url='http://example.com/sample_%C3%B1.html', text=''), Link(url='http://example.com/sample_%E2%82%AC.html', text='sample \xe2\x82\xac text'.decode('utf-8')), ]) self.assertEqual(lx.extract_links(response_noenc), [ Link(url='http://example.com/sample_%C3%B1.html', text=''), Link(url='http://example.com/sample_%E2%82%AC.html', text='sample \xe2\x82\xac text'.decode('utf-8')), ]) self.assertEqual(lx.extract_links(response_latin1), [ Link(url='http://example.com/sample_%F1.html', text=''), Link(url='http://example.com/sample_%E1.html', text='sample \xe1 text'.decode('latin1')), ])
def test_extract_filter_allowed_domains(self): lx = self.extractor_cls(allow_domains=('google.com', )) self.assertEqual( [link for link in lx.extract_links(self.response)], [ Link(url='http://www.google.com/something', text=u''), ])
def mklink(url, anchortext=None, nofollow=False): url = url.strip() fullurl = urljoin(base_href, replace_entities(url, encoding=htmlpage.encoding)) return Link(fullurl.encode(htmlpage.encoding), text=anchortext, nofollow=nofollow)
def test_extraction(self): '''Test the extractor's behaviour among different situations''' lx = HTMLImageLinkExtractor(locations=('//img', )) links_1 = lx.extract_links(self.response) self.assertEqual(links_1, [ Link(url='http://example.com/sample1.jpg', text=u'sample 1'), Link(url='http://example.com/sample2.jpg', text=u'sample 2'), Link(url='http://example.com/sample4.jpg', text=u'sample 4') ]) lx = HTMLImageLinkExtractor(locations=('//img', ), unique=False) links_2 = lx.extract_links(self.response) self.assertEqual(links_2, [ Link(url='http://example.com/sample1.jpg', text=u'sample 1'), Link(url='http://example.com/sample2.jpg', text=u'sample 2'), Link(url='http://example.com/sample4.jpg', text=u'sample 4'), Link(url='http://example.com/sample4.jpg', text=u'sample 4 repetition') ]) lx = HTMLImageLinkExtractor(locations=('//div[@id="wrapper"]', )) links_3 = lx.extract_links(self.response) self.assertEqual(links_3, [ Link(url='http://example.com/sample1.jpg', text=u'sample 1'), Link(url='http://example.com/sample2.jpg', text=u'sample 2'), Link(url='http://example.com/sample4.jpg', text=u'sample 4') ]) lx = HTMLImageLinkExtractor(locations=('//a', )) links_4 = lx.extract_links(self.response) self.assertEqual(links_4, [ Link(url='http://example.com/sample2.jpg', text=u'sample 2'), Link(url='http://example.com/sample3.html', text=u'sample 3') ])
def test_follow_whitespace_link(self): self._assert_followed_url(Link('http://example.com/foo '), 'http://example.com/foo%20')
def test_extraction(self): '''Test the extractor's behaviour among different situations''' lx = SgmlLinkExtractor() self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2'), Link(url='http://example.com/sample3.html', text=u'sample 3 text'), Link(url='http://www.google.com/something', text=u'') ]) lx = SgmlLinkExtractor(allow=('sample', )) self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2'), Link(url='http://example.com/sample3.html', text=u'sample 3 text') ]) lx = SgmlLinkExtractor(allow=('sample', ), unique=False) self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2'), Link(url='http://example.com/sample3.html', text=u'sample 3 text'), Link(url='http://example.com/sample3.html', text=u'sample 3 repetition') ]) lx = SgmlLinkExtractor(allow=('sample', )) self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2'), Link(url='http://example.com/sample3.html', text=u'sample 3 text'), ]) lx = SgmlLinkExtractor(allow=('sample', ), deny=('3', )) self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2') ]) lx = SgmlLinkExtractor(allow_domains=('google.com', )) self.assertEqual( [link for link in lx.extract_links(self.response)], [Link(url='http://www.google.com/something', text=u'')])
def test_attrs(self): lx = self.extractor_cls(attrs="href") self.assertEqual(lx.extract_links(self.response), [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2'), Link(url='http://example.com/sample3.html', text=u'sample 3 text'), Link(url='http://www.google.com/something', text=u''), Link(url='http://example.com/innertag.html', text=u'inner tag'), ]) lx = self.extractor_cls(attrs=("href", "src"), tags=("a", "area", "img"), deny_extensions=()) self.assertEqual(lx.extract_links(self.response), [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2'), Link(url='http://example.com/sample2.jpg', text=u''), Link(url='http://example.com/sample3.html', text=u'sample 3 text'), Link(url='http://www.google.com/something', text=u''), Link(url='http://example.com/innertag.html', text=u'inner tag'), ]) lx = self.extractor_cls(attrs=None) self.assertEqual(lx.extract_links(self.response), []) html = """<html><area href="sample1.html"></area><a ref="sample2.html">sample text 2</a></html>""" response = HtmlResponse("http://example.com/index.html", body=html) lx = SgmlLinkExtractor(attrs=("href")) self.assertEqual(lx.extract_links(response), [ Link(url='http://example.com/sample1.html', text=u''), ])
def test_unicode_url(self): with warnings.catch_warnings(record=True) as w: link = Link(u"http://www.example.com/\xa3") self.assertIsInstance(link.url, bytes) self.assertEqual(link.url, b'http://www.example.com/\xc2\xa3') assert len(w) == 1, "warning not issued"
def test_extract_filter_allow_and_deny(self): lx = self.extractor_cls(allow=('sample', ), deny=('3', )) self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://example.com/sample1.html', text=''), Link(url='http://example.com/sample2.html', text='sample 2'), ])
def extract_links(self, response): if not self.base_url: self.base_url = get_base_url(response) items = re.findall(self.restrict_re, response.text) all_links = [Link(response.urljoin(self.base_url.format(str(item)))) for item in items] return unique_list(all_links)
def _extract_links(self, response): buff = StringIO(response.body) reader = csv.reader(buff, **self.fmtparams) for row in reader: if len(row) > self.column: yield Link(row[self.column])
def _extract_links(self, response): xxs = XmlXPathSelector(response) for url in xxs.select(self.xpath).extract(): yield Link(url.encode(response.encoding))
def test_restrict_css(self): lx = self.extractor_cls(restrict_css=('#subwrapper a', )) self.assertEqual(lx.extract_links(self.response), [ Link(url='http://example.com/sample2.html', text=u'sample 2') ])
def mklink(url, anchortext=None, nofollow=False): url = url.strip() path = remove_entities(url, encoding=encoding) return Link(urljoin(base_href, path).encode(encoding), text=anchortext, nofollow=nofollow)
def test_attrs(self): lx = self.extractor_cls(attrs="href") page4_url = 'http://example.com/page%204.html' self.assertEqual(lx.extract_links(self.response), [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2'), Link(url='http://example.com/sample3.html', text=u'sample 3 text'), Link(url='http://example.com/sample3.html#foo', text='sample 3 repetition with fragment'), Link(url='http://www.google.com/something', text=u''), Link(url='http://example.com/innertag.html', text=u'inner tag'), Link(url=page4_url, text=u'href with whitespaces'), ]) lx = self.extractor_cls(attrs=("href", "src"), tags=("a", "area", "img"), deny_extensions=()) self.assertEqual(lx.extract_links(self.response), [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2'), Link(url='http://example.com/sample2.jpg', text=u''), Link(url='http://example.com/sample3.html', text=u'sample 3 text'), Link(url='http://example.com/sample3.html#foo', text='sample 3 repetition with fragment'), Link(url='http://www.google.com/something', text=u''), Link(url='http://example.com/innertag.html', text=u'inner tag'), Link(url=page4_url, text=u'href with whitespaces'), ]) lx = self.extractor_cls(attrs=None) self.assertEqual(lx.extract_links(self.response), [])
def test_restrict_css(self): lx = self.extractor_cls(restrict_css=("#subwrapper a", )) self.assertEqual( lx.extract_links(self.response), [Link(url="http://example.com/sample2.html", text="sample 2")], )
def test_xhtml(self): xhtml = """ <?xml version="1.0"?> <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> <head> <title>XHTML document title</title> </head> <body> <div class='links'> <p><a href="/about.html">About us</a></p> </div> <div> <p><a href="/follow.html">Follow this link</a></p> </div> <div> <p><a href="/nofollow.html" rel="nofollow">Dont follow this one</a></p> </div> <div> <p><a href="/nofollow2.html" rel="blah">Choose to follow or not</a></p> </div> </body> </html> """ response = HtmlResponse("http://example.com/index.xhtml", body=xhtml) lx = self.extractor_cls() self.assertEqual(lx.extract_links(response), [ Link(url='http://example.com/about.html', text=u'About us', fragment='', nofollow=False), Link(url='http://example.com/follow.html', text=u'Follow this link', fragment='', nofollow=False), Link(url='http://example.com/nofollow.html', text=u'Dont follow this one', fragment='', nofollow=True), Link(url='http://example.com/nofollow2.html', text=u'Choose to follow or not', fragment='', nofollow=False) ]) response = XmlResponse("http://example.com/index.xhtml", body=xhtml) lx = self.extractor_cls() self.assertEqual(lx.extract_links(response), [ Link(url='http://example.com/about.html', text=u'About us', fragment='', nofollow=False), Link(url='http://example.com/follow.html', text=u'Follow this link', fragment='', nofollow=False), Link(url='http://example.com/nofollow.html', text=u'Dont follow this one', fragment='', nofollow=True), Link(url='http://example.com/nofollow2.html', text=u'Choose to follow or not', fragment='', nofollow=False) ])
def test_attrs(self): lx = self.extractor_cls(attrs="href") page4_url = "http://example.com/page%204.html" self.assertEqual( lx.extract_links(self.response), [ Link(url="http://example.com/sample1.html", text=""), Link(url="http://example.com/sample2.html", text="sample 2"), Link(url="http://example.com/sample3.html", text="sample 3 text"), Link( url="http://example.com/sample3.html#foo", text="sample 3 repetition with fragment", ), Link(url="http://www.google.com/something", text=""), Link(url="http://example.com/innertag.html", text="inner tag"), Link(url=page4_url, text="href with whitespaces"), ], ) lx = self.extractor_cls(attrs=("href", "src"), tags=("a", "area", "img"), deny_extensions=()) self.assertEqual( lx.extract_links(self.response), [ Link(url="http://example.com/sample1.html", text=""), Link(url="http://example.com/sample2.html", text="sample 2"), Link(url="http://example.com/sample2.jpg", text=""), Link(url="http://example.com/sample3.html", text="sample 3 text"), Link( url="http://example.com/sample3.html#foo", text="sample 3 repetition with fragment", ), Link(url="http://www.google.com/something", text=""), Link(url="http://example.com/innertag.html", text="inner tag"), Link(url=page4_url, text="href with whitespaces"), ], ) lx = self.extractor_cls(attrs=None) self.assertEqual(lx.extract_links(self.response), [])
def test_restrict_xpaths(self): lx = SgmlLinkExtractor(restrict_xpaths=('//div[@id="subwrapper"]', )) self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2') ])
def test_restrict_xpaths_with_html_entities(self): html = '<html><body><p><a href="/♥/you?c=€">text</a></p></body></html>' response = HtmlResponse("http://example.org/somepage/index.html", body=html, encoding='iso8859-15') links = SgmlLinkExtractor(restrict_xpaths='//p').extract_links(response) self.assertEqual(links, [Link(url='http://example.org/%E2%99%A5/you?c=%E2%82%AC', text=u'text')])
def _add_link(url_sel, alt_sel=None): url = flatten([url_sel.extract()]) alt = flatten([alt_sel.extract()]) if alt_sel else (u'', ) if url: ret.append(Link(unicode_to_str(url[0], encoding), alt[0]))
def get_url(self): return Link(self.select('@href').extract()[0])