def test_attrs(self): lx = self.extractor_cls(attrs="href") self.assertEqual(lx.extract_links(self.response), [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2'), Link(url='http://example.com/sample3.html', text=u'sample 3 text'), Link(url='http://www.google.com/something', text=u''), Link(url='http://example.com/innertag.html', text=u'inner tag'), ]) lx = self.extractor_cls(attrs=("href","src"), tags=("a","area","img"), deny_extensions=()) self.assertEqual(lx.extract_links(self.response), [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2'), Link(url='http://example.com/sample2.jpg', text=u''), Link(url='http://example.com/sample3.html', text=u'sample 3 text'), Link(url='http://www.google.com/something', text=u''), Link(url='http://example.com/innertag.html', text=u'inner tag'), ]) lx = self.extractor_cls(attrs=None) self.assertEqual(lx.extract_links(self.response), []) html = """<html><area href="sample1.html"></area><a ref="sample2.html">sample text 2</a></html>""" response = HtmlResponse("http://example.com/index.html", body=html) lx = SgmlLinkExtractor(attrs=("href")) self.assertEqual(lx.extract_links(response), [ Link(url='http://example.com/sample1.html', text=u''), ])
def test_deny_extensions(self): html = """<a href="page.html">asd</a> and <a href="photo.jpg">""" response = HtmlResponse("http://example.org/", body=html) lx = SgmlLinkExtractor(deny_extensions="jpg") self.assertEqual(lx.extract_links(response), [ Link(url='http://example.org/page.html', text=u'asd'), ])
def test_attrs(self): lx = self.extractor_cls(attrs="href") self.assertEqual(lx.extract_links(self.response), [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2'), Link(url='http://example.com/sample3.html', text=u'sample 3 text'), Link(url='http://www.google.com/something', text=u''), Link(url='http://example.com/innertag.html', text=u'inner tag'), ]) lx = self.extractor_cls(attrs=("href", "src"), tags=("a", "area", "img"), deny_extensions=()) self.assertEqual(lx.extract_links(self.response), [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2'), Link(url='http://example.com/sample2.jpg', text=u''), Link(url='http://example.com/sample3.html', text=u'sample 3 text'), Link(url='http://www.google.com/something', text=u''), Link(url='http://example.com/innertag.html', text=u'inner tag'), ]) lx = self.extractor_cls(attrs=None) self.assertEqual(lx.extract_links(self.response), []) html = """<html><area href="sample1.html"></area><a ref="sample2.html">sample text 2</a></html>""" response = HtmlResponse("http://example.com/index.html", body=html) lx = SgmlLinkExtractor(attrs=("href")) self.assertEqual(lx.extract_links(response), [ Link(url='http://example.com/sample1.html', text=u''), ])
def test_attrs_sgml(self): html = """<html><area href="sample1.html"></area> <a ref="sample2.html">sample text 2</a></html>""" response = HtmlResponse("http://example.com/index.html", body=html) lx = SgmlLinkExtractor(attrs="href") self.assertEqual(lx.extract_links(response), [ Link(url='http://example.com/sample1.html', text=u''), ])
def test_link_nofollow(self): html = """ <a href="page.html?action=print" rel="nofollow">Printer-friendly page</a> <a href="about.html">About us</a> """ response = HtmlResponse("http://example.org/page.html", body=html) lx = SgmlLinkExtractor() self.assertEqual([link for link in lx.extract_links(response)], [ Link(url='http://example.org/page.html?action=print', text=u'Printer-friendly page', nofollow=True), Link(url='http://example.org/about.html', text=u'About us', nofollow=False), ])
def test_link_nofollow(self): html = """ <a href="page.html?action=print" rel="nofollow">Printer-friendly page</a> <a href="about.html">About us</a> <a href="http://google.com/something" rel="external nofollow">Something</a> """ response = HtmlResponse("http://example.org/page.html", body=html) lx = SgmlLinkExtractor() self.assertEqual( [link for link in lx.extract_links(response)], [ Link(url="http://example.org/page.html?action=print", text=u"Printer-friendly page", nofollow=True), Link(url="http://example.org/about.html", text=u"About us", nofollow=False), Link(url="http://google.com/something", text=u"Something", nofollow=True), ], )
def parse(self,response): link_ex = SgmlLinkExtractor(allow=(r'https://movie.douban.com/subject/\d+')) for i in link_ex.extract_links(response): yield Request(i.url,callback=self.parse_item,headers=self.headers)