def test_header(self): specs = {"type": "column", "value": 1} lextractor = create_linkextractor_from_specs(specs) response = UTF8TextResponse(url='http://www.example.com/', body=csvfeed3) links = list(lextractor.links_to_follow(response)) self.assertEqual(len(links), 2) self.assertEqual(links[0].url, 'http://www.example.com/path') self.assertEqual(links[1].url, 'http://www.example.com/path2')
def test_extra_params(self): specs = {"type": "column", "value": 1, "delimiter": "|"} lextractor = create_linkextractor_from_specs(specs) response = TextResponse(url="http://www.example.com/", body=csvfeed2) links = list(lextractor.links_to_follow(response)) self.assertEqual(len(links), 2) self.assertEqual(links[0].url, "http://www.example.com/path") self.assertEqual(links[1].url, "http://www.example.com/path2")
def test_simple(self): specs = {"type": "html", "value": None} lextractor = create_linkextractor_from_specs(specs) response = UTF8HtmlResponse(url='http://www.example.com/', body=html) links = list(lextractor.links_to_follow(response)) self.assertEqual(len(links), 1) self.assertEqual(links[0].url, 'http://www.example.com/path') self.assertEqual(links[0].text, 'Click here')
def test_custom_withargs(self): specs = {"type": "regex", "value": 'url: ((?:http|https)://www.example.com/[\w/]+)', 'allowed_schemes': ['http']} lextractor = create_linkextractor_from_specs(specs) text = "url: http://www.example.com/path, more text url: https://www.example.com/path2. And more text url: https://aws.amazon.com/product?id=23#tre" response = UTF8TextResponse(url='http://www.example.com/', body=text) links = list(lextractor.links_to_follow(response)) self.assertEqual(len(links), 1) self.assertEqual(links[0].url, 'http://www.example.com/path')
def test_default(self): specs = {"type": "regex", "value": ''} lextractor = create_linkextractor_from_specs(specs) text = "Hello http://www.example.com/path, more text https://aws.amazon.com/product?id=23#tre?" response = UTF8TextResponse(url='http://www.example.com/', body=text) links = list(lextractor.links_to_follow(response)) self.assertEqual(len(links), 2) self.assertEqual(links[0].url, 'http://www.example.com/path') self.assertEqual(links[1].url, 'https://aws.amazon.com/product?id=23')
def test_simple(self): specs = {"type": "pagination", "value": None} lextractor = create_linkextractor_from_specs(specs) html_page = htmlpage_from_response(HtmlResponse(url="http://www.example.com/", body=html)) html_page.headers["n_items"] = 1 links = list(lextractor.links_to_follow(html_page)) self.assertEqual(len(links), 1) self.assertEqual(links[0].url, "http://www.example.com/path") self.assertEqual(links[0].text, "Click here")
def test_custom(self): specs = {"type": "regex", "value": "url: ((?:http|https)://www.example.com/[\w/]+)"} lextractor = create_linkextractor_from_specs(specs) text = "url: http://www.example.com/path, more text url: https://www.example.com/path2. And more text url: https://aws.amazon.com/product?id=23#tre" response = TextResponse(url="http://www.example.com/", body=text) links = list(lextractor.links_to_follow(response)) self.assertEqual(len(links), 2) self.assertEqual(links[0].url, "http://www.example.com/path") self.assertEqual(links[1].url, "https://www.example.com/path2")
def test_sitemap(self): specs = {"type": "sitemap", "value": ""} lextractor = create_linkextractor_from_specs(specs) links = list(lextractor.links_to_follow(self.sitemap)) self.assertEqual(len(links), 3) self.assertEqual(links[0].url, 'http://www.accommodationforstudents.com/') links = list(lextractor.links_to_follow(self.sitemapindex)) self.assertEqual(len(links), 1) self.assertEqual(links[0].url, 'http://www.example.com/sitemap1.xml.gz')
def _create_start_request_from_specs(self, info): url = info["url"] lspecs = info.get("link_extractor") if lspecs: linkextractor = create_linkextractor_from_specs(lspecs) def _callback(spider, response): for link in linkextractor.links_to_follow(response): yield Request(url=link.url, callback=spider.parse) return Request(url=url, callback=_callback) return Request(url=url, callback=self.parse)
def handle_xml(self, response, seen): _type = content_type(response).subtype.split('+')[0] try: link_extractor = create_linkextractor_from_specs({ 'type': _type, 'value': '' }) except ValueError: link_extractor = XmlLinkExtractor() for link in link_extractor.links_to_follow(response): request = self._filter_link(link, seen) if request: yield request
def test_custom(self): specs = { "type": "regex", "value": 'url: ((?:http|https)://www.example.com/[\w/]+)' } lextractor = create_linkextractor_from_specs(specs) text = "url: http://www.example.com/path, more text url: https://www.example.com/path2. And more text url: https://aws.amazon.com/product?id=23#tre" response = TextResponse(url='http://www.example.com/', body=text) links = list(lextractor.links_to_follow(response)) self.assertEqual(len(links), 2) self.assertEqual(links[0].url, 'http://www.example.com/path') self.assertEqual(links[1].url, 'https://www.example.com/path2')
def handle_xml(self, response, seen): _type = XML_APPLICATION_TYPE(response.headers.get('Content-Type', '')) _type = _type.groupdict()['type'] if _type else 'xml' try: link_extractor = create_linkextractor_from_specs({ 'type': _type, 'value': '' }) except ValueError: link_extractor = SitemapLinkExtractor() for link in link_extractor.links_to_follow(response): request = self._filter_link(link, seen) if request: yield request
def test_start_urls(self): specs = {"type": "pagination", "value": None, "start_urls": ['http://www.spam.com/?p=1', 'http://www.eggs.com/?page=0'] } lextractor = create_linkextractor_from_specs(specs) html = """ <a href="http://www.spam.com/?p=100">Click here 1</a> <a href="http://www.spam.com/?p=200">Click here 2</a> <a href="http://www.spam.com/?p=300">Click here 3</a> """ html_page = htmlpage_from_response( HtmlResponse(url='http://www.example.com/', body=html)) links = list(lextractor.links_to_follow(html_page)) links = sorted(links, key=lambda link: link.url) self.assertEqual(len(links), 3) self.assertEqual(links[0].url, "http://www.spam.com/?p=100") self.assertEqual(links[1].url, "http://www.spam.com/?p=200") self.assertEqual(links[2].url, "http://www.spam.com/?p=300") self.assertEqual(links[0].text, 'Click here 1') self.assertEqual(links[1].text, 'Click here 2') self.assertEqual(links[2].text, 'Click here 3')
def test_start_urls(self): specs = {"type": "pagination", "value": None, "start_urls": ['http://www.spam.com/?p=1', 'http://www.eggs.com/?page=0'] } lextractor = create_linkextractor_from_specs(specs) html = """ <a href="http://www.spam.com/?p=100">Click here 1</a> <a href="http://www.spam.com/?p=200">Click here 2</a> <a href="http://www.spam.com/?p=300">Click here 3</a> """ html_page = htmlpage_from_response( UTF8HtmlResponse(url='http://www.example.com/', body=html)) links = list(lextractor.links_to_follow(html_page)) links = sorted(links, key=lambda link: link.url) self.assertEqual(len(links), 3) self.assertEqual(links[0].url, "http://www.spam.com/?p=100") self.assertEqual(links[1].url, "http://www.spam.com/?p=200") self.assertEqual(links[2].url, "http://www.spam.com/?p=300") self.assertEqual(links[0].text, 'Click here 1') self.assertEqual(links[1].text, 'Click here 2') self.assertEqual(links[2].text, 'Click here 3')
def test_rss(self): specs = {"type": "rss", "value": ""} lextractor = create_linkextractor_from_specs(specs) links = list(lextractor.links_to_follow(self.response)) self.assertEqual(len(links), 1) self.assertEqual(links[0].url, "http://www.wikipedia.org/")
def test_xml_remove_namespaces(self): specs = {"type": "xpath", "value": "//link/@href", "remove_namespaces": True} lextractor = create_linkextractor_from_specs(specs) links = list(lextractor.links_to_follow(self.atom)) self.assertEqual(len(links), 3) self.assertEqual(links[0].url, 'http://example.org/feed/')
def test_atom(self): specs = {"type": "atom", "value": ""} lextractor = create_linkextractor_from_specs(specs) links = list(lextractor.links_to_follow(self.atom)) self.assertEqual(len(links), 3) self.assertEqual(links[0].url, 'http://example.org/feed/')
def test_xml(self): specs = {"type": "xpath", "value": "//item/link/text()"} lextractor = create_linkextractor_from_specs(specs) links = list(lextractor.links_to_follow(self.response)) self.assertEqual(len(links), 1) self.assertEqual(links[0].url, 'http://www.wikipedia.org/')