Python create_linkextractor_from_specs示例，slybot.linkextractor.create_linkextractor_from_specs Python示例

示例#1

0

显示文件

文件： test_linkextractors.py 项目： daqv/portia-dashboard

 def test_header(self):
     specs = {"type": "column", "value": 1}
     lextractor = create_linkextractor_from_specs(specs)
     response = UTF8TextResponse(url='http://www.example.com/', body=csvfeed3)
     links = list(lextractor.links_to_follow(response))
     self.assertEqual(len(links), 2)
     self.assertEqual(links[0].url, 'http://www.example.com/path')
     self.assertEqual(links[1].url, 'http://www.example.com/path2')

示例#2

0

显示文件

文件： test_linkextractors.py 项目： plafl/portia

 def test_extra_params(self):
     specs = {"type": "column", "value": 1, "delimiter": "|"}
     lextractor = create_linkextractor_from_specs(specs)
     response = TextResponse(url="http://www.example.com/", body=csvfeed2)
     links = list(lextractor.links_to_follow(response))
     self.assertEqual(len(links), 2)
     self.assertEqual(links[0].url, "http://www.example.com/path")
     self.assertEqual(links[1].url, "http://www.example.com/path2")

示例#3

0

显示文件

文件： test_linkextractors.py 项目： daqv/portia-dashboard

 def test_simple(self):
     specs = {"type": "html", "value": None}
     lextractor = create_linkextractor_from_specs(specs)
     response = UTF8HtmlResponse(url='http://www.example.com/', body=html)
     links = list(lextractor.links_to_follow(response))
     self.assertEqual(len(links), 1)
     self.assertEqual(links[0].url, 'http://www.example.com/path')
     self.assertEqual(links[0].text, 'Click here')

示例#4

0

显示文件

文件： test_linkextractors.py 项目： daqv/portia-dashboard

 def test_custom_withargs(self):
     specs = {"type": "regex", "value": 'url: ((?:http|https)://www.example.com/[\w/]+)', 'allowed_schemes': ['http']}
     lextractor = create_linkextractor_from_specs(specs)
     text = "url: http://www.example.com/path, more text url: https://www.example.com/path2. And more text url: https://aws.amazon.com/product?id=23#tre"
     response = UTF8TextResponse(url='http://www.example.com/', body=text)
     links = list(lextractor.links_to_follow(response))
     self.assertEqual(len(links), 1)
     self.assertEqual(links[0].url, 'http://www.example.com/path')

示例#5

0

显示文件

文件： test_linkextractors.py 项目： daqv/portia-dashboard

 def test_default(self):
     specs = {"type": "regex", "value": ''}
     lextractor = create_linkextractor_from_specs(specs)
     text = "Hello http://www.example.com/path, more text https://aws.amazon.com/product?id=23#tre?"
     response = UTF8TextResponse(url='http://www.example.com/', body=text)
     links = list(lextractor.links_to_follow(response))
     self.assertEqual(len(links), 2)
     self.assertEqual(links[0].url, 'http://www.example.com/path')
     self.assertEqual(links[1].url, 'https://aws.amazon.com/product?id=23')

示例#6

0

显示文件

文件： test_linkextractors.py 项目： plafl/portia

 def test_simple(self):
     specs = {"type": "pagination", "value": None}
     lextractor = create_linkextractor_from_specs(specs)
     html_page = htmlpage_from_response(HtmlResponse(url="http://www.example.com/", body=html))
     html_page.headers["n_items"] = 1
     links = list(lextractor.links_to_follow(html_page))
     self.assertEqual(len(links), 1)
     self.assertEqual(links[0].url, "http://www.example.com/path")
     self.assertEqual(links[0].text, "Click here")

示例#7

0

显示文件

文件： test_linkextractors.py 项目： plafl/portia

 def test_custom(self):
     specs = {"type": "regex", "value": "url: ((?:http|https)://www.example.com/[\w/]+)"}
     lextractor = create_linkextractor_from_specs(specs)
     text = "url: http://www.example.com/path, more text url: https://www.example.com/path2. And more text url: https://aws.amazon.com/product?id=23#tre"
     response = TextResponse(url="http://www.example.com/", body=text)
     links = list(lextractor.links_to_follow(response))
     self.assertEqual(len(links), 2)
     self.assertEqual(links[0].url, "http://www.example.com/path")
     self.assertEqual(links[1].url, "https://www.example.com/path2")

示例#8

0

显示文件

文件： test_linkextractors.py 项目： daqv/portia-dashboard

    def test_sitemap(self):
        specs = {"type": "sitemap", "value": ""}
        lextractor = create_linkextractor_from_specs(specs)
        links = list(lextractor.links_to_follow(self.sitemap))
        self.assertEqual(len(links), 3)
        self.assertEqual(links[0].url, 'http://www.accommodationforstudents.com/')

        links = list(lextractor.links_to_follow(self.sitemapindex))
        self.assertEqual(len(links), 1)
        self.assertEqual(links[0].url, 'http://www.example.com/sitemap1.xml.gz')

示例#9

0

显示文件

文件： spider.py 项目： Kola0o0/slybot

 def _create_start_request_from_specs(self, info):
     url = info["url"]
     lspecs = info.get("link_extractor")
     if lspecs:
         linkextractor = create_linkextractor_from_specs(lspecs)
         def _callback(spider, response):
             for link in linkextractor.links_to_follow(response):
                 yield Request(url=link.url, callback=spider.parse)
         return Request(url=url, callback=_callback)
     return Request(url=url, callback=self.parse)

示例#10

0

显示文件

文件： test_linkextractors.py 项目： torome/portia

    def test_sitemap(self):
        specs = {"type": "sitemap", "value": ""}
        lextractor = create_linkextractor_from_specs(specs)
        links = list(lextractor.links_to_follow(self.sitemap))
        self.assertEqual(len(links), 3)
        self.assertEqual(links[0].url,
                         'http://www.accommodationforstudents.com/')

        links = list(lextractor.links_to_follow(self.sitemapindex))
        self.assertEqual(len(links), 1)
        self.assertEqual(links[0].url,
                         'http://www.example.com/sitemap1.xml.gz')

示例#11

0

显示文件

文件： annotations.py 项目： fakegit/portia

 def handle_xml(self, response, seen):
     _type = content_type(response).subtype.split('+')[0]
     try:
         link_extractor = create_linkextractor_from_specs({
             'type': _type, 'value': ''
         })
     except ValueError:
         link_extractor = XmlLinkExtractor()
     for link in link_extractor.links_to_follow(response):
         request = self._filter_link(link, seen)
         if request:
             yield request

示例#12

0

显示文件

文件： test_linkextractors.py 项目： torome/portia

 def test_custom(self):
     specs = {
         "type": "regex",
         "value": 'url: ((?:http|https)://www.example.com/[\w/]+)'
     }
     lextractor = create_linkextractor_from_specs(specs)
     text = "url: http://www.example.com/path, more text url: https://www.example.com/path2. And more text url: https://aws.amazon.com/product?id=23#tre"
     response = TextResponse(url='http://www.example.com/', body=text)
     links = list(lextractor.links_to_follow(response))
     self.assertEqual(len(links), 2)
     self.assertEqual(links[0].url, 'http://www.example.com/path')
     self.assertEqual(links[1].url, 'https://www.example.com/path2')

示例#13

0

显示文件

文件： spider.py 项目： semutter/portia

    def _create_start_request_from_specs(self, info):
        url = info["url"]
        lspecs = info.get("link_extractor")
        if lspecs:
            linkextractor = create_linkextractor_from_specs(lspecs)

            def _callback(spider, response):
                for link in linkextractor.links_to_follow(response):
                    yield Request(url=link.url, callback=spider.parse)

            return Request(url=url, callback=_callback)
        return Request(url=url, callback=self.parse)

示例#14

0

显示文件

文件： annotations.py 项目： tomzhang/portia

 def handle_xml(self, response, seen):
     _type = XML_APPLICATION_TYPE(response.headers.get('Content-Type', ''))
     _type = _type.groupdict()['type'] if _type else 'xml'
     try:
         link_extractor = create_linkextractor_from_specs({
             'type': _type, 'value': ''
         })
     except ValueError:
         link_extractor = SitemapLinkExtractor()
     for link in link_extractor.links_to_follow(response):
         request = self._filter_link(link, seen)
         if request:
             yield request

示例#15

0

显示文件

文件： annotations.py 项目： BenJamesbabala/portia

 def handle_xml(self, response, seen):
     _type = XML_APPLICATION_TYPE(response.headers.get('Content-Type', ''))
     _type = _type.groupdict()['type'] if _type else 'xml'
     try:
         link_extractor = create_linkextractor_from_specs({
             'type': _type, 'value': ''
         })
     except ValueError:
         link_extractor = SitemapLinkExtractor()
     for link in link_extractor.links_to_follow(response):
         request = self._filter_link(link, seen)
         if request:
             yield request

示例#16

0

显示文件

文件： annotations.py 项目： monocleman1/dd

 def handle_xml(self, response, seen):
     _type = content_type(response).subtype.split('+')[0]
     try:
         link_extractor = create_linkextractor_from_specs({
             'type': _type,
             'value': ''
         })
     except ValueError:
         link_extractor = XmlLinkExtractor()
     for link in link_extractor.links_to_follow(response):
         request = self._filter_link(link, seen)
         if request:
             yield request

示例#17

0

显示文件

文件： test_linkextractors.py 项目： 01-/portia

 def test_start_urls(self):
     specs = {"type": "pagination",
              "value": None,
              "start_urls": ['http://www.spam.com/?p=1',
                             'http://www.eggs.com/?page=0']
     }
     lextractor = create_linkextractor_from_specs(specs)
     html = """
     <a href="http://www.spam.com/?p=100">Click here 1</a>
     <a href="http://www.spam.com/?p=200">Click here 2</a>
     <a href="http://www.spam.com/?p=300">Click here 3</a>
     """
     html_page = htmlpage_from_response(
         HtmlResponse(url='http://www.example.com/', body=html))
     links = list(lextractor.links_to_follow(html_page))
     links = sorted(links, key=lambda link: link.url)
     self.assertEqual(len(links), 3)
     self.assertEqual(links[0].url, "http://www.spam.com/?p=100")
     self.assertEqual(links[1].url, "http://www.spam.com/?p=200")
     self.assertEqual(links[2].url, "http://www.spam.com/?p=300")
     self.assertEqual(links[0].text, 'Click here 1')
     self.assertEqual(links[1].text, 'Click here 2')
     self.assertEqual(links[2].text, 'Click here 3')

示例#18

0

显示文件

 def test_start_urls(self):
     specs = {"type": "pagination",
              "value": None,
              "start_urls": ['http://www.spam.com/?p=1',
                             'http://www.eggs.com/?page=0']
     }
     lextractor = create_linkextractor_from_specs(specs)
     html = """
     <a href="http://www.spam.com/?p=100">Click here 1</a>
     <a href="http://www.spam.com/?p=200">Click here 2</a>
     <a href="http://www.spam.com/?p=300">Click here 3</a>
     """
     html_page = htmlpage_from_response(
         UTF8HtmlResponse(url='http://www.example.com/', body=html))
     links = list(lextractor.links_to_follow(html_page))
     links = sorted(links, key=lambda link: link.url)
     self.assertEqual(len(links), 3)
     self.assertEqual(links[0].url, "http://www.spam.com/?p=100")
     self.assertEqual(links[1].url, "http://www.spam.com/?p=200")
     self.assertEqual(links[2].url, "http://www.spam.com/?p=300")
     self.assertEqual(links[0].text, 'Click here 1')
     self.assertEqual(links[1].text, 'Click here 2')
     self.assertEqual(links[2].text, 'Click here 3')

示例#19

0

显示文件

文件： test_linkextractors.py 项目： plafl/portia

 def test_rss(self):
     specs = {"type": "rss", "value": ""}
     lextractor = create_linkextractor_from_specs(specs)
     links = list(lextractor.links_to_follow(self.response))
     self.assertEqual(len(links), 1)
     self.assertEqual(links[0].url, "http://www.wikipedia.org/")

示例#20

0

显示文件

文件： test_linkextractors.py 项目： r2k0/slybot

 def test_xml_remove_namespaces(self):
     specs = {"type": "xpath", "value": "//link/@href", "remove_namespaces": True}
     lextractor = create_linkextractor_from_specs(specs)
     links = list(lextractor.links_to_follow(self.atom))
     self.assertEqual(len(links), 3)
     self.assertEqual(links[0].url, 'http://example.org/feed/')

示例#21

0

显示文件

文件： test_linkextractors.py 项目： daqv/portia-dashboard

 def test_xml_remove_namespaces(self):
     specs = {"type": "xpath", "value": "//link/@href", "remove_namespaces": True}
     lextractor = create_linkextractor_from_specs(specs)
     links = list(lextractor.links_to_follow(self.atom))
     self.assertEqual(len(links), 3)
     self.assertEqual(links[0].url, 'http://example.org/feed/')

示例#22

0

显示文件

文件： test_linkextractors.py 项目： daqv/portia-dashboard

 def test_atom(self):
     specs = {"type": "atom", "value": ""}
     lextractor = create_linkextractor_from_specs(specs)
     links = list(lextractor.links_to_follow(self.atom))
     self.assertEqual(len(links), 3)
     self.assertEqual(links[0].url, 'http://example.org/feed/')

示例#23

0

显示文件

 def test_xml(self):
     specs = {"type": "xpath", "value": "//item/link/text()"}
     lextractor = create_linkextractor_from_specs(specs)
     links = list(lextractor.links_to_follow(self.response))
     self.assertEqual(len(links), 1)
     self.assertEqual(links[0].url, 'http://www.wikipedia.org/')

示例#24

0

显示文件

文件： test_linkextractors.py 项目： daqv/portia-dashboard

 def test_xml(self):
     specs = {"type": "xpath", "value": "//item/link/text()"}
     lextractor = create_linkextractor_from_specs(specs)
     links = list(lextractor.links_to_follow(self.response))
     self.assertEqual(len(links), 1)
     self.assertEqual(links[0].url, 'http://www.wikipedia.org/')

示例#25

0

显示文件

 def test_atom(self):
     specs = {"type": "atom", "value": ""}
     lextractor = create_linkextractor_from_specs(specs)
     links = list(lextractor.links_to_follow(self.atom))
     self.assertEqual(len(links), 3)
     self.assertEqual(links[0].url, 'http://example.org/feed/')