Пример #1
0
 def test_url_is_from_spider_class_attributes(self):
     class MySpider(Spider):
         name = 'example.com'
     self.assertTrue(url_is_from_spider('http://www.example.com/some/page.html', MySpider))
     self.assertTrue(url_is_from_spider('http://sub.example.com/some/page.html', MySpider))
     self.assertFalse(url_is_from_spider('http://www.example.org/some/page.html', MySpider))
     self.assertFalse(url_is_from_spider('http://www.example.net/some/page.html', MySpider))
Пример #2
0
 def test_url_is_from_spider(self):
     spider = Spider(name='example.com')
     self.assertTrue(
         url_is_from_spider('http://www.example.com/some/page.html', spider))
     self.assertTrue(
         url_is_from_spider('http://sub.example.com/some/page.html', spider))
     self.assertFalse(
         url_is_from_spider('http://www.example.org/some/page.html', spider))
     self.assertFalse(
         url_is_from_spider('http://www.example.net/some/page.html', spider))
Пример #3
0
    def test_url_is_from_spider_with_allowed_domains(self):
        spider = Spider(
            name='example.com', allowed_domains=['example.org', 'example.net'])
        self.assertTrue(
            url_is_from_spider('http://www.example.com/some/page.html', spider))
        self.assertTrue(
            url_is_from_spider('http://sub.example.com/some/page.html', spider))
        self.assertTrue(
            url_is_from_spider('http://example.com/some/page.html', spider))
        self.assertTrue(
            url_is_from_spider('http://www.example.org/some/page.html', spider))
        self.assertTrue(
            url_is_from_spider('http://www.example.net/some/page.html', spider))
        self.assertFalse(
            url_is_from_spider('http://www.example.us/some/page.html', spider))

        spider = Spider(
            name='example.com', allowed_domains=set(('example.com', 'example.net')))
        self.assertTrue(
            url_is_from_spider('http://www.example.com/some/page.html', spider))

        spider = Spider(
            name='example.com', allowed_domains=('example.com', 'example.net'))
        self.assertTrue(
            url_is_from_spider('http://www.example.com/some/page.html', spider))
Пример #4
0
    def test_url_is_from_spider_class_attributes(self):
        class MySpider(Spider):
            name = 'example.com'

        self.assertTrue(
            url_is_from_spider('http://www.example.com/some/page.html',
                               MySpider))
        self.assertTrue(
            url_is_from_spider('http://sub.example.com/some/page.html',
                               MySpider))
        self.assertFalse(
            url_is_from_spider('http://www.example.org/some/page.html',
                               MySpider))
        self.assertFalse(
            url_is_from_spider('http://www.example.net/some/page.html',
                               MySpider))
Пример #5
0
 def process_response(self, request, response, spider):
     if isinstance(response, HtmlResponse) and response.body and getattr(spider, 'follow_canonical_links', False):
         rel_canonical = self._extractor.extract_links(response)
         if rel_canonical:
             rel_canonical = rel_canonical[0].url
             if rel_canonical != request.url and url_is_from_spider(rel_canonical, spider):
                 log.msg("Redirecting (rel=\"canonical\") to %s from %s" % (rel_canonical, request), level=log.DEBUG, spider=spider)
                 return request.replace(url=rel_canonical, callback=lambda r: r if r.status == 200 else response)
     return response
Пример #6
0
 def process_response(self, request, response, spider):
     if isinstance(response, HtmlResponse) and response.body and getattr(spider, 'follow_canonical_links', False):
         rel_canonical = self._extractor.extract_links(response)
         if rel_canonical:
             rel_canonical = rel_canonical[0].url
             if rel_canonical != request.url and url_is_from_spider(rel_canonical, spider):
                 log.msg("Redirecting (rel=\"canonical\") to %s from %s" % (rel_canonical, request), level=log.DEBUG, spider=spider)
                 return request.replace(url=rel_canonical, callback=lambda r: r if r.status == 200 else response)
     return response
Пример #7
0
    def test_url_is_from_spider_with_allowed_domains(self):
        spider = Spider(name="example.com", allowed_domains=["example.org", "example.net"])
        self.assertTrue(url_is_from_spider("http://www.example.com/some/page.html", spider))
        self.assertTrue(url_is_from_spider("http://sub.example.com/some/page.html", spider))
        self.assertTrue(url_is_from_spider("http://example.com/some/page.html", spider))
        self.assertTrue(url_is_from_spider("http://www.example.org/some/page.html", spider))
        self.assertTrue(url_is_from_spider("http://www.example.net/some/page.html", spider))
        self.assertFalse(url_is_from_spider("http://www.example.us/some/page.html", spider))

        spider = Spider(name="example.com", allowed_domains=set(("example.com", "example.net")))
        self.assertTrue(url_is_from_spider("http://www.example.com/some/page.html", spider))

        spider = Spider(name="example.com", allowed_domains=("example.com", "example.net"))
        self.assertTrue(url_is_from_spider("http://www.example.com/some/page.html", spider))
Пример #8
0
    def test_url_is_from_spider_with_allowed_domains(self):
        spider = Spider(name='example.com', allowed_domains=['example.org', 'example.net'])
        self.assertTrue(url_is_from_spider('http://www.example.com/some/page.html', spider))
        self.assertTrue(url_is_from_spider('http://sub.example.com/some/page.html', spider))
        self.assertTrue(url_is_from_spider('http://example.com/some/page.html', spider))
        self.assertTrue(url_is_from_spider('http://www.example.org/some/page.html', spider))
        self.assertTrue(url_is_from_spider('http://www.example.net/some/page.html', spider))
        self.assertFalse(url_is_from_spider('http://www.example.us/some/page.html', spider))

        spider = Spider(name='example.com', allowed_domains=set(('example.com', 'example.net')))
        self.assertTrue(url_is_from_spider('http://www.example.com/some/page.html', spider))

        spider = Spider(name='example.com', allowed_domains=('example.com', 'example.net'))
        self.assertTrue(url_is_from_spider('http://www.example.com/some/page.html', spider))
Пример #9
0
 def fromurl(self, url):
     if self.force_domain:
         return self._spiders.get(self.force_domain)
     domain = urlparse.urlparse(url).hostname
     domain = str(domain).replace('www.', '')
     if domain:
         if domain in self._spiders:         # try first locating by domain
             return self._spiders[domain]
         else:                               # else search spider by spider
             plist = self._spiders.values()
             for p in plist:
                 if url_is_from_spider(url, p):
                     return p
     def process_response(self, request, response, spider):
         if isinstance(response, HtmlResponse) and response.body and getattr(spider, 'follow_canonical_links', False):
             rel_canonical = self._extractor.extract_links(response)
             if rel_canonical:
                 rel_canonical = rel_canonical[0].url
                 if rel_canonical != request.url and url_is_from_spider(rel_canonical, spider):
                     log.msg("Redirecting (rel=\"canonical\") to %s from %s" % (rel_canonical, request), level=log.DEBUG, spider=spider)
                     return request.replace(url=rel_canonical, callback=lambda r: r if r.status == 200 else response)
         return response
  
 # Snippet imported from snippets.scrapy.org (which no longer works)
 # author: pablo
 # date  : Aug 27, 2010
Пример #11
0
 def test_url_is_from_spider_with_allowed_domains_class_attributes(self):
     class MySpider(Spider):
         name = 'example.com'
         allowed_domains = ('example.org', 'example.net')
     self.assertTrue(url_is_from_spider('http://www.example.com/some/page.html', MySpider))
     self.assertTrue(url_is_from_spider('http://sub.example.com/some/page.html', MySpider))
     self.assertTrue(url_is_from_spider('http://example.com/some/page.html', MySpider))
     self.assertTrue(url_is_from_spider('http://www.example.org/some/page.html', MySpider))
     self.assertTrue(url_is_from_spider('http://www.example.net/some/page.html', MySpider))
     self.assertFalse(url_is_from_spider('http://www.example.us/some/page.html', MySpider))
Пример #12
0
    def test_url_is_from_spider_with_allowed_domains_class_attributes(self):
        class MySpider(BaseSpider):
            name = "example.com"
            allowed_domains = ["example.org", "example.net"]

        self.assertTrue(url_is_from_spider("http://www.example.com/some/page.html", MySpider))
        self.assertTrue(url_is_from_spider("http://sub.example.com/some/page.html", MySpider))
        self.assertTrue(url_is_from_spider("http://example.com/some/page.html", MySpider))
        self.assertTrue(url_is_from_spider("http://www.example.org/some/page.html", MySpider))
        self.assertTrue(url_is_from_spider("http://www.example.net/some/page.html", MySpider))
        self.assertFalse(url_is_from_spider("http://www.example.us/some/page.html", MySpider))
Пример #13
0
 def handles_request(cls, request):
     return url_is_from_spider(request.url, cls)
Пример #14
0
 def handles_request(cls, request):
     """
     判断当前请求的URL是否属于当前蜘蛛
     allowed_domains
     """
     return url_is_from_spider(request.url, cls)
Пример #15
0
 def find_by_request(self, request):
     """Returns list of spiders names that match the given Request"""
     return [name for name, spider in self._spiders.iteritems()
             if url_is_from_spider(request.url, spider)]
Пример #16
0
 def test_url_is_from_spider(self):
     spider = Spider(name='example.com')
     self.assertTrue(url_is_from_spider('http://www.example.com/some/page.html', spider))
     self.assertTrue(url_is_from_spider('http://sub.example.com/some/page.html', spider))
     self.assertFalse(url_is_from_spider('http://www.example.org/some/page.html', spider))
     self.assertFalse(url_is_from_spider('http://www.example.net/some/page.html', spider))
Пример #17
0
 def handles_request(cls, request):
     return url_is_from_spider(request.url, cls)
Пример #18
0
 def test_url_is_from_spider(self):
     spider = BaseSpider(name="example.com")
     self.assertTrue(url_is_from_spider("http://www.example.com/some/page.html", spider))
     self.assertTrue(url_is_from_spider("http://sub.example.com/some/page.html", spider))
     self.assertFalse(url_is_from_spider("http://www.example.org/some/page.html", spider))
     self.assertFalse(url_is_from_spider("http://www.example.net/some/page.html", spider))