def test_robotstxt_error(self): self.crawler.settings.set('ROBOTSTXT_OBEY', True) err = error.DNSLookupError('Robotstxt address not found') def return_failure(request, spider): deferred = Deferred() reactor.callFromThread(deferred.errback, failure.Failure(err)) return deferred self.crawler.engine.download.side_effect = return_failure middleware = RobotsTxtMiddleware(self.crawler) middleware._logerror = mock.MagicMock(side_effect=middleware._logerror) deferred = middleware.process_request(Request('http://site.local'), None) deferred.addCallback(lambda _: self.assertTrue(middleware._logerror.called)) return deferred
def test_robotstxt_empty_response(self): # empty response should equal 'allow all' middleware = RobotsTxtMiddleware(self._get_emptybody_crawler()) return DeferredList([ self.assertNotIgnored(Request('http://site.local/allowed'), middleware), self.assertNotIgnored(Request('http://site.local/admin/main'), middleware), self.assertNotIgnored(Request('http://site.local/static/'), middleware) ], fireOnOneErrback=True)
def test_robotstxt_immediate_error(self): self.crawler.settings.set('ROBOTSTXT_OBEY', True) err = error.DNSLookupError('Robotstxt address not found') def immediate_failure(request, spider): deferred = Deferred() deferred.errback(failure.Failure(err)) return deferred self.crawler.engine.download.side_effect = immediate_failure middleware = RobotsTxtMiddleware(self.crawler) return self.assertNotIgnored(Request('http://site.local'), middleware)
def test_robotstxt_meta(self): middleware = RobotsTxtMiddleware(self._get_successful_crawler()) meta = {'dont_obey_robotstxt': True} return DeferredList([ self.assertNotIgnored( Request('http://site.local/allowed', meta=meta), middleware), self.assertNotIgnored( Request('http://site.local/admin/main', meta=meta), middleware), self.assertNotIgnored( Request('http://site.local/static/', meta=meta), middleware) ], fireOnOneErrback=True)
def test_robotstxt_garbage(self): # garbage response should be discarded, equal 'allow all' middleware = RobotsTxtMiddleware(self._get_garbage_crawler()) deferred = DeferredList([ self.assertNotIgnored(Request('http://site.local'), middleware), self.assertNotIgnored(Request('http://site.local/allowed'), middleware), self.assertNotIgnored(Request('http://site.local/admin/main'), middleware), self.assertNotIgnored(Request('http://site.local/static/'), middleware) ], fireOnOneErrback=True) return deferred
def test_ignore_robotstxt_request(self): self.crawler.settings.set('ROBOTSTXT_OBEY', True) def ignore_request(request, spider): deferred = Deferred() reactor.callFromThread(deferred.errback, failure.Failure(IgnoreRequest())) return deferred self.crawler.engine.download.side_effect = ignore_request middleware = RobotsTxtMiddleware(self.crawler) mw_module_logger.error = mock.MagicMock() d = self.assertNotIgnored(Request('http://site.local/allowed'), middleware) d.addCallback(lambda _: self.assertFalse(mw_module_logger.error.called)) return d
def test_robotstxt(self): middleware = RobotsTxtMiddleware(self._get_successful_crawler()) return DeferredList([ self.assertNotIgnored(Request('http://site.local/allowed'), middleware), self.assertIgnored(Request('http://site.local/admin/main'), middleware), self.assertIgnored(Request('http://site.local/static/'), middleware), self.assertIgnored( Request('http://site.local/wiki/K%C3%A4ytt%C3%A4j%C3%A4:'), middleware), self.assertIgnored(Request('http://site.local/wiki/Käyttäjä:'), middleware) ], fireOnOneErrback=True)
def test_robotstxt_empty_response(self): # empty response should equal 'allow all' middleware = RobotsTxtMiddleware(self._get_emptybody_crawler()) self.assertNotIgnored(Request('http://site.local'), middleware) def test(r): self.assertNotIgnored(Request('http://site.local/allowed'), middleware) self.assertNotIgnored(Request('http://site.local/admin/main'), middleware) self.assertNotIgnored(Request('http://site.local/static/'), middleware) deferred = Deferred() deferred.addCallback(test) reactor.callFromThread(deferred.callback, None) return deferred
def test_robotstxt_meta(self): middleware = RobotsTxtMiddleware(self._get_successful_crawler()) meta = {'dont_obey_robotstxt': True} self.assertNotIgnored(Request('http://site.local', meta=meta), middleware) def test(r): self.assertNotIgnored( Request('http://site.local/allowed', meta=meta), middleware) self.assertNotIgnored( Request('http://site.local/admin/main', meta=meta), middleware) self.assertNotIgnored( Request('http://site.local/static/', meta=meta), middleware) deferred = Deferred() deferred.addCallback(test) reactor.callFromThread(deferred.callback, None) return deferred
def test_robotstxt(self): middleware = RobotsTxtMiddleware(self._get_successful_crawler()) # There is a bit of neglect in robotstxt.py: robots.txt is fetched asynchronously, # and it is actually fetched only *after* first process_request completes. # So, first process_request will always succeed. # We defer test() because otherwise robots.txt download mock will be called after assertRaises failure. self.assertNotIgnored(Request('http://site.local'), middleware) def test(r): self.assertNotIgnored(Request('http://site.local/allowed'), middleware) self.assertIgnored(Request('http://site.local/admin/main'), middleware) self.assertIgnored(Request('http://site.local/static/'), middleware) deferred = Deferred() deferred.addCallback(test) reactor.callFromThread(deferred.callback, None) return deferred
def _get_middleware(self): crawler = self._get_crawler() return RobotsTxtMiddleware(crawler)
def test_robotstxt_ready_parser(self): middleware = RobotsTxtMiddleware(self._get_successful_crawler()) d = self.assertNotIgnored(Request('http://site.local/allowed'), middleware) d.addCallback(lambda _: self.assertNotIgnored(Request('http://site.local/allowed'), middleware)) return d