def test_request_extractor(self): extractors = [SgmlRequestExtractor()] # extract all requests reqgen = RequestGenerator(extractors, [], callback=self.deferred) requests = reqgen.generate_requests(self.response) self.failUnless(self._equal_requests_list(requests, self.requests))
def test_request_extractor(self): extractors = [ SgmlRequestExtractor() ] # extract all requests reqgen = RequestGenerator(extractors, [], callback=self.deferred) requests = reqgen.generate_requests(self.response) self.failUnless(self._equal_requests_list(requests, self.requests))
def test_request_processor(self): extractors = [ SgmlRequestExtractor() ] processors = [ Canonicalize(), FilterDupes(), ] reqgen = RequestGenerator(extractors, processors, callback=self.deferred) requests = reqgen.generate_requests(self.response) self.failUnless(self._equal_requests_list(requests, self.requests)) # filter domain processors = [ Canonicalize(), FilterDupes(), FilterDomain(deny='example.org'), ] reqgen = RequestGenerator(extractors, processors, callback=self.deferred) requests = reqgen.generate_requests(self.response) self.failUnlessEqual(list(requests), []) # filter url processors = [ Canonicalize(), FilterDupes(), FilterUrl(deny=(r'about', r'othercat')), ] reqgen = RequestGenerator(extractors, processors, callback=self.deferred) requests = reqgen.generate_requests(self.response) self.failUnless(self._equal_requests_list(requests, [ Request('http://example.org/somepage/item/12.html', meta={'link_text': 'Item 12'}), Request('http://example.org/', meta={'link_text': ''}), ])) processors = [ Canonicalize(), FilterDupes(), FilterUrl(allow=r'/somepage/'), ] reqgen = RequestGenerator(extractors, processors, callback=self.deferred) requests = reqgen.generate_requests(self.response) self.failUnless(self._equal_requests_list(requests, [ Request('http://example.org/somepage/item/12.html', meta={'link_text': 'Item 12'}), ]))
def test_basic(self): reqgen = RequestGenerator([], [], callback=self.deferred) # returns generator requests = reqgen.generate_requests(self.response) self.failUnlessEqual(list(requests), [])
def test_request_processor(self): extractors = [SgmlRequestExtractor()] processors = [ Canonicalize(), FilterDupes(), ] reqgen = RequestGenerator(extractors, processors, callback=self.deferred) requests = reqgen.generate_requests(self.response) self.failUnless(self._equal_requests_list(requests, self.requests)) # filter domain processors = [ Canonicalize(), FilterDupes(), FilterDomain(deny='example.org'), ] reqgen = RequestGenerator(extractors, processors, callback=self.deferred) requests = reqgen.generate_requests(self.response) self.failUnlessEqual(list(requests), []) # filter url processors = [ Canonicalize(), FilterDupes(), FilterUrl(deny=(r'about', r'othercat')), ] reqgen = RequestGenerator(extractors, processors, callback=self.deferred) requests = reqgen.generate_requests(self.response) self.failUnless( self._equal_requests_list(requests, [ Request('http://example.org/somepage/item/12.html', meta={'link_text': 'Item 12'}), Request('http://example.org/', meta={'link_text': ''}), ])) processors = [ Canonicalize(), FilterDupes(), FilterUrl(allow=r'/somepage/'), ] reqgen = RequestGenerator(extractors, processors, callback=self.deferred) requests = reqgen.generate_requests(self.response) self.failUnless( self._equal_requests_list(requests, [ Request('http://example.org/somepage/item/12.html', meta={'link_text': 'Item 12'}), ]))
def test_basic(self): reqgen = RequestGenerator([], [], callback=self.deferred) # returns generator requests = reqgen.generate_requests(self.response) self.failUnlessEqual(list(requests), [])