def test_extraction(self): # Default arguments lx = RegexLinkExtractor() self.assertEqual(lx.extract_links(self.response), [Link(url='http://example.com/sample2.html', text=u'sample 2'), Link(url='http://example.com/sample3.html', text=u'sample 3 text'), Link(url='http://www.google.com/something', text=u''),])
def test_extraction(self): # Default arguments lx = RegexLinkExtractor() # Note that RegexLinkExtractor returns links in arbitrary order, # so we need to sort them for comparison self.assertEqual(sorted(lx.extract_links(self.response), key=lambda x: x.url), [ Link(url='http://example.com/sample2.html', text=u'sample 2'), Link(url='http://example.com/sample3.html', text=u'sample 3 repetition'), Link(url='http://www.google.com/something', text=u''), ])
class BaseSpider(CrawlSpider): name = "Base" site__id = None extractors = None allowed_domains = [] start_urls = [] session = None rules = ( Rule(RegexLinkExtractor(),callback='parse_item'), Rule(SgmlLinkExtractor(),callback='parse_item'), Rule(LxmlParserLinkExtractor(),callback='parse_item'), ) def process_results(self, response, results): return chain(results, self.parse_item(response)) def parse_item(self, response): for extractor in self.extractors: values = { 'URL_PROD': response.url, } extract = {} for e in extractor(response): extract.update(e) # TODO: check relevance if overwriting for k,v in extract.iteritems(): values[k] = v[0] name = values.get('NAME_PROD') if name: yield ScraperItem(name=name, site=self.site__id, values=values.iteritems())
class MySpider(CrawlSpider): name = 'example' rules = [ Rule(FallbackLinkExtractor([ LxmlLinkExtractor(), SgmlLinkExtractor(), RegexLinkExtractor(), ]), callback='parse_page', follow=True) ] def parse_page(self, response): pass parse_start_url = parse_page
class MySpider(CrawlSpider): name = 'example' start_urls = ['http://scrapinghub.com'] callback_calls = 0 rules = [Rule(FallbackLinkExtractor([ LxmlLinkExtractor(), RegexLinkExtractor(), ]), callback='parse_page', follow=True)] def parse_page(self, response): self.callback_calls += 1 pass def parse_nothing(self, response): pass parse_start_url = parse_nothing
class MySpider(CrawlSpider): name = 'recorder' start_urls = [ 'http://' + DOMAIN, ] allowed_domains = [DOMAIN] rules = [ Rule(FallbackLinkExtractor([ LxmlLinkExtractor(allow=ALLOWED_RE), SgmlLinkExtractor(allow=ALLOWED_RE), RegexLinkExtractor(allow=ALLOWED_RE), ]), callback='parse_page', follow=True) ] def parse_page(self, response): pass