def test_extract_repeated_field(self): sample = { 'plugins': {'annotations-plugin': {}}, 'url': 'https://stackoverflow.com', 'original_body': re.sub( 'data-scrapy-annotate=".*"', '', html_page._body), 'scrapes': 'default', 'page_id': '507f520c3bf361f4c5cd55c44307a271bccb2218', 'version': '0.13.0' } data = open_spec('so_annotations.json') annos, items, results = data['annos'], data['items'], data['results'] sample['plugins']['annotations-plugin']['extracts'] = annos spider = IblSpider('so', make_spider(sample=sample), items, {}, Settings()) page = HtmlResponse('http://url', body=sample['original_body'], encoding='utf-8') items = [i for i in spider.parse(page) if not isinstance(i, Request)] keys = {(u'_index', u'_template', u'_type', u'answered', u'tags', u'title', 'url')} self.assertEqual({tuple(sorted(i.keys())) for i in items}, keys) self.assertEqual([items[0], items[52], items[-1]], results) self.assertEqual(len(items), 96) spider, page, results = open_spider_page_and_results('autoevolution.json') items = [i for i in spider.parse(page) if not isinstance(i, Request)] self.assertEqual(items, results)
def test_extract_multiple_item_types(self): spider = IblSpider('xceed', xceed_spider, xceed_spider['items'], {}, Settings()) data = list(spider.parse( HtmlResponse('http://url', body=xceed_spider['templates'][0]['original_body'], encoding='utf-8') )) self.assertEqual(data[:6], xceed_spider['results'])
def test_extract_multiple_item_types(self): spider = IblSpider('xceed', xceed_spider, xceed_spider['items'], {}, Settings()) data = list(spider.parse( HtmlResponse('http://url', body=xceed_spider['templates'][0]['original_body'], encoding='utf-8') )) items = [d for d in data if not isinstance(d, Request)] self.assertEqual(items, xceed_spider['results'])
def test_extract_multiple_item_types(self): spider = IblSpider('xceed', xceed_spider, xceed_spider['items'], {}, Settings()) data = list( spider.parse( HtmlResponse( 'http://url', body=xceed_spider['templates'][0]['original_body'], encoding='utf-8'))) self.assertEqual(data[:6], xceed_spider['results'])
def test_extract_multiple_item_types(self): spider = IblSpider('xceed', xceed_spider, xceed_spider['items'], {}, Settings()) data = list(spider.parse( HtmlResponse('http://url', body=xceed_spider['templates'][0]['original_body'], encoding='utf-8') )) items = sorted([d for d in data if not isinstance(d, Request)], key=lambda x: ('ticket', 'venue', 'event').index(x['_type'])) self.assertEqual(items, xceed_spider['results'])