def _open_sample_and_page(name): sample_spec = _open_spec(name) annotations = sample_spec['plugins']['annotations-plugin']['extracts'] annotated = apply_annotations(_clean_annotation_data(annotations), sample_spec['original_body']) url = sample_spec['url'] return (HtmlPage(url=url, body=annotated), HtmlPage(url=url, body=sample_spec['original_body']))
class ExtractorTest(TestCase): annotated = u""" <table> <tr data-scrapy-annotate="{"required": [], "variant": 0, "annotations": {"content": "gender"}}"> <th class="item-key">Gender</th> <td >Male</td></tr> </table>""" _target = u""" <table> <tr> <th class="item-key">Gender</th> <td >Male</td></tr> </table>""" annotated2 = u""" <table> <tr data-scrapy-annotate="{"required": [], "variant": 0, "annotations": {"content": "name"}}"> <th class="item-key">Name</th> <td >John</td></tr> <span data-scrapy-annotate="{"required": [], "variant": 0, "annotations": {"content": "gender"}}">Male</span> </table>""" _target2 = u""" <body> <tr> <th class="item-key">Name</th><td>Olivia</td></tr> <span></span> </body>""" annotations = _clean_annotation_data([{ 'id': 'annotation', 'selector': 'td > a', 'container_id': 'parent', 'data': { 1: { 'attribute': 'content', 'field': 'title', 'required': False, 'extractors': [] }, 2: { 'attribute': 'content', 'field': 'name', 'required': False, 'extractors': ['3'] }, 3: { 'attribute': 'href', 'field': 'url', 'required': False, 'extractors': ['1', '2'] } } }, { 'id': 'annotation', 'selector': 'span', 'container_id': 'parent', 'data': { 1: { 'attribute': 'content', 'field': 'price', 'required': False, 'extractors': ['8', '4', '5', '6'] }, 2: { 'attribute': 'content', 'field': 'date', 'required': False, 'extractors': ['4', '7'] } } }, { 'id': 'parent', 'item_container': True, 'selector': 'body' }]) target3 = u""" <html> <body> <tr> <th class="item-key">Name</th> <td> <a href="/olivia.html">Name: Olivia</a> </td> </tr><span>2016-03-17 20:25</span> </body></html>""" template = HtmlPage(url="http://www.test.com/", body=annotated) target = HtmlPage(url="http://www.test.com/", body=_target) template2 = HtmlPage(url="http://www.test.com/", body=annotated2) target2 = HtmlPage(url="http://www.test.com/a", body=_target2) template3 = HtmlPage(url="http://www.test.com/a", body=apply_annotations(annotations, target3)) target3 = HtmlPage(url="http://www.test.com/a", body=target3) def test_regex_extractor(self): extractor = create_regex_extractor("(\d+).*(\.\d+)") extracted = extractor(u"The price of this product is <div>45</div> </div class='small'>.50</div> pounds") self.assertEqual(extracted, u"45.50") processor = TextFieldTypeProcessor() self.assertEqual(processor.adapt(extracted, None), u"45.50") def test_raw_type_w_regex(self): schema = { 'fields': { 'gender': { 'required': False, 'type': 'raw', 'vary': False, } } } descriptor = create_slybot_item_descriptor(schema) extractors = { 1: {"regular_expression": "Gender.*(<td\s*>(?:Male|Female)</td>)"} } apply_extractors(descriptor, {"gender": [1]}, extractors) ibl_extractor = SlybotIBLExtractor([ (self.template, {'#default': descriptor}, '0.12.0')]) self.assertEqual(ibl_extractor.extract(self.target)[0][0]['gender'], [u'<td >Male</td>']) def test_negative_hit_w_regex(self): schema = { 'fields': { 'gender': { 'required': False, 'type': 'number', 'vary': False, } } } descriptor = create_slybot_item_descriptor(schema) extractors = {1: {"regular_expression": "Gender\\s+(Male|Female)"}} apply_extractors(descriptor, {"gender": [1]}, extractors) ibl_extractor = SlybotIBLExtractor([ (self.template, {'#default': descriptor}, '0.12.0')]) self.assertEqual(ibl_extractor.extract(self.target)[0], None) def test_text_type_w_regex(self): schema = { "fields": { 'gender': { 'required': False, 'type': 'text', 'vary': False, } } } descriptor = create_slybot_item_descriptor(schema) extractors = {1: {"regular_expression": "Gender\\s+(Male|Female)"}} apply_extractors(descriptor, {"gender": [1]}, extractors) ibl_extractor = SlybotIBLExtractor([ (self.template, {'#default': descriptor}, '0.12.0')]) self.assertEqual(ibl_extractor.extract(self.target)[0][0]['gender'], [u'Male']) def test_type_extractor(self): schema = { "fields": { 'gender': { 'required': False, 'type': 'number', 'vary': False, } } } descriptor = create_slybot_item_descriptor(schema) extractors = { 1: {"type_extractor": "text"}, 2: {"regular_expression": "Gender\\s+(Male|Female)"} } apply_extractors(descriptor, {"gender": [1, 2]}, extractors) ibl_extractor = SlybotIBLExtractor([ (self.template, {'#default': descriptor}, '0.12.0')]) self.assertEqual(ibl_extractor.extract(self.target)[0][0]['gender'], [u'Male']) def test_default_type_extractor(self): schema = { 'fields': {} } descriptor = create_slybot_item_descriptor(schema) extractors = { 1: {"regular_expression": "Gender\\s+(Male|Female)"} } apply_extractors(descriptor, {"gender": [1]}, extractors) ibl_extractor = SlybotIBLExtractor([ (self.template, {'#default': descriptor}, '0.12.0')]) self.assertEqual(ibl_extractor.extract(self.target)[0][0]['gender'], [u'Male']) def test_text_type_w_regex_and_no_groups(self): schema = { 'fields': { 'gender': { 'required': False, 'type': 'text', 'vary': False, } } } descriptor = create_slybot_item_descriptor(schema) extractors = { 1: {"regular_expression": "Gender"} } apply_extractors(descriptor, {"gender": [1]}, extractors) ibl_extractor = SlybotIBLExtractor([ (self.template, {'#default': descriptor}, '0.12.0')]) self.assertEqual(ibl_extractor.extract(self.target)[0][0]['gender'], [u'Gender']) def test_extractor_w_empty_string_extraction(self): schema = { 'fields': { 'gender': { 'required': False, 'type': 'text', 'vary': False, }, 'name': { 'required': True, 'type': 'text', 'vary': False, } } } descriptor = create_slybot_item_descriptor(schema) extractors = { 1: { "regular_expression": "([0-9]+)" } } apply_extractors(descriptor, {"gender": [1]}, extractors) ibl_extractor = SlybotIBLExtractor([ (self.template2, {'#default': descriptor}, '0.12.0')]) self.assertEqual(ibl_extractor.extract(self.target2)[0][0]['name'], [u'Name Olivia']) def test_per_annotation_extractors(self): schema = { 'fields': { 'url': { 'required': False, 'type': 'text', 'vary': False, }, 'name': { 'required': True, 'type': 'text', 'vary': False, } } } extractors = { '1': { 'type_extractor': 'url' }, '2': { 'regular_expression': '(.*)\.html' }, '3': { 'regular_expression': 'Name: (.*)' }, '4': { 'type_extractor': 'text' }, '5': { 'type_extractor': 'price' }, '6': { 'type_extractor': 'number' }, '7': { 'type_extractor': 'date' }, '8': { 'regular_expression': '(\d+)-' } } descriptors = {'#default': create_slybot_item_descriptor(schema)} add_extractors_to_descriptors(descriptors, extractors) ibl_extractor = SlybotIBLExtractor([ (self.template3, descriptors, '0.13.0') ]) result = {'name': [u'Olivia'], 'url': [u'http://www.test.com/olivia'], 'title': [u'Name: Olivia'], 'price': [u'2016'], 'date': [datetime(2016, 3, 17, 20, 25)]} data = ibl_extractor.extract(self.target3)[0][0] del data['_template'] self.assertEqual(data, result)
<div><p>Text {idx} Text {idx}</p><p>Text {idx} Text {idx}</p></div> </li> """.format html = base_page('\n'.join(item_template(idx=i, rank=i if i % 2 else '') for i in range(1, 11))) annotations = _clean_annotation_data([{ 'id': 'annotation1', 'selector': 'li > div > h3 > a', 'container_id': 'repeated_parent', 'data': {1: {'attribute': 'content', 'field': 'title', 'required': False, 'extractors': []}, 2: {'attribute': 'href', 'field': 'url', 'required': False, 'extractors': ['1', '2']}}}, {'id': 'annotation2', 'selector': 'li > div > span', 'container_id': 'repeated_parent', 'data': {1: {'attribute': 'content', 'field': 'rank', 'required': True, 'extractors': []}}}, {'id': 'annotation3', 'selector': 'li > div:nth-child(2)', 'container_id': 'repeated_parent', 'data': {1: {'attribute': 'content', 'field': 'description', 'required': True, 'extractors': []}}}, {'id': 'parent', 'item_container': True, 'selector': 'ul'}, {'id': 'repeated_parent', 'item_container': True, 'container_id': 'parent', 'selector': 'li', 'repeated': True}]) schemas = { '#default': {'name': 'default_item', 'fields': {}}, 'data': { 'name': 'data_item', 'fields': { 'title': {'required': False, 'vary': False, 'type': 'text'}, 'url': {'required': False, 'vary': False, 'type': 'url'},
items = json.load(f) descriptors = {'#default': create_slybot_item_descriptor(items['default'], 'default')} template = parse_template(td, html_page, descriptors) unvalidated_template = parse_template(td, html_page, {}) unvalidated_template.id = u'stack_overflow_test' basic_extractors = BasicTypeExtractor.create(template.annotations) uncontained_annotation = basic_extractors[0] root_container = basic_extractors[1] child_container = basic_extractors[2] child_annotations = basic_extractors[3:] with open('%s/data/templates/411_list.json' % _PATH) as f: sample = json.load(f) annotations = sample['plugins']['annotations-plugin']['extracts'] annotated = apply_annotations(_clean_annotation_data(annotations), sample['original_body']) sample_411 = HtmlPage(url=sample['url'], body=annotated) page_411 = HtmlPage(url=sample['url'], body=sample['original_body']) with open('%s/data/templates/daft_list.json' % _PATH) as f: sample = json.load(f) annotations = sample['plugins']['annotations-plugin']['extracts'] annotated = apply_annotations(_clean_annotation_data(annotations), sample['original_body']) sample_daft = HtmlPage(url=sample['url'], body=annotated) page_daft = HtmlPage(url=sample['url'], body=sample['original_body']) for annotation in annotations: for attribute in annotation.get('data', {}).values(): attribute['required'] = False
lextractor = create_linkextractor_from_specs(specs) response = TextResponse(url='http://www.example.com/', body=csvfeed3) links = list(lextractor.links_to_follow(response)) self.assertEqual(len(links), 2) self.assertEqual(links[0].url, 'http://www.example.com/path') self.assertEqual(links[1].url, 'http://www.example.com/path2') html = """ <a href="http://www.example.com/path">Click here</a> """ _PATH = dirname(__file__) with open('%s/data/templates/daft_list.json' % _PATH) as f: daft_sample = json.load(f) annotations = daft_sample['plugins']['annotations-plugin']['extracts'] daft_body = apply_annotations(_clean_annotation_data(annotations), daft_sample['original_body']) daft_sample['annotated_body'] = daft_body class Test_HtmlLinkExtractor(TestCase): def test_simple(self): specs = {"type": "html", "value": None} lextractor = create_linkextractor_from_specs(specs) response = HtmlResponse(url='http://www.example.com/', body=html) links = list(lextractor.links_to_follow(response)) self.assertEqual(len(links), 1) self.assertEqual(links[0].url, 'http://www.example.com/path') self.assertEqual(links[0].text, 'Click here')
specs = {"type": "column", "value": 1} lextractor = create_linkextractor_from_specs(specs) response = TextResponse(url='http://www.example.com/', body=csvfeed3) links = list(lextractor.links_to_follow(response)) self.assertEqual(len(links), 2) self.assertEqual(links[0].url, 'http://www.example.com/path') self.assertEqual(links[1].url, 'http://www.example.com/path2') html = """ <a href="http://www.example.com/path">Click here</a> """ _PATH = dirname(__file__) with open('%s/data/templates/daft_list.json' % _PATH) as f: daft_sample = json.load(f) annotations = daft_sample['plugins']['annotations-plugin']['extracts'] daft_body = apply_annotations(_clean_annotation_data(annotations), daft_sample['original_body']) daft_sample['annotated_body'] = daft_body class Test_HtmlLinkExtractor(TestCase): def test_simple(self): specs = {"type": "html", "value": None} lextractor = create_linkextractor_from_specs(specs) response = HtmlResponse(url='http://www.example.com/', body=html) links = list(lextractor.links_to_follow(response)) self.assertEqual(len(links), 1) self.assertEqual(links[0].url, 'http://www.example.com/path') self.assertEqual(links[0].text, 'Click here')