def build_extraction_tree(self, template, type_descriptor, trace=True): """Build a tree of region extractors corresponding to the template """ attribute_map = type_descriptor.attribute_map if type_descriptor else None extractors = BasicTypeExtractor.create(template.annotations, attribute_map) if trace: extractors = TraceExtractor.apply(template, extractors) for cls in (DeptaExtractor,): extractors = cls.apply(template, extractors) if trace: extractors = TraceExtractor.apply(template, extractors) return TemplatePageExtractor(template, extractors)
def build_extraction_tree(self, template, type_descriptor, trace=True): """Build a tree of region extractors corresponding to the template """ attribute_map = type_descriptor.attribute_map if type_descriptor else None extractors = BasicTypeExtractor.create(template.annotations, attribute_map) if trace: extractors = TraceExtractor.apply(template, extractors) for cls in (DeptaExtractor, ): extractors = cls.apply(template, extractors) if trace: extractors = TraceExtractor.apply(template, extractors) return TemplatePageExtractor(template, extractors)
simple_descriptors = {k: create_slybot_item_descriptor(v) for k, v in schemas.items()} add_extractors_to_descriptors(simple_descriptors, {}) td = TokenDict() html_page = HtmlPage(body=open_spec('stack_overflow.html').decode('utf-8')) extraction_page = parse_extraction_page(td, html_page) with open('%s/data/SampleProject/items.json' % PATH) as f: items = json.load(f) descriptors = {'#default': create_slybot_item_descriptor(items['default'], 'default')} template = parse_template(td, html_page, descriptors) unvalidated_template = parse_template(td, html_page, {}) unvalidated_template.id = u'stack_overflow_test' basic_extractors = BasicTypeExtractor.create(template.annotations) uncontained_annotation = basic_extractors[0] root_container = basic_extractors[1] child_container = basic_extractors[2] child_annotations = basic_extractors[3:] sample_411, page_411 = open_sample_and_page('411_list.json') xceed_spider = open_spec('xceed.json') def _annotation_tag_to_dict(tag): return {attr: getattr(tag, attr, object()) for attr in ['annotation_text', 'end_index', 'metadata', 'start_index', 'surrounds_attribute', 'tag_attributes', 'variant_id']}
from scrapely.extraction.pageparsing import parse_extraction_page from scrapely.htmlpage import HtmlTagType _PATH = dirname(__file__) td = TokenDict() with open('%s/data/templates/stack_overflow.html' % _PATH) as f: html_page = HtmlPage(body=f.read().decode('utf-8')) extraction_page = parse_extraction_page(td, html_page) with open('%s/data/SampleProject/items.json' % _PATH) as f: items = json.load(f) descriptors = {'#default': create_slybot_item_descriptor(items['default'], 'default')} template = parse_template(td, html_page, descriptors) unvalidated_template = parse_template(td, html_page, {}) unvalidated_template.id = u'stack_overflow_test' basic_extractors = BasicTypeExtractor.create(template.annotations) uncontained_annotation = basic_extractors[0] root_container = basic_extractors[1] child_container = basic_extractors[2] child_annotations = basic_extractors[3:] with open('%s/data/templates/411_list.json' % _PATH) as f: sample = json.load(f) annotations = sample['plugins']['annotations-plugin']['extracts'] annotated = apply_annotations(_clean_annotation_data(annotations), sample['original_body']) sample_411 = HtmlPage(url=sample['url'], body=annotated) page_411 = HtmlPage(url=sample['url'], body=sample['original_body']) with open('%s/data/templates/daft_list.json' % _PATH) as f: sample = json.load(f)