def __init__(self, template_descriptor_pairs, trace=False, apply_extrarequired=True): self.token_dict = TokenDict() parsed_templates = [] template_versions = [] for template, descriptors, version in template_descriptor_pairs: parsed = parse_template(self.token_dict, template, descriptors) parsed_templates.append(parsed) template_versions.append(version) if _annotation_count(parsed): parse_extraction_page(self.token_dict, template) for parsed in parsed_templates: default_schema = getattr(parsed, '_default_schema', None) descriptor = parsed.descriptors.get(default_schema) if descriptor is not None and apply_extrarequired: descriptor = descriptor.copy() parsed.descriptors[default_schema] = descriptor parsed.descriptors['#default'] = descriptor # templates with more attributes are considered first parsed_templates = sorted( parsed_templates, key=_annotation_count, reverse=True ) self.extraction_trees = [ self.build_extraction_tree(p, None, trace, legacy=v < '0.13.0') for p, v in zip(parsed_templates, template_versions) ]
def __init__(self, template_descriptor_pairs, trace=False, apply_extrarequired=True): self.token_dict = TokenDict() parsed_templates = [] template_versions = [] for template, descriptors, version in template_descriptor_pairs: parsed = parse_template(self.token_dict, template, descriptors) parsed_templates.append(parsed) template_versions.append(version) if _annotation_count(parsed): parse_extraction_page(self.token_dict, template) for parsed in parsed_templates: default_schema = getattr(parsed, '_default_schema', None) descriptor = parsed.descriptors.get(default_schema) if descriptor is not None and apply_extrarequired: descriptor = descriptor.copy() parsed.descriptors[default_schema] = descriptor parsed.descriptors['#default'] = descriptor # templates with more attributes are considered first parsed_templates = sorted( parsed_templates, key=_annotation_count, reverse=True ) self.extraction_trees = [ self.build_extraction_tree(p, None, trace) for p, v in zip(parsed_templates, template_versions) ]
def extract(self, html, pref_template_id=None): """Extract data from an html page. If pref_template_url is specified, the template with that url will be used first. """ extraction_page = parse_extraction_page(self.token_dict, html) extraction_trees = self.extraction_trees if pref_template_id is not None: extraction_trees = sorted( self.extraction_trees, key=lambda x: x.template.id != pref_template_id) for extraction_tree in extraction_trees: template_id = extraction_tree.template.id extracted = extraction_tree.extract(extraction_page) correctly_extracted = [] for item in extracted: if u'_type' in item or not hasattr(self, 'validated'): correctly_extracted.append(item) else: validated = self.validated[template_id]([item]) if validated: correctly_extracted.append(validated) if len(correctly_extracted) > 0: return correctly_extracted, extraction_tree.template return None, None
def extract(self, html, pref_template_id=None): """Extract data from an html page. If pref_template_url is specified, the template with that url will be used first. """ extraction_page = parse_extraction_page(self.token_dict, html) extraction_trees = self.extraction_trees if pref_template_id is not None: extraction_trees = sorted( self.extraction_trees, key=lambda x: x.template.id != pref_template_id) for extraction_tree in extraction_trees: template_id = extraction_tree.template.id extracted = extraction_tree.extract(extraction_page) correctly_extracted = [] for item in extracted: if (isinstance(item, ItemProcessor) or not hasattr(self, 'validated')): correctly_extracted.append(item) else: validated = self.validated[template_id]([item]) if validated: correctly_extracted.append(validated) if len(correctly_extracted) > 0: return correctly_extracted, extraction_tree.template return None, None
def extract(self, html, pref_template_id=None): """Extract data from an html page. If pref_template_url is specified, the template with that url will be used first. """ extraction_page = parse_extraction_page(self.token_dict, html) extraction_trees = self.extraction_trees if pref_template_id is not None: extraction_trees = sorted( self.extraction_trees, key=lambda x: x.template.id != pref_template_id) sel = Selector(text=html.body) for extraction_tree in extraction_trees: template_id = extraction_tree.template.id extracted = extraction_tree.extract(extraction_page) correctly_extracted = [] for item in extracted: if (isinstance(item, ItemProcessor) or not hasattr(self, 'validated')): if hasattr(item, 'process'): item = item.process(sel) else: item = self.validated[template_id]([item]) if item: correctly_extracted.append(item) if len(correctly_extracted) > 0: return correctly_extracted, extraction_tree.template return None, None
simple_template = HtmlPage(url="http://www.test.com/a", body=apply_annotations(annotations, html)) target1 = base_page('\n'.join(item_template(idx=i, rank=1) for i in range(1, 11))) target2 = base_page('\n'.join(item_template(idx=i, rank=i if i % 2 else '') for i in range(1, 11))) target1 = HtmlPage(url="http://www.test.com/a", body=target1) target2 = HtmlPage(url="http://www.test.com/a", body=target2) simple_descriptors = {k: create_slybot_item_descriptor(v) for k, v in schemas.items()} add_extractors_to_descriptors(simple_descriptors, {}) td = TokenDict() html_page = HtmlPage(body=open_spec('stack_overflow.html').decode('utf-8')) extraction_page = parse_extraction_page(td, html_page) with open('%s/data/SampleProject/items.json' % PATH) as f: items = json.load(f) descriptors = {'#default': create_slybot_item_descriptor(items['default'], 'default')} template = parse_template(td, html_page, descriptors) unvalidated_template = parse_template(td, html_page, {}) unvalidated_template.id = u'stack_overflow_test' basic_extractors = BasicTypeExtractor.create(template.annotations) uncontained_annotation = basic_extractors[0] root_container = basic_extractors[1] child_container = basic_extractors[2] child_annotations = basic_extractors[3:] sample_411, page_411 = open_sample_and_page('411_list.json') xceed_spider = open_spec('xceed.json')
from slybot.extractors import add_extractors_to_descriptors from slybot.item import create_slybot_item_descriptor from slybot.plugins.scrapely_annotations.builder import ( apply_annotations, _clean_annotation_data ) from scrapely.extraction.pageobjects import TokenDict from scrapely.htmlpage import HtmlPage from scrapely.extraction.regionextract import BasicTypeExtractor from scrapely.extraction.pageparsing import parse_extraction_page from scrapely.htmlpage import HtmlTagType _PATH = dirname(__file__) td = TokenDict() with open('%s/data/templates/stack_overflow.html' % _PATH) as f: html_page = HtmlPage(body=f.read().decode('utf-8')) extraction_page = parse_extraction_page(td, html_page) with open('%s/data/SampleProject/items.json' % _PATH) as f: items = json.load(f) descriptors = {'#default': create_slybot_item_descriptor(items['default'], 'default')} template = parse_template(td, html_page, descriptors) unvalidated_template = parse_template(td, html_page, {}) unvalidated_template.id = u'stack_overflow_test' basic_extractors = BasicTypeExtractor.create(template.annotations) uncontained_annotation = basic_extractors[0] root_container = basic_extractors[1] child_container = basic_extractors[2] child_annotations = basic_extractors[3:] with open('%s/data/templates/411_list.json' % _PATH) as f: sample = json.load(f)