示例#1
0
文件: depta.py 项目: BuGoNee/pydepta
    def build_extraction_tree(self, template, type_descriptor, trace=True):
        """Build a tree of region extractors corresponding to the
        template
        """
        attribute_map = type_descriptor.attribute_map if type_descriptor else None
        extractors = BasicTypeExtractor.create(template.annotations, attribute_map)
        if trace:
            extractors = TraceExtractor.apply(template, extractors)
        for cls in (DeptaExtractor,):
            extractors = cls.apply(template, extractors)
            if trace:
                extractors = TraceExtractor.apply(template, extractors)

        return TemplatePageExtractor(template, extractors)
示例#2
0
    def build_extraction_tree(self, template, type_descriptor, trace=True):
        """Build a tree of region extractors corresponding to the
        template
        """
        attribute_map = type_descriptor.attribute_map if type_descriptor else None
        extractors = BasicTypeExtractor.create(template.annotations,
                                               attribute_map)
        if trace:
            extractors = TraceExtractor.apply(template, extractors)
        for cls in (DeptaExtractor, ):
            extractors = cls.apply(template, extractors)
            if trace:
                extractors = TraceExtractor.apply(template, extractors)

        return TemplatePageExtractor(template, extractors)
simple_descriptors = {k: create_slybot_item_descriptor(v)
                      for k, v in schemas.items()}
add_extractors_to_descriptors(simple_descriptors, {})


td = TokenDict()
html_page = HtmlPage(body=open_spec('stack_overflow.html').decode('utf-8'))
extraction_page = parse_extraction_page(td, html_page)
with open('%s/data/SampleProject/items.json' % PATH) as f:
    items = json.load(f)
descriptors = {'#default': create_slybot_item_descriptor(items['default'],
                                                         'default')}
template = parse_template(td, html_page, descriptors)
unvalidated_template = parse_template(td, html_page, {})
unvalidated_template.id = u'stack_overflow_test'
basic_extractors = BasicTypeExtractor.create(template.annotations)
uncontained_annotation = basic_extractors[0]
root_container = basic_extractors[1]
child_container = basic_extractors[2]
child_annotations = basic_extractors[3:]

sample_411, page_411 = open_sample_and_page('411_list.json')
xceed_spider = open_spec('xceed.json')


def _annotation_tag_to_dict(tag):
    return {attr: getattr(tag, attr, object())
            for attr in ['annotation_text', 'end_index', 'metadata',
                         'start_index', 'surrounds_attribute',
                         'tag_attributes', 'variant_id']}
from scrapely.extraction.pageparsing import parse_extraction_page
from scrapely.htmlpage import HtmlTagType

_PATH = dirname(__file__)
td = TokenDict()
with open('%s/data/templates/stack_overflow.html' % _PATH) as f:
    html_page = HtmlPage(body=f.read().decode('utf-8'))
extraction_page = parse_extraction_page(td, html_page)
with open('%s/data/SampleProject/items.json' % _PATH) as f:
    items = json.load(f)
descriptors = {'#default': create_slybot_item_descriptor(items['default'],
                                                         'default')}
template = parse_template(td, html_page, descriptors)
unvalidated_template = parse_template(td, html_page, {})
unvalidated_template.id = u'stack_overflow_test'
basic_extractors = BasicTypeExtractor.create(template.annotations)
uncontained_annotation = basic_extractors[0]
root_container = basic_extractors[1]
child_container = basic_extractors[2]
child_annotations = basic_extractors[3:]

with open('%s/data/templates/411_list.json' % _PATH) as f:
    sample = json.load(f)
annotations = sample['plugins']['annotations-plugin']['extracts']
annotated = apply_annotations(_clean_annotation_data(annotations),
                              sample['original_body'])
sample_411 = HtmlPage(url=sample['url'], body=annotated)
page_411 = HtmlPage(url=sample['url'],
                    body=sample['original_body'])
with open('%s/data/templates/daft_list.json' % _PATH) as f:
    sample = json.load(f)