Exemplo n.º 1
0
def load_annotations(body):
    """Create slybot annotations from annotated html."""
    if not body:
        return {'annotations-plugin': {'extracts': []}}
    sel = Selector(text=add_tagids(body))
    existing_ids = set()
    annotations = []
    for elem in sel.xpath('//*[@data-scrapy-annotate]'):
        attributes = elem.root.attrib
        try:
            # Load annotation json and skip malformed json strings
            annotation = json.loads(unquote(
                attributes['data-scrapy-annotate']))
        except ValueError:
            continue
        if (isinstance(elem.root, _Element)
                and elem.root.tag.lower() == 'ins'):
            annotation.update(find_generated_annotation(elem))
        else:
            annotation['tagid'] = attributes.get('data-tagid')
        if 'id' not in annotation:
            annotation['id'] = gen_id(disallow=existing_ids)
        existing_ids.add(annotation['id'])
        annotations.append(annotation)
    for elem in sel.xpath('//*[@%s]' % '|@'.join(IGNORE_ATTRIBUTES)):
        attributes = elem.root.attrib
        for attribute in IGNORE_ATTRIBUTES:
            if attribute in attributes:
                break
        ignore = {attribute[len('data-scrapy-'):]: True}
        if 'id' not in ignore:
            ignore['id'] = gen_id(disallow=existing_ids)
        existing_ids.add(ignore['id'])
        annotations.append(ignore)
    return {'annotations-plugin': {'extracts': annotations}}
Exemplo n.º 2
0
def load_annotations(body):
    """Create slybot annotations from annotated html."""
    if not body:
        return {'annotations-plugin': {'extracts': []}}
    sel = Selector(text=add_tagids(body))
    existing_ids = set()
    annotations = []
    for elem in sel.xpath('//*[@data-scrapy-annotate]'):
        attributes = elem.root.attrib
        try:
            # Load annotation json and skip malformed json strings
            annotation = json.loads(
                unquote(attributes['data-scrapy-annotate']))
        except ValueError:
            continue
        if (isinstance(elem.root, _Element) and
                elem.root.tag.lower() == 'ins'):
            annotation.update(find_generated_annotation(elem))
        else:
            annotation['tagid'] = attributes.get('data-tagid')
        if 'id' not in annotation:
            annotation['id'] = gen_id(disallow=existing_ids)
        existing_ids.add(annotation['id'])
        annotations.append(annotation)
    for elem in sel.xpath('//*[@%s]' % '|@'.join(IGNORE_ATTRIBUTES)):
        attributes = elem.root.attrib
        for attribute in IGNORE_ATTRIBUTES:
            if attribute in attributes:
                break
        ignore = {attribute[len('data-scrapy-'):]: True}
        if 'id' not in ignore:
            ignore['id'] = gen_id(disallow=existing_ids)
        existing_ids.add(ignore['id'])
        annotations.append(ignore)
    return {'annotations-plugin': {'extracts': annotations}}
Exemplo n.º 3
0
def html4annotation(htmlpage, baseurl=None, proxy_resources=None):
    """Convert the given html document for the annotation UI

    This adds tags, removes scripts and optionally adds a base url
    """
    htmlpage = add_tagids(htmlpage)
    cleaned_html = descriptify(htmlpage, baseurl, proxy=proxy_resources)
    return cleaned_html
Exemplo n.º 4
0
def html4annotation(htmlpage, baseurl=None, proxy_resources=None):
    """Convert the given html document for the annotation UI

    This adds tags, removes scripts and optionally adds a base url
    """
    htmlpage = add_tagids(htmlpage)
    cleaned_html = descriptify(htmlpage, baseurl, proxy=proxy_resources)
    return cleaned_html
Exemplo n.º 5
0
def apply_annotations(annotations, target_page, legacy=False):
    selector_annotations, tagid_annotations = _filter_annotations(annotations)
    inserts = defaultdict(list)
    numbered_html = add_tagids(target_page)
    if selector_annotations:
        converted_annotations = apply_selector_annotations(
            selector_annotations, numbered_html)
        tagid_annotations += converted_annotations
    target = iter(parse_html(numbered_html))
    output, tag_stack = [], []
    element = next(target)
    last_id = 0
    # XXX: A dummy element is added to the end so if the last annotation is
    #      generated it will be added to the output
    filtered = defaultdict(list)
    for grouped in tagid_annotations:
        for ann in arg_to_iter(grouped):
            filtered[ann['tagid']].append(ann)
    dummy = [(1e9, [{}])]
    sorted_annotations = sorted([(int(k), v) for k, v in filtered.items()] +
                                dummy)
    try:
        for aid, annotation_data in sorted_annotations:
            # Move target until replacement/insertion point
            while True:
                while not isinstance(element, HtmlTag) or element.tag == 'ins':
                    output.append(numbered_html[element.start:element.end])
                    element = next(target)
                if element.tag_type in {OPEN_TAG, UNPAIRED_TAG}:
                    last_id = element.attributes.get(TAGID)
                    tag_stack.append(last_id)
                if element.tag_type in {CLOSE_TAG, UNPAIRED_TAG} and tag_stack:
                    if ('__added' not in element.attributes and
                            last_id is not None and aid is not None and
                            int(last_id) < int(aid)):
                        output.append(numbered_html[element.start:element.end])
                        element.attributes['__added'] = True
                    last_inserted = tag_stack.pop()
                    to_insert = inserts.pop(last_inserted, None)
                    if to_insert:
                        output.extend(to_insert)
                        # Skip all nodes up to the next HtmlTag as these
                        # have already been added
                        while True:
                            element = next(target)
                            try:
                                last_id = element.attributes.get(TAGID,
                                                                 last_id)
                            except AttributeError:
                                pass
                            if isinstance(element, HtmlTag):
                                break
                        continue
                if (last_id is not None and aid is not None and
                        int(last_id) < int(aid)):
                    if '__added' not in element.attributes:
                        output.append(numbered_html[element.start:element.end])
                        element.attributes['__added'] = True
                    element = next(target)
                else:
                    break

            generated = []
            next_generated = []
            regular_annotations = []
            # Place generated annotations at the end and sort by slice
            for annotation in sorted(annotation_data, key=_annotation_key):
                if annotation.get('generated'):
                    if annotation.get('insert_after'):
                        next_generated.append(annotation)
                    else:
                        generated.append(annotation)
                else:
                    regular_annotations.append(annotation)
            # Add annotations data as required
            if regular_annotations:
                annotation_info = _gen_annotation_info(regular_annotations,
                                                       legacy)
                for key, val in annotation_info.items():
                    element.attributes[key] = val
            next_text_section = ''
            if generated:
                inner_data, target = tee(target)
                nodes = _get_inner_nodes(inner_data)
                next_text_section = _get_generated_annotation(
                    element, generated, nodes, numbered_html, inserts,
                    legacy)
            if next_generated:
                inner_data, target = tee(target)
                open_tags = 0 if element.tag_type == UNPAIRED_TAG else 1
                nodes = _get_inner_nodes(inner_data, open_tags=open_tags,
                                         insert_after=True)
                next_text_section = _get_generated_annotation(
                    element, next_generated, nodes, numbered_html, inserts,
                    legacy)

            if '__added' not in element.attributes:
                output.append(serialize_tag(element))
                element.attributes['__added'] = True
            # If an <ins> tag has been inserted we need to move forward
            if next_text_section:
                while True:
                    elem = next(target)
                    if (isinstance(elem, HtmlDataFragment) and
                            elem.is_text_content):
                        break
                    output.append(numbered_html[elem.start:elem.end])
                output.append(next_text_section)
    # Reached the end of the document
    except StopIteration:
        output.append(numbered_html[element.start:element.end])
    else:
        for element in target:
            output.append(numbered_html[element.start:element.end])
    return remove_tagids(''.join(output))
Exemplo n.º 6
0
 def numbered_html(self):
     if hasattr(self, '_numbered_html'):
         return self._numbered_html
     self._numbered_html = add_tagids(self.html)
     return self._numbered_html
Exemplo n.º 7
0
 def numbered_html(self):
     if hasattr(self, '_numbered_html'):
         return self._numbered_html
     self._numbered_html = add_tagids(self.html)
     return self._numbered_html
Exemplo n.º 8
0
def port_sample(sample, schemas=None, extractors=None):
    """Convert slybot samples made before slybot 0.13 to new format."""
    if schemas is None:
        schemas = {}
    if extractors is None:
        extractors = {}
    if sample.get('version') == SLYBOT_VERSION:
        return sample, schemas
    container_id = gen_predictable_id(sample.get('id', 1), sample['page_id'])
    schema_id, schemas = guess_schema(sample, schemas)
    default_annotations = [
        _create_container('body', container_id, schema_id=schema_id)
    ]
    if not sample.get('annotated_body') and not sample.get('plugins'):
        sample['plugins'] = {
            'annotations-plugin': {
                'extracts': default_annotations
            }
        }
        return sample, schemas
    if not sample.get('plugins'):
        sample['plugins'] = load_annotations(sample.get('annotated_body', u''))
    else:
        repair_ids(sample)

    # Group annotations by type
    annotations = sample['plugins']['annotations-plugin']['extracts']
    try:
        sel = Selector(text=add_tagids(sample['original_body']))
    except KeyError:
        annotated = sample.get('annotated_body', u'')
        sample['original_body'] = annotated
        try:
            tagged = add_tagids(annotated)
        except KeyError:
            tagged = u''
        sel = Selector(text=tagged)
    sample.pop('annotated_body', None)
    annotations = port_standard(annotations, sel, sample, extractors)
    standard_annos, generated_annos, variant_annos = [], [], []
    for a in annotations:
        if a.get('generated'):
            generated_annos.append(a)
        elif a.get('variants', 0) > 0:
            variant_annos.append(a)
        else:
            standard_annos.append(a)
    if not annotations:
        sample['plugins'] = {
            'annotations-plugin': {
                'extracts': default_annotations
            }
        }
        return sample, schemas
    new_annotations = []
    a = find_element(annotations[0], sel)
    for b in annotations[1:]:
        b = find_element(b, sel)
        a = find_common_parent(a, b)
    parent = a.getparent()
    container = _create_container(a if parent is None else parent,
                                  container_id,
                                  selector=sel,
                                  schema_id=schema_id)
    new_annotations.append(container)
    for a in standard_annos:
        a.pop('variant', None)
    new_annotations.extend(standard_annos)
    new_annotations.extend(port_generated(generated_annos, sel))
    new_annotations.extend(port_variants(variant_annos, sel))
    for a in new_annotations:
        if not (a.get('item_container') and a.get('container_id')):
            if container_id == a.get('id'):
                continue
            a['container_id'] = container_id
        a.pop('tagid', None) or a.pop('data-tagid', None)
    # Update annotations
    sample['plugins']['annotations-plugin']['extracts'] = new_annotations
    sample['version'] = SLYBOT_VERSION
    return sample, schemas
Exemplo n.º 9
0
def port_sample(sample, schemas=None, extractors=None):
    """Convert slybot samples made before slybot 0.13 to new format."""
    if schemas is None:
        schemas = {}
    if extractors is None:
        extractors = {}
    if sample.get('version') == SLYBOT_VERSION:
        return sample, schemas
    if 'url' not in sample:
        sample['url'] = 'http://example.com'
    container_id = gen_predictable_id(
        sample.get('id', 1), sample.get('page_id', sample['name']))
    schema_id, schemas = guess_schema(sample, schemas)
    default_annotations = [_create_container('body', container_id,
                                             schema_id=schema_id)]
    if not sample.get('annotated_body') and not sample.get('plugins'):
        sample['plugins'] = {
            'annotations-plugin': {
                'extracts': default_annotations
            }
        }
        return sample, schemas
    if not sample.get('plugins'):
        sample['plugins'] = load_annotations(sample.get('annotated_body', u''))
    else:
        repair_ids(sample)

    # Group annotations by type
    annotations = sample['plugins']['annotations-plugin']['extracts']
    try:
        sel = Selector(text=add_tagids(sample['original_body']))
    except KeyError:
        annotated = sample.get('annotated_body', u'')
        sample['original_body'] = annotated
        try:
            tagged = add_tagids(annotated)
        except KeyError:
            tagged = u''
        sel = Selector(text=tagged)
    sample.pop('annotated_body', None)
    annotations = port_standard(annotations, sel, sample, extractors)
    standard_annos, generated_annos, variant_annos = [], [], []
    for a in annotations:
        if a.get('generated'):
            generated_annos.append(a)
        elif a.get('variants', 0) > 0:
            variant_annos.append(a)
        else:
            standard_annos.append(a)
    if not annotations:
        sample['plugins'] = {
            'annotations-plugin': {
                'extracts': default_annotations
            }
        }
        return sample, schemas
    new_annotations = []
    a = find_element(annotations[0], sel)
    for b in annotations[1:]:
        b = find_element(b, sel)
        a = find_common_parent(a, b)
    parent = a.getparent()
    container = _create_container(
        a if parent is None else parent, container_id, selector=sel,
        schema_id=schema_id)
    new_annotations.append(container)
    for a in standard_annos:
        a.pop('variant', None)
    new_annotations.extend(standard_annos)
    new_annotations.extend(port_generated(generated_annos, sel))
    new_annotations.extend(port_variants(variant_annos, sel))
    for a in new_annotations:
        if not (a.get('item_container') and a.get('container_id')):
            if container_id == a.get('id'):
                continue
            a['container_id'] = container_id
        a.pop('tagid', None) or a.pop('data-tagid', None)
    # Update annotations
    sample['plugins']['annotations-plugin']['extracts'] = new_annotations
    sample['version'] = SLYBOT_VERSION
    return sample, schemas