示例#1
0
    def _handle_unpaired_tag(self, html_tag):
        if self._read_bool_template_attribute(html_tag, "ignore") and html_tag.tag == "img":
            self.ignored_regions.append((self.next_tag_index, self.next_tag_index + 1))
        elif self._read_bool_template_attribute(html_tag, "ignore-beneath"):
            self.ignored_regions.append((self.next_tag_index, None))
        jannotation = self._read_template_annotation(html_tag)
        if jannotation:
            if self.unpairedtag_stack:
                self._close_unpaired_tag()

            annotation = AnnotationTag(self.next_tag_index, self.next_tag_index + 1)
            attribute_annotations = jannotation.pop('annotations', {}).items()
            content_key = jannotation.pop('text-content', 'content')
            for extract_attribute, tag_value in attribute_annotations:
                if extract_attribute == content_key:
                    annotation.surrounds_attribute = tag_value
                    self.unpairedtag_stack.append(annotation)
                else:
                    annotation.tag_attributes.append((extract_attribute, tag_value))
            self.annotations.append(annotation)

            self.extra_required_attrs.extend(jannotation.pop('required', []))
            variant_id = jannotation.pop('variant', 0)
            if variant_id > 0:
                annotation.variant_id = variant_id
            assert jannotation.pop("generated", False) == False
            annotation.metadata = jannotation

        self.next_tag_index += 1
示例#2
0
    def _handle_unpaired_tag(self, html_tag):
        if self._read_bool_template_attribute(html_tag, "ignore") and html_tag.tag == "img":
            self.ignored_regions.append((self.next_tag_index, self.next_tag_index + 1))
        elif self._read_bool_template_attribute(html_tag, "ignore-beneath"):
            self.ignored_regions.append((self.next_tag_index, None))
        jannotation = self._read_template_annotation(html_tag)
        if jannotation:
            if self.unpairedtag_stack:
                self._close_unpaired_tag()

            annotation = AnnotationTag(self.next_tag_index, self.next_tag_index + 1)
            attribute_annotations = jannotation.pop('annotations', {}).items()
            content_key = jannotation.pop('text-content', 'content')
            for extract_attribute, tag_value in attribute_annotations:
                if extract_attribute == content_key:
                    annotation.surrounds_attribute = tag_value
                    self.unpairedtag_stack.append(annotation)
                else:
                    annotation.tag_attributes.append((extract_attribute, tag_value))
            self.annotations.append(annotation)

            self.extra_required_attrs.extend(jannotation.pop('required', []))
            variant_id = jannotation.pop('variant', 0)
            if variant_id > 0:
                annotation.variant_id = variant_id
            assert jannotation.pop("generated", False) == False
            annotation.metadata = jannotation

        self.next_tag_index += 1
示例#3
0
 def build_annotation(self, jannotation, is_open=True):
     annotation = AnnotationTag(self.next_tag_index,
                                self.next_tag_index + 1)
     content_key = jannotation.pop('text-content', 'content')
     attribute_annotations = jannotation.pop('annotations', {})
     content = attribute_annotations.pop(content_key, None)
     if is_open and content:
         annotation.surrounds_attribute = content
     annotation.tag_attributes = list(attribute_annotations.items())
     annotation.metadata = jannotation
     return annotation
示例#4
0
 def build_annotation(self, jannotation, is_open=True):
     annotation = AnnotationTag(self.next_tag_index,
                                self.next_tag_index + 1)
     content_key = jannotation.pop('text-content', 'content')
     attribute_annotations = jannotation.pop('annotations', {})
     content = attribute_annotations.pop(content_key, None)
     if is_open and content:
         annotation.surrounds_attribute = content
     annotation.tag_attributes = list(attribute_annotations.items())
     annotation.metadata = jannotation
     return annotation
示例#5
0
 def _find_siblings(self, template, containers, container_contents):
     child_id = container_contents[0].annotation.metadata['container_id']
     child = self.annotation = containers[child_id][0].annotation
     parent_id = self.annotation.metadata.get('container_id')
     parent = self._find_annotation(template, parent_id)
     siblings = child.metadata.get('siblings', 0)
     end = child.end_index
     if siblings > 0:
         end = self._find_siblings_end(template, child.end_index + 1,
                                       parent.end_index, siblings)
     if end is not None:
         new_child = AnnotationTag(child.start_index, end,
                                   child.surrounds_attribute,
                                   child.annotation_text,
                                   child.tag_attributes)
         new_child.metadata = child.metadata
         return (parent, new_child)
     return parent, child
示例#6
0
 def _find_siblings(self, template, containers, container_contents):
     child_id = container_contents[0].annotation.metadata['container_id']
     child = self.annotation = containers[child_id][0].annotation
     parent_id = self.annotation.metadata.get('container_id')
     parent = self._find_annotation(template, parent_id)
     siblings = child.metadata.get('siblings', 0)
     end = child.end_index
     if siblings > 0:
         end = self._find_siblings_end(template, child.end_index + 1,
                                       parent.end_index, siblings)
     if end is not None:
         new_child = AnnotationTag(child.start_index, end,
                                   child.surrounds_attribute,
                                   child.annotation_text,
                                   child.tag_attributes)
         new_child.metadata = child.metadata
         return (parent, new_child)
     return parent, child
示例#7
0
 def __init__(self, extractors, template_tokens):
     """Construct a RecordExtractor for the given annotations and their
     corresponding region extractors
     """
     self.extractors = extractors
     self.template_tokens = template_tokens
     self.template_ignored_regions = []
     start_index = min(e.annotation.start_index for e in extractors)
     end_index = max(e.annotation.end_index for e in extractors)
     self.annotation = AnnotationTag(start_index, end_index)
示例#8
0
    def _handle_open_tag(self, html_tag):
        if self._read_bool_template_attribute(html_tag, "ignore"):
            if html_tag.tag == "img":
                self.ignored_regions.append((self.next_tag_index, self.next_tag_index + 1))
            else:
                self.ignored_regions.append((self.next_tag_index, None))
                self.ignored_tag_stacks[html_tag.tag].append(html_tag)

        elif self.ignored_tag_stacks.get(html_tag.tag):
            self.ignored_tag_stacks[html_tag.tag].append(None)
        if self._read_bool_template_attribute(html_tag, "ignore-beneath"):
            self.ignored_regions.append((self.next_tag_index, None))

        replacement = html_tag.attributes.pop("data-scrapy-replacement", None)
        if replacement:
            self.token_list.pop()
            self._add_token(replacement, html_tag.tag_type, html_tag.start, html_tag.end)
            self.replacement_stacks[html_tag.tag].append(replacement)
        elif html_tag.tag in self.replacement_stacks:
            self.replacement_stacks[html_tag.tag].append(None)

        if self.unpairedtag_stack:
            if html_tag.tag in _END_UNPAIREDTAG_TAGS:
                self._close_unpaired_tag()
            else:
                self.unpairedtag_stack.append(html_tag.tag)

        tagname = replacement or self._update_replacement_stack(html_tag)
        self._handle_unclosed_tags(tagname, _AUTO_CLOSE_TAGS_ON_OPEN)

        jannotation = self._read_template_annotation(html_tag)
        if not jannotation:
            if tagname in self.labelled_tag_stacks:
                # add this tag to the stack to match correct end tag
                self.labelled_tag_stacks[tagname].append(None)
            self.next_tag_index += 1
            return

        annotation = AnnotationTag(self.next_tag_index, None)
        if jannotation.pop('generated', False):
            self.token_list.pop()
            annotation.start_index -= 1
            if self.previous_element_class == HtmlTag:
                annotation.annotation_text = AnnotationText('')
            else:
                annotation.annotation_text = AnnotationText(self.prev_data)
            if self._read_bool_template_attribute(html_tag, "ignore") \
                    or self._read_bool_template_attribute(html_tag, "ignore-beneath"):
                ignored = self.ignored_regions.pop()
                self.ignored_regions.append((ignored[0]-1, ignored[1]))

        self.extra_required_attrs.extend(jannotation.pop('required', []))

        attribute_annotations = jannotation.pop('annotations', {}).items()
        content_key = jannotation.pop('text-content', 'content')
        for extract_attribute, tag_value in attribute_annotations:
            if extract_attribute == content_key:
                annotation.surrounds_attribute = tag_value
            else:
                annotation.tag_attributes.append((extract_attribute, tag_value))

        variant_id = jannotation.pop('variant', 0)
        if variant_id > 0:
            if annotation.surrounds_attribute is not None:
                self.variant_stack.append(variant_id)
            else:
                annotation.variant_id = variant_id

        annotation.metadata = jannotation

        if annotation.annotation_text is None:
            self.next_tag_index += 1
        if self.variant_stack and annotation.variant_id is None:
            variant_id = self.variant_stack[-1]
            if variant_id == '0':
                variant_id = None
            annotation.variant_id = variant_id

        # look for a closing tag if the content is important
        if annotation.surrounds_attribute:
            self.labelled_tag_stacks[tagname].append(annotation)
        else:
            annotation.end_index = annotation.start_index + 1
            self.annotations.append(annotation)
示例#9
0
    def _handle_open_tag(self, html_tag):
        if self._read_bool_template_attribute(html_tag, "ignore"):
            if html_tag.tag == "img":
                self.ignored_regions.append((self.next_tag_index, self.next_tag_index + 1))
            else:
                self.ignored_regions.append((self.next_tag_index, None))
                self.ignored_tag_stacks[html_tag.tag].append(html_tag)

        elif self.ignored_tag_stacks.get(html_tag.tag):
            self.ignored_tag_stacks[html_tag.tag].append(None)
        if self._read_bool_template_attribute(html_tag, "ignore-beneath"):
            self.ignored_regions.append((self.next_tag_index, None))

        replacement = html_tag.attributes.pop("data-scrapy-replacement", None)
        if replacement:
            self.token_list.pop()
            self._add_token(replacement, html_tag.tag_type, html_tag.start, html_tag.end)
            self.replacement_stacks[html_tag.tag].append(replacement)
        elif html_tag.tag in self.replacement_stacks:
            self.replacement_stacks[html_tag.tag].append(None)

        if self.unpairedtag_stack:
            if html_tag.tag in _END_UNPAIREDTAG_TAGS:
                self._close_unpaired_tag()
            else:
                self.unpairedtag_stack.append(html_tag.tag)

        tagname = replacement or self._update_replacement_stack(html_tag)
        self._handle_unclosed_tags(tagname, _AUTO_CLOSE_TAGS_ON_OPEN)

        jannotation = self._read_template_annotation(html_tag)
        if not jannotation:
            if tagname in self.labelled_tag_stacks:
                # add this tag to the stack to match correct end tag
                self.labelled_tag_stacks[tagname].append(None)
            self.next_tag_index += 1
            return

        annotation = AnnotationTag(self.next_tag_index, None)
        if jannotation.pop('generated', False):
            self.token_list.pop()
            annotation.start_index -= 1
            if self.previous_element_class == HtmlTag:
                annotation.annotation_text = AnnotationText('')
            else:
                annotation.annotation_text = AnnotationText(self.prev_data)
            if self._read_bool_template_attribute(html_tag, "ignore") \
                    or self._read_bool_template_attribute(html_tag, "ignore-beneath"):
                ignored = self.ignored_regions.pop()
                self.ignored_regions.append((ignored[0]-1, ignored[1]))

        self.extra_required_attrs.extend(jannotation.pop('required', []))

        attribute_annotations = jannotation.pop('annotations', {}).items()
        content_key = jannotation.pop('text-content', 'content')
        for extract_attribute, tag_value in attribute_annotations:
            if extract_attribute == content_key:
                annotation.surrounds_attribute = tag_value
            else:
                annotation.tag_attributes.append((extract_attribute, tag_value))

        variant_id = jannotation.pop('variant', 0)
        if variant_id > 0:
            if annotation.surrounds_attribute is not None:
                self.variant_stack.append(variant_id)
            else:
                annotation.variant_id = variant_id

        annotation.metadata = jannotation

        if annotation.annotation_text is None:
            self.next_tag_index += 1
        if self.variant_stack and annotation.variant_id is None:
            variant_id = self.variant_stack[-1]
            if variant_id == '0':
                variant_id = None
            annotation.variant_id = variant_id

        # look for a closing tag if the content is important
        if annotation.surrounds_attribute:
            self.labelled_tag_stacks[tagname].append(annotation)
        else:
            annotation.end_index = annotation.start_index + 1
            self.annotations.append(annotation)
示例#10
0
 def annotation(self):
     try:
         return self._annotation
     except AttributeError:
         return AnnotationTag(1, 1)