def _handle_unpaired_tag(self, html_tag): if self._read_bool_template_attribute(html_tag, "ignore") and html_tag.tag == "img": self.ignored_regions.append((self.next_tag_index, self.next_tag_index + 1)) elif self._read_bool_template_attribute(html_tag, "ignore-beneath"): self.ignored_regions.append((self.next_tag_index, None)) jannotation = self._read_template_annotation(html_tag) if jannotation: if self.unpairedtag_stack: self._close_unpaired_tag() annotation = AnnotationTag(self.next_tag_index, self.next_tag_index + 1) attribute_annotations = jannotation.pop('annotations', {}).items() content_key = jannotation.pop('text-content', 'content') for extract_attribute, tag_value in attribute_annotations: if extract_attribute == content_key: annotation.surrounds_attribute = tag_value self.unpairedtag_stack.append(annotation) else: annotation.tag_attributes.append((extract_attribute, tag_value)) self.annotations.append(annotation) self.extra_required_attrs.extend(jannotation.pop('required', [])) variant_id = jannotation.pop('variant', 0) if variant_id > 0: annotation.variant_id = variant_id assert jannotation.pop("generated", False) == False annotation.metadata = jannotation self.next_tag_index += 1
def build_annotation(self, jannotation, is_open=True): annotation = AnnotationTag(self.next_tag_index, self.next_tag_index + 1) content_key = jannotation.pop('text-content', 'content') attribute_annotations = jannotation.pop('annotations', {}) content = attribute_annotations.pop(content_key, None) if is_open and content: annotation.surrounds_attribute = content annotation.tag_attributes = list(attribute_annotations.items()) annotation.metadata = jannotation return annotation
def _find_siblings(self, template, containers, container_contents): child_id = container_contents[0].annotation.metadata['container_id'] child = self.annotation = containers[child_id][0].annotation parent_id = self.annotation.metadata.get('container_id') parent = self._find_annotation(template, parent_id) siblings = child.metadata.get('siblings', 0) end = child.end_index if siblings > 0: end = self._find_siblings_end(template, child.end_index + 1, parent.end_index, siblings) if end is not None: new_child = AnnotationTag(child.start_index, end, child.surrounds_attribute, child.annotation_text, child.tag_attributes) new_child.metadata = child.metadata return (parent, new_child) return parent, child
def __init__(self, extractors, template_tokens): """Construct a RecordExtractor for the given annotations and their corresponding region extractors """ self.extractors = extractors self.template_tokens = template_tokens self.template_ignored_regions = [] start_index = min(e.annotation.start_index for e in extractors) end_index = max(e.annotation.end_index for e in extractors) self.annotation = AnnotationTag(start_index, end_index)
def _handle_open_tag(self, html_tag): if self._read_bool_template_attribute(html_tag, "ignore"): if html_tag.tag == "img": self.ignored_regions.append((self.next_tag_index, self.next_tag_index + 1)) else: self.ignored_regions.append((self.next_tag_index, None)) self.ignored_tag_stacks[html_tag.tag].append(html_tag) elif self.ignored_tag_stacks.get(html_tag.tag): self.ignored_tag_stacks[html_tag.tag].append(None) if self._read_bool_template_attribute(html_tag, "ignore-beneath"): self.ignored_regions.append((self.next_tag_index, None)) replacement = html_tag.attributes.pop("data-scrapy-replacement", None) if replacement: self.token_list.pop() self._add_token(replacement, html_tag.tag_type, html_tag.start, html_tag.end) self.replacement_stacks[html_tag.tag].append(replacement) elif html_tag.tag in self.replacement_stacks: self.replacement_stacks[html_tag.tag].append(None) if self.unpairedtag_stack: if html_tag.tag in _END_UNPAIREDTAG_TAGS: self._close_unpaired_tag() else: self.unpairedtag_stack.append(html_tag.tag) tagname = replacement or self._update_replacement_stack(html_tag) self._handle_unclosed_tags(tagname, _AUTO_CLOSE_TAGS_ON_OPEN) jannotation = self._read_template_annotation(html_tag) if not jannotation: if tagname in self.labelled_tag_stacks: # add this tag to the stack to match correct end tag self.labelled_tag_stacks[tagname].append(None) self.next_tag_index += 1 return annotation = AnnotationTag(self.next_tag_index, None) if jannotation.pop('generated', False): self.token_list.pop() annotation.start_index -= 1 if self.previous_element_class == HtmlTag: annotation.annotation_text = AnnotationText('') else: annotation.annotation_text = AnnotationText(self.prev_data) if self._read_bool_template_attribute(html_tag, "ignore") \ or self._read_bool_template_attribute(html_tag, "ignore-beneath"): ignored = self.ignored_regions.pop() self.ignored_regions.append((ignored[0]-1, ignored[1])) self.extra_required_attrs.extend(jannotation.pop('required', [])) attribute_annotations = jannotation.pop('annotations', {}).items() content_key = jannotation.pop('text-content', 'content') for extract_attribute, tag_value in attribute_annotations: if extract_attribute == content_key: annotation.surrounds_attribute = tag_value else: annotation.tag_attributes.append((extract_attribute, tag_value)) variant_id = jannotation.pop('variant', 0) if variant_id > 0: if annotation.surrounds_attribute is not None: self.variant_stack.append(variant_id) else: annotation.variant_id = variant_id annotation.metadata = jannotation if annotation.annotation_text is None: self.next_tag_index += 1 if self.variant_stack and annotation.variant_id is None: variant_id = self.variant_stack[-1] if variant_id == '0': variant_id = None annotation.variant_id = variant_id # look for a closing tag if the content is important if annotation.surrounds_attribute: self.labelled_tag_stacks[tagname].append(annotation) else: annotation.end_index = annotation.start_index + 1 self.annotations.append(annotation)
def annotation(self): try: return self._annotation except AttributeError: return AnnotationTag(1, 1)