def _handle_unpaired_tag(self, html_tag): if self._read_bool_template_attribute(html_tag, "ignore") and html_tag.tag == "img": self.ignored_regions.append((self.next_tag_index, self.next_tag_index + 1)) elif self._read_bool_template_attribute(html_tag, "ignore-beneath") and html_tag.tag == "img": self.ignored_regions.append((self.next_tag_index, None)) jannotation = self._read_template_annotation(html_tag) if jannotation: if self.unpairedtag_stack: self._close_unpaired_tag() annotation = AnnotationTag(self.next_tag_index, self.next_tag_index + 1) attribute_annotations = jannotation.pop('annotations', {}).items() for extract_attribute, tag_value in attribute_annotations: if extract_attribute == 'content': annotation.surrounds_attribute = tag_value self.unpairedtag_stack.append(annotation) else: annotation.tag_attributes.append((extract_attribute, tag_value)) self.annotations.append(annotation) if jannotation.pop('common_prefix', False): annotation.match_common_prefix = True self.extra_required_attrs.extend(jannotation.pop('required', [])) annotation.metadata = jannotation self.next_tag_index += 1
def _handle_unpaired_tag(self, html_tag): if self._read_bool_template_attribute( html_tag, "ignore") and html_tag.tag == "img": self.ignored_regions.append( (self.next_tag_index, self.next_tag_index + 1)) elif self._read_bool_template_attribute(html_tag, "ignore-beneath"): self.ignored_regions.append((self.next_tag_index, None)) jannotation = self._read_template_annotation(html_tag) if jannotation: if self.unpairedtag_stack: self._close_unpaired_tag() annotation = AnnotationTag(self.next_tag_index, self.next_tag_index + 1) attribute_annotations = jannotation.pop('annotations', {}).items() for extract_attribute, tag_value in attribute_annotations: if extract_attribute == 'content': annotation.surrounds_attribute = tag_value self.unpairedtag_stack.append(annotation) else: annotation.tag_attributes.append( (extract_attribute, tag_value)) self.annotations.append(annotation) self.extra_required_attrs.extend(jannotation.pop('required', [])) annotation.metadata = jannotation self.next_tag_index += 1
def __init__(self, extractors, template_tokens): """Construct a RecordExtractor for the given annotations and their corresponding region extractors """ self.extractors = extractors self.template_tokens = template_tokens self.template_ignored_regions = [] start_index = min(e.annotation.start_index for e in extractors) end_index = max(e.annotation.end_index for e in extractors) self.annotation = AnnotationTag(start_index, end_index)
def _handle_open_tag(self, html_tag): if self._read_bool_template_attribute(html_tag, "ignore"): if html_tag.tag == "img": self.ignored_regions.append((self.next_tag_index, self.next_tag_index + 1)) else: self.ignored_regions.append((self.next_tag_index, None)) self.ignored_tag_stacks[html_tag.tag].append(html_tag) elif self.ignored_tag_stacks.get(html_tag.tag): self.ignored_tag_stacks[html_tag.tag].append(None) if self._read_bool_template_attribute(html_tag, "ignore-beneath"): self.ignored_regions.append((self.next_tag_index, None)) replacement = html_tag.attributes.pop("data-scrapy-replacement", None) if replacement: self.token_list.pop() self._add_token(replacement, html_tag.tag_type, html_tag.start, html_tag.end) self.replacement_stacks[html_tag.tag].append(replacement) elif html_tag.tag in self.replacement_stacks: self.replacement_stacks[html_tag.tag].append(None) if self.unpairedtag_stack: if html_tag.tag in _END_UNPAIREDTAG_TAGS: self._close_unpaired_tag() else: self.unpairedtag_stack.append(html_tag.tag) # can't be a p inside another p. Also, an open p element closes # a previous open p element. if html_tag.tag == "p" and html_tag.tag in self.labelled_tag_stacks: annotation = self.labelled_tag_stacks.pop(html_tag.tag)[0] annotation.end_index = self.next_tag_index self.annotations.append(annotation) jannotation = self._read_template_annotation(html_tag) if not jannotation: if html_tag.tag in self.labelled_tag_stacks: # add this tag to the stack to match correct end tag self.labelled_tag_stacks[html_tag.tag].append(None) self.next_tag_index += 1 return annotation = AnnotationTag(self.next_tag_index, None) if jannotation.pop('generated', False): self.token_list.pop() annotation.start_index -= 1 if self.previous_element_class == HtmlTag: annotation.annotation_text = AnnotationText('') else: annotation.annotation_text = AnnotationText(self.prev_data) if self._read_bool_template_attribute(html_tag, "ignore") \ or self._read_bool_template_attribute(html_tag, "ignore-beneath"): ignored = self.ignored_regions.pop() self.ignored_regions.append((ignored[0]-1, ignored[1])) self.extra_required_attrs.extend(jannotation.pop('required', [])) attribute_annotations = jannotation.pop('annotations', {}).items() for extract_attribute, tag_value in attribute_annotations: if extract_attribute == 'content': annotation.surrounds_attribute = tag_value else: annotation.tag_attributes.append((extract_attribute, tag_value)) variant_id = jannotation.pop('variant', 0) if variant_id > 0: if annotation.surrounds_attribute is not None: self.variant_stack.append(variant_id) else: annotation.variant_id = variant_id annotation.metadata = jannotation if annotation.annotation_text is None: self.next_tag_index += 1 if self.variant_stack and annotation.variant_id is None: variant_id = self.variant_stack[-1] if variant_id == '0': variant_id = None annotation.variant_id = variant_id # look for a closing tag if the content is important if annotation.surrounds_attribute: self.labelled_tag_stacks[html_tag.tag].append(annotation) else: annotation.end_index = annotation.start_index + 1 self.annotations.append(annotation)
def _handle_open_tag(self, html_tag): if self._read_bool_template_attribute(html_tag, "ignore"): if html_tag.tag == "img": self.ignored_regions.append( (self.next_tag_index, self.next_tag_index + 1)) else: self.ignored_regions.append((self.next_tag_index, None)) self.ignored_tag_stacks[html_tag.tag].append(html_tag) elif self.ignored_tag_stacks.get(html_tag.tag): self.ignored_tag_stacks[html_tag.tag].append(None) if self._read_bool_template_attribute(html_tag, "ignore-beneath"): self.ignored_regions.append((self.next_tag_index, None)) replacement = html_tag.attributes.pop("data-scrapy-replacement", None) if replacement: self.token_list.pop() self._add_token(replacement, html_tag.tag_type, html_tag.start, html_tag.end) self.replacement_stacks[html_tag.tag].append(replacement) elif html_tag.tag in self.replacement_stacks: self.replacement_stacks[html_tag.tag].append(None) if self.unpairedtag_stack: if html_tag.tag in _END_UNPAIREDTAG_TAGS: self._close_unpaired_tag() else: self.unpairedtag_stack.append(html_tag.tag) # can't be a p inside another p. Also, an open p element closes # a previous open p element. if html_tag.tag == "p" and html_tag.tag in self.labelled_tag_stacks: annotation = self.labelled_tag_stacks.pop(html_tag.tag)[0] annotation.end_index = self.next_tag_index self.annotations.append(annotation) jannotation = self._read_template_annotation(html_tag) if not jannotation: if html_tag.tag in self.labelled_tag_stacks: # add this tag to the stack to match correct end tag self.labelled_tag_stacks[html_tag.tag].append(None) self.next_tag_index += 1 return annotation = AnnotationTag(self.next_tag_index, None) if jannotation.pop('generated', False): self.token_list.pop() annotation.start_index -= 1 if self.previous_element_class == HtmlTag: annotation.annotation_text = AnnotationText('') else: annotation.annotation_text = AnnotationText(self.prev_data) if self._read_bool_template_attribute(html_tag, "ignore") \ or self._read_bool_template_attribute(html_tag, "ignore-beneath"): ignored = self.ignored_regions.pop() self.ignored_regions.append((ignored[0] - 1, ignored[1])) self.extra_required_attrs.extend(jannotation.pop('required', [])) attribute_annotations = jannotation.pop('annotations', {}).items() for extract_attribute, tag_value in attribute_annotations: if extract_attribute == 'content': annotation.surrounds_attribute = tag_value else: annotation.tag_attributes.append( (extract_attribute, tag_value)) variant_id = jannotation.pop('variant', 0) if variant_id > 0: if annotation.surrounds_attribute is not None: self.variant_stack.append(variant_id) else: annotation.variant_id = variant_id annotation.metadata = jannotation if annotation.annotation_text is None: self.next_tag_index += 1 if self.variant_stack and annotation.variant_id is None: variant_id = self.variant_stack[-1] if variant_id == '0': variant_id = None annotation.variant_id = variant_id # look for a closing tag if the content is important if annotation.surrounds_attribute: self.labelled_tag_stacks[html_tag.tag].append(annotation) else: annotation.end_index = annotation.start_index + 1 self.annotations.append(annotation)