def _doextract(self, page, extractors, start_index, end_index, nested_regions=None, ignored_regions=None, **kwargs): # reorder extractors leaving nested ones for the end and separating # ignore regions nested_regions = nested_regions or [] ignored_regions = ignored_regions or [] first_extractor, following_extractors = extractors[0], extractors[1:] lelem = labelled_element extracted_data = [] # end_index is inclusive, but similar_region treats it as exclusive end_region = None if end_index is None else end_index + 1 labelled = lelem(first_extractor) score, pindex, sindex = \ similar_region( page.page_tokens, self.template_tokens, labelled, start_index, end_region, self.best_match, **kwargs) if score > 0: if isinstance(labelled, AnnotationTag): similar_ignored_regions = [] start = pindex for i in ignored_regions: s, p, e = similar_region(page.page_tokens, self.template_tokens, i, start, sindex, self.best_match, **kwargs) if s > 0: similar_ignored_regions.append(PageRegion(p, e)) start = e or start extracted_data = first_extractor.extract( page, pindex, sindex, similar_ignored_regions, **kwargs) if following_extractors: _, _, following_data = self._doextract(page, following_extractors, sindex or start_index, end_index, **kwargs) extracted_data += following_data elif following_extractors: end_index, _, following_data = self._doextract( page, following_extractors, start_index, end_index, **kwargs) if end_index is not None: pindex, sindex, extracted_data = self._doextract( page, [first_extractor], start_index, end_index - 1, nested_regions, ignored_regions, **kwargs) extracted_data += following_data if (not extracted_data and hasattr(first_extractor, 'annotation') and first_extractor.annotation): annotation = first_extractor.annotation or [] content = annotation.surrounds_attribute or [] attributes = annotation.tag_attributes attrs = chain(content, *(a for _, a in attributes)) if (any(isinstance(k, dict) and k.get('required') for k in attrs)): raise MissingRequiredError() return pindex, sindex, extracted_data
def _doextract(self, page, extractors, start_index, end_index, nested_regions=None, ignored_regions=None, **kwargs): """Carry out extraction of records using the given annotations in the page tokens bounded by start_index and end_index """ # reorder extractors leaving nested ones for the end and separating # ignore regions nested_regions = nested_regions or [] ignored_regions = ignored_regions or [] current_extractor, following_extractors = extractors[0], extractors[1:] while (following_extractors and _int_cmp(labelled_element(following_extractors[0]).start_index, 'lt', labelled_element(current_extractor).end_index)): ex = following_extractors.pop(0) labelled = labelled_element(ex) if (isinstance(labelled, AnnotationTag) or (nested_regions and _int_cmp(labelled_element(nested_regions[-1]).start_index, 'lt', labelled.start_index) and _int_cmp(labelled.start_index, 'lt', labelled_element(nested_regions[-1]).end_index))): nested_regions.append(ex) else: ignored_regions.append(ex) extracted_data = [] # end_index is inclusive, but similar_region treats it as exclusive end_index_exclusive = None if end_index is None else end_index + 1 labelled = labelled_element(current_extractor) score, pindex, sindex = \ similar_region(page.page_tokens, self.template_tokens, labelled, start_index, end_index_exclusive, self.best_match, **kwargs) if score > 0: if isinstance(labelled, AnnotationTag): similar_ignored_regions = [] start = pindex for i in ignored_regions: s, p, e = similar_region(page.page_tokens, self.template_tokens, i, start, sindex, self.best_match, **kwargs) if s > 0: similar_ignored_regions.append(PageRegion(p, e)) start = e or start extracted_data = current_extractor.extract(page, pindex, sindex, similar_ignored_regions, **kwargs) if extracted_data: if current_extractor.annotation.variant_id: extracted_data = [(current_extractor.annotation.variant_id, extracted_data)] if nested_regions: _, _, nested_data = self._doextract(page, nested_regions, pindex, sindex, **kwargs) extracted_data += nested_data if following_extractors: _, _, following_data = self._doextract(page, following_extractors, sindex or start_index, end_index, **kwargs) extracted_data += following_data elif following_extractors: end_index, _, following_data = self._doextract(page, following_extractors, start_index, end_index, **kwargs) if end_index is not None: pindex, sindex, extracted_data = self._doextract(page, [current_extractor], start_index, end_index - 1, nested_regions, ignored_regions, **kwargs) extracted_data += following_data elif nested_regions: _, _, nested_data = self._doextract(page, nested_regions, start_index, end_index, **kwargs) extracted_data += nested_data return pindex, sindex, extracted_data
def _doextract(self, page, extractors, start_index, end_index, nested_regions=None, ignored_regions=None, **kwargs): # reorder extractors leaving nested ones for the end and separating # ignore regions nested_regions = nested_regions or [] ignored_regions = ignored_regions or [] first_extractor, following_extractors = extractors[0], extractors[1:] lelem = labelled_element extracted_data = [] # end_index is inclusive, but similar_region treats it as exclusive end_region = None if end_index is None else end_index + 1 labelled = lelem(first_extractor) score, pindex, sindex = \ similar_region( page.page_tokens, self.template_tokens, labelled, start_index, end_region, self.best_match, **kwargs) if score > 0: if isinstance(labelled, AnnotationTag): similar_ignored_regions = [] start = pindex for i in ignored_regions: s, p, e = similar_region( page.page_tokens, self.template_tokens, i, start, sindex, self.best_match, **kwargs) if s > 0: similar_ignored_regions.append(PageRegion(p, e)) start = e or start extracted_data = first_extractor.extract( page, pindex, sindex, similar_ignored_regions, **kwargs) if following_extractors: _, _, following_data = self._doextract( page, following_extractors, sindex or start_index, end_index, **kwargs) extracted_data += following_data elif following_extractors: end_index, _, following_data = self._doextract( page, following_extractors, start_index, end_index, **kwargs) if end_index is not None: pindex, sindex, extracted_data = self._doextract( page, [first_extractor], start_index, end_index - 1, nested_regions, ignored_regions, **kwargs ) extracted_data += following_data if (hasattr(first_extractor, 'annotation') and first_extractor.annotation): annotation = first_extractor.annotation or [] content = annotation.surrounds_attribute or [] attributes = annotation.tag_attributes attrs = chain(content, *(a for _, a in attributes)) extracted_ids = {a['id'] for annos, _ in extracted_data for a in annos if isinstance(a, dict) and 'id' in a} if (any(isinstance(k, dict) and k.get('required') and k.get('id') not in extracted_ids for k in attrs)): raise MissingRequiredError() return pindex, sindex, extracted_data
def _doextract(self, page, region_elements, start_index, end_index, nested_regions=None, ignored_regions=None): """Carry out extraction of records using the given annotations in the page tokens bounded by start_index and end_index """ # reorder extractors leaving nested ones for the end and separating # ignore regions nested_regions = nested_regions or [] ignored_regions = ignored_regions or [] first_region, following_regions = region_elements[0], region_elements[1:] while following_regions and _labelled(following_regions[0]).start_index \ < _labelled(first_region).end_index: region = following_regions.pop(0) labelled = _labelled(region) if isinstance(labelled, AnnotationTag) or (nested_regions and \ _labelled(nested_regions[-1]).start_index < labelled.start_index \ < _labelled(nested_regions[-1]).end_index): nested_regions.append(region) else: ignored_regions.append(region) extracted_data = [] # end_index is inclusive, but similar_region treats it as exclusive end_region = None if end_index is None else end_index + 1 labelled = _labelled(first_region) score, pindex, sindex = \ similar_region(page.page_tokens, self.template_tokens, labelled, start_index, end_region) if score > 0: if isinstance(labelled, AnnotationTag): similar_ignored_regions = [] start = pindex for i in ignored_regions: s, p, e = similar_region(page.page_tokens, self.template_tokens, \ i, start, sindex) if s > 0: similar_ignored_regions.append(PageRegion(p, e)) start = e or start extracted_data = first_region.extract(page, pindex, sindex, similar_ignored_regions) if extracted_data: if first_region.annotation.variant_id: extracted_data = [(first_region.annotation.variant_id, extracted_data)] if nested_regions: _, _, nested_data = self._doextract(page, nested_regions, pindex, sindex) extracted_data += nested_data if following_regions: _, _, following_data = self._doextract(page, following_regions, sindex or start_index, end_index) extracted_data += following_data elif following_regions: end_index, _, following_data = self._doextract(page, following_regions, start_index, end_index) if end_index is not None: pindex, sindex, extracted_data = self._doextract(page, [first_region], start_index, end_index - 1, nested_regions, ignored_regions) extracted_data += following_data elif nested_regions: _, _, nested_data = self._doextract(page, nested_regions, start_index, end_index) extracted_data += nested_data return pindex, sindex, extracted_data
def _doextract(self, page, region_elements, start_index, end_index, nested_regions=None, ignored_regions=None, **kwargs): """Carry out extraction of records using the given annotations in the page tokens bounded by start_index and end_index """ # reorder extractors leaving nested ones for the end and separating # ignore regions nested_regions = nested_regions or [] ignored_regions = ignored_regions or [] first_region, following_regions = region_elements[0], region_elements[1:] while following_regions and _labelled(following_regions[0]).start_index \ < _labelled(first_region).end_index: region = following_regions.pop(0) labelled = _labelled(region) if isinstance(labelled, AnnotationTag) or (nested_regions and \ _labelled(nested_regions[-1]).start_index < labelled.start_index \ < _labelled(nested_regions[-1]).end_index): nested_regions.append(region) else: ignored_regions.append(region) extracted_data = [] # end_index is inclusive, but similar_region treats it as exclusive end_region = None if end_index is None else end_index + 1 labelled = _labelled(first_region) score, pindex, sindex = \ similar_region(page.page_tokens, self.template_tokens, labelled, start_index, end_region, **kwargs) if score > 0: if isinstance(labelled, AnnotationTag): similar_ignored_regions = [] start = pindex for i in ignored_regions: s, p, e = similar_region(page.page_tokens, self.template_tokens, \ i, start, sindex, **kwargs) if s > 0: similar_ignored_regions.append(PageRegion(p, e)) start = e or start extracted_data = first_region.extract(page, pindex, sindex, similar_ignored_regions, **kwargs) if extracted_data: if first_region.annotation.variant_id: extracted_data = [(first_region.annotation.variant_id, extracted_data)] if nested_regions: _, _, nested_data = self._doextract(page, nested_regions, pindex, sindex, **kwargs) extracted_data += nested_data if following_regions: _, _, following_data = self._doextract(page, following_regions, sindex or start_index, end_index, **kwargs) extracted_data += following_data elif following_regions: end_index, _, following_data = self._doextract(page, following_regions, start_index, end_index, **kwargs) if end_index is not None: pindex, sindex, extracted_data = self._doextract(page, [first_region], start_index, end_index - 1, nested_regions, ignored_regions, **kwargs) extracted_data += following_data elif nested_regions: _, _, nested_data = self._doextract(page, nested_regions, start_index, end_index, **kwargs) extracted_data += nested_data return pindex, sindex, extracted_data
def extract(self, page, start_index=0, end_index=None, ignored_regions=None, **kwargs): """ Find a region surrounding repeated data and run extractors on the data in that region. """ start_index = max(0, start_index - 1) max_end_index = len(page.token_page_indexes) if end_index is None: end_index = max_end_index else: end_index = min(max_end_index, end_index + 1) region = Region(*similar_region(page.page_tokens, self.template_tokens, self.annotation, start_index, end_index, self.best_match, **kwargs)) if region.score < 1: return [] surrounding = element_from_page_index(page, start_index) items = self._extract_items_from_region(region, page, ignored_regions, surrounding, **kwargs) tag = element_from_page_index(page, region.start_index) items = [self._validate_and_adapt_item(i, page, tag) for i in items] if self.many: return items return self._merge_items(items)
def extract(self, page, start_index=0, end_index=None, ignored_regions=None, **kwargs): """ Find a region surrounding repeated data and run extractors on the data in that region. """ start_index = max(0, start_index - 1) max_end_index = len(page.token_page_indexes) if end_index is None: end_index = max_end_index else: end_index = min(max_end_index, end_index + 1) region = Region(*similar_region( page.page_tokens, self.template_tokens, self.annotation, start_index, end_index, self.best_match, **kwargs)) if region.score < 1: return [] surrounding = element_from_page_index(page, start_index) items = self._extract_items_from_region( region, page, ignored_regions, surrounding, **kwargs) tag = element_from_page_index(page, region.start_index) items = [self._validate_and_adapt_item(i, page, tag) for i in items] if self.many: return items return self._merge_items(items)
def extract(self, page, start_index=0, end_index=None, ignored_regions=None, **kwargs): """ Find a region surrounding repeated data and run extractors on the data in that region. """ items = [] start_index = max(0, start_index - 1) max_end_index = len(page.token_page_indexes) if end_index is None: end_index = max_end_index - 1 else: end_index = min(max_end_index, end_index + 1) region = Region(*similar_region( page.page_tokens, self.template_tokens, self.annotation, start_index, end_index, self.best_match, **kwargs)) if region.score < 1: return [] for extractor in self.extractors: try: item = extractor.extract(page, region.start_index, region.end_index, ignored_regions, **kwargs) except TypeError: ex = SlybotRecordExtractor(extractor.extractors, extractor.template_tokens) item = ex.extract(page, region.start_index, region.end_index, ignored_regions, **kwargs) if (isinstance(extractor, RepeatedContainerExtractor) or isinstance(item, list)): items.extend(item) else: items.append(item) items = [self._validate_and_adapt_item(i, page) for i in items] return items
def extract(self, page, start_index=0, end_index=None, ignored_regions=None, **kwargs): """ Find a region surrounding repeated data and run extractors on the data in that region. """ items = [] start_index = max(0, start_index - 1) max_end_index = len(page.token_page_indexes) if end_index is None: end_index = max_end_index - 1 else: end_index = min(max_end_index, end_index + 1) region = Region(*similar_region(page.page_tokens, self.template_tokens, self.annotation, start_index, end_index, self.best_match, **kwargs)) if region.score < 1: return [] for extractor in self.extractors: try: try: item = extractor.extract(page, region.start_index, region.end_index, ignored_regions, **kwargs) except TypeError: ex = SlybotRecordExtractor(extractor.extractors, extractor.template_tokens) item = ex.extract(page, region.start_index, region.end_index, ignored_regions, **kwargs) except MissingRequiredError: return [] if (isinstance(extractor, RepeatedContainerExtractor) or isinstance(item, list)): if item and isinstance(item[0], dict): items.extend(item) else: items.append(item) else: items.append(item) items = [self._validate_and_adapt_item(i, page) for i in items] return items
def _doextract(self, page, extractors, start_index, end_index, nested_regions=None, ignored_regions=None, **kwargs): # reorder extractors leaving nested ones for the end and separating # ignore regions nested_regions = nested_regions or [] ignored_regions = ignored_regions or [] first_extractor, following_extractors = extractors[0], extractors[1:] while (following_extractors and _int_cmp( labelled_element(following_extractors[0]).start_index, 'lt', labelled_element(first_extractor).end_index)): ex = following_extractors.pop(0) labelled = labelled_element(ex) if (isinstance(labelled, AnnotationTag) or (nested_regions and _int_cmp( labelled_element(nested_regions[-1]).start_index, 'lt', labelled.start_index) and _int_cmp( labelled.start_index, 'lt', labelled_element(nested_regions[-1]).end_index))): nested_regions.append(ex) else: ignored_regions.append(ex) lelem = labelled_element extracted_data = [] # end_index is inclusive, but similar_region treats it as exclusive end_region = None if end_index is None else end_index + 1 start_region = None if start_index is None else start_index - 1 labelled = lelem(first_extractor) try: score, pindex, sindex = similar_region(page.page_tokens, self.template_tokens, labelled, start_region, end_region, self.best_match, **kwargs) except IndexError: start_region, end_region = start_index, end_index score, pindex, sindex = similar_region(page.page_tokens, self.template_tokens, labelled, start_region, end_region, self.best_match, **kwargs) if score > 0: if isinstance(labelled, AnnotationTag): similar_ignored_regions = [] start = pindex for i in ignored_regions: s, p, e = similar_region(page.page_tokens, self.template_tokens, i, start, sindex, self.best_match, **kwargs) if s > 0: similar_ignored_regions.append(PageRegion(p, e)) start = e or start extracted_data = first_extractor.extract( page, pindex, sindex, similar_ignored_regions, **kwargs) if following_extractors: previous_extraction = start_region or sindex if previous_extraction: kwargs['previous'] = previous_extraction + 1 _, _, following_data = self._doextract(page, following_extractors, sindex or start_region, end_index, **kwargs) extracted_data += following_data if nested_regions: _, _, nested_data = self._doextract(page, nested_regions, pindex, sindex, **kwargs) extracted_data += nested_data elif following_extractors: end_index, _, following_data = self._doextract( page, following_extractors, start_index, end_index, **kwargs) if end_index is not None: pindex, sindex, extracted_data = self._doextract( page, [first_extractor], start_region, end_index, nested_regions, ignored_regions, **kwargs) if extracted_data and sindex: kwargs['previous'] = sindex + 1 extracted_data += following_data elif nested_regions: _, _, nested_data = self._doextract(page, nested_regions, start_index, end_index, **kwargs) extracted_data += nested_data if (hasattr(first_extractor, 'annotation') and first_extractor.annotation): annotation = first_extractor.annotation or [] content = annotation.surrounds_attribute or [] attributes = annotation.tag_attributes attrs = chain(content, *(a for _, a in attributes)) extracted_ids = { a['id'] for annos, _ in extracted_data for a in annos if isinstance(a, dict) and 'id' in a } if (any( isinstance(k, dict) and k.get('required') and k.get('id') not in extracted_ids for k in attrs)): raise MissingRequiredError() return pindex, sindex, extracted_data