示例#1
0
    def _doextract(self,
                   page,
                   extractors,
                   start_index,
                   end_index,
                   nested_regions=None,
                   ignored_regions=None,
                   **kwargs):
        # reorder extractors leaving nested ones for the end and separating
        # ignore regions
        nested_regions = nested_regions or []
        ignored_regions = ignored_regions or []
        first_extractor, following_extractors = extractors[0], extractors[1:]
        lelem = labelled_element
        extracted_data = []
        # end_index is inclusive, but similar_region treats it as exclusive
        end_region = None if end_index is None else end_index + 1
        labelled = lelem(first_extractor)
        score, pindex, sindex = \
            similar_region(
                page.page_tokens, self.template_tokens, labelled, start_index,
                end_region, self.best_match, **kwargs)
        if score > 0:
            if isinstance(labelled, AnnotationTag):
                similar_ignored_regions = []
                start = pindex
                for i in ignored_regions:
                    s, p, e = similar_region(page.page_tokens,
                                             self.template_tokens, i, start,
                                             sindex, self.best_match, **kwargs)
                    if s > 0:
                        similar_ignored_regions.append(PageRegion(p, e))
                        start = e or start
                extracted_data = first_extractor.extract(
                    page, pindex, sindex, similar_ignored_regions, **kwargs)
            if following_extractors:
                _, _, following_data = self._doextract(page,
                                                       following_extractors,
                                                       sindex or start_index,
                                                       end_index, **kwargs)
                extracted_data += following_data

        elif following_extractors:
            end_index, _, following_data = self._doextract(
                page, following_extractors, start_index, end_index, **kwargs)
            if end_index is not None:
                pindex, sindex, extracted_data = self._doextract(
                    page, [first_extractor], start_index, end_index - 1,
                    nested_regions, ignored_regions, **kwargs)
            extracted_data += following_data
        if (not extracted_data and hasattr(first_extractor, 'annotation')
                and first_extractor.annotation):
            annotation = first_extractor.annotation or []
            content = annotation.surrounds_attribute or []
            attributes = annotation.tag_attributes
            attrs = chain(content, *(a for _, a in attributes))
            if (any(isinstance(k, dict) and k.get('required') for k in attrs)):
                raise MissingRequiredError()
        return pindex, sindex, extracted_data
示例#2
0
    def _doextract(self, page, extractors, start_index, end_index, nested_regions=None, ignored_regions=None, **kwargs):
        """Carry out extraction of records using the given annotations
        in the page tokens bounded by start_index and end_index
        """
        # reorder extractors leaving nested ones for the end and separating
        # ignore regions
        nested_regions = nested_regions or []
        ignored_regions = ignored_regions or []
        current_extractor, following_extractors = extractors[0], extractors[1:]
        while (following_extractors and
               _int_cmp(labelled_element(following_extractors[0]).start_index, 'lt',
                        labelled_element(current_extractor).end_index)):
            ex = following_extractors.pop(0)
            labelled = labelled_element(ex)
            if (isinstance(labelled, AnnotationTag) or
                (nested_regions and
                 _int_cmp(labelled_element(nested_regions[-1]).start_index, 'lt', labelled.start_index) and
                 _int_cmp(labelled.start_index, 'lt', labelled_element(nested_regions[-1]).end_index))):
                nested_regions.append(ex)
            else:
                ignored_regions.append(ex)
        extracted_data = []
        # end_index is inclusive, but similar_region treats it as exclusive
        end_index_exclusive = None if end_index is None else end_index + 1
        labelled = labelled_element(current_extractor)
        score, pindex, sindex = \
            similar_region(page.page_tokens, self.template_tokens,
                labelled, start_index, end_index_exclusive, self.best_match, **kwargs)
        if score > 0:
            if isinstance(labelled, AnnotationTag):
                similar_ignored_regions = []
                start = pindex
                for i in ignored_regions:
                    s, p, e = similar_region(page.page_tokens, self.template_tokens,
                              i, start, sindex, self.best_match, **kwargs)
                    if s > 0:
                        similar_ignored_regions.append(PageRegion(p, e))
                        start = e or start
                extracted_data = current_extractor.extract(page, pindex, sindex, similar_ignored_regions, **kwargs)
                if extracted_data:
                    if current_extractor.annotation.variant_id:
                        extracted_data = [(current_extractor.annotation.variant_id, extracted_data)]

            if nested_regions:
                _, _, nested_data = self._doextract(page, nested_regions, pindex, sindex, **kwargs)
                extracted_data += nested_data
            if following_extractors:
                _, _, following_data = self._doextract(page, following_extractors, sindex or start_index, end_index, **kwargs)
                extracted_data += following_data

        elif following_extractors:
            end_index, _, following_data = self._doextract(page, following_extractors, start_index, end_index, **kwargs)
            if end_index is not None:
                pindex, sindex, extracted_data = self._doextract(page, [current_extractor], start_index, end_index - 1, nested_regions, ignored_regions, **kwargs)
            extracted_data += following_data
        elif nested_regions:
            _, _, nested_data = self._doextract(page, nested_regions, start_index, end_index, **kwargs)
            extracted_data += nested_data
        return pindex, sindex, extracted_data
示例#3
0
    def _doextract(self, page, extractors, start_index, end_index, nested_regions=None, ignored_regions=None, **kwargs):
        """Carry out extraction of records using the given annotations
        in the page tokens bounded by start_index and end_index
        """
        # reorder extractors leaving nested ones for the end and separating
        # ignore regions
        nested_regions = nested_regions or []
        ignored_regions = ignored_regions or []
        current_extractor, following_extractors = extractors[0], extractors[1:]
        while (following_extractors and
               _int_cmp(labelled_element(following_extractors[0]).start_index, 'lt',
                        labelled_element(current_extractor).end_index)):
            ex = following_extractors.pop(0)
            labelled = labelled_element(ex)
            if (isinstance(labelled, AnnotationTag) or
                (nested_regions and
                 _int_cmp(labelled_element(nested_regions[-1]).start_index, 'lt', labelled.start_index) and
                 _int_cmp(labelled.start_index, 'lt', labelled_element(nested_regions[-1]).end_index))):
                nested_regions.append(ex)
            else:
                ignored_regions.append(ex)
        extracted_data = []
        # end_index is inclusive, but similar_region treats it as exclusive
        end_index_exclusive = None if end_index is None else end_index + 1
        labelled = labelled_element(current_extractor)
        score, pindex, sindex = \
            similar_region(page.page_tokens, self.template_tokens,
                labelled, start_index, end_index_exclusive, self.best_match, **kwargs)
        if score > 0:
            if isinstance(labelled, AnnotationTag):
                similar_ignored_regions = []
                start = pindex
                for i in ignored_regions:
                    s, p, e = similar_region(page.page_tokens, self.template_tokens,
                              i, start, sindex, self.best_match, **kwargs)
                    if s > 0:
                        similar_ignored_regions.append(PageRegion(p, e))
                        start = e or start
                extracted_data = current_extractor.extract(page, pindex, sindex, similar_ignored_regions, **kwargs)
                if extracted_data:
                    if current_extractor.annotation.variant_id:
                        extracted_data = [(current_extractor.annotation.variant_id, extracted_data)]

            if nested_regions:
                _, _, nested_data = self._doextract(page, nested_regions, pindex, sindex, **kwargs)
                extracted_data += nested_data
            if following_extractors:
                _, _, following_data = self._doextract(page, following_extractors, sindex or start_index, end_index, **kwargs)
                extracted_data += following_data

        elif following_extractors:
            end_index, _, following_data = self._doextract(page, following_extractors, start_index, end_index, **kwargs)
            if end_index is not None:
                pindex, sindex, extracted_data = self._doextract(page, [current_extractor], start_index, end_index - 1, nested_regions, ignored_regions, **kwargs)
            extracted_data += following_data
        elif nested_regions:
            _, _, nested_data = self._doextract(page, nested_regions, start_index, end_index, **kwargs)
            extracted_data += nested_data
        return pindex, sindex, extracted_data
示例#4
0
文件: extraction.py 项目: 01-/portia
    def _doextract(self, page, extractors, start_index, end_index,
                   nested_regions=None, ignored_regions=None, **kwargs):
        # reorder extractors leaving nested ones for the end and separating
        # ignore regions
        nested_regions = nested_regions or []
        ignored_regions = ignored_regions or []
        first_extractor, following_extractors = extractors[0], extractors[1:]
        lelem = labelled_element
        extracted_data = []
        # end_index is inclusive, but similar_region treats it as exclusive
        end_region = None if end_index is None else end_index + 1
        labelled = lelem(first_extractor)
        score, pindex, sindex = \
            similar_region(
                page.page_tokens, self.template_tokens, labelled, start_index,
                end_region, self.best_match, **kwargs)
        if score > 0:
            if isinstance(labelled, AnnotationTag):
                similar_ignored_regions = []
                start = pindex
                for i in ignored_regions:
                    s, p, e = similar_region(
                        page.page_tokens, self.template_tokens, i, start,
                        sindex, self.best_match, **kwargs)
                    if s > 0:
                        similar_ignored_regions.append(PageRegion(p, e))
                        start = e or start
                extracted_data = first_extractor.extract(
                    page, pindex, sindex, similar_ignored_regions, **kwargs)
            if following_extractors:
                _, _, following_data = self._doextract(
                    page, following_extractors, sindex or start_index,
                    end_index, **kwargs)
                extracted_data += following_data

        elif following_extractors:
            end_index, _, following_data = self._doextract(
                page, following_extractors, start_index, end_index, **kwargs)
            if end_index is not None:
                pindex, sindex, extracted_data = self._doextract(
                    page, [first_extractor], start_index, end_index - 1,
                    nested_regions, ignored_regions, **kwargs
                )
            extracted_data += following_data
        if (hasattr(first_extractor, 'annotation') and
                first_extractor.annotation):
            annotation = first_extractor.annotation or []
            content = annotation.surrounds_attribute or []
            attributes = annotation.tag_attributes
            attrs = chain(content, *(a for _, a in attributes))
            extracted_ids = {a['id'] for annos, _ in extracted_data
                             for a in annos
                             if isinstance(a, dict) and 'id' in a}
            if (any(isinstance(k, dict) and k.get('required') and
                    k.get('id') not in extracted_ids for k in attrs)):
                raise MissingRequiredError()
        return pindex, sindex, extracted_data
示例#5
0
 def _doextract(self, page, region_elements, start_index, end_index, nested_regions=None, ignored_regions=None):
     """Carry out extraction of records using the given annotations
     in the page tokens bounded by start_index and end_index
     """
     # reorder extractors leaving nested ones for the end and separating
     # ignore regions
     nested_regions = nested_regions or []
     ignored_regions = ignored_regions or []
     first_region, following_regions = region_elements[0], region_elements[1:]
     while following_regions and _labelled(following_regions[0]).start_index \
             < _labelled(first_region).end_index:
         region = following_regions.pop(0)
         labelled = _labelled(region)
         if isinstance(labelled, AnnotationTag) or (nested_regions and \
                 _labelled(nested_regions[-1]).start_index < labelled.start_index \
                 < _labelled(nested_regions[-1]).end_index):
             nested_regions.append(region)
         else:
             ignored_regions.append(region)
     extracted_data = []
     # end_index is inclusive, but similar_region treats it as exclusive
     end_region = None if end_index is None else end_index + 1
     labelled = _labelled(first_region)
     score, pindex, sindex = \
         similar_region(page.page_tokens, self.template_tokens,
             labelled, start_index, end_region)
     if score > 0:
         if isinstance(labelled, AnnotationTag):
             similar_ignored_regions = []
             start = pindex
             for i in ignored_regions:
                 s, p, e = similar_region(page.page_tokens, self.template_tokens, \
                           i, start, sindex)
                 if s > 0:
                     similar_ignored_regions.append(PageRegion(p, e))
                     start = e or start
             extracted_data = first_region.extract(page, pindex, sindex, similar_ignored_regions)
             if extracted_data:
                 if first_region.annotation.variant_id:
                     extracted_data = [(first_region.annotation.variant_id, extracted_data)]
         
         if nested_regions:
             _, _, nested_data = self._doextract(page, nested_regions, pindex, sindex)
             extracted_data += nested_data
         if following_regions:
             _, _, following_data = self._doextract(page, following_regions, sindex or start_index, end_index)
             extracted_data += following_data
     
     elif following_regions:
         end_index, _, following_data = self._doextract(page, following_regions, start_index, end_index)
         if end_index is not None:
             pindex, sindex, extracted_data = self._doextract(page, [first_region], start_index, end_index - 1, nested_regions, ignored_regions)
             extracted_data += following_data
     elif nested_regions:
         _, _, nested_data = self._doextract(page, nested_regions, start_index, end_index)
         extracted_data += nested_data
     return pindex, sindex, extracted_data
示例#6
0
 def _doextract(self, page, region_elements, start_index, end_index, nested_regions=None, ignored_regions=None, **kwargs):
     """Carry out extraction of records using the given annotations
     in the page tokens bounded by start_index and end_index
     """
     # reorder extractors leaving nested ones for the end and separating
     # ignore regions
     nested_regions = nested_regions or []
     ignored_regions = ignored_regions or []
     first_region, following_regions = region_elements[0], region_elements[1:]
     while following_regions and _labelled(following_regions[0]).start_index \
             < _labelled(first_region).end_index:
         region = following_regions.pop(0)
         labelled = _labelled(region)
         if isinstance(labelled, AnnotationTag) or (nested_regions and \
                 _labelled(nested_regions[-1]).start_index < labelled.start_index \
                 < _labelled(nested_regions[-1]).end_index):
             nested_regions.append(region)
         else:
             ignored_regions.append(region)
     extracted_data = []
     # end_index is inclusive, but similar_region treats it as exclusive
     end_region = None if end_index is None else end_index + 1
     labelled = _labelled(first_region)
     score, pindex, sindex = \
         similar_region(page.page_tokens, self.template_tokens,
             labelled, start_index, end_region, **kwargs)
     if score > 0:
         if isinstance(labelled, AnnotationTag):
             similar_ignored_regions = []
             start = pindex
             for i in ignored_regions:
                 s, p, e = similar_region(page.page_tokens, self.template_tokens, \
                           i, start, sindex, **kwargs)
                 if s > 0:
                     similar_ignored_regions.append(PageRegion(p, e))
                     start = e or start
             extracted_data = first_region.extract(page, pindex, sindex, similar_ignored_regions, **kwargs)
             if extracted_data:
                 if first_region.annotation.variant_id:
                     extracted_data = [(first_region.annotation.variant_id, extracted_data)]
         
         if nested_regions:
             _, _, nested_data = self._doextract(page, nested_regions, pindex, sindex, **kwargs)
             extracted_data += nested_data
         if following_regions:
             _, _, following_data = self._doextract(page, following_regions, sindex or start_index, end_index, **kwargs)
             extracted_data += following_data
     
     elif following_regions:
         end_index, _, following_data = self._doextract(page, following_regions, start_index, end_index, **kwargs)
         if end_index is not None:
             pindex, sindex, extracted_data = self._doextract(page, [first_region], start_index, end_index - 1, nested_regions, ignored_regions, **kwargs)
             extracted_data += following_data
     elif nested_regions:
         _, _, nested_data = self._doextract(page, nested_regions, start_index, end_index, **kwargs)
         extracted_data += nested_data
     return pindex, sindex, extracted_data
示例#7
0
 def extract(self,
             page,
             start_index=0,
             end_index=None,
             ignored_regions=None,
             **kwargs):
     """
     Find a region surrounding repeated data and run extractors on the data
     in that region.
     """
     start_index = max(0, start_index - 1)
     max_end_index = len(page.token_page_indexes)
     if end_index is None:
         end_index = max_end_index
     else:
         end_index = min(max_end_index, end_index + 1)
     region = Region(*similar_region(page.page_tokens, self.template_tokens,
                                     self.annotation, start_index,
                                     end_index, self.best_match, **kwargs))
     if region.score < 1:
         return []
     surrounding = element_from_page_index(page, start_index)
     items = self._extract_items_from_region(region, page, ignored_regions,
                                             surrounding, **kwargs)
     tag = element_from_page_index(page, region.start_index)
     items = [self._validate_and_adapt_item(i, page, tag) for i in items]
     if self.many:
         return items
     return self._merge_items(items)
 def extract(self, page, start_index=0, end_index=None,
             ignored_regions=None, **kwargs):
     """
     Find a region surrounding repeated data and run extractors on the data
     in that region.
     """
     start_index = max(0, start_index - 1)
     max_end_index = len(page.token_page_indexes)
     if end_index is None:
         end_index = max_end_index
     else:
         end_index = min(max_end_index, end_index + 1)
     region = Region(*similar_region(
         page.page_tokens, self.template_tokens, self.annotation,
         start_index, end_index, self.best_match, **kwargs))
     if region.score < 1:
         return []
     surrounding = element_from_page_index(page, start_index)
     items = self._extract_items_from_region(
         region, page, ignored_regions, surrounding, **kwargs)
     tag = element_from_page_index(page, region.start_index)
     items = [self._validate_and_adapt_item(i, page, tag) for i in items]
     if self.many:
         return items
     return self._merge_items(items)
示例#9
0
 def extract(self, page, start_index=0, end_index=None,
             ignored_regions=None, **kwargs):
     """
     Find a region surrounding repeated data and run extractors on the data
     in that region.
     """
     items = []
     start_index = max(0, start_index - 1)
     max_end_index = len(page.token_page_indexes)
     if end_index is None:
         end_index = max_end_index - 1
     else:
         end_index = min(max_end_index, end_index + 1)
     region = Region(*similar_region(
         page.page_tokens, self.template_tokens, self.annotation,
         start_index, end_index, self.best_match, **kwargs))
     if region.score < 1:
         return []
     for extractor in self.extractors:
         try:
             item = extractor.extract(page, region.start_index,
                                      region.end_index, ignored_regions,
                                      **kwargs)
         except TypeError:
             ex = SlybotRecordExtractor(extractor.extractors,
                                        extractor.template_tokens)
             item = ex.extract(page, region.start_index, region.end_index,
                               ignored_regions, **kwargs)
         if (isinstance(extractor, RepeatedContainerExtractor) or
                 isinstance(item, list)):
             items.extend(item)
         else:
             items.append(item)
     items = [self._validate_and_adapt_item(i, page) for i in items]
     return items
示例#10
0
 def extract(self,
             page,
             start_index=0,
             end_index=None,
             ignored_regions=None,
             **kwargs):
     """
     Find a region surrounding repeated data and run extractors on the data
     in that region.
     """
     items = []
     start_index = max(0, start_index - 1)
     max_end_index = len(page.token_page_indexes)
     if end_index is None:
         end_index = max_end_index - 1
     else:
         end_index = min(max_end_index, end_index + 1)
     region = Region(*similar_region(page.page_tokens, self.template_tokens,
                                     self.annotation, start_index,
                                     end_index, self.best_match, **kwargs))
     if region.score < 1:
         return []
     for extractor in self.extractors:
         try:
             try:
                 item = extractor.extract(page, region.start_index,
                                          region.end_index, ignored_regions,
                                          **kwargs)
             except TypeError:
                 ex = SlybotRecordExtractor(extractor.extractors,
                                            extractor.template_tokens)
                 item = ex.extract(page, region.start_index,
                                   region.end_index, ignored_regions,
                                   **kwargs)
         except MissingRequiredError:
             return []
         if (isinstance(extractor, RepeatedContainerExtractor)
                 or isinstance(item, list)):
             if item and isinstance(item[0], dict):
                 items.extend(item)
             else:
                 items.append(item)
         else:
             items.append(item)
     items = [self._validate_and_adapt_item(i, page) for i in items]
     return items
示例#11
0
    def _doextract(self,
                   page,
                   extractors,
                   start_index,
                   end_index,
                   nested_regions=None,
                   ignored_regions=None,
                   **kwargs):
        # reorder extractors leaving nested ones for the end and separating
        # ignore regions
        nested_regions = nested_regions or []
        ignored_regions = ignored_regions or []
        first_extractor, following_extractors = extractors[0], extractors[1:]
        while (following_extractors and _int_cmp(
                labelled_element(following_extractors[0]).start_index, 'lt',
                labelled_element(first_extractor).end_index)):
            ex = following_extractors.pop(0)
            labelled = labelled_element(ex)
            if (isinstance(labelled, AnnotationTag)
                    or (nested_regions and _int_cmp(
                        labelled_element(nested_regions[-1]).start_index, 'lt',
                        labelled.start_index) and _int_cmp(
                            labelled.start_index, 'lt',
                            labelled_element(nested_regions[-1]).end_index))):
                nested_regions.append(ex)
            else:
                ignored_regions.append(ex)
        lelem = labelled_element
        extracted_data = []
        # end_index is inclusive, but similar_region treats it as exclusive
        end_region = None if end_index is None else end_index + 1
        start_region = None if start_index is None else start_index - 1
        labelled = lelem(first_extractor)
        try:
            score, pindex, sindex = similar_region(page.page_tokens,
                                                   self.template_tokens,
                                                   labelled, start_region,
                                                   end_region, self.best_match,
                                                   **kwargs)
        except IndexError:
            start_region, end_region = start_index, end_index
            score, pindex, sindex = similar_region(page.page_tokens,
                                                   self.template_tokens,
                                                   labelled, start_region,
                                                   end_region, self.best_match,
                                                   **kwargs)

        if score > 0:
            if isinstance(labelled, AnnotationTag):
                similar_ignored_regions = []
                start = pindex
                for i in ignored_regions:
                    s, p, e = similar_region(page.page_tokens,
                                             self.template_tokens, i, start,
                                             sindex, self.best_match, **kwargs)
                    if s > 0:
                        similar_ignored_regions.append(PageRegion(p, e))
                        start = e or start
                extracted_data = first_extractor.extract(
                    page, pindex, sindex, similar_ignored_regions, **kwargs)
            if following_extractors:
                previous_extraction = start_region or sindex
                if previous_extraction:
                    kwargs['previous'] = previous_extraction + 1
                _, _, following_data = self._doextract(page,
                                                       following_extractors,
                                                       sindex or start_region,
                                                       end_index, **kwargs)
                extracted_data += following_data
            if nested_regions:
                _, _, nested_data = self._doextract(page, nested_regions,
                                                    pindex, sindex, **kwargs)
                extracted_data += nested_data
        elif following_extractors:
            end_index, _, following_data = self._doextract(
                page, following_extractors, start_index, end_index, **kwargs)
            if end_index is not None:
                pindex, sindex, extracted_data = self._doextract(
                    page, [first_extractor], start_region, end_index,
                    nested_regions, ignored_regions, **kwargs)
                if extracted_data and sindex:
                    kwargs['previous'] = sindex + 1
            extracted_data += following_data
        elif nested_regions:
            _, _, nested_data = self._doextract(page, nested_regions,
                                                start_index, end_index,
                                                **kwargs)
            extracted_data += nested_data

        if (hasattr(first_extractor, 'annotation')
                and first_extractor.annotation):
            annotation = first_extractor.annotation or []
            content = annotation.surrounds_attribute or []
            attributes = annotation.tag_attributes
            attrs = chain(content, *(a for _, a in attributes))
            extracted_ids = {
                a['id']
                for annos, _ in extracted_data for a in annos
                if isinstance(a, dict) and 'id' in a
            }
            if (any(
                    isinstance(k, dict) and k.get('required')
                    and k.get('id') not in extracted_ids for k in attrs)):
                raise MissingRequiredError()
        return pindex, sindex, extracted_data