示例#1
0
    def __init__(self, doc: fitz.Document, page: fitz.Page) -> None:
        self.doc = doc
        self._page = page

        # Page's cropbox (to help identify what part of an image is actually
        # being displayed), already rotated
        self.bbox = page.bound()

        # Cross-referenced images (ignore masks since they can't be easily downsampled)
        all_xref_images = list(map(self._build_xref_image, page.get_images(full=True)))
        smask_xrefs = set(filter(None, map(lambda xref_image: xref_image['smask'], all_xref_images)))
        self.xref_images = list(filter(lambda xref_image: xref_image['xref'] not in smask_xrefs, all_xref_images))

        # Lazy, memoized attributes

        # Match block numbers to image hashes
        self._block_hashes = None
示例#2
0
 def _analyze_page(self, page: fitz.Page):
     """Analyzes `page` and records the data extracted from it. Does
     nothing if the page cannot be analyzed successfully.
     """
     original_text = page.get_text()  # type: ignore
     if (
         total_image_area(page) / page.bound().getArea()
         < self.image_area_thresh
         and not len([a for a in original_text if a == '�'])
         > self.max_unreadable
     ):
         metadata, orientation_used, scale = None, None, None
         language = detected_language(original_text)
         self.texts.append(original_text)
         self.mean_confidences.append(None)
         used_original_text = True
     else:
         metadata, orientation_used, language, scale = self._run_ocr(
             page,
             (detected_language(original_text)
              if len(original_text) >= self.text_len_thresh
              else self.languages.items[0])
         )
         if mean_conf(metadata) < self.coarse_thresh:
             warnings.warn('Failed to analyze image.')
         self.texts.append(data_to_string(
             metadata.corrected if 'corrected' in metadata.columns
             else metadata.text
         ))
         self.mean_confidences.append(mean_conf(metadata))
         used_original_text = False
     self.languages.add_weight(language)
     self.metadata.append(metadata)
     self.orientations.append(orientation_used)
     self.page_languages.append(language)
     self.used_original_texts.append(used_original_text)
     self.times.append(time.time())
     self.scales.append(scale)