def __init__(self, doc: fitz.Document, page: fitz.Page) -> None: self.doc = doc self._page = page # Page's cropbox (to help identify what part of an image is actually # being displayed), already rotated self.bbox = page.bound() # Cross-referenced images (ignore masks since they can't be easily downsampled) all_xref_images = list(map(self._build_xref_image, page.get_images(full=True))) smask_xrefs = set(filter(None, map(lambda xref_image: xref_image['smask'], all_xref_images))) self.xref_images = list(filter(lambda xref_image: xref_image['xref'] not in smask_xrefs, all_xref_images)) # Lazy, memoized attributes # Match block numbers to image hashes self._block_hashes = None
def _analyze_page(self, page: fitz.Page): """Analyzes `page` and records the data extracted from it. Does nothing if the page cannot be analyzed successfully. """ original_text = page.get_text() # type: ignore if ( total_image_area(page) / page.bound().getArea() < self.image_area_thresh and not len([a for a in original_text if a == '�']) > self.max_unreadable ): metadata, orientation_used, scale = None, None, None language = detected_language(original_text) self.texts.append(original_text) self.mean_confidences.append(None) used_original_text = True else: metadata, orientation_used, language, scale = self._run_ocr( page, (detected_language(original_text) if len(original_text) >= self.text_len_thresh else self.languages.items[0]) ) if mean_conf(metadata) < self.coarse_thresh: warnings.warn('Failed to analyze image.') self.texts.append(data_to_string( metadata.corrected if 'corrected' in metadata.columns else metadata.text )) self.mean_confidences.append(mean_conf(metadata)) used_original_text = False self.languages.add_weight(language) self.metadata.append(metadata) self.orientations.append(orientation_used) self.page_languages.append(language) self.used_original_texts.append(used_original_text) self.times.append(time.time()) self.scales.append(scale)