def filter_redactions_by_pixmap( redactions: List[RedactionType], page: Page, ) -> List[RedactionType]: """Convert each bad redaction to an image and check it for text :param redactions: A list of redactions that might be bad :param page: The PyMuPDF.Page object where the bad redactions might be :return: The redactions, if they are valid """ bad_redactions = [] for redaction in redactions: pixmap = page.get_pixmap( # Use gray for simplicity and speed, though this risks missing a # bad redaction. colorspace=fitz.csRGB, clip=fitz.Rect(redaction["bbox"]), ) if not pixmap.is_unicolor: # There's some degree of variation in the colors of the pixels. # ∴ it's not a uniform box and it's not a bad redaction. # filename = f'{redaction["text"].replace("/", "_")}.png' # pixmap.save(filename) continue bad_redactions.append(redaction) return bad_redactions
def image_from_page(page: fitz.Page, scale: float = 1) -> Image: """Converts a page to an image. :param page: the page to be represented as an image :param scale: the proportion by which to scale the image """ pix = page.get_pixmap(matrix=fitz.Matrix(scale, scale)) # type: ignore return Image.frombytes( # type: ignore ("RGBA" if pix.alpha else "RGB"), (pix.width, pix.height), pix.samples )