def strip_invisible_text(pdf, page): stream = [] in_text_obj = False render_mode = 0 text_objects = [] rich_page = Page(page) rich_page.contents_coalesce() for operands, operator in parse_content_stream(page, ''): if not in_text_obj: if operator == Operator('BT'): in_text_obj = True render_mode = 0 text_objects.append((operands, operator)) else: stream.append((operands, operator)) else: if operator == Operator('Tr'): render_mode = operands[0] text_objects.append((operands, operator)) if operator == Operator('ET'): in_text_obj = False if render_mode != 3: stream.extend(text_objects) text_objects.clear() content_stream = unparse_content_stream(stream) page.Contents = Stream(pdf, content_stream)
def test_externalize(resources): with Pdf.open(resources / 'image-mono-inline.pdf') as p: page = Page(p.pages[0]) page.contents_coalesce() assert b'BI' in page.obj.Contents.read_bytes(), "no inline image" assert Name.XObject not in page.obj.Resources, "expected no xobjs" page.externalize_inline_images() assert Name.XObject in page.obj.Resources, "image not created" pdfimagexobj = next(iter(p.pages[0].images.values())) assert pdfimagexobj.Subtype == Name.Image assert page.label == '1'