Пример #1
0
def strip_invisible_text(pdf, page):
    stream = []
    in_text_obj = False
    render_mode = 0
    text_objects = []

    rich_page = Page(page)
    rich_page.contents_coalesce()
    for operands, operator in parse_content_stream(page, ''):
        if not in_text_obj:
            if operator == Operator('BT'):
                in_text_obj = True
                render_mode = 0
                text_objects.append((operands, operator))
            else:
                stream.append((operands, operator))
        else:
            if operator == Operator('Tr'):
                render_mode = operands[0]
            text_objects.append((operands, operator))
            if operator == Operator('ET'):
                in_text_obj = False
                if render_mode != 3:
                    stream.extend(text_objects)
                text_objects.clear()

    content_stream = unparse_content_stream(stream)
    page.Contents = Stream(pdf, content_stream)
Пример #2
0
def test_externalize(resources):
    with Pdf.open(resources / 'image-mono-inline.pdf') as p:
        page = Page(p.pages[0])
        page.contents_coalesce()
        assert b'BI' in page.obj.Contents.read_bytes(), "no inline image"

        assert Name.XObject not in page.obj.Resources, "expected no xobjs"
        page.externalize_inline_images()

        assert Name.XObject in page.obj.Resources, "image not created"

        pdfimagexobj = next(iter(p.pages[0].images.values()))
        assert pdfimagexobj.Subtype == Name.Image

        assert page.label == '1'