예제 #1
0
def test_creation_date_preserved(
    spoof_tesseract_noop, output_type, resources, infile, outpdf
):
    input_file = resources / infile

    check_ocrmypdf(
        input_file, outpdf, '--output-type', output_type, env=spoof_tesseract_noop
    )

    pdf_before = pikepdf.open(input_file)
    pdf_after = pikepdf.open(outpdf)

    before = pdf_before.trailer.get('/Info', {})
    after = pdf_after.trailer.get('/Info', {})

    if not before:
        assert after.get('/CreationDate', '') != ''
    else:
        # We expect that the creation date stayed the same
        date_before = decode_pdf_date(str(before['/CreationDate']))
        date_after = decode_pdf_date(str(after['/CreationDate']))
        assert seconds_between_dates(date_before, date_after) < 1000

    # We expect that the modified date is quite recent
    date_after = decode_pdf_date(str(after['/ModDate']))
    assert seconds_between_dates(date_after, datetime.datetime.now(timezone.utc)) < 1000
예제 #2
0
def test_override_metadata(spoof_tesseract_noop, output_type, resources, outpdf):
    input_file = resources / 'c02-22.pdf'
    german = 'Du siehst den Wald vor lauter Bäumen nicht.'
    chinese = '孔子'

    p, out, err = run_ocrmypdf(
        input_file,
        outpdf,
        '--title',
        german,
        '--author',
        chinese,
        '--output-type',
        output_type,
        env=spoof_tesseract_noop,
    )

    assert p.returncode == ExitCode.ok, err

    before = pikepdf.open(input_file)
    after = pikepdf.open(outpdf)

    assert after.docinfo.Title == german, after.docinfo
    assert after.docinfo.Author == chinese, after.docinfo
    assert after.docinfo.get('/Keywords', '') == ''

    before_date = decode_pdf_date(str(before.docinfo.CreationDate))
    after_date = decode_pdf_date(str(after.docinfo.CreationDate))
    assert before_date == after_date

    pdfa_info = file_claims_pdfa(outpdf)
    assert pdfa_info['output'] == output_type
예제 #3
0
def test_jbig2_lossy(lossy, resources, outpdf, spoof_tesseract_noop):
    args = [
        resources / 'ccitt.pdf',
        outpdf,
        '--image-dpi',
        '200',
        '--optimize',
        3,
        '--jpg-quality',
        '50',
        '--png-quality',
        '20',
    ]
    if lossy:
        args.append('--jbig2-lossy')

    check_ocrmypdf(*args, env=spoof_tesseract_noop)

    pdf = pikepdf.open(outpdf)
    pim = pikepdf.PdfImage(next(iter(pdf.pages[0].images.values())))
    assert pim.filters[0] == '/JBIG2Decode'

    if lossy:
        assert '/JBIG2Globals' in pim.decode_parms[0]
    else:
        assert len(pim.decode_parms) == 0
예제 #4
0
def convert_to_pdfa(input_files_groups, output_file, log, context):
    options = context.get_options()
    input_pdfinfo = context.get_pdfinfo()

    input_files = list(f for f in flatten_groups(input_files_groups))
    layers_file = next(
        (ii for ii in input_files if ii.endswith('layers.rendered.pdf')), None
    )

    # If the DocumentInfo record contains NUL characters, Ghostscript will
    # produce XMP metadata which contains invalid XML entities (&#0;).
    # NULs in DocumentInfo seem to be common since older Acrobats included them.
    # pikepdf can deal with this, but we make the world a better place by
    # stamping them out as soon as possible.
    with pikepdf.open(layers_file) as pdf_layers_file:
        if pdf_layers_file.docinfo:
            modified = False
            for k, v in pdf_layers_file.docinfo.items():
                if b'\x00' in bytes(v):
                    pdf_layers_file.docinfo[k] = bytes(v).replace(b'\x00', b'')
                    modified = True
            if modified:
                pdf_layers_file.save(layers_file)

    ps = next((ii for ii in input_files if ii.endswith('.ps')), None)
    ghostscript.generate_pdfa(
        pdf_version=input_pdfinfo.min_version,
        pdf_pages=[layers_file, ps],
        output_file=output_file,
        compression=options.pdfa_image_compression,
        log=log,
        threads=options.jobs or 1,
        pdfa_part=options.output_type[-1],  # is pdfa-1, pdfa-2, or pdfa-3
    )
예제 #5
0
def test_preserve_metadata(spoof_tesseract_noop, output_type, resources, outpdf):
    pdf_before = pikepdf.open(resources / 'graph.pdf')

    output = check_ocrmypdf(
        resources / 'graph.pdf',
        outpdf,
        '--output-type',
        output_type,
        env=spoof_tesseract_noop,
    )

    pdf_after = pikepdf.open(output)

    for key in ('/Title', '/Author'):
        assert pdf_before.docinfo[key] == pdf_after.docinfo[key]

    pdfa_info = file_claims_pdfa(str(output))
    assert pdfa_info['output'] == output_type
예제 #6
0
def test_kodak_toc(resources, outpdf, spoof_tesseract_noop):
    output = check_ocrmypdf(
        resources / 'kcs.pdf', outpdf, '--output-type', 'pdf', env=spoof_tesseract_noop
    )

    p = pikepdf.open(outpdf)

    if pikepdf.Name.First in p.root.Outlines:
        assert isinstance(p.root.Outlines.First, pikepdf.Dictionary)
예제 #7
0
def test_no_glyphless_weave(resources, outdir):
    pdf = pikepdf.open(resources / 'francais.pdf')
    pdf_aspect = pikepdf.open(resources / 'aspect.pdf')
    pdf_cmyk = pikepdf.open(resources / 'cmyk.pdf')
    pdf.pages.extend(pdf_aspect.pages)
    pdf.pages.extend(pdf_cmyk.pages)
    pdf.save(outdir / 'test.pdf')

    env = os.environ.copy()
    env['_OCRMYPDF_MAX_REPLACE_PAGES'] = '2'
    check_ocrmypdf(
        outdir / 'test.pdf',
        outdir / 'out.pdf',
        '--deskew',
        '--tesseract-timeout',
        '0',
        env=env,
    )
예제 #8
0
def metadata_fixup(input_files_groups, output_file, log, context):
    options = context.get_options()

    input_files = list(f for f in flatten_groups(input_files_groups))
    original_file = next(
        (ii for ii in input_files if ii.endswith('.repaired.pdf')), None
    )
    layers_file = next(
        (ii for ii in input_files if ii.endswith('layers.rendered.pdf')), None
    )
    pdfa_file = next((ii for ii in input_files if ii.endswith('pdfa.pdf')), None)
    original = pikepdf.open(original_file)
    docinfo = get_docinfo(original, options)

    working_file = pdfa_file if pdfa_file else layers_file

    pdf = pikepdf.open(working_file)
    with pdf.open_metadata() as meta:
        meta.load_from_docinfo(docinfo, delete_missing=False)
        # If xmp:CreateDate is missing, set it to the modify date to
        # match Ghostscript, for consistency
        if 'xmp:CreateDate' not in meta:
            meta['xmp:CreateDate'] = meta.get('xmp:ModifyDate', '')
        if pdfa_file:
            meta_original = original.open_metadata()
            not_copied = set(meta_original.keys()) - set(meta.keys())
            if not_copied:
                log.warning(
                    "Some input metadata could not be copied because it is not "
                    "permitted in PDF/A. You may wish to examine the output "
                    "PDF's XMP metadata."
                )
                log.debug(
                    "The following metadata fields were not copied: %r", not_copied
                )

    pdf.save(
        output_file,
        compress_streams=True,
        object_stream_mode=pikepdf.ObjectStreamMode.generate,
    )
    original.close()
    pdf.close()
예제 #9
0
def test_links(resources, outpdf):
    check_ocrmypdf(
        resources / 'link.pdf',
        outpdf,
        '--redo-ocr',
        '--oversample',
        '200',
        '--output-type',
        'pdf',
    )
    pdf = pikepdf.open(outpdf)
    p1 = pdf.pages[0]
    p2 = pdf.pages[1]
    assert p1.Annots[0].A.D[0].objgen == p2.objgen
    assert p2.Annots[0].A.D[0].objgen == p1.objgen
예제 #10
0
def unlock_PDf(source_folder, destination_folder, password):
    count = 0
    for item in os.scandir(source_folder):
        if ".pdf" in item.name:
            file_name = item.name
            try:
                mypdf = pikepdf.open(
                    source_folder + "/" + file_name,
                    password=password)  # open the locked pdf in source folder
                mypdf.save(
                    destination_folder + "/" +
                    file_name)  # save the unlocked pdf in destination folder
            except pikepdf.PasswordError:
                print("The password failed to open the file!")

            print("\t\"" + file_name + "\"" + " unlocked")
            count = count + 1
    return count
예제 #11
0
def export(input_files, pages, file_out, metadata):
    pdf_output = pikepdf.Pdf.new()
    pdf_input = [pikepdf.open(p.copyname) for p in input_files]
    for row in pages:
        current_page = pdf_input[row[2] - 1].pages[row[3] - 1]
        angle = row[6]
        angle0 = current_page.Rotate if '/Rotate' in current_page else 0
        if angle != 0:
            current_page.Rotate = angle + angle0
        cropped = _mediabox(row, angle, angle0, current_page.MediaBox)
        if cropped:
            current_page.MediaBox = cropped
        pdf_output.pages.append(current_page)
    with pdf_output.open_metadata() as outmeta:
        outmeta.load_from_docinfo(pdf_input[0].docinfo)
        for k, v in metadata.items():
            outmeta[k] = v
    pdf_output.save(file_out)
예제 #12
0
def del_page(path):
    # 读取文件夹下的文件
    # out_path = path +'\out'
    for root, dirs, files in os.walk(path):
        # print(root)     #当前目录路径
        # print(dirs)     #当前路径下所有子目录
        # print(files)    #当前路径下所有非目录子文件
        for file in files:
            # print(file)
            # file1 = os.path.splitext(file)[0]
            # file2 = os.path.splitext(file)[1]
            # print(file1,file2)
            # if os.path.splitext(file)[1] == '.pdf':
            #     filename = os.path.splitext(file)[0]
            #     if 'Unlock' in filename :
            #         filename.replace('Unlock','')
            #     pdfReader = PdfFileReader(open(path+file, 'rb'))
            #     pdfFileWriter = PdfFileWriter()
            #     numPages = pdfReader.getNumPages()
            #     print(numPages)
            #     for i in range(0,numPages):
            #         if i != 1 or i != numPages:
            #             pageObj = pdfReader.getPage(i)
            #             pdfFileWriter.addPage(pageObj)
            #     pdfFileWriter.write(open(path+'out'+filename+'.pdf', 'wb'))
            if os.path.splitext(file)[1] == '.pdf':
                filepath = path + file
                print(filepath)
                with pikepdf.open(filepath, 'wb',
                                  allow_overwriting_input=True) as pdf:
                    #     nums = len(pdf.pages)-2
                    #     remove = [0,nums]
                    #     # print(len(pdf.pages))
                    #     if 'Unlock' in str(file):
                    #         file = file.replace('Unlock','')
                    #         filepath = path + file
                    #     for index in remove :
                    #         # print(index)
                    #         del pdf.pages[index]
                    #         pdf.save(filepath)
                    if 'Unlock' in str(file):
                        file = file.split('.')[1]
                        filepath = path + file + '.pdf'
                        pdf.save(filepath)
예제 #13
0
def split_pdf_to_page_blocks(
    src_pdf_fn: str,
    pages_per_block: int = 1,
    page_block_base_name: str = None,
) -> Generator[List[str], None, None]:
    with pikepdf.open(src_pdf_fn) as pdf:
        if len(pdf.pages) < 1:
            yield []
            return

        if len(pdf.pages) < pages_per_block:
            yield [src_pdf_fn]
            return

        if not page_block_base_name:
            page_block_base_name = os.path.basename(src_pdf_fn)
        temp_dir = mkdtemp()
        try:
            res: List[str] = list()
            page_start: int = 0
            out_pdf: Optional[pikepdf.Pdf] = None
            for n, page in enumerate(pdf.pages):
                if n % pages_per_block == 0:
                    if out_pdf is not None:
                        out_fn = build_block_fn(str(page_block_base_name),
                                                page_start, n - 1)
                        out_pdf.save(os.path.join(temp_dir, out_fn))
                        out_pdf.close()
                        res.append(os.path.join(temp_dir, out_fn))

                    page_start = n
                    out_pdf = pikepdf.new()

                out_pdf.pages.append(page)

            if out_pdf is not None and len(out_pdf.pages) > 0:
                out_fn = build_block_fn(str(page_block_base_name), page_start,
                                        n)
                out_pdf.save(os.path.join(temp_dir, out_fn))
                out_pdf.close()
                res.append(os.path.join(temp_dir, out_fn))
            yield res
        finally:
            shutil.rmtree(temp_dir)
예제 #14
0
def extract_columns(file: str, start: int, end: int, out_dir: str):
    start_time = time.time()
    with pikepdf.open(file) as pdf:
        for page_num in range(start, end + 1):
            page = get_page(pdf, page_num)
            segmenter = ColumnSegmenter(page)
            columns = segmenter.detect_columns()
            if len(columns) in [5, 10]:
                page_out_dir = os.path.join(out_dir, f"page-{page_num:04}")
                logging.info(
                    f"saving {len(columns)} columns for page {page_num} to {page_out_dir}"
                )
                save_segments(page, columns, page_out_dir)
            else:
                logging.warning(
                    logging.warning(
                        f"page {page_num} has {len(columns)} columns, not saving"
                    ))
    logging.info(f"{time.time() - start_time:.2f} seconds elapsed")
def decrypt_pdf(file_pdf):
    chars = string.ascii_letters + string.digits
    attempts = 0
    print("Searching for password!\nThis may take long time...")  # print that you can go shopping :D
    for plen in range(1, 6):  # brute-force procedure already the same
        for guess in itertools.product(chars, repeat=plen):
            attempts += 1
            guess = ''.join(guess)
            # print(guess,attempts)                                          #Debug
            try:
                pdf = pikepdf.open(file_pdf, password=guess)  # try start pikepdf with open the file (declared as
                # file_pdf) and generated password
                pdf.save('decrypted.pdf')  # save opened pdf-file decrypted in new file
                print("[PDF BRUTE-FORCE]: found password! "
                      "password: {} with {} attempts".format(guess, attempts))  # print that you've won :D
                return True
            except:
                # print(str(attempts)+" not correct!")                       #Debug
                continue  # if open failed, continue with next password
예제 #16
0
def pdf_cracker(language, mode):
    if language == "English":
        pdf_file = input(
            "Insert the name of the PDF file you want to crack --> ")
        password_file = input("Insert the name of the password list file --> ")
    else:
        pdf_file = input("Inserisci il nome del file PDF da crackare --> ")
        password_file = input(
            "Inserisci il nome del file di elenco delle password --> ")
    passwords = [line.strip() for line in open(password_file)]
    for password in tqdm(passwords, "Decrypting PDF"):
        try:
            with pikepdf.open(pdf_file, password=password) as pdf_file:
                if language == "English":
                    print(f"[+] Password found: {password}")
                else:
                    print(f"[+] Password trovata: {password}")
        except pikepdf._qpdf.PasswordError as e:
            continue
예제 #17
0
def test_image_scale0(resources, outpdf):
    with pikepdf.open(resources / 'cmyk.pdf') as cmyk:
        xobj = pikepdf.Page(cmyk.pages[0]).as_form_xobject()

        p = pikepdf.Pdf.new()
        p.add_blank_page(page_size=(72, 72))
        objname = pikepdf.Page(p.pages[0]).add_resource(
            p.copy_foreign(xobj), pikepdf.Name.XObject, pikepdf.Name.Im0)
        print(objname)
        p.pages[0].Contents = pikepdf.Stream(
            p, b"q 0 0 0 0 0 0 cm %s Do Q" % bytes(objname))
        p.save(outpdf)

    pi = pdfinfo.PdfInfo(outpdf,
                         detailed_analysis=True,
                         progbar=False,
                         max_workers=1)
    assert not pi.pages[0]._images[0].dpi.is_finite
    assert pi.pages[0].dpi == Resolution(0, 0)
예제 #18
0
def _find_font(text, pdf_base):
    "Copy a font from the filename text into pdf_base"

    font, font_key = None, None
    possible_font_names = ('/f-0-0', '/F1')
    try:
        pdf_text = pikepdf.open(text)
        pdf_text_fonts = pdf_text.pages[0].Resources.get('/Font', {})
    except Exception:
        return None, None

    for f in possible_font_names:
        pdf_text_font = pdf_text_fonts.get(f, None)
        if pdf_text_font is not None:
            font_key = f
            break
    if pdf_text_font:
        font = pdf_base.copy_foreign(pdf_text_font)
    return font, font_key
예제 #19
0
def generate_booklet(pdfqueue, tmp_dir, pages):
    file, filename = make_tmp_file(tmp_dir)
    content_dict = pikepdf.Dictionary({})
    file_indexes = {p.nfile for p in pages}
    source_files = {n: pikepdf.open(pdfqueue[n - 1].copyname) for n in file_indexes}
    for i in range(len(pages)//2):
        even = i % 2 == 0
        first = pages[-i - 1 if even else i]
        second = pages[i if even else -i - 1]

        second_page_size = second.size_in_points()
        first_page_size = first.size_in_points()
        page_size = [max(second_page_size[0], first_page_size[0]) * 2,
                     max(second_page_size[1], first_page_size[1])]

        first_original = source_files[first.nfile].pages[first.npage - 1]
        first_foreign = file.copy_foreign(first_original)
        _update_angle(first, first_original, first_foreign)

        second_original = source_files[second.nfile].pages[second.npage - 1]
        second_foreign = file.copy_foreign(second_original)
        _update_angle(second, second_original, second_foreign)

        content_dict[f'/Page{i*2}'] = pikepdf.Page(first_foreign).as_form_xobject()
        content_dict[f'/Page{i*2 + 1}'] = pikepdf.Page(second_foreign).as_form_xobject()

        content_txt = (f'q 1 0 0 1 0 0 cm /Page{i*2} Do Q'
                       f' q 1 0 0 1 {first_page_size[0]} 0 cm /Page{i*2 + 1} Do Q ')

        newpage = pikepdf.Dictionary(
                Type=pikepdf.Name.Page,
                MediaBox=[0, 0, *page_size],
                Resources=pikepdf.Dictionary(XObject=content_dict),
                Contents=pikepdf.Stream(file, content_txt.encode())
            )

        # workaround for pikepdf <= 2.6.0. See https://github.com/pikepdf/pikepdf/issues/174
        if pikepdf.__version__ < '2.7.0':
            newpage = file.make_indirect(newpage)
        file.pages.append(newpage)

    file.save(filename)
    return filename
예제 #20
0
def check_pdf(input_file: Path) -> bool:
    """Check if a PDF complies with the PDF specification.

    Checks for proper formatting and proper linearization.
    """
    pdf = None
    try:
        pdf = pikepdf.open(input_file)
    except pikepdf.PdfError as e:
        log.error(e)
        return False
    else:
        messages = pdf.check()
        for msg in messages:
            if 'error' in msg.lower():
                log.error(msg)
            else:
                log.warning(msg)

        sio = StringIO()
        linearize_msgs = ''
        try:
            # If linearization is missing entirely, we do not complain. We do
            # complain if linearization is present but incorrect.
            pdf.check_linearization(sio)
        except RuntimeError:
            pass
        except (getattr(pikepdf, 'ForeignObjectError') if pikepdf.__version__
                == '2.1.0'  # This version may throw wrong exception
                else NeverRaise):
            pass
        else:
            linearize_msgs = sio.getvalue()
            if linearize_msgs:
                log.warning(linearize_msgs)

        if not messages and not linearize_msgs:
            return True
        return False
    finally:
        if pdf:
            pdf.close()
예제 #21
0
def convert_to_pdfa(input_pdf: Path, input_ps_stub: Path, context: PdfContext):
    options = context.options
    input_pdfinfo = context.pdfinfo
    fix_docinfo_file = context.get_path('fix_docinfo.pdf')
    output_file = context.get_path('pdfa.pdf')

    # If the DocumentInfo record contains NUL characters, Ghostscript will
    # produce XMP metadata which contains invalid XML entities (&#0;).
    # NULs in DocumentInfo seem to be common since older Acrobats included them.
    # pikepdf can deal with this, but we make the world a better place by
    # stamping them out as soon as possible.
    modified = False
    with pikepdf.open(input_pdf) as pdf_file:
        try:
            len(pdf_file.docinfo)
        except TypeError:
            log.error(
                "File contains a malformed DocumentInfo block - continuing anyway"
            )
        else:
            if pdf_file.docinfo:
                for k, v in pdf_file.docinfo.items():
                    if b'\x00' in bytes(v):
                        pdf_file.docinfo[k] = bytes(v).replace(b'\x00', b'')
                        modified = True
        if modified:
            pdf_file.save(fix_docinfo_file)
        else:
            safe_symlink(input_pdf, fix_docinfo_file)

    context.plugin_manager.hook.generate_pdfa(
        pdf_version=input_pdfinfo.min_version,
        pdf_pages=[fix_docinfo_file],
        pdfmark=input_ps_stub,
        output_file=output_file,
        compression=options.pdfa_image_compression,
        pdfa_part=options.output_type[-1],  # is pdfa-1, pdfa-2, or pdfa-3
        progressbar_class=(context.plugin_manager.hook.get_progressbar_class()
                           if options.progress_bar else None),
    )

    return output_file
예제 #22
0
def check_pdf(input_file: Path) -> bool:
    """Check if a PDF complies with the PDF specification.

    Checks for proper formatting and proper linearization. Uses pikepdf (which in
    turn, uses QPDF) to perform the checks.
    """
    try:
        pdf = pikepdf.open(input_file)
    except pikepdf.PdfError as e:
        log.error(e)
        return False
    else:
        with pdf:
            messages = pdf.check()
            for msg in messages:
                if 'error' in msg.lower():
                    log.error(msg)
                else:
                    log.warning(msg)

            sio = StringIO()
            linearize_msgs = ''
            try:
                # If linearization is missing entirely, we do not complain. We do
                # complain if linearization is present but incorrect.
                pdf.check_linearization(sio)
            except RuntimeError:
                pass
            except (
                    # Workaround for a problematic pikepdf version
                    # pragma: no cover
                    getattr(pikepdf, 'ForeignObjectError')
                    if pikepdf.__version__ == '2.1.0' else NeverRaise):
                pass
            else:
                linearize_msgs = sio.getvalue()
                if linearize_msgs:
                    log.warning(linearize_msgs)

            if not messages and not linearize_msgs:
                return True
            return False
예제 #23
0
def wait_for_file_ready(file_path):
    # This loop waits to make sure that the file is completely loaded on
    # disk before attempting to read. Docker sometimes will publish the
    # watchdog event before the file is actually fully on disk, causing
    # pikepdf to fail.

    retries = 5
    while retries:
        try:
            pdf = pikepdf.open(file_path)
        except (FileNotFoundError, pikepdf.PdfError) as e:
            log.info(f"File {file_path} is not ready yet")
            log.debug("Exception was", exc_info=e)
            time.sleep(POLL_NEW_FILE_SECONDS)
            retries -= 1
        else:
            pdf.close()
            return True

    return False
def extractContent(content=""):
    fileNames = []
    # numPageBooks = []
    pdfFileText = []
    pdfFileReader = ''
    for files in glob.glob("Resource/*.pdf"):
        fileNames.append(files)
    for i in range(len(fileNames)):
        pdfFile = open(fileNames[i], 'rb')
        pdfFileReader = pyPDF.PdfFileReader(fileNames[i])
        if (pdfFileReader.isEncrypted):
            pdfFile = pikepdf.open(fileNames[i])
            #pdfFile.save(fileNames[i])
            print("%s decrypted!" % fileNames[i])
            pdfFileReader = pyPDF.PdfFileReader(fileNames[i])
        #numPageBooks.append(pdfFileReader.numPages)
        pdfText = pdfFileReader.getPage(100)
        pdfText = pdfText.extractText()
        pdfFileText.append(pdfText)
        print(pdfFileText)
예제 #25
0
def test_pdfa(resources, outpdf, optimize, pdfa_level):
    check_ocrmypdf(
        resources / 'francais.pdf',
        outpdf,
        '--plugin',
        'tests/plugins/tesseract_noop.py',
        f'--output-type=pdfa-{pdfa_level}',
        f'--optimize={optimize}',
    )
    if pdfa_level in (2, 3):
        # PDF/A-2 allows ObjStm
        assert b'/ObjStm' in outpdf.read_bytes()
    elif pdfa_level == 1:
        # PDF/A-1 might allow ObjStm, but Acrobat does not approve it, so
        # we don't use it
        assert b'/ObjStm' not in outpdf.read_bytes()

    with pikepdf.open(outpdf) as pdf:
        with pdf.open_metadata() as m:
            assert m.pdfa_status == f'{pdfa_level}B'
예제 #26
0
def _pdf_get_all_pageinfo(infile, detailed_page_analysis, log=None):
    if not log:
        log = Mock()

    pdf = pikepdf.open(infile)
    if not detailed_page_analysis:
        pages_xml = None
    else:
        pages_xml = ghosttext.extract_text_xml(infile,
                                               pdf,
                                               pageno=None,
                                               log=log)

    pages = []
    for n in range(len(pdf.pages)):
        page_xml = pages_xml[n] if pages_xml else None
        page = PageInfo(pdf, n, infile, page_xml)
        pages.append(page)

    return pages, pdf
예제 #27
0
def brute_force_pdf(plock_file, password_length, asci):
    """
    function uses brute-force techniques to gain access to password protected pdf files
    :param plock_file: file with password protection
    :param password_length: possible length of password
    :param asci: string combination of ascii values
    :return:
    """
    # iterates through all asci values and generates a progress bar showing completion process
    for i in tqdm(range(1, (password_length + 1))):
        for letter in itertools.product(asci, repeat=i):
            password = ''.join(letter)
            # uses pikepdf to try and open password protected pdf file using brute force method
            try:
                with pikepdf.open(plock_file, password=password):
                    print(f"Password found: {password}")
                    break
            except pikepdf._qpdf.PasswordError as e:
                # if password fail continue
                continue
예제 #28
0
 def make_rotate_test(prefix, image_angle, page_angle):
     im = Image.open(fspath(resources / 'typewriter.png'))
     if image_angle != 0:
         ccw_angle = -image_angle % 360
         im = im.transpose(getattr(Image, f'ROTATE_{ccw_angle}'))
     memimg = BytesIO()
     im.save(memimg, format='PNG')
     memimg.seek(0)
     mempdf = BytesIO()
     img2pdf.convert(
         memimg.read(),
         layout_fun=img2pdf.get_fixed_dpi_layout_fun((200, 200)),
         outputstream=mempdf,
     )
     mempdf.seek(0)
     pike = pikepdf.open(mempdf)
     pike.pages[0].Rotate = page_angle
     target = outdir / f'{prefix}_{image_angle}_{page_angle}.pdf'
     pike.save(target)
     return target
예제 #29
0
 def make_rotate_test(prefix, image_angle, page_angle):
     im = Image.open(fspath(resources / 'typewriter.png'))
     if image_angle != 0:
         ccw_angle = -image_angle % 360
         im = im.transpose(getattr(Image, 'ROTATE_{}'.format(ccw_angle)))
     memimg = BytesIO()
     im.save(memimg, format='PNG')
     memimg.seek(0)
     mempdf = BytesIO()
     img2pdf.convert(memimg.read(),
                     layout_fun=img2pdf.get_fixed_dpi_layout_fun(
                         (200, 200)),
                     outputstream=mempdf)
     mempdf.seek(0)
     pike = pikepdf.open(mempdf)
     pike.pages[0].Rotate = page_angle
     target = outdir / '{}_{}_{}.pdf'.format(prefix, image_angle,
                                             page_angle)
     pike.save(target)
     return target
예제 #30
0
def _pdf_get_all_pageinfo(infile, detailed_analysis=False, log=None):
    if not log:
        log = Mock()

    pdf = pikepdf.open(infile)  # Do not close in this function
    if pdf.is_encrypted:
        pdf.close()
        raise EncryptedPdfError()  # Triggered by encryption with empty passwd
    if detailed_analysis:
        pages_xml = None
    else:
        pages_xml = ghosttext.extract_text_xml(infile, pdf, pageno=None, log=log)

    pages = []
    for n in range(len(pdf.pages)):
        page_xml = pages_xml[n] if pages_xml else None
        page = PageInfo(pdf, n, infile, page_xml, detailed_analysis)
        pages.append(page)

    return pages, pdf
예제 #31
0
def test_malformed_docinfo(caplog, resources, outdir):
    generate_pdfa_ps(outdir / 'pdfa.ps')
    # copyfile(resources / 'trivial.pdf', outdir / 'layers.rendered.pdf')

    with pikepdf.open(resources / 'trivial.pdf') as pike:
        pike.trailer.Info = pikepdf.Stream(pike, b"<xml></xml>")
        pike.save(outdir / 'layers.rendered.pdf', fix_metadata_version=False)

    options = parser.parse_args(
        args=['-j', '1', '--output-type', 'pdfa-2', 'a.pdf', 'b.pdf'])
    pdfinfo = PdfInfo(outdir / 'layers.rendered.pdf')
    context = PDFContext(options, outdir, outdir / 'layers.rendered.pdf',
                         pdfinfo)

    convert_to_pdfa(str(outdir / 'layers.rendered.pdf'),
                    str(outdir / 'pdfa.ps'), context)

    print(caplog.records)
    assert any('malformed DocumentInfo block' in record.message
               for record in caplog.records)
예제 #32
0
def test_simulated_scan(outdir):
    canvas = Canvas(
        fspath(outdir / 'fakescan.pdf'),
        pagesize=(209.8, 297.6),
    )

    page_vars = [(2, 36, 250), (91, 170, 240), (179, 190, 36), (271, 36, 36)]

    for n, page_var in enumerate(page_vars):
        text = canvas.beginText()
        text.setFont('Helvetica', 20)

        angle, x, y = page_var
        cos_a, sin_a = cos(angle / 180.0 * pi), sin(angle / 180.0 * pi)

        text.setTextTransform(cos_a, -sin_a, sin_a, cos_a, x, y)
        text.textOut(f'Page {n + 1}')
        canvas.drawText(text)
        canvas.showPage()
    canvas.save()

    check_ocrmypdf(
        outdir / 'fakescan.pdf',
        outdir / 'out.pdf',
        '--force-ocr',
        '--deskew',
        '--rotate-pages',
        '--plugin',
        'tests/plugins/tesseract_debug_rotate.py',
    )

    with pikepdf.open(outdir / 'out.pdf') as pdf:
        assert (pdf.pages[1].MediaBox[2] >
                pdf.pages[1].MediaBox[3]), "Wrong orientation: not landscape"
        assert (pdf.pages[3].MediaBox[2] >
                pdf.pages[3].MediaBox[3]), "Wrong orientation: Not landscape"

        assert (pdf.pages[0].MediaBox[2] <
                pdf.pages[0].MediaBox[3]), "Wrong orientation: Not portrait"
        assert (pdf.pages[2].MediaBox[2] <
                pdf.pages[2].MediaBox[3]), "Wrong orientation: Not portrait"
예제 #33
0
    def _page_extract_cb(self, filename):
        # open allows you to read the file
        destination = os.path.join(os.path.split(filename)[0], self.pages_dir)
        if not os.path.exists(destination):
            os.mkdir(destination)
        prefix = os.path.splitext(os.path.basename(filename))[0]
        try:
            pdfobject = pikepdf.open(filename)
            num_pages = len(pdfobject.pages)

            count = 0
            text = ""
            while count < num_pages:
                pageObj = pdfobject.pages[count]
                count += 1
                decodeImage(
                    pageObj, os.path.join(destination,
                                          prefix + '_%03d' % count))
                #text += pageObj.extractText()
        except Exception as ex:
            log(ERROR, 'Cannot extract %s due to %s' % (filename, str(ex)))
예제 #34
0
def test_flate_to_jbig2(resources, outdir, spoof_tesseract_noop):
    # This test requires an image that pngquant is capable of converting to
    # to 1bpp - so use an existing 1bpp image, convert up, confirm it can
    # convert down
    im = Image.open(fspath(resources / 'typewriter.png'))
    assert im.mode in ('1', 'P')
    im = im.convert('L')
    im.save(fspath(outdir / 'type8.png'))

    check_ocrmypdf(outdir / 'type8.png',
                   outdir / 'out.pdf',
                   '--image-dpi',
                   '100',
                   '--png-quality',
                   '10',
                   '--optimize',
                   '3',
                   env=spoof_tesseract_noop)

    pdf = pikepdf.open(outdir / 'out.pdf')
    pim = pikepdf.PdfImage(next(iter(pdf.pages[0].images.values())))
    assert pim.filters[0] == '/JBIG2Decode'
예제 #35
0
def _find_font(text, pdf_base):
    """Copy a font from the filename text into pdf_base"""

    font, font_key = None, None
    possible_font_names = ('/f-0-0', '/F1')
    try:
        with pikepdf.open(text) as pdf_text:
            try:
                pdf_text_fonts = pdf_text.pages[0].Resources.get('/Font', {})
            except (AttributeError, IndexError, KeyError):
                return None, None
            for f in possible_font_names:
                pdf_text_font = pdf_text_fonts.get(f, None)
                if pdf_text_font is not None:
                    font_key = f
                    break
            if pdf_text_font:
                font = pdf_base.copy_foreign(pdf_text_font)
            return font, font_key
    except (FileNotFoundError, pikepdf.PdfError):
        # PdfError occurs if a 0-length file is written e.g. due to OCR timeout
        return None, None
예제 #36
0
def _transcode_png(pike: Pdf, filename: Path, xref: Xref) -> bool:
    output = filename.with_suffix('.png.pdf')
    with output.open('wb') as f:
        img2pdf.convert(fspath(filename), outputstream=f)

    with pikepdf.open(output) as pdf_image:
        foreign_image = next(pdf_image.pages[0].images.values())
        local_image = pike.copy_foreign(foreign_image)

        im_obj = pike.get_object(xref, 0)
        im_obj.write(
            local_image.read_raw_bytes(),
            filter=local_image.Filter,
            decode_parms=local_image.DecodeParms,
        )

        # Don't copy keys from the new image...
        del_keys = set(im_obj.keys()) - set(local_image.keys())
        # ...except for the keep_fields, which are essential to displaying
        # the image correctly and preserving its metadata. (/Decode arrays
        # and /SMaskInData are implicitly discarded prior to this point.)
        keep_fields = {
            '/ID',
            '/Intent',
            '/Interpolate',
            '/Mask',
            '/Metadata',
            '/OC',
            '/OPI',
            '/SMask',
            '/StructParent',
        }
        del_keys -= keep_fields
        for key in local_image.keys():
            if key != Name.Length and str(key) not in keep_fields:
                im_obj[key] = local_image[key]
        for key in del_keys:
            del im_obj[key]
    return True
예제 #37
0
def test_metadata_fixup_warning(resources, outdir, caplog):
    options = get_parser().parse_args(
        args=['--output-type', 'pdfa-2', 'graph.pdf', 'out.pdf'])

    copyfile(resources / 'graph.pdf', outdir / 'graph.pdf')

    context = PdfContext(options, outdir, outdir / 'graph.pdf', None,
                         get_plugin_manager([]))
    metadata_fixup(working_file=outdir / 'graph.pdf', context=context)
    for record in caplog.records:
        assert record.levelname != 'WARNING'

    # Now add some metadata that will not be copyable
    graph = pikepdf.open(outdir / 'graph.pdf')
    with graph.open_metadata() as meta:
        meta['prism2:publicationName'] = 'OCRmyPDF Test'
    graph.save(outdir / 'graph_mod.pdf')

    context = PdfContext(options, outdir, outdir / 'graph_mod.pdf', None,
                         get_plugin_manager([]))
    metadata_fixup(working_file=outdir / 'graph.pdf', context=context)
    assert any(record.levelname == 'WARNING' for record in caplog.records)
예제 #38
0
    def _find_font(self, text):
        """Copy a font from the filename text into pdf_base"""

        font, font_key = None, None
        possible_font_names = ('/f-0-0', '/F1')
        try:
            with pikepdf.open(text) as pdf_text:
                try:
                    pdf_text_fonts = pdf_text.pages[0].Resources.get('/Font', {})
                except (AttributeError, IndexError, KeyError):
                    return None, None
                for f in possible_font_names:
                    pdf_text_font = pdf_text_fonts.get(f, None)
                    if pdf_text_font is not None:
                        font_key = f
                        break
                if pdf_text_font:
                    font = self.pdf_base.copy_foreign(pdf_text_font)
                return font, font_key
        except (FileNotFoundError, pikepdf.PdfError):
            # PdfError occurs if a 0-length file is written e.g. due to OCR timeout
            return None, None
def set_all_bookmark_zooms(in_file,out_file=None,zoom_factor=None,only_bookmarks=False):
    
    def _apply_set_all_bookmark_zoom(names,depth,children):
        for child in children:

            set_zoom_factor(names,depth,child,zoom_factor,only_bookmarks)
            _apply_set_all_bookmark_zoom(names,depth+1,child.children)
    
    pdf = pikepdf.open(in_file)

    if not only_bookmarks:
        names = get_names(pdf.root)
        for key,values in names.items():
            index,array = values
            array[index] = update_dest(zoom_factor,array[index])
            
    with pdf.open_outline() as outline:
        _apply_set_all_bookmark_zoom(pdf.root,0,outline.root)
        
    save_name = out_file if out_file else in_file.split(".")[0] + "-zoomed.pdf"
    print("saving %s ..." % save_name) 
    pdf.save(save_name)
예제 #40
0
def strip(file, password):

    # Removes the password of pdf using pikepdf module

    try:

        name_pdf = file.replace('.pdf', '_decrypted.pdf')

        print('%s Working on %s file..' % (WORKING, file))

        with pikepdf.open(file, password) as pdf_clean:
            pdf_clean.save(name_pdf)

            print('%s Successfully removed password. Saved as %s' %
                  (SUCCESS, name_pdf))
            pdf_clean.close()

    except pikepdf._qpdf.PasswordError:
        print('%s Unable to remove protection. Invalid password.' % (FAILED))

    except FileNotFoundError:
        print('%s %s not found.' % (FAILED, file))
예제 #41
0
def _transcode_png(pike: Pdf, filename: Path, xref: Xref) -> bool:
    output = filename.with_suffix('.png.pdf')
    with output.open('wb') as f:
        img2pdf.convert(fspath(filename), outputstream=f)

    with pikepdf.open(output) as pdf_image:
        foreign_image = next(pdf_image.pages[0].images.values())
        local_image = pike.copy_foreign(foreign_image)

        im_obj = pike.get_object(xref, 0)
        im_obj.write(
            local_image.read_raw_bytes(),
            filter=local_image.Filter,
            decode_parms=local_image.DecodeParms,
        )

        del_keys = set(im_obj.keys()) - set(local_image.keys())
        for key in local_image.keys():
            if key != Name.Length:
                im_obj[key] = local_image[key]
        for key in del_keys:
            del im_obj[key]
예제 #42
0
def test_metadata_fixup_warning(resources, outdir, caplog):
    from ocrmypdf._pipeline import metadata_fixup

    input_files = [
        str(outdir / 'graph.repaired.pdf'),
        str(outdir / 'layers.rendered.pdf'),
        str(outdir / 'pdfa.pdf'),  # It is okay that this is not a PDF/A
    ]
    for f in input_files:
        copyfile(resources / 'graph.pdf', f)

    log = logging.getLogger()
    context = MagicMock()
    metadata_fixup(
        input_files_groups=input_files,
        output_file=outdir / 'out.pdf',
        log=log,
        context=context,
    )
    for record in caplog.records:
        assert record.levelname != 'WARNING'

    # Now add some metadata that will not be copyable
    graph = pikepdf.open(outdir / 'graph.repaired.pdf')
    with graph.open_metadata() as meta:
        meta['prism2:publicationName'] = 'OCRmyPDF Test'
    graph.save(outdir / 'graph.repaired.pdf')

    log = logging.getLogger()
    context = MagicMock()
    metadata_fixup(
        input_files_groups=input_files,
        output_file=outdir / 'out.pdf',
        log=log,
        context=context,
    )
    assert any(record.levelname == 'WARNING' for record in caplog.records)
예제 #43
0
def test_flate_to_jbig2(resources, outdir, spoof_tesseract_noop):
    # This test requires an image that pngquant is capable of converting to
    # to 1bpp - so use an existing 1bpp image, convert up, confirm it can
    # convert down
    im = Image.open(fspath(resources / 'typewriter.png'))
    assert im.mode in ('1', 'P')
    im = im.convert('L')
    im.save(fspath(outdir / 'type8.png'))

    check_ocrmypdf(
        outdir / 'type8.png',
        outdir / 'out.pdf',
        '--image-dpi',
        '100',
        '--png-quality',
        '50',
        '--optimize',
        '3',
        env=spoof_tesseract_noop,
    )

    pdf = pikepdf.open(outdir / 'out.pdf')
    pim = pikepdf.PdfImage(next(iter(pdf.pages[0].images.values())))
    assert pim.filters[0] == '/JBIG2Decode'
예제 #44
0
def _weave_layers_graft(
    *, pdf_base, page_num, text, font, font_key, procset, rotation, strip_old_text, log
):
    """Insert the text layer from text page 0 on to pdf_base at page_num"""

    log.debug("Grafting")
    if Path(text).stat().st_size == 0:
        return

    # This is a pointer indicating a specific page in the base file
    pdf_text = pikepdf.open(text)
    pdf_text_contents = pdf_text.pages[0].Contents.read_bytes()

    if not tesseract.has_textonly_pdf():
        # If we don't have textonly_pdf, edit the stream to delete the
        # instruction to draw the image Tesseract generated, which we do not
        # use.
        stream = bytearray(pdf_text_contents)
        pattern = b'/Im1 Do'
        idx = stream.find(pattern)
        stream[idx : (idx + len(pattern))] = b' ' * len(pattern)
        pdf_text_contents = bytes(stream)

    base_page = pdf_base.pages.p(page_num)

    # The text page always will be oriented up by this stage but the original
    # content may have a rotation applied. Wrap the text stream with a rotation
    # so it will be oriented the same way as the rest of the page content.
    # (Previous versions OCRmyPDF rotated the content layer to match the text.)
    mediabox = [float(pdf_text.pages[0].MediaBox[v]) for v in range(4)]
    wt, ht = mediabox[2] - mediabox[0], mediabox[3] - mediabox[1]

    mediabox = [float(base_page.MediaBox[v]) for v in range(4)]
    wp, hp = mediabox[2] - mediabox[0], mediabox[3] - mediabox[1]

    translate = pikepdf.PdfMatrix().translated(-wt / 2, -ht / 2)
    untranslate = pikepdf.PdfMatrix().translated(wp / 2, hp / 2)
    # -rotation because the input is a clockwise angle and this formula
    # uses CCW
    rotation = -rotation % 360
    rotate = pikepdf.PdfMatrix().rotated(rotation)

    # Because of rounding of DPI, we might get a text layer that is not
    # identically sized to the target page. Scale to adjust. Normally this
    # is within 0.998.
    if rotation in (90, 270):
        wt, ht = ht, wt
    scale_x = wp / wt
    scale_y = hp / ht

    log.debug('%r', (scale_x, scale_y))
    scale = pikepdf.PdfMatrix().scaled(scale_x, scale_y)

    # Translate the text so it is centered at (0, 0), rotate it there, adjust
    # for a size different between initial and text PDF, then untranslate
    ctm = translate @ rotate @ scale @ untranslate

    pdf_text_contents = b'q %s cm\n' % ctm.encode() + pdf_text_contents + b'\nQ\n'

    new_text_layer = pikepdf.Stream(pdf_base, pdf_text_contents)

    if strip_old_text:
        strip_invisible_text(pdf_base, base_page, log)

    base_page.page_contents_add(new_text_layer, prepend=True)

    _update_page_resources(
        page=base_page, font=font, font_key=font_key, procset=procset
    )
    pdf_text.close()
예제 #45
0
def weave_layers(infiles, output_file, log, context):
    """Apply text layer and/or image layer changes to baseline file

    This is where the magic happens. infiles will be the main PDF to modify,
    and optional .text.pdf and .image-layer.pdf files, organized however ruffus
    organizes them.

    From .text.pdf, we copy the content stream (which contains the Tesseract
    OCR results), and rotate it into place. The first time we do this, we also
    copy the GlyphlessFont, and then reference that font again.

    For .image-layer.pdf, we check if this is a "pointer" to the original file,
    or a new file. If a new file, we replace the page and remember that we
    replaced this page.

    Every 100 open files, we save intermediate results, to avoid any resource
    limits, since pikepdf/qpdf need to keep a lot of open file handles in the
    background. When objects are copied from one file to another qpdf, qpdf
    doesn't actually copy the data until asked to write, so all the resources
    it may need to remain available.

    For completeness, we set up a /ProcSet on every page, although it's
    unlikely any PDF viewer cares about this anymore.

    """

    def input_sorter(key):
        try:
            return page_number(key)
        except ValueError:
            return -1

    flat_inputs = sorted(flatten_groups(infiles), key=input_sorter)
    groups = groupby(flat_inputs, key=input_sorter)

    # Extract first item
    _, basegroup = next(groups)
    base = list(basegroup)[0]
    path_base = Path(base).resolve()
    pdf_base = pikepdf.open(path_base)
    font, font_key, procset = None, None, None
    pdfinfo = context.get_pdfinfo()

    procset = pdf_base.make_indirect(
        pikepdf.Object.parse(b'[ /PDF /Text /ImageB /ImageC /ImageI ]')
    )

    emplacements = 1
    interim_count = 0

    # Iterate rest
    for page_num, layers in groups:
        layers = list(layers)
        log.debug(page_num)
        log.debug(layers)

        text = next((ii for ii in layers if ii.endswith('.text.pdf')), None)
        image = next((ii for ii in layers if ii.endswith('.image-layer.pdf')), None)

        if text and not font:
            font, font_key = _find_font(text, pdf_base)

        emplaced_page = False
        content_rotation = pdfinfo[page_num - 1].rotation

        path_image = Path(image).resolve() if image else None
        if path_image is not None and path_image != path_base:
            # We are updating the old page with a rasterized PDF of the new
            # page (without changing objgen, to preserve references)
            log.debug("Emplacement update")
            with pikepdf.open(image) as pdf_image:
                emplacements += 1
                foreign_image_page = pdf_image.pages[0]
                pdf_base.pages.append(foreign_image_page)
                local_image_page = pdf_base.pages[-1]
                pdf_base.pages[page_num - 1].emplace(local_image_page)
                del pdf_base.pages[-1]
            emplaced_page = True

        autorotate_correction = context.get_rotation(page_num - 1)
        if emplaced_page:
            content_rotation = autorotate_correction
        text_rotation = autorotate_correction
        text_misaligned = (text_rotation - content_rotation) % 360
        log.debug(
            '%r',
            [text_rotation, autorotate_correction, text_misaligned, content_rotation],
        )

        if text and font:
            # Graft the text layer onto this page, whether new or old
            strip_old = context.get_options().redo_ocr
            _weave_layers_graft(
                pdf_base=pdf_base,
                page_num=page_num,
                text=text,
                font=font,
                font_key=font_key,
                rotation=text_misaligned,
                procset=procset,
                strip_old_text=strip_old,
                log=log,
            )

        # Correct the rotation if applicable
        pdf_base.pages[page_num - 1].Rotate = (
            content_rotation - autorotate_correction
        ) % 360

        if emplacements % MAX_REPLACE_PAGES == 0:
            # Periodically save and reload the Pdf object. This will keep a
            # lid on our memory usage for very large files. Attach the font to
            # page 1 even if page 1 doesn't use it, so we have a way to get it
            # back.
            # TODO refactor this to outside the loop
            page0 = pdf_base.pages[0]
            _update_page_resources(
                page=page0, font=font, font_key=font_key, procset=procset
            )

            # We cannot read and write the same file, that will corrupt it
            # but we don't to keep more copies than we need to. Delete intermediates.
            # {interim_count} is the opened file we were updateing
            # {interim_count - 1} can be deleted
            # {interim_count + 1} is the new file will produce and open
            old_file = output_file + f'_working{interim_count - 1}.pdf'
            if not context.get_options().keep_temporary_files:
                with suppress(FileNotFoundError):
                    os.unlink(old_file)

            next_file = output_file + f'_working{interim_count + 1}.pdf'
            pdf_base.save(next_file)
            pdf_base.close()

            pdf_base = pikepdf.open(next_file)
            procset = pdf_base.pages[0].Resources.ProcSet
            font, font_key = None, None  # Ensure we reacquire this information
            interim_count += 1

    pdf_base.save(output_file)
    pdf_base.close()
예제 #46
0
def linn(resources):
    path = resources / 'linn.pdf'
    return path, pikepdf.open(path)