예제 #1
0
def reorder_pages(src, dst, new_order):
    """
    new_order is a list of following format:

        [
            {'page_num': 2, page_order: 1},
            {'page_num': 1, page_order: 2},
            {'page_num': 3, page_order: 3},
            {'page_num': 4, page_order: 4},
        ]
    Example above means that in current document of 4 pages,
    first page was swapped with second one.
    page_num    = older page order
    page_order  = current page order
    So in human language, each hash is read:
        <page_num> now should be <page_order>
    """
    page_count = get_pagecount(src)

    cat_ranges = cat_ranges_for_reorder(page_count=page_count,
                                        new_order=new_order)

    cmd = [settings.BINARY_PDFTK, src, "cat"]
    for page in cat_ranges:
        cmd.append(str(page))

    cmd.append("output")
    cmd.append(dst)
    run(cmd)
예제 #2
0
def resize_img(page_path, media_root):

    local_abspath = os.path.join(
        media_root,
        page_path.document_path.url()
    )
    logger.debug(f"Resizing image {page_path.img_url()}")

    ppmroot = os.path.join(media_root, page_path.ppmroot)
    ppmroot_dirname = os.path.dirname(ppmroot)

    width = page_path.step.width

    if not os.path.exists(ppmroot_dirname):
        logger.debug(f"PPMROOT {ppmroot_dirname} does not exists. Creating.")
        os.makedirs(
            ppmroot_dirname, exist_ok=True
        )
    else:
        logger.debug(f"PPMROOT {ppmroot_dirname} already exists.")

    cmd = (
        settings.BINARY_CONVERT,
        "-resize",
        f"{width}x",
        local_abspath,
        # output directory path, similar to ppmroot
        f"{ppmroot}-1.jpg"
    )

    run(cmd)
예제 #3
0
def delete_pages(src, dst, page_numbers):
    cmd = [settings.BINARY_STAPLER, "del", src]
    for page in page_numbers:
        cmd.append(str(page))

    cmd.append(dst)

    run(cmd)
예제 #4
0
def extract_hocr(page_url, lang, media_root):
    page_abspath = os.path.join(media_root, page_url.img_url())

    hocr_root, hocr_ext = os.path.splitext(
        os.path.join(media_root, page_url.hocr_url()))
    cmd = (settings.BINARY_OCR, "-l", lang, page_abspath, hocr_root, "hocr")
    run(cmd)
    logger.debug(f"OCR for {page_url.img_url()} - Complete.")
    logger.debug(f"OCR Result {page_url.hocr_url()}.")
예제 #5
0
def paste_pages_into_existing_doc(src,
                                  dst,
                                  data_list,
                                  after_page_number=False,
                                  before_page_number=False):
    page_count = get_pagecount(src)
    list1, list2 = split_ranges(total=page_count,
                                after=after_page_number,
                                before=before_page_number)
    # notice missing A
    # Letter A is assignent to current folder and
    # pages from list1 and list2
    letters = "BCDEFGHIJKLMNOPQRSTUVWXYZ"
    letters_2_doc_map = []
    letters_pages = []
    letters_pages_before = []
    letters_pages_after = []

    letters_2_doc_map.append(f"A={src}")

    for idx in range(0, len(data_list)):
        letter = letters[idx]
        src = data_list[idx]['src']
        pages = data_list[idx]['page_nums']

        letters_2_doc_map.append(f"{letter}={src}")
        for p in pages:
            letters_pages.append(f"{letter}{p}")

    for p in list1:
        letters_pages_before.append(f"A{p}")

    for p in list2:
        letters_pages_after.append(f"A{p}")

    cmd = [
        settings.BINARY_PDFTK,
    ]
    # add A=doc1_path, B=doc2_path
    cmd.extend(letters_2_doc_map)

    cmd.append("cat")

    # existing doc pages (may be empty)
    cmd.extend(letters_pages_before)
    # newly inserted pages
    cmd.extend(letters_pages)
    # existing doc pages (may be empty)
    cmd.extend(letters_pages_after)

    cmd.append("output")

    cmd.append(dst)

    run(cmd)
예제 #6
0
def delete_pages(src, dst, page_numbers):
    page_count = get_pagecount(src)

    cat_ranges = cat_ranges_for_delete(page_count, page_numbers)

    cmd = [settings.BINARY_PDFTK, src, "cat"]
    for page in cat_ranges:
        cmd.append(str(page))

    cmd.append("output")
    cmd.append(dst)

    run(cmd)
예제 #7
0
def convert_tiff2pdf(doc_url):

    logger.debug(f"convert_tiff2pdf for {doc_url}")

    new_doc_url, new_filename = pdfname_from_tiffname(doc_url)

    logger.debug(f"tiff2pdf source={doc_url} dest={new_doc_url}")

    cmd = (
        settings.BINARY_CONVERT,
        doc_url,
        new_doc_url,
    )

    run(cmd)

    # returns new filename
    return new_filename
예제 #8
0
def extract_txt(page_url, lang, media_root):
    page_abspath = os.path.join(
        media_root,
        page_url.img_url()
    )
    txt_root, txt_ext = os.path.splitext(
        os.path.join(
            media_root, page_url.txt_url()
        )
    )
    cmd = (
        settings.BINARY_OCR,
        "-l",
        lang,
        page_abspath,
        txt_root
    )
    run(cmd)
예제 #9
0
def extract_txt(page_url, lang, media_root):
    page_abspath = os.path.join(
        media_root,
        page_url.img_url()
    )
    txt_root, txt_ext = os.path.splitext(
        os.path.join(
            media_root, page_url.txt_url()
        )
    )
    cmd = (
        "tesseract",
        "-l",
        lang,
        page_abspath,
        txt_root
    )
    run(cmd)
예제 #10
0
def extract_hocr(page_url, lang, media_root):
    page_abspath = os.path.join(
        media_root,
        page_url.img_url()
    )

    hocr_root, hocr_ext = os.path.splitext(
        os.path.join(media_root, page_url.hocr_url())
    )
    cmd = (
        "tesseract",
        "-l",
        lang,
        page_abspath,
        hocr_root,
        "hocr"
    )
    run(cmd)
예제 #11
0
파일: tiff.py 프로젝트: mtonnie/mglib
def convert_tiff2pdf(doc_url):

    logger.debug(f"convert_tiff2pdf for {doc_url}")
    # basename is filename + ext (no path)

    basename = os.path.basename(doc_url)
    base_root, base_ext = os.path.splitext(basename)
    root, ext = os.path.splitext(doc_url)
    new_doc_url = f"{root}.pdf"

    logger.debug(f"tiff2pdf source={doc_url} dest={new_doc_url}")

    cmd = (
        settings.BINARY_CONVERT,
        doc_url,
        new_doc_url,
    )

    run(cmd)

    # returns new filename
    return f"{base_root}.pdf"
예제 #12
0
def extract_img(page_path, media_root):

    local_abspath = os.path.join(
        media_root,
        page_path.document_path.url()
    )
    logger.debug(f"Extracing image for {page_path.img_url()}")

    ppmroot = os.path.join(media_root, page_path.ppmroot)
    ppmroot_dirname = os.path.dirname(ppmroot)

    page_num = page_path.page_num
    width = page_path.step.width

    if not os.path.exists(ppmroot_dirname):
        logger.debug(f"PPMROOT {ppmroot_dirname} does not exists. Creating.")
        os.makedirs(
            ppmroot_dirname, exist_ok=True
        )
    else:
        logger.debug(f"PPMROOT {ppmroot_dirname} already exists.")
    cmd = (
        settings.BINARY_PDFTOPPM,
        "-jpeg",
        "-f",
        str(page_num),
        "-l",  # generate only one page
        str(page_num),
        "-scale-to-x",
        str(width),
        "-scale-to-y",
        "-1",  # it will adjust height according to img ratio
        local_abspath,
        # output directory path,
        ppmroot
    )

    run(cmd)
예제 #13
0
def paste_pages(src,
                dst,
                data_list,
                dst_doc_is_new=True,
                after_page_number=False,
                before_page_number=False):
    """
    dest_doc_ep = endpoint of the doc where newly created
        file will be placed.
    src_doc_ep_list is a list of following format:
        [
            {
                'doc_ep': doc_ep,
                'page_nums': [page_num_1, page_num_2, page_num_3]
            },
            {
                'doc_ep': doc_ep,
                'page_nums': [page_num_1, page_num_2, page_num_3]
            },
            ...
        ]
    src_doc_ep_list is a list of documents where pages
    (with numbers page_num_1...) will be paste from.

    dst_doc_is_new = True well.. destination document was just created,
    we are pasting here cutted pages into some folder as new document.

    In this case 'after' and 'before' arguments are ignored

    dst_doc_is_new = False, pasting pages into exiting document.
    If before_page_number > 0 - paste pages before page number
        'before_page_number'
    If after_page_number > 0 - paste pages after page number
        'after_page_number'

    before_page_number argument has priority over after_page_number.

    If both before_page_number and after_page_number are < 0 - just paste
    pages at the end of the document.
    """
    if not dst_doc_is_new:
        return paste_pages_into_existing_doc(
            src=src,
            dst=dst,
            data_list=data_list,
            after_page_number=after_page_number,
            before_page_number=before_page_number)
    letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
    letters_2_doc_map = []
    letters_pages = []

    for idx in range(0, len(data_list)):
        letter = letters[idx]
        src = data_list[idx]['src']
        pages = data_list[idx]['page_nums']

        letters_2_doc_map.append(f"{letter}={src}")
        for p in pages:
            letters_pages.append(f"{letter}{p}")

    cmd = [
        settings.BINARY_PDFTK,
    ]
    # add A=doc1_path, B=doc2_path
    cmd.extend(letters_2_doc_map)

    cmd.append("cat")

    cmd.extend(letters_pages)

    cmd.append("output")

    cmd.append(dst)

    run(cmd)