Пример #1
0
def split_pdf_to_png_files (pdf_file_spec,output_dir):
    """
    Split the PDF file specified by PDF_FILE_SPEC into a series of
    files, each representing a single page as a PNG image. Write files
    to directory specified by OUTPUT_DIR.
    """
    png_files = None
    try:
        # sanity check
        if not os.path.isabs(pdf_file_spec):
            msg = "The input PDF must be specified as an absolute file path"
            lg.error(json1.json_msg(108,[msg],False,files=[pdf_file_spec]))
            sys.exit(msg)
        else:
            # array of (<file_name>,<page_number>) tuples
            png_specs = pdf.pdf_to_pngs(pdf_file_spec,output_dir)
    except Exception as e:
        msg = json1.json_failed_to_convert_pdf(e,pdf_file_spec)
        lg.error(msg)
        print "failed to convert PDF file(s): %s" % pdf_file_spec
        print "e: %s" % e
        lg.info(json1.json_last_log_msg())
        sys.exit(msg)
    else:
        lg.info(json1.json_pdf_to_pngs_success(pdf_file_spec,png_specs))
        return png_specs
Пример #2
0
def invoke_pdfimages_on (pdf_file_spec,output_dir):
    """
    Extract images in PDF file specified by PDF_FILE_SPEC into a
    series of files, each representing a single PNG image. Write files
    to directory specified by OUTPUT_DIR.

    Returns a list of tuples where each tuple has the structure
    (png_file,png_file_page_number) png_file_page_number is an
    integer. The list is an ordered sequence with respect to page
    number - low to high.
    """
    png_file_page_number_tuples = None
    try:
        # sanity check
        if not os.path.isabs(pdf_file_spec):
            msg = "The input PDF must be specified as an absolute file path"
            lg.error(json1.json_msg(108,[msg],False,files=[pdf_file_spec]))
            sys.exit(msg)
        else:
            png_file_page_number_tuples = pdf.pdfimages(pdf_file_spec,output_dir)
    except Exception as e:
        lg.debug(str(e))
        msg = json1.json_failed_to_convert_pdf(e,pdf_file_spec)
        lg.error(msg)
        lg.info(json1.json_last_log_msg())
        sys.exit(msg)
    else:
        # Is it really import to log png files? (Need to dig them out of tuples...)
        lg.info(json1.json_pdf_to_pngs_success(pdf_file_spec,
                                               None #png_files
        ))
        return png_file_page_number_tuples
Пример #3
0
def pdf_to_pngs(pdf_file, output_dir):
    """
    Generate PNG files, one corresponding to each page of the PDF file
    PDF_FILE. Write files to directory specified by OUTPUT_DIR. Return
    a list of the PNG file names.
    """
    input_file_sans_suffix, input_file_suffix = os.path.splitext(pdf_file)
    maybe_dir, input_file_name_only = os.path.split(input_file_sans_suffix)
    number_of_pages = None
    outfile_root = input_file_name_only
    # determine number of pages
    reader = PyPDF2.PdfFileReader(file(pdf_file, "rb"))
    # getNumPages can fail if the PDF, or an object therein, is
    # corrupt
    try:
        number_of_pages = reader.getNumPages()
        lg.info(json1.json_pdf_info(number_of_pages))
    except Exception as e:
        lg.error(
            json1.json_msg(
                109,
                "Failure to open or parse a PDF file -- possible indication of a corrupt PDF",
                None,
                file=pdf_file))
        raise e
    # Qs:
    # 1. advantages/disadvantages of gs and pdftoppm = ?
    # 2. is there really no way to just scan directly from PDF, specifying page number as we go?
    return pdf_to_pngs__pdftoppm(pdf_file, number_of_pages, outfile_root,
                                 output_dir)
Пример #4
0
def barcodeScan(imagePNGPath, scan_region):
    """
    Return None if a barcode was not found. If a barcode was found,
    return a string corresponding to the barcode-encoded data.

    Search within the region defined by SCAN_REGION when SCAN_REGION
    is a list. When SCAN_REGION is a list, it specifies two points as
    [x1,y1,x2,y2]. These two points (x1,y1) and (x2,y2) are pairs
    (x,y) of percentages (each expressed as a value between 0 and 1.0)
    relative to the dimensions of the image; they define the box
    within which the barcode scan occurs.

    If SCAN_REGION is not a list, the full image is analyzed. If
    analysis of the full image is desirable, do not set SCAN_REGION to
    [0,0,1,1] but instead set it to None or some other non-list value.
    """
    # sanity check(s)
    if not isinstance(scan_region,list):
        scan_region = None
    else:
        for value in scan_region:
            if (value < 0 or value > 1):
                msg = json1.json_msg(999,"insane scan region value",False,None)
                lg.error(msg)
                lg.info(json1.json_last_log_msg())
                sys.exit(msg)
    # obtain image data either via PIL or CV2/numpy
    #   1. using pil
    # PIL origin (0,0) is top left corner
    pil = Image.open(imagePNGPath).convert('L') # 'L' is "black and white mode": converts to 8-bit pixels B/W
    #   2. using cv2/numpy
    #pil_1 = Image.open(imagePNGPath)
    #frame = pil_1.convert("RGB")
    #pil_gray = cv2.cvtColor(numpy.array(frame), cv2.COLOR_BGR2GRAY, dstCn=0)
    #pil = Image.fromarray(pil_gray)
    pilCropped = pil
    width, height = pil.size
    lg.debug("width: %s height: %s",width,height)
    if scan_region:
        # relative (percentage) values between 0 and 1
        x_crop_min = min(scan_region[0],scan_region[2])
        x_crop_max = max(scan_region[0],scan_region[2])
        y_crop_min = min(scan_region[1],scan_region[3])
        y_crop_max = max(scan_region[1],scan_region[3])
        cropTop=int(height*y_crop_min)
        cropBottom=int(height*y_crop_max)
        cropLeft=int(height*x_crop_min)
        cropRight=int(height*x_crop_max)
        # crop box is 4-tuple: left,upper,right,lower
        pilCropBox = [cropLeft,cropTop,cropRight,cropBottom]
        pilCropped = pil.crop(pilCropBox)
    #  zbar sometimes catches a barcode at a lower resolution but misses it at a higher resolution. Scan for barcode with several variants of image specified by IMAGE_FILE_SPEC.
    barcodeString = barcode_scan_at_resolutions(pilCropped,None)
    if ( not barcodeString ):
            lg.warn(json1.json_barcode_not_found_msg([imagePNGPath],""))
    return barcodeString
Пример #5
0
def pdf_number_of_pages(pdf_file):
    """
    Determine the number of pages in a PDF document. Return an integer.
    """
    reader = PyPDF2.PdfFileReader(file(pdf_file, "rb"))
    # getNumPages can fail if the PDF, or an object therein, is
    # corrupt
    try:
        return reader.getNumPages()
    except Exception as e:
        lg.error(
            json1.json_msg(
                109,
                "Failure to open or parse a PDF file -- possible indication of a corrupt PDF",
                None,
                file=pdf_file))
        raise e
Пример #6
0
def pdfxcb (pdf_file_spec,output_dir,match_re,rasterize_p):
    """
    Given the file specified by PDF_FILE_SPEC, look for cover sheets
    and split the PDF at each coversheet. Name output file(s) based on
    cover sheet content. Write files to directory specified by
    OUTPUT_DIR. Return True. If MATCH_RE is defined, ignore barcodes
    unless the corresponding string matches the regex MATCH_RE. Use
    RASTERIZE_P = False if the PDF does not contain vector graphics
    but is solely bitmap data (e.g., the PDF was generated from a
    scanned document).
    """
    global lg
    sanity_checks([output_dir],[pdf_file_spec])
    # If confident that the PDF under analysis is derived from a scan
    # (i.e., contains only bitmap data), then the images embedded in
    # the PDF can be analyzed directly. If the PDF may contain vector
    # data on the cover sheet pages, then rasterization is indicated.
    # See doc/optimization.md for notes on time implications.

    # PNG_FILE_PAGE_NUMBER_TUPLES is an array where each member has
    # the form (<PNG file name>, <PDF page number>). There is no
    # guarantee that all pages in the original PDF document are
    # represented. Furthermore, there may be multiple PNG images per
    # PDF page -- i.e., the array might include ("flurpies.png",1) and
    # ("glurpies.png",1).

    # FIXME: consider having a single call here -- FOO -- that specializes on rasterize_p
    if rasterize_p:
        # extract PDF pages as image data (PNG files)
        png_file_page_number_tuples = split_pdf_to_png_files(pdf_file_spec,output_dir)
        # Once rasterized pages are generated, optionally scan for cue marks
        # CUE_INDICES = array where each member is an integer indicating index of member of png_file_page_number_tuples where the corresponding bitmap has a cue mark
        # cue_indices = scan_for_cue_marks(png_file_page_number_tuples) <-- use urh_corner_mean w/reasonable threshold (10? 20? 50?) for "black" 
    else:
        # extract images directly from PDF
        png_file_page_number_tuples = invoke_pdfimages_on(pdf_file_spec,output_dir)
    # Code below expects png_file_page_number_tuples to be ordered with respect to page number.
    # Note that sorted default is ascending order.
    png_file_page_number_tuples = sorted(png_file_page_number_tuples,
                                         key=lambda tuple: tuple[1])
    #
    # locate cover sheets
    #
    if rasterize_p:
        # possibilities:
        # 1. png files represent rasterized pages
        scan_region = ([0,0,0.7,0.5])
    else:
        # 2. png files represent images from PDF (via pdfimages)
        scan_region = None # None is not treated as the equivalent of ([0,0,1,1]). ([0,0,1,1]) triggers cropping by barcodeScan.
    cover_sheet_barcodes, cover_sheet_indices = locate_cover_sheets(png_file_page_number_tuples,output_dir,match_re,scan_region)
    print(cover_sheet_barcodes)
    lg.debug(cover_sheet_barcodes)
    lg.debug(cover_sheet_indices)
    # Setting to False supports debugging/development. This should be set to True in production.
    clean_up_png_files = False # False # True
    if clean_up_png_files:
        for png_file_tuple in png_file_page_number_tuples:
            os.remove(os.path.join(output_dir,png_file_tuple[0]))
    # write PDFs
    pdf_length = pdf.pdf_number_of_pages(pdf_file_spec) # len(png_files) only works if PNGs are rasterized pages
    page_ranges = generate_page_ranges(cover_sheet_indices,png_file_page_number_tuples,pdf_length)
    output_file_names = generate_output_file_names(cover_sheet_barcodes,cover_sheet_indices,output_dir)
    lg.debug(output_file_names)
    pdf.pdf_split(pdf_file_spec,output_file_names,page_ranges)
    lg.info(json1.json_msg(40,
             ['Analysis and burst completed'],
             False,
             files=output_file_names,
             data={
                 'barcodes': cover_sheet_barcodes,
                 'indices': cover_sheet_indices
             }
    ))
    return True