示例#1
0
def merge_metadata(src,dst):
    '''
    Sets the metadata of dst to the metadata of src pdf files.
    
    .. note:: pdftk is used for extracting and updating the metadata.
    '''
    tmp=NamedTemporaryFile(dir=".",delete=False)
    srcm=_(["pdftk",src,"dump_data"],stdout=PIPE)
    dstm=_(["pdftk",dst,"update_info","-","output",tmp.name],stdin=srcm.stdout)
    srcm.stdout.close() # for pipe to work correctly (SIGPIPE) 
    dstm.communicate()
    
    # move temporary file to the actual file
    os.rename(tmp.name, dst)
示例#2
0
def merge_metadata(src, dst):
    '''
    Sets the metadata of dst to the metadata of src pdf files.
    
    .. note:: pdftk is used for extracting and updating the metadata.
    '''
    tmp = NamedTemporaryFile(dir=".", delete=False)
    srcm = _(["pdftk", src, "dump_data"], stdout=PIPE)
    dstm = _(["pdftk", dst, "update_info", "-", "output", tmp.name],
             stdin=srcm.stdout)
    srcm.stdout.close()  # for pipe to work correctly (SIGPIPE)
    dstm.communicate()

    # move temporary file to the actual file
    os.rename(tmp.name, dst)
示例#3
0
def call_tesseract(file):
    '''
    Calls tesseract to generate the hocr of a page. The output pdf
    file name is the image filename plus ".pdf" extension.
    
    :param file: the path to the page image
    '''
    args=["tesseract",file,file,"hocr"]
    p=_(args); p.communicate()
示例#4
0
def call_tesseract(file):
    '''
    Calls tesseract to generate the hocr of a page. The output pdf
    file name is the image filename plus ".pdf" extension.
    
    :param file: the path to the page image
    '''
    args = ["tesseract", file, file, "hocr"]
    p = _(args)
    p.communicate()
示例#5
0
def call_hocr(pdffile,imagefile,hocrfile):
    '''
    calls hocr2pdf and generates a pdf from a single page pdf file.
    
    :param pdffile: the output pdf filename
    :param imagefile: the input image filename
    :param hocrfile: the input hocr filename
    '''
    
    args=["hocr2pdf","-i",imagefile,"-o",pdffile]
    p=_(args,stdin=PIPE)
    with open(hocrfile,"r") as hocr:
        p.communicate(input=hocr.read())
示例#6
0
def call_hocr(pdffile, imagefile, hocrfile):
    '''
    calls hocr2pdf and generates a pdf from a single page pdf file.
    
    :param pdffile: the output pdf filename
    :param imagefile: the input image filename
    :param hocrfile: the input hocr filename
    '''

    args = ["hocr2pdf", "-i", imagefile, "-o", pdffile]
    p = _(args, stdin=PIPE)
    with open(hocrfile, "r") as hocr:
        p.communicate(input=hocr.read())
示例#7
0
def merge_pdfs(pdflist,output):
    '''
    Merge the pdf files into a single one. The page order is the
    same as the order of the list. 
    
    .. note:: ghostscript with pdfwrite is used for this process.
    :param pdflist: list of pdf filenames
    :param output: the name of the output pdf
    '''
    args=["gs","-dBATCH","-dNOPAUSE","-q","-sDEVICE=pdfwrite",
          "-dNumRenderingThreads=%d"%THREADS,
          "-sOutputFile="+output,
          "-c","30000000 setvmthreshold",
          "-f"]+pdflist
    p=_(args); p.communicate()
示例#8
0
def merge_pdfs(pdflist, output):
    '''
    Merge the pdf files into a single one. The page order is the
    same as the order of the list. 
    
    .. note:: ghostscript with pdfwrite is used for this process.
    :param pdflist: list of pdf filenames
    :param output: the name of the output pdf
    '''
    args = [
        "gs", "-dBATCH", "-dNOPAUSE", "-q", "-sDEVICE=pdfwrite",
        "-dNumRenderingThreads=%d" % THREADS, "-sOutputFile=" + output, "-c",
        "30000000 setvmthreshold", "-f"
    ] + pdflist
    p = _(args)
    p.communicate()
示例#9
0
def create_tiffs(pdf):
    '''
    Split pdf into images for each page of the pdf. The images are
    output in tiff format and named "image<page-number>.tiff".
    
    .. note:: The images are generated in the current directory.
    .. note:: ghostscript with tiffg4 device is used.
    
    :param pdf: the path to the pdf file.
    '''
    args=["gs","-dNOPAUSE","-sDEVICE=tiffg4",
          "-dNumRenderingThreads=%d"%THREADS,
          "-dFirstPage=1",
          "-sOutputFile=image%d.tiff",
          "-r%d"%RESOLUTION,"-q",
          "-c","30000000 setvmthreshold",
          "-f",pdf,"-c","quit"]
    p=_(args); p.communicate()
示例#10
0
def create_tiffs(pdf):
    '''
    Split pdf into images for each page of the pdf. The images are
    output in tiff format and named "image<page-number>.tiff".
    
    .. note:: The images are generated in the current directory.
    .. note:: ghostscript with tiffg4 device is used.
    
    :param pdf: the path to the pdf file.
    '''
    args = [
        "gs", "-dNOPAUSE", "-sDEVICE=tiffg4",
        "-dNumRenderingThreads=%d" % THREADS, "-dFirstPage=1",
        "-sOutputFile=image%d.tiff",
        "-r%d" % RESOLUTION, "-q", "-c", "30000000 setvmthreshold", "-f", pdf,
        "-c", "quit"
    ]
    p = _(args)
    p.communicate()
示例#11
0
def which(exe):
    t = _p(_(["where.exe", exe],
             capture_output=True).stdout.decode().strip()).resolve()
    if t.is_file():
        return str(t)