def convert(file_name): try: orig_file = Path(file_name).resolve() output_path = Path(f"{file_name.split('.')[0]}_.pdf").resolve() output_path_temp = Path(f"{file_name.split('.')[0]}__.pdf").resolve() with Image(filename=str(orig_file), resolution=150) as img: img.transform_colorspace('gray') img.linear_stretch(black_point=0.035, white_point=0.1) img.blur(radius=0, sigma=0.5) img.noise(noise_type='gaussian', attenuate=0.25) img.rotate(0.5) img.save(filename=str(output_path)) cmd_gs = [ 'gs', '-dSAFER', '-dBATCH', '-dNOPAUSE', '-dNOCACHE', '-sDEVICE=pdfwrite', '-sColorConversionStrategy=LeaveColorUnchanged', '-dAutoFilterColorImages=true', '-dAutoFilterGrayImages=true', '-dDownsampleMonoImages=true', '-dDownsampleGrayImages=true', '-dDownsampleColorImages=true', f'-sOutputFile={str(output_path_temp)}', str(output_path) ] encoding = locale.getpreferredencoding() cmd_gs = [a.encode(encoding) for a in cmd_gs] ghostscript.Ghostscript(*cmd_gs) os.remove(str(output_path_temp)) click.secho("File processed and saved", fg="green") except Exception as e: print(e)
def convert_pdf2png(bucket, pdf_blob): # download the PDF file to a temp file print("Downloading PDF: {}".format(pdf_blob.name)) _, pdf_file_name = tempfile.mkstemp() with open(pdf_file_name, "w+b") as pdf_file: pdf_blob.download_to_file(pdf_file) # convert the PDF to PNG print("Converting PDF to PNGs for {}".format(pdf_blob.name)) pdf_prefix = pdf_blob.name.replace(".pdf", "")[:4] png_tempdir = tempfile.mkdtemp args = [ "pdf2png", "-dSAFER", "-sDEVICE=pngalpha", "-r100", "-sOutputFile={}/%03d.png".format(png_tempdir), pdf_file_name ] encoding = locale.getpreferredencoding() args = [a.encode(encoding) for a in args] ghostscript.Ghostscript(*args) # save the PNGs on GCP print("Saving PNGs for {}".format(pdf_blob.name)) for f in glob.glob(png_tempdir + "/*"): png_blob = bucket.blob(pdf_prefix + "-images/" + os.path.split(f)[1]) png_blob.upload_from_filename(f, content_type="image/png") png_blob.make_public() os.remove(f) print("Ended converting PDF to PNGs for {}".format(pdf_blob.name)) os.remove(pdf_file_name) # merging both main and test_tutorial modules; the trigger function # would be called in a function # https://cloud.google.com/functions/docs/tutorials/ocr
def generate_thumbnail_from_pdf(self, document): """Generating a thumbnail based on document first file""" thumbnail_temporary, thumbnail_directory = self.get_thumbnail_path( document) # Creating directory for thumbnail if not exists if not os.path.exists(thumbnail_directory): os.makedirs(thumbnail_directory) # Storing temporary PDF file for converting tmp_pdf = open(thumbnail_temporary, 'w') tmp_pdf.write(document.get_file_obj().read()) tmp_pdf.close() args = [ 'gs', '-q', # Quiet '-dSAFER', '-sDEVICE=png16m', # Type. PNG used '-r10', # resolution of the thumbnail '-dBATCH', # Quit GS after converting '-dNOPAUSE', # Do not stop on pages '-dFirstPage=1', '-dLastPage=1', '-sOutputFile=%s.png' % thumbnail_temporary, # Destination '%s' % thumbnail_temporary, # Source ] ghostscript.Ghostscript(*args) # Deleting the temp PDF os.unlink(thumbnail_temporary)
def convert_pdf2png(bucket, pdf_blob): # download the PDF file to a temp file print("Downloading PDF: {}".format(pdf_blob.name)) _, pdf_file_name = tempfile.mkstemp() with open(pdf_file_name, "w+b") as pdf_file: pdf_blob.download_to_file(pdf_file) # convert the PDF to PNGs print("Converting PDF to PNGs for {}".format(pdf_blob.name)) pdf_prefix = pdf_blob.name.replace(".pdf", "")[:4] png_tempdir = tempfile.mkdtemp() args = [ "pdf2png", "-dSAFER", "-sDEVICE=pngalpha", "-r100", "-sOutputFile={}/%03d.png".format(png_tempdir), pdf_file_name, ] encoding = locale.getpreferredencoding() args = [a.encode(encoding) for a in args] ghostscript.Ghostscript(*args) # save the PNGs on GCS print("Saving PNGs for {}".format(pdf_blob.name)) for f in glob.glob(png_tempdir + "/*"): png_blob = bucket.blob(pdf_prefix + "-images/" + os.path.split(f)[1]) png_blob.upload_from_filename(f, content_type="image/png") png_blob.make_public() os.remove(f) print("Ended converting PDF to PNGs for {}".format(pdf_blob.name)) os.remove(pdf_file_name)
def pdf_to_images(filepath, output_folder): """Split a PDF file and make images of the individual pages :filepath: Path to PDF file :returns: tuple with output_folder and resulting file count """ first_page = "1" args = [ "-dNOPAUSE", "-dBATCH", "-dJPEGQ=60", "-r200", "-dFirstPage=" + first_page, "-sDEVICE=jpeg", "-sOutputFile=" + os.path.join(output_folder, "page_%03d.jpg"), filepath ] encoding = locale.getpreferredencoding() args = [a.encode(encoding) for a in args] try: logging.info(args) ghostscript.Ghostscript(*args) logging.info(f"PDF file split") return (output_folder, len(glob.glob(output_folder))) except Exception as ex: print(ex)
def print_pdf_put_view(request): printer = request.params.get("printer") if request.params.get( "printer") else get_default_printer() if not request.body: return Response("Bad Request", status=400) with tempfile.TemporaryDirectory(suffix="lmu.localprintservice") as dir: with open(os.path.join(os.path.abspath(dir), "file_to_print.pdf"), "w+b") as pdf: pdf.write(request.body) if sys.platform == "win32": files_to_print = glob.glob( os.path.join(os.path.abspath(dir), "*.pdf"))[0].replace('\\\\', '\\') import ghostscript args = [ "-dPrinted", "-dBATCH", "-dNOSAFER", "-dNOPAUSE", "-dNOPROMPT" "-q", "-dNumCopies#1", "-sDEVICE#mswinpr2", f'-sOutputFile#"%printer%{printer}"', f'"{files_to_print}"' ] encoding = locale.getpreferredencoding() args = [a.encode(encoding) for a in args] ghostscript.Ghostscript(*args) else: files_to_print = glob.glob( os.path.join(os.path.abspath(dir), "*.pdf")) import cups conn = cups.Connection() conn.printFiles(printer, files_to_print, "Test", options) request.response.status = 202 request.response.headers.update({ 'Access-Control-Allow-Origin': '*', }) return request.response
def print_pdf(data=None): data['investigation'] = tuple(json.loads(data['investigation'])) data['advice'] = tuple((json.loads(data['advice']))) context = { 'data': data, } pdf = render_to_pdf('print/slip.html', context) temp1 = tempfile.mktemp('.pdf') f1 = open(temp1, 'ab') f1.write(pdf) f1.close() args = [ "-dPrinted", "-dBATCH", "-dNOSAFER", "-dNOPAUSE", "-dNOPROMPT" "-q", "-dNumCopies#1", "-sDEVICE#mswinpr2", f'-sOutputFile#"%printer%{win32print.GetDefaultPrinter()}"', f'"{temp1}"' ] encoding = locale.getpreferredencoding() args = [a.encode(encoding) for a in args] ghostscript.Ghostscript(*args) return True
def pdf2jpeg(pdf_input_path, jpeg_output_path): args = [ "pef2jpeg", "-dNOPAUSE", "-sDEVICE=jpeg", "-r144", "-sOutputFile=" + jpeg_output_path, pdf_input_path ] encoding = locale.getpreferredencoding() args = [a.encode(encoding) for a in args] ghostscript.Ghostscript(*args)
def pdf2jpeg(self, PDFDirectory, JPEGDirectory): args = [ "pef2jpeg", "-dNOPAUSE", "-sDEVICE=jpeg", "-r144", "-sOutputFile=" + JPEGDirectory, PDFDirectory ] encoding = locale.getpreferredencoding() args = [a.encode(encoding) for a in args] ghostscript.Ghostscript(*args)
def Slide_Extractor(): files = glob.glob('./media/*') # print ("REMOVING FILES: ", files) for f in files: os.remove(f) slide_directory = os.listdir('./uploads') ppt_list = [] for ppt in slide_directory: if ppt.endswith('.pptx') or ppt.endswith('.ppt'): ppt_list.append(ppt) # print (ppt_list) directory = os.getcwd() directory = directory + '/uploads' os.chdir(directory) for element in ppt_list: command = os.popen('unoconv -f pdf ' + element) command.close() merger = PdfFileMerger() pdf_directory = os.listdir() pdf_list = [] for pdf in pdf_directory: if pdf.endswith('.pdf'): pdf_list.append(pdf) merger.append(pdf) # print (pdf_list) directory = directory + '/../media' os.chdir(directory) # print (os.getcwd()) merger.write("combine.pdf") args = ["gs", "-q", "-o", "image%d.png", "-sDEVICE=pngalpha", "combine.pdf"] encoding = locale.getpreferredencoding() args = [a.encode(encoding) for a in args] ghostscript.Ghostscript(*args) image_directory = os.listdir() image_list = [] for image in image_directory: if image.endswith('.png'): image_list.append(image) # print (image_list) # print (len(image_list)) directory = directory + '/..' os.chdir(directory) # print (os.getcwd()) return (len(image_list))
def pdf2jpeg(pdf_input_path, jpeg_output_path): args = [ "pdf2jpeg", # actual value doesn't matter "-dNOPAUSE", "-sDEVICE=jpeg", "-r144", "-sOutputFile=" + jpeg_output_path, pdf_input_path ] ghostscript.Ghostscript(*args)
def pdf_printer(self, pdf_input_path): args = ['pdf_printer', '-dNOPAUSE', '-sDEVICE=mswinpr2', pdf_input_path] encoding = locale.getpreferredencoding() args = [a.encode(encoding) for a in args] ghostscript.Ghostscript(*args) ghostscript.cleanup()
def load_paper(target_path, target_pdf): print target_pdf file_name = target_pdf.split("/")[::-1][0].split(".")[0] target_directory = PROCESSED_IMG_PATH + "/" + file_name if not os.path.exists(target_directory): os.mkdir(target_directory) gs_args = [ "-q", "-dNOPAUSE", "-dBATCH", "-dNOPROMPT", "-dNOSAFER", "-sDEVICE=png16m", "-sOutputFile=" + target_directory + "/%d.png", target_path ] ghostscript.Ghostscript(*gs_args) images = os.listdir(target_directory) images = [image for image in images if image.endswith(".png")] result_image_path = [] for image in images: result_image_path.append(target_directory + '/' + image) img = cv2.imread(target_directory + '/' + image, 0) img_rgb = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) try: top_right = templateMatcher.detectSymbols(img, img_rgb, TEMPLATE_TOP_RIGHT) top_left = templateMatcher.detectSymbols(img, img_rgb, TEMPLATE_TOP_LEFT) bottom_right = templateMatcher.detectSymbols( img, img_rgb, TEMPLATE_BOTTOM_RIGHT) bottom_left = templateMatcher.detectSymbols( img, img_rgb, TEMPLATE_BOTTOM_LEFT) corner_symbols = __extract_border_marker(top_right, top_left, bottom_right, bottom_left) for symbol in corner_symbols.keys(): coord = corner_symbols[symbol] w, h = TEMPLATE[symbol].shape[::-1] cv2.rectangle(img_rgb, tuple(coord), (coord[0] + w, coord[1] + h), (255, 0, 0), 1) templateMatcher.removeDetected(img, coord[1], coord[0], w, h) border_pos = __find_border(img, corner_symbols["top_right"], corner_symbols["top_left"], corner_symbols["bottom_right"], corner_symbols["bottom_left"]) for pos in border_pos: coord = border_pos[pos] cv2.rectangle(img_rgb, tuple(coord), (coord[0] + 1, coord[1] + 1), (0, 255, 0), 1) except Exception: continue cv2.imwrite(target_directory + '/' + image, img) return result_image_path
def ai2jpegGs(pdf_input_path, jpeg_output_path): args = ["pdf2jpeg", # actual value doesn't matter "-dNOPAUSE", "-sDEVICE=jpeg", "-r144", "-sOutputFile=" + jpeg_output_path, pdf_input_path] encoding = locale.getpreferredencoding() args = [a.encode(encoding) for a in args] ghostscript.Ghostscript(*args)
def pdf_to_jpg(pdf_input_path, jpeg_name): args = [ "pdf2jpeg", # actual value doesn't matter "-dNOPAUSE", "-sDEVICE=jpeg", "-r300", f'-sOutputFile={jpeg_name}-%03d.jpg', pdf_input_path ] encoding = locale.getpreferredencoding() args = [a.encode(encoding) for a in args] with ghostscript.Ghostscript(*args) as g: ghostscript.cleanup()
def pdf2jpeg(pdf_input_path, jpeg_output_path): gs_call = "-q -sDEVICE=png16m -o {} -r300 {}".format( jpeg_output_path, pdf_input_path ) gs_call = ( "-dNumRenderingThreads=8 -dBufferSpace=2000000000 -dBandBufferSpace=500000000" + gs_call ) print(gs_call) gs_call = gs_call.encode().split() ghostscript.Ghostscript(*gs_call)
def change_format_and_ocr(pdf_input_path, filename): jpeg_output_path = filename+".jpeg" args = ["pdf2jpeg", # actual value doesn't matter "-dNOPAUSE", "-sDEVICE=jpeg", "-r144", "-sOutputFile=" + jpeg_output_path, pdf_input_path] encoding = locale.getpreferredencoding() args = [a.encode(encoding) for a in args] ghostscript.Ghostscript(*args) return image_to_text(jpeg_output_path)
def pdf2jpeg(pdf_input_path, output_path): args = [ "pdf2jpeg", # actual value doesn't matter "-dNOPAUSE", "-sDEVICE=png16m", # png 24 bit rgb color "-r200", # input rendering 200 dpi "-dDownScaleFactor=1", # make .png file 200 dpi as well "-sOutputFile=" + output_path, pdf_input_path ] # arguments have to be bytes, encode them encoding = locale.getpreferredencoding() args = [a.encode(encoding) for a in args] ghostscript.Ghostscript(*args)
def pdf2jpeg(pdf_input_path, jpeg_output_path): args = ["gs", # actual value doesn't matter "-dNOPAUSE", "-dBATCH", "-sDEVICE=png16m", "-r144", "-dPDFFitPage", # "-dFirstPage=" + page, # "-dLastPage=" + page, "-sOutputFile=" + jpeg_output_path, pdf_input_path] encoding = locale.getpreferredencoding() args = [a.encode(encoding) for a in args] ghostscript.Ghostscript(*args)
def convert_to_image(self): """ Convert the PDF to tiff image. :return: self (allows chaining of methods, since the methods do not return any additional info). """ if os.path.exists(self.pdf_file_spec): start_conversion = perf_counter() args = [ "pdf2tiff", "-dNOPAUSE", "-dSAFER", "-dBATCH", f"-dNumRenderingThreads={self.threads}", f"-q", f"-sDEVICE={self.IMAGE_FORMAT}", f"-r{self.dpi}", f"-sOutputFile={os.path.abspath(f'{self.output_file}-%00d.{self.extension}')}", f"{self.pdf_file_spec}", ] encoding = locale.getpreferredencoding() args = [a.encode(encoding) for a in args] # Convert the PDF to the TIFF (Need to clean up instance after execution, # to allow conversion of additional documents) try: gs_apis = ghostscript.Ghostscript(*args) gs_apis.exit() ghostscript.cleanup() except Exception as exc: print(f"\tERROR ({self.name}): Exception: {exc}") # Measure time to convert the PDF to image files. self.conversion_duration = perf_counter() - start_conversion print( f"{self.name}: Conversion took: {self.conversion_duration:0.4f} seconds." ) # Specified PDF was not found. else: print(f"{self.name}: Unable to find '{self.pdf_file_spec}'") return self
def pdf2png(pdf_input_path, png_output_path): """ This function is used to transform PDF into PNG. :param pdf_input_path: the PDF file :param png_output_path: the PNG file :type pdf_input_path: FileObject :type png_output_path: FileObject :returns: the convertion of the PDF into PNG :rtype: FileObject """ args = ["pdf2png", # actual value doesn't matter "-dNOPAUSE", "-sDEVICE=png", "-r144", "-sOutputFile=" + png_output_path, pdf_input_path] ghostscript.Ghostscript(*args)
def pdf2jpeg(self, pdf_list): import ghostscript import os for file in pdf_list: basename = file.split(".")[0] print basename print os.path.join(self.temp_dir, basename + "%03d.jpeg") args = [ "pdf2jpeg", # actual value doesn't matter "-dNOPAUSE", "-sDEVICE=jpeg", "-r144", "-sOutputFile=" + os.path.join(self.temp_dir, basename + "%03d.jpeg"), #%03.d will increment the file name os.path.join(self.dir_path, file) ] ghostscript.Ghostscript(*args)
def main(): '''This is the method''' for file in files_in_directory: #for each file in list input_doc = directory_in_str + '/' + file #set the input path print(input_doc) output_doc = directory_in_str + '/' + file + '.jpg' #set the output path args = [ "gs", # actual value doesn't matter "-dNOPAUSE", "-sDEVICE=jpeg", "-r144", "-sOutputFile=" + output_doc, input_doc ] ghostscript.Ghostscript(*args)
def pdf2png(self, pdf_input_path, png_output_path): args = [ "pdf2png", # actual value doesn't matter "-dNOPAUSE", "-sDEVICE=pngmono", "-r300", "-sOutputFile=" + png_output_path, pdf_input_path ] encoding = locale.getpreferredencoding() args = [a.encode(encoding) for a in args] try: ghostscript.Ghostscript(*args) ghostscript.cleanup() except: print("Erro", ghostscript.GhostscriptError)
def convert(self, pdf_path, png_path, resolution=300): if not self.installed(): raise OSError( "Ghostscript is not installed. You can install it using the instructions" " here: https://camelot-py.readthedocs.io/en/master/user/install-deps.html" ) import ghostscript gs_command = [ "gs", "-q", "-sDEVICE=png16m", "-o", png_path, f"-r{resolution}", pdf_path, ] ghostscript.Ghostscript(*gs_command)
def start(self, source_path, output_path): if ".pdf" not in source_path: return if (os.path.exists(output_path)): shutil.rmtree(output_path) os.makedirs(output_path) args = [ 'pdf2jpeg', '-dNOPAUSE', '-dBATCH', '-dSAFER', '-sDEVICE=png16m', '-r50x50', '-sOutputFile=' + output_path + '/page-%03d.jpg', source_path ] encoding = locale.getpreferredencoding() args = [a.encode(encoding) for a in args] ghostscript.Ghostscript(*args)
def _run_ghostscript(obj, device, outfile, pages=(1, 1)): # An empty file is apparently a valid file as far as ghostscript is # concerned. However, it produces an empty image file, which causes # errors downline. Detect an empty file and raise here. if not obj.src.size: raise Exception('Invalid file size 0') args = [ b'-dNOPAUSE', b'-dBATCH', b'-dSAFER', b'-sDEVICE=%s' % bytes(device, 'utf8'), ] if pages != (0, 0): args.extend( [b'-dFirstPage=%i' % pages[0], b'-dLastPage=%i' % pages[1]]) # Calculate suitable DPI... dpi = _calc_dpi(obj.width, obj.height) LOGGER.debug('Converting PDF to image with DPI of %ix%i', *dpi) args.extend([ b'-r%ix%i' % dpi, b'-o', bytes(outfile, 'utf8'), bytes(obj.src.path, 'utf8'), ]) LOGGER.debug('Ghostscript args: %s', args) # TODO: fix this lib. You cannot clean up the object with try / except if # __init__() raises. output = BytesIO() with ghostscript.Ghostscript(stdout=output, stderr=output, *args): pass # Checkout output for errors that require special handling. output = output.getvalue() if pages != (0, 0) and (b'FirstPage' in output or b'LastPage' in output): raise InvalidPageError(pages)
def pdf2jpeg(self, pdf_input_path, jpeg_output_path): args = [ "gs", # actual value doesn't matter "--permit-file-read=" + os.path.dirname(pdf_input_path), "--permit-file-write=" + jpeg_output_path, "-dNOPAUSE", #"-dBATCH", "-sDEVICE=jpeg", "-dTextAlphaBits=4", "-r300", #"-sOutputFile=" + jpeg_output_path, "-o a%03d.jpg", pdf_input_path ] encoding = locale.getpreferredencoding() args = [a.encode(encoding) for a in args] with ghostscript.Ghostscript(*args) as g: ghostscript.cleanup()
def resave_pdf(self): if 0: quality = { 0: '/default', 1: '/prepress', 2: '/printer', 3: '/ebook', 4: '/screen' } args = ['gs', '-sDEVICE=pdfwrite', '-dCompatibilityLevel=1.4', '-dPDFSETTINGS={}'.format(quality[0]), '-dNOPAUSE', '-dQUIET', '-dBATCH', '-dColorAccuracy=2', '-dProcessColorModel=/DeviceRGB', '-sOutputFile={}'.format(self.abs_output_filename), self.abs_tmp_output_filename] # '-sDefaultRGBProfile=sRGB_v4_ICC_preference.icc', # '-sOutputICCProfile=sRGB_v4_ICC_preference.icc', # '-sImageICCProfile=sRGB_v4_ICC_preference.icc', # Using python ghostscript module encoding = locale.getpreferredencoding() args = [a.encode(encoding) for a in args] ghostscript.Ghostscript(*args) # Calling ghoscript directly # subprocess.call(args) # Remove original file, called tmp.pdf if os.path.exists(self.abs_tmp_output_filename): os.remove(self.abs_tmp_output_filename) else: os.rename(self.abs_tmp_output_filename, self.abs_output_filename) self.message_on_header_widget("Created ({:.1f}MB)!".format( getsize(self.abs_output_filename) / 1000000.)) self.message_on_detail_widget("Drag another folder to create a new one.")
def catalogue_to_txt_files(txt_filename_template, pdf_filename) -> None: """ Parse the entire catalogue into text files (one for each page) using GhostScript: Note the %d which means each page becomes a different txt file :param: :return: """ args = list( map( lambda s: s.encode(), # args need to be encoded into bytes [ "gs", # name of the command "-sDEVICE=txtwrite", # job type - writing to txt files "-o" + txt_filename_template, # output filename template os.path.join(Paths.PDF_PATH, pdf_filename), # input filename ])) # with suppress_stdout(): ghostscript.Ghostscript(*args)