示例#1
0
def get_language(doc_id, path):
    language_list = []
    count = 0
    imagename = '/tmp/temp.png'
    try:
        with (Img(filename=path, resolution=300)) as source:
            time_out = time.time() + 30
            while time.time() <= time_out:
                source.compression_quality = 99
                images = source.sequence
                pages = len(images)
                for i in range(pages):
                    count += 1
                    Img(images[i]).save(filename=imagename)
                    text = image_to_string(Image.open(imagename))
                    if (text.strip()):
                        (language, waight) = langid.classify(text)
                        language_list.append(language)
                    else:
                        language_list.append("*")
                    if count == 6:
                        return statistics.mode(language_list)
        if count == 0:
            language_list.append("difficult")
            print("Difficult document: " + doc_id + " - " + path)
        return statistics.mode(language_list)
    except:
        print("Document Error: " + doc_id + " - " + path)
        return
示例#2
0
def save_pdf_2_png(filename, output_path, resolution=300):
    all_pages = Img(filename=filename, resolution=resolution)
    for i, page in enumerate(all_pages.sequence):
        with Img(page) as img:
            img.format = 'png'
            img.background_color = Color('white')
            img.alpha_channel = 'remove'

            image_filename = os.path.splitext(os.path.basename(filename))[0]
            image_filename = '{}_{:0>3}.png'.format(image_filename, i)
            image_filename = os.path.join(output_path, image_filename)

            img.save(filename=image_filename)
    return None
示例#3
0
def pdf_to_image(path, resolution=None, **kwargs):
    img = Img(filename=path, resolution=resolution, **kwargs)
    img_buffer = np.asarray(bytearray(img.make_blob(format='png')),
                            dtype='uint8')
    bytesio = io.BytesIO(img_buffer)
    pil_img = Image.open(bytesio)
    return pil_img
def convert_pdf_to_png(file_path, file_name):
    print("Converting pdf to png image for OCR")
    name_to_save = file_name[0:len(file_name) - 4]
    with Img(filename=full_path, resolution=300) as img:
        img.compression_quality = 99
        temp_name = file_path + '/' + name_to_save + ".jpg"
        img.save(filename=temp_name)
示例#5
0
    def save_img_with_wand(self, pdf_name, output):
        """
        Save pdf with on or more pages into JPG file. Called by self.pdf_to_jpg function

        :param pdf_name: path to pdf
        :param output: Filename of temporary jpeg after pdf conversion
        :return: Boolean to show if all the processes ended well
        """
        try:
            with Img(filename=pdf_name, resolution=self.resolution) as pic:
                pic.compression_quality = self.compressionQuality
                pic.background_color = Color("white")
                pic.alpha_channel = 'remove'
                pic.save(filename=output)

        except wand_except.WandRuntimeError as e:
            self.Log.error(e)
            self.Log.error('Exiting program...Fix the issue and restart the service')
            return False
        except wand_except.CacheError as e:
            self.Log.error(e)
            self.Log.error('Exiting program...Fix the issue and restart the service')
            return False
        except wand_except.PolicyError as e:
            self.Log.error(e)
            self.Log.error('Maybe you have to check the PDF rights in ImageMagick policy.xml file')
            self.Log.error('Exiting programm...Fix the issue and restart the service')
            return False
示例#6
0
 def pdf2jpg(self, file):
     with Img(filename=file, resolution=300) as img:
         img.compression_quality = 99
         img.save(filename=os.path.splitext(file)[0] + '.jpg')
         text1 = self.jpg2text(os.path.splitext(file)[0] + '.jpg')
         #print("hi")
         return text1
示例#7
0
	def convert(self, filename, temp_folder):
		logging.info('{0} Created'.format(filename))
		from wand.image import Image as Img
		with Img(filename=filename, resolution=300) as img:
			img.compression_quality = 99
			img.save(filename=temp_folder+'/sample_scan.jpg')
		del Img
示例#8
0
def testwand():
    with Img(filename='data/poetry.png') as img:
        # 生成指向图片QingYi.jpg的图片对象img,用with打开图片用完不必关闭
        print(img.__dict__)
        print()

        print('width =', img.width)
示例#9
0
    def crop_image_footer(self,
                          img,
                          isTiff=False,
                          lastImage=False,
                          outputName=None):
        try:
            if not isTiff:
                with Img(filename=img, resolution=self.resolution) as pic:
                    pic.compression_quality = self.compressionQuality
                    pic.background_color = Color("white")
                    pic.alpha_channel = 'remove'
                    self.heightRatio = int(pic.height / 3 + pic.height * 0.1)
                    pic.crop(width=pic.width,
                             height=int(pic.height - self.heightRatio),
                             gravity='south')
                    if outputName:
                        pic.save(filename=outputName)
                    if lastImage:
                        pic.save(filename=self.jpgName_last_footer)
                    else:
                        pic.save(filename=self.jpgName_footer)

            else:
                if outputName:
                    target = outputName
                elif lastImage:
                    target = self.tiffName_last
                else:
                    target = self.tiffName
                with Img(filename=target, resolution=self.resolution) as pic:
                    pic.compression_quality = self.compressionQuality
                    pic.background_color = Color("white")
                    pic.alpha_channel = 'remove'
                    self.heightRatio = int(pic.height / 3 + pic.height * 0.1)
                    pic.crop(width=pic.width,
                             height=int(pic.height - self.heightRatio),
                             gravity='south')
                    if outputName:
                        pic.save(filename=outputName)
                    elif lastImage:
                        pic.save(filename=self.tiffName_last_footer)
                    else:
                        pic.save(filename=self.tiffName_footer)
        except (PolicyError, CacheError) as e:
            self.Log.error('Error during WAND conversion : ' + str(e))
示例#10
0
def method_three():
    global path_file
    file = askopenfile(mode='r')
    if file is not None:
        try:
            #cria arquivos das páginas
            document = file.name
            page_number = user_page.get()
            folder = home + '/' + user_folder.get() + '/page_' + page_number
            os.makedirs(folder,
                        exist_ok=True)  #overwrite caso a pasta já exista
            inputpdf = PdfFileReader(open(document, "rb"))
            for i in range(1, inputpdf.numPages + 1):
                output = PdfFileWriter()
                output.addPage(inputpdf.getPage(i - 1))
                pdf_file = folder + '/' + "document-page%s.pdf" % i
                with open(pdf_file, "wb") as outputStream:
                    output.write(outputStream)
            #transforma página em imagem, lê como string e salva em csv
            page_file = folder + '/' + "document-page%s.pdf" % page_number
            with Img(filename=page_file, resolution=400) as img:
                img.compression_quality = 99
                img_file = page_file.replace('pdf', 'jpg')
                img.save(filename=img_file)
            im = Image.open(img_file)
            text = pytesseract.image_to_string(im, lang='eng')
            #salva csv file
            text = text.replace(' ', ',')
            csv_file = folder + '/' + 'page_{}.csv'.format(page_number)
            h = open(csv_file, 'w')
            h.write(text)
            h.close()
            #drop pdf files
            os.system('rm ' + folder + '/*pdf')
            os.system('rm ' + folder + '/*jpg')
            message_open = tk.Tk()
            message_open.geometry("500x100+650+450")
            message_open.title('Message')
            style = ThemedStyle(message_open)
            style.set_theme("arc")
            msg = tk.Frame(message_open)
            msg.pack()
            content = ttk.Label(msg, text="Done!", font=("Helvetica", "50"))
            content.pack()
        except:
            message_open = tk.Tk()
            message_open.geometry("500x100+650+450")
            message_open.title('Message')
            style = ThemedStyle(message_open)
            style.set_theme("arc")
            msg = tk.Frame(message_open)
            msg.pack()
            content = ttk.Label(
                msg,
                text="The tabulation fails. \nPlease, check the log file",
                font=("Helvetica", "16"))
            content.pack()
示例#11
0
def pdf_to_jpg(path, pdf_name, image_name):
    try:
        noPages = get_num_of_pages(path + pdf_name)
        with Img(filename=path + pdf_name, resolution=300) as img:
            img.compression_quality = 99
            img.save(filename=path + image_name)
        return noPages
    except:
        sys.exit("Failure in processing your file")
示例#12
0
def pdfToImage(input_path, output):

    with Img(filename=input_path, resolution=300) as img:
        img.background_color = Color('white')  # Set white background.
        img.alpha_channel = 'remove'

        with img.convert('png') as converted:
            converted.save(filename=output)

            return output
示例#13
0
 def __init__(self, infile, outfile, threshold):
     with Img(filename=infile, resolution=100) as image:
         image.compression_quality = 100
         image.format = 'png'
         image.save(filename="temp.png")
     self.img = Image.open("temp.png").convert("RGBA")
     self.outfile = outfile
     self.threshold = threshold
     self.black = (0, 0, 0)
     self.data = self.img.getdata()
示例#14
0
def pdf_to_img(filename, index):
    #     num = filename.split('/')      This commented code
    #     num = num[2]                      places same file name of the pdf to img
    #     num = num.split('.')
    #     num = num[0]
    #     print("data from split",num)
    with Img(filename=filename, resolution=300) as img:
        img.compression_quality = 50
        #Below specify the output folder
        img.save(filename='./new_data/5/{}.jpg'.format(str(index)))
示例#15
0
def ocr_pdf_file(pdf_path):
    # print("\tConverting to images")

    partitions = pdf_path.split("/")
    n_parts = len(partitions)
    file = partitions[n_parts - 1].replace(".pdf", "")

    img_path = "/".join(partitions[:-1]) + "/"
    dest_path = img_path + file + ".jpg"

    clear_dir(img_path)

    with Img(filename=pdf_path, resolution=300) as img:
        for i, image in enumerate(img.sequence):
            img_id = str(i + 1).zfill(4)
            file_name = dest_path[:-4] + "-" + img_id + '.jpg'
            Img(image).save(filename=file_name)

    # time.sleep(1)
    files = glob.glob(img_path + "*.jpg")

    # print("\tLoading images")

    text = ""
    for file in files:
        # print("\t\tProcessing image: ", file)

        img = Image.open(file)
        img = img.convert('L')
        img.save(file)

        # To Use portuguese language it was necessary to download the trained model in:
        # https://github.com/tesseract-ocr/tessdata_best/blob/master/por.traineddata
        # And move it to /usr/share/tesseract-ocr/4.00/tessdata/
        read = pytesseract.image_to_string(Image.open(file),
                                           lang='por',
                                           config=tessdata_dir_config)
        text += read + "\n"

    text = text_preprocessing.clear_pdf_rtf(text)

    return text
示例#16
0
def converter_pdf(path_source,pdffilename,path_destination,imgfilename,dpi):

    if "pdf" in pdffilename:
        filename=path_source+pdffilename
        image_name=path_destination+imgfilename

        with Img(filename=filename, resolution=dpi) as img:
            img.compression_quality = 99
           
            img.save(filename=image_name)
    return image_name
示例#17
0
 def save_img_with_wand(self, pdfName, output):
     try:
         with Img(filename=pdfName, resolution=self.resolution) as pic:
             library.MagickResetIterator(pic.wand)
             pic.scene = 1  # Start cpt of filename at 1 instead of 0
             pic.compression_quality = self.compressionQuality
             pic.background_color = Color("white")
             pic.alpha_channel = 'remove'
             pic.save(filename=output)
     except (PolicyError, CacheError) as e:
         self.Log.error('Error during WAND conversion : ' + str(e))
示例#18
0
    def convertirPDFenJPG(self, ruta):
        """
        transforma un pdf en una sucesion de imagenes, una por cada hoja del pdf
        :param ruta: ruta donde se almacena el pdf
        :return: imagenes.jpg
        """
        print('*******************************************')

        with Img(filename= MEDIA_URL+ruta , resolution=150) as img:
            #img.compression_quality = 99
            img.save(filename=IMAGENES_PATH+'image_name.jpg')
示例#19
0
def convert_pdf(fichier):
    """
    Permet la conversion d'un fichier pdf en fichier image
    :param : <str> Nom du fichier pdf
    :return : <str> Nom du fichier image
    """
    with Img(filename=fichier, resolution=300) as img:
        img.compression_quality = 99
        image = 'image_name.jpg'  # cas ou on traite plusieurs fichiers à la suite
        img.save(filename=image)
    return image
示例#20
0
def processaPDF():
    try:
        with Img(filename='processar.pdf', resolution=300) as img:
            img.compression_quality = 99
            with open('processar.jpg', 'wb') as f:
                img.save(file=f)
        text = pytesseract.image_to_string('processar.jpg')
        os.remove('processar.jpg')
        os.remove('processar.pdf')
    except:
        text = ''
    return text
示例#21
0
    def pyTesseract(self):
        from wand.image import Image as Img  # have to install Imagemagick for converting pdf to images. R is much better here, with no additional installtion it can use tesseract seamlessly somehow
        with Img(filename=filename, resolution=300) as img:
            img.compression_quality = 99
            img.save(filename='image_name.jpg')
            numPages = len(img.sequence)

        data = []
        for i in range(0, numPages):
            data.append(pt.image_to_string('image_name-{i}.jpg'.format(i=i)))
        data = ' '.join(map(str, data))
        return (data)
def image_to_text(file_name):
    text = ''
    with Img(filename=file_name, resolution=300) as img:
        img.compression_quality = 99
        img.save(filename=file_name[:-4] + '.jpg')

    total_image_file = [i for i in os.listdir() if i.endswith('.jpg')]
    for image in total_image_file:
        text += pytesseract.image_to_string(Image.open(image))
        os.remove(image)

    print(text[0:100])
    return text
示例#23
0
def converter_pdf(path_source, pdffilename, path_destination, imgfilename,
                  dpi):

    filename = os.getcwd() + "/" + path_source + pdffilename
    image_name = os.getcwd() + "/" + path_destination + imgfilename

    with Img(filename=filename, resolution=dpi) as img:
        img.alpha_channel = 'remove'  #close alpha channel
        img.background_color = wandimg.Color('white')
        img.compression_quality = 99

        img.save(filename=image_name)
    return image_name
示例#24
0
def cvt_pdf2img(fileName):
    #Create output image directory
    outputImage = outputPath + fileName + "/images"
    if not os.path.exists(outputImage):
        os.makedirs(outputImage)
    #Create output OCRtext directory
    outputOCRtext = outputPath + fileName + "/OCRtext"
    if not os.path.exists(outputOCRtext):
        os.makedirs(outputOCRtext)
    #Convert pdf to image then store in outputImage directory
    with Img(filename=inputPath + inputFile, resolution=300) as cvt2img:
        cvt2img.compression_quality = 99
        cvt2img.save(filename=outputImage + "/page.jpg")
示例#25
0
def pdfToJpegMultipleFiles(inputFile, outputFile):
    try:
        with Img(filename=inputFile, resolution=IMAGE_RESOLUTION) as input:
            if input.format != 'PDF':
                raise RuntimeError('Input file is not a PDF.')

            input.compression_quality = COMPRESSION_QUALITY

            input.save(filename=outputFile)

    # If the pdf is corrupt / empty, it will attempt to free the image object
    # when we exit the program, and this fails because the image object is
    # empty. This also can't be caught because it occurs when the runtime is
    # exiting, so we'll get an ugly stack trace when we quit. We can't attempt
    # to del it before exiting because this will fail, so it will again be
    # automatically called on exit.
    except DelegateError:
        raise RuntimeError('PDF file appears corrupted.')
示例#26
0
def convertPDFToOCR(path):
    os.system("rm sample_scan*")
    text = ""
    pdfObject = open(path, 'rb')
    pdfReader = PyPDF2.PdfFileReader(pdfObject)
    pages = pdfReader.numPages
    with Img(filename=path, resolution=300) as img:
        img.compression_quality = 0
        img.save(filename="sample_scan.jpg")

    if pages == 1:
        text = pytesseract.image_to_string(Image.open('sample_scan.jpg'))
    else:
        for i in range(0, pages):
            temp = pytesseract.image_to_string(
                Image.open('sample_scan-' + str(i) + '.jpg'))
            text += temp

    return text
示例#27
0
def apply_ocr(pdf_path_file):
    path_img = f'../assets/img/TEMPS-IMG.jpg'
    path_dir_img = '../assets/img/'

    # Convert pdf to jpg
    with Img(filename=pdf_path_file, resolution=300) as img:
        img.compression_quality = 99
        img.save(filename=path_img)

    # Apply OCR on it
    text_from_pdf = ''
    for r, d, f in os.walk(path_dir_img):
        for img in f:
            if '.jpg' in img:
                text_file = pytesseract.image_to_string(
                    Image.open(path_dir_img + img))
                text_from_pdf += text_file
                os.remove(path_dir_img + img)

    return text_from_pdf
示例#28
0
    def convert_scanned_pdf_to_png(self):
        """
        Resolution = 300 and compression_quality = 99 are optimal to assure the image's quality.
        :return: file converted to JPG
        """

        img = Img(filename=self.file, resolution=300) ### Opens scanned pdf file
        img.compression_quality = 99 ### Sets compression quality to 72

        ### Reassign self.img_file according to self.file
        try:
            self.img_file = self.file.split('/')[-1]
            self.img_file = self.img_file.replace('.pdf', '.png')
            self.img_file = os.path.join(ParseScannedPdf.temp_folder, self.img_file)
        except Exception as e:
            print 'Be sure to use forward slash when assigning the path of the pdf or img file.'
            print e.args
            sys.exit()

        self.type_pdf = True
        img.save(filename=self.img_file)  ### Converts the file to png
        time.sleep(2)
示例#29
0
def PDFtoJPG(PDFpath, saveAs="[inputfilename].jpg"):
    # Saves a JPG file from a specified PDF. Saves JPG at specified location and returns path of jpg file as String.
    # Multipage PDFs will be split up into multiple JPGs and page number appended in the format "-#" (starting at zero, no leading zeros)
    from wand.image import Image as Img  #requires ImageMagik requires ghostscript and paths.Tedious, but best solution found so far.

    #convert potential paths into strings
    saveAs = str(saveAs)
    PDFpath = str(PDFpath)
    try:
        with Img(filename=PDFpath, resolution=300) as pdf:
            pdf.compression_quality = 99
            if saveAs == "[inputfilename].jpg":
                pdf.convert("jpg").save(
                    filename=PDFpath.replace(".pdf", ".jpg"))
                return PDFpath.replace(".pdf", ".jpg")
            if not saveAs.endswith(".jpg"):
                saveAs = saveAs + ".jpg"
            pdf.convert("jpg").save(filename=saveAs)
            return saveAs
    except:
        print("PDF file for jpg conversion not found at specified location:",
              PDFpath)
        return
示例#30
0
def parse_contents(contents, filename, date):
    with Img(filename=filename, resolution = 300) as img:
        img.compression_quality = 99
        img.save(filename='uploaded_file.jpg')
        demo = Image.open("uploaded_file.jpg")
        text = pytesseract.image_to_string(demo, lang = 'eng')
    #print(text)
    return html.Div([
        html.H5(filename),
        html.H6(datetime.datetime.fromtimestamp(date)),

        # HTML images accept base64 encoded strings in the same format
        # that is supplied by the upload
        html.Img(src=contents),
        html.Hr(),
        html.H5("The OCR read words are below"),
        html.Div(text),
        html.Div('Raw Content'),
        html.Pre(contents[0:200] + '...', style={
            'whiteSpace': 'pre-wrap',
            'wordBreak': 'break-all'
        })
    ])