Python convert_pdf2jpg 예제들, pdf2jpg.pdf2jpg.convert_pdf2jpg Python 예제들

예제 #1

0

파일 보기

    def extract_text(self):

        if not os.path.exists(self.image_out_path):
            os.mkdir(self.image_out_path)

        PDF_file = self.filename
        pdf2jpg.convert_pdf2jpg(PDF_file, self.image_out_path, dpi=300)
        print("Successfully saved images for each page for {}".format(
            self.image_out_path))

        out_folder_name = os.path.basename(self.filename) + "_dir"
        english_text = list()

        for filename in os.listdir(
                os.path.join(self.image_out_path, out_folder_name)):
            if filename.endswith("jpg"):
                text = str(((pytesseract.image_to_string(
                    Image.open(
                        os.path.join(self.image_out_path, out_folder_name,
                                     filename))))))
                text = text.replace('-\n', '')
                english_text.append(text)

        corpus = " ".join(english_text)
        corpus = re.sub(r'\n+', '\n', corpus).strip()
        corpus = corpus.split(".")
        self.english = list(map(str.strip, corpus))
        print("English Text Extracted is : {}".format(self.english))
        shutil.rmtree(self.image_out_path)

예제 #2

0

파일 보기

파일: pdf.py 프로젝트: LucasVidigal98/Crawler-Enade

def convert_pdf(content_dict):

    try:
        os.mkdir('Images')
    except:
        pass

    for key in content_dict.keys():

        found = False
        #Verfica se a prova existe
        try:
            with open('Pdfs/Prova ' + str(key) + '.pdf', 'rb') as file_check:
                found = True
                file_check.close()
        except:
            print('Erro ao converter ' + str(key))
            continue

        if found == True:
            try:
                pdf_name = 'Pdfs/Prova ' + key + '.pdf'
                pdf2jpg.convert_pdf2jpg(pdf_name, 'Images/', pages='ALL')
                print('Convertendo Prova ' + str(key))
            except:
                print('Erro ao converter Prova ' + str(key))
                continue

예제 #3

0

파일 보기

파일: pdf_utility.py 프로젝트: nareshrasamalla/python

def read_pdfpage_as_image(pdf, directory, cert_no):
    tgt_img_dir = os.path.basename(pdf) + "_dir"

    pdf2jpg.convert_pdf2jpg(pdf, directory, dpi=150, pages="ALL")

    tgt_folder = directory + "/" + tgt_img_dir

    img_list = os.listdir(tgt_folder)
    file_count = len(img_list)

    index = 0

    for fname in os.listdir(tgt_folder):
        src_file = tgt_folder + "/" + fname
        tgt_path = directory

        timestamp = time.time()
        random_number = random.randint(10000000, 99999999)

        #tgt_path = tgt_path + "/"+ cert_no +  "_" + str(timestamp)
        tgt_path = tgt_path + "/" + cert_no + "_" + str(random_number)

        if file_count > 1:
            index += 1
            tgt_path = tgt_path + "_" + str(index) + ".jpg"
        else:
            tgt_path = tgt_path + ".jpg"

        file.rename_file(src_file, tgt_path)

    file.remove_directory(tgt_folder)

예제 #4

0

파일 보기

파일: pdf_line_bb.py 프로젝트: Sayantanmukherjee6/Visual-analysis-of-PDF-documents

def get_page_image_from_pdf(page_num, image_file_name, pdf_file_path):
	"""
	Converting a pdf page into an Image for processing
	"""
	inputpath = pdf_file_path
	outputpath = "images/"
	# To convert single page
	pdf2jpg.convert_pdf2jpg(inputpath, outputpath, pages=str(page_num))
	im = cv2.imread(
		"images/"+pdf_file_path+"/"+str(page_num)+"_"+pdf_file_path+".jpg",
		 cv2.IMREAD_COLOR
		)
	cv2.imwrite(image_file_name, im)
	return cv2.imread(image_file_name, 0)

예제 #5

0

파일 보기

파일: Tkinter last Final OCR with GUI.py 프로젝트: vermavinay982/OCR-with-GUI

def ocrfinal():
    print("Started OCRING")
    btn1.configure(text="Started OCRING")
    global textdata
    global inputpath
    global outputpath
    global langs

    #inputpath=input()
    pytesseract.tesseract_cmd = "C:\Program Files (x86)\Tesseract-OCR"
    btn1.configure(text="pdf to images")
    print("pdf to images")
    result = pdf2jpg.convert_pdf2jpg((inputpath), outputpath, pages="ALL")
    path, dirs, files = next(os.walk("PDFIMAGES/" + inputpath))
    btn1.configure(text="image to string")
    print("image to string")
    for file in files:
        btn1.configure(text=file)
        print(file)
        textdata = textdata + pytesseract.image_to_string(
            path + "/" + file, lang=str(langs[selected.get()]))

    #writing the files in txt format

    print("Saving the File")

예제 #6

0

파일 보기

파일: utility.py 프로젝트: PP-HashInclude/assessmentplatform

def convert_qpaper(inputfolder,
                   inputfilename,
                   outputfolder,
                   qpaperfoldername=""):
    inputpathfile = os.path.join(inputfolder, inputfilename)

    result = pdf2jpg.convert_pdf2jpg(inputpathfile,
                                     outputfolder,
                                     dpi=300,
                                     pages="ALL")
    print(result)

    #Remove filepaths and return only filenames as list
    if len(qpaperfoldername) > 0:
        qpaperfoldername = qpaperfoldername + "/"

    jpgoutputfolder = qpaperfoldername + os.path.basename(
        result[0]['output_pdfpath'])
    jpgfilepaths = result[0]['output_jpgfiles']
    jpgfiles = []
    for jpgfile in jpgfilepaths:
        jpgfiles.append(
            (inputfilename, jpgoutputfolder + "/" + os.path.basename(jpgfile)))

    return jpgfiles

예제 #7

0

파일 보기

def askfail():
    try:
        q=open("data_slovar.json",'r+',encoding='UTF-8')
        data_slovar=json.load(q)
        q.close()
        q=open("data_slovar.json",'w',encoding='UTF-8')
        print ("json был , скачаны файлы,  json обнулен и создан заново")
    except FileNotFoundError:
        q=open("data_slovar.json",'w',encoding='UTF-8')
        data_slovar={}
        print ("json не было ,json создан заново")


    try:
        os.mkdir("dump_for_jpg")
    except OSError:
        print ("Директория есть?")
        
    #ОСНОВА ПРОГРАММЫ НАЧАЛО
    directory_of_pdf_fail=filedialog.askopenfilename()

    #l2['text']=directory_of_pdf_fail
    outputpath=os.getcwd()+'//'+'dump_for_jpg'
    result = pdf2jpg.convert_pdf2jpg(directory_of_pdf_fail, outputpath, pages="ALL")

    name_of_folder_with_jpg=os.listdir(outputpath)[0]

    if name_of_folder_with_jpg[:-4] in data_slovar:
        json.dump(data_slovar,q)
        q.close()
        to_remove=outputpath+'//'+name_of_folder_with_jpg
        shutil.rmtree(to_remove, ignore_errors=False, onerror=None)
        messagebox.showinfo("Опа-па", "PDF фаил с таким именем уже присутствует в базе данных")
        return
        
    print("А ВЫХОДА ТО НЕ БЫЛО!!!")
    folder_with_jpg=outputpath+'//'+os.listdir(outputpath)[0]
    list_of_names_jpg=os.listdir(folder_with_jpg)

    print(list_of_names_jpg)
    print(folder_with_jpg)
    for i in range(len(list_of_names_jpg)):
        input_path_of_jpg_to_ocr=folder_with_jpg+'//'+list_of_names_jpg[i]
        string_of_image = pytesseract.image_to_string(Image.open(input_path_of_jpg_to_ocr), lang='rus')

        if name_of_folder_with_jpg[:-4] in data_slovar:
            data_slovar[name_of_folder_with_jpg[:-4]]=data_slovar[name_of_folder_with_jpg[:-4]]+' '+string_of_image
        else:
            data_slovar[name_of_folder_with_jpg[:-4]]=string_of_image

    #!!!!! УДАЛЕНИЕ СОДЕРЖИМОГО ДИРЕКТОРИИ  
    shutil.rmtree(folder_with_jpg , ignore_errors=False, onerror=None)
    #!!!!! УДАЛЕНИЕ СОДЕРЖИМОГО ДИРЕКТОРИИ В "PATH'

    json.dump(data_slovar,q)
    q.close()
    print("Представление окончено")

예제 #8

0

파일 보기

def getImageOfPdf(inputpath):
    from pdf2jpg import pdf2jpg
    outputpath, _ = os.path.split(inputpath)
    outputimagePath = makedirectory(outputpath)
    # print(outputpath)
    # prepare task for it
    result = pdf2jpg.convert_pdf2jpg(inputpath, outputimagePath, pages="ALL")
    print(result)
    return outputimagePath

예제 #9

0

파일 보기

def extract_multipage_data(key, image):
    # file path
    input_file_path = os.path.join(tempfile.gettempdir(), f'{key}.blob')
    # path for output
    output_file_path = os.path.join(tempfile.gettempdir(), f'{key}')
    # create dir
    os.mkdir(output_file_path)
    try:
        # dump pdf file
        with open(input_file_path, "wb") as file:
            file.write(image)
        # pdf to images
        result = convert_pdf2jpg(input_file_path, output_file_path, dpi=300, pages="ALL")
        # process images
        all_text = []
        all_pdf = []
        for index, value in enumerate(result[0]['output_jpgfiles']):
            # extract
            text, pdf = extract_data(Image.open(value))
            # store text
            all_text.append(text)
            # dump pdf
            path_pdf = os.path.join(tempfile.gettempdir(), f'{key}', f'{key}_{index}.pdf')
            with open(path_pdf, 'wb') as fout:
                fout.write(pdf)
            # store filepath
            all_pdf.append(path_pdf)
            # remove image
            os.remove(value)
        # combine text
        combine_text = ' '.join([x for x in all_text]).encode('utf-8')
        # combine pdf
        pdf_writer = PdfFileWriter()
        for x in all_pdf:
            pdf_reader = PdfFileReader(x)
            for page in range(pdf_reader.getNumPages()):
                pdf_writer.addPage(pdf_reader.getPage(page))
            # remove old pdf
            os.remove(x)
        # dump all pdf file
        path_pdf = os.path.join(tempfile.gettempdir(), f'{key}', f'{key}_all.pdf')
        with open(path_pdf, 'wb') as fout:
            pdf_writer.write(fout)
        # read and encode
        with open(path_pdf, 'rb') as fin:
            data = base64.b64encode(fin.read())
        # remove file
        os.remove(path_pdf)
        # done
        return combine_text, data
    finally:
        # delete file
        if os.path.exists(input_file_path):
            os.remove(input_file_path)
        # delete images
        if os.path.exists(output_file_path):
            shutil.rmtree(output_file_path)

예제 #10

0

파일 보기

파일: Script.py 프로젝트: arka7007/IE-from-Scanned-PDF

def get_required_text_scanned_pdf(count, file, sentence, corpus):
    for i in range(0, count):
        pdf2jpg.convert_pdf2jpg(file, 'images/', pages=str(i))
        config = ('-l eng --oem 1 --psm 3')
        im = cv2.imread('images/' + file + '/' + str(i) + '_' + file + '.jpg',
                        cv2.IMREAD_COLOR)
        text = pytesseract.image_to_string(im, config=config)
        text_list = text.splitlines()
        for j in range(0, len(text_list) - 1):
            if corpus[0] in text_list[j] and corpus[1] in text_list[j]:
                try:
                    s = text_list[j] + " " + text_list[j +
                                                       1] + " " + text_list[j +
                                                                            2]
                except:
                    s = text_list[j] + " " + text_list[j + 1]
                if corpus[0] in s and corpus[1] in s and corpus[2].lower(
                ) in s.lower():
                    sentence.append(s)
                    return sentence

예제 #11

0

파일 보기

def watermark(original_pdf, output_pdf, watermark_pdf):
    """
    Take the original pdf and do the following:
        - merge it with watermark pdf into an intermediary pdf
        - export the intermediary pdf to jpeg
        - build another pdf file from jpegs -> watermark + readonly
    Refs:
        - https://stackabuse.com/working-with-pdfs-in-python-adding-images-and-watermarks
        - http://www.blog.pythonlibrary.org/2018/06/07/an-intro-to-pypdf2
    """

    print("starting worker: {0}".format(threading.get_ident()))

    tmp_pdf_name = 'intermediary_' + str(uuid.uuid1()) + '_.pdf'
    tmp_pdf_path = "{0}/{1}".format("/tmp", tmp_pdf_name)
    jpegs_dir = "{0}/{1}_{2}".format("/tmp", "jpegs", str(uuid.uuid1()))
    watermark = PdfFileReader(watermark_pdf)
    watermark_page = watermark.getPage(0)
    pdf = PdfFileReader(original_pdf)
    pdf_writer = PdfFileWriter()

    for page in range(pdf.getNumPages()):
        pdf_page = pdf.getPage(page)
        pdf_page.mergePage(watermark_page)
        pdf_writer.addPage(pdf_page)

    with open(tmp_pdf_path, 'wb') as fh:
        pdf_writer.write(fh)

    pdf2jpg.convert_pdf2jpg(tmp_pdf_path, jpegs_dir, pages="ALL")

    images_list = [
        i for i in os.listdir("{0}/{1}_{2}".format(
            jpegs_dir, tmp_pdf_name, "dir")) if i.endswith(".jpg")
    ]
    sort_nicely(images_list)
    makePdf(output_pdf, images_list,
            "{0}/{1}_{2}".format(jpegs_dir, tmp_pdf_name, "dir"))

    os.remove(tmp_pdf_path)
    shutil.rmtree(jpegs_dir)

예제 #12

0

파일 보기

파일: init.py 프로젝트: mkalicharan/optional

def pdf_to_jpg(inputpath, imagepath):
    """
    This function is used to convert PDF drawings into JPGs

    Arguments:
        inputpath {[type]} -- [description]
        imagepath {[type]} -- [description]
    """    
    try:
        global step_now
        for file in glob.glob(inputpath + "\\" + "*.pdf"):
            logging.info(' Converting ' + file + " to JPG")  
            pdf2jpg.convert_pdf2jpg(file, imagepath, dpi=300, pages="ALL")
            if not __debug__:
                progress_bar_increment()   
                label_1.configure(text = "Converting " + os.path.basename(file) + " to JPG")
                label_1.update()
    except:
        #global error_present
        error_present = 1
        logging.error(' Error in "pdf_to_jpg" function : ')
        error_message = PrintException()
        logging.error(error_message)
        raise

예제 #13

0

파일 보기

파일: views.py 프로젝트: revanthgss/ebook

def home(request):
    branch = 'CSE'
    year = '2'
    upload_dir = os.path.join('media', branch, year)
    ENV_PATH = os.path.abspath(os.path.dirname(__file__))[:-6]
    upload_dir = os.path.join(ENV_PATH, upload_dir)
    for filename in os.listdir(upload_dir):
        path = os.path.join(upload_dir, filename)
        newpath = os.path.join(upload_dir, filename[:-4] + '.jpg')
        if not os.path.exists(newpath):
            result = pdf2jpg.convert_pdf2jpg(path, upload_dir, pages="0")
            oldpath = os.path.join(path + '_dir', '0_' + filename + '.jpg')
            print(result)
            os.rename(oldpath, newpath)
            os.rmdir(path + '_dir')
    return render(request, 'home.html')

예제 #14

0

파일 보기

 def pdfTojpg(self, event):
     outputpath = self.outputpath
     result = []
     for i in self.eachPDF:
         inputpath = str(i)
         result.append(
             pdf2jpg.convert_pdf2jpg(inputpath, outputpath, pages="0"))
     a = 0
     b = []
     for n in result:
         k = str(str(n[0]['output_jpgfiles'][0]))
         text = pytesseract.image_to_string(k)
         hwb_demo = self.getNum(self.eachPerTIF[a])
         a += 1
         b.append(
             str(self.analyze(hwb_demo, text)) + str(self.checkTXT(text)))

예제 #15

0

파일 보기

파일: pdf2img.py 프로젝트: Guissey/Jarvis

    def __call__(self, jarvis, s):

        if not s:
            jarvis.say("please enter file path after calling the plugin")
        elif not "pdf" in s:
            jarvis.say("Your file must be a .pdf file")
        else:
            #We have to add the '.' back beacause the Jarvis API removes it
            s = s.replace('pdf', '.' + 'pdf')

            source_path = s
            dest_path = s.replace('.pdf', '')
            jarvis.say(source_path)
            jarvis.say(dest_path)
            result = pdf2jpg.convert_pdf2jpg(source_path,
                                             dest_path,
                                             pages="ALL")
            jarvis.say("file successfully converted")

예제 #16

0

파일 보기

def do_ocr(key, blob, root_dir):
    # file path
    input_file_path = os.path.join(root_dir, f'{key}.blob')
    # path for output
    output_file_path = os.path.join(root_dir, f'{key}')
    # create dir
    os.mkdir(output_file_path)
    try:
        # dump pdf file
        with open(input_file_path, "wb") as file:
            file.write(blob)
        # check if image already
        if check_if_image(input_file_path):
            result = [input_file_path]
        else:
            # pdf to images
            result = convert_pdf2jpg(input_file_path, output_file_path, dpi=300, pages="ALL")[0]['output_jpgfiles']
        # text
        all_text = []
        # pdf
        all_hocr = []
        # process images
        for value in result:
            # extract text and hocr
            text, hocr = extract_data_ocr(Image.open(value))
            # dump text
            all_text.append(text)
            # dump hocr
            all_hocr.append(hocr)
        # combine text
        combine_text = ' '.join([x for x in all_text]).encode('utf-8')
        combine_hocr = ' '.join([x.decode('utf-8') for x in all_hocr]).encode('utf-8')
        # combine
        return combine_text, combine_hocr
    finally:
        # delete file
        if os.path.exists(input_file_path):
            os.remove(input_file_path)
        # delete images
        if os.path.exists(output_file_path):
            shutil.rmtree(output_file_path)

예제 #17

0

파일 보기

파일: pdfconv.py 프로젝트: coolpulkit99/ocr_denoise_position_extraction

from pdf2jpg import pdf2jpg
inputpath = r"test.pdf"
outputpath = r"convertedpdf"

result = pdf2jpg.convert_pdf2jpg(inputpath, outputpath, pages="ALL")
print(result)

예제 #18

0

파일 보기

파일: main.py 프로젝트: terrsoshi/TextTableLocalizationIP

                    "Please enter the full path of the pdf document \n(Eg. " +
                    r"C:\Users\Eugene\Desktop\CSC2014 Assignment Aug 2019\Test Files\Test PDF Documents\test.pdf) :"
                    + "\n\n")
                sourcepath = r"" + userpdf

                #Prompts user to enter full path of the destination folder for the converted images to be saved into.
                userfolder = input(
                    "Please enter the full path of the folder for images to be saved to \n(Eg. "
                    +
                    r"C:\Users\Eugene\Desktop\CSC2014 Assignment Aug 2019\Test Files\Test PDF Documents) :"
                    + "\n\n")
                destinationpath = r"" + userfolder

                #Convert all pages of user's pdf into separate images saved in jpg format.
                convert = pdf2jpg.convert_pdf2jpg(sourcepath,
                                                  destinationpath,
                                                  pages="ALL")

                #If the entered pdf document cannot be found, output_jpgfiles list in dictionary at index 0 of convert list will be empty. An exception is raised if this happens.
                if len(convert[0]['output_jpgfiles']) == 0:
                    raise Exception

                #Get the pdf file name from user's first input
                pdfsplit = sourcepath.split("\\")
                pdfname = pdfsplit[len(pdfsplit) - 1]

                #Read all converted images in jpg format in the destination folder and save all of it into the pages list
                pages = [
                    cv2.imread(image)
                    for image in glob.glob(r"" + destinationpath + "\\" +
                                           pdfname + "_dir\\*.jpg")

예제 #19

0

파일 보기

 def pdf_ocr(self):
     #OCR the PDF files.
     self.pdfstring = ''
     self.pdflist = []
     termlist = ['EXW', 'CIP', 'CIF', 'FCA', 'FOB', 'DDU', 'DAP']
     inv_list = []
     outputpath = self.inputpath
     for k in self.eachpdfinv:
         inputpath = str(k)
         for n in self.newdir:
             s_part = re.findall('\d+', n)[0]
             s_tr = 's' + s_part
             if s_part in k:
                 s_num = s_tr
                 break
         invres = (pdf2jpg.convert_pdf2jpg(inputpath,
                                           outputpath,
                                           pages="ALL"))
         invadd = invres[0]['output_jpgfiles']
         term = 'none'
         limit = 0
         for kkk in invadd:
             text = pytesseract.image_to_string(kkk)
             for a_min in termlist:
                 if a_min in text:
                     term = a_min
                     break
             if limit >= 2:
                 break
             limit += 1
         ll = []
         ll.append(s_num)
         ll.append(term)
         self.inv_list.append(ll)
     a = 0
     b = []
     count_ = 0
     item = 0
     if self.eachPDF != []:
         outputpath = self.inputpath
         result = []
         for i in self.eachPDF:
             inputpath = str(i)
             pack = 'none'
             weight = 'none'
             for n in self.newdir:
                 num_part = re.findall(r'\d+', n)[0]
                 s_tr = 's' + num_part
                 if num_part in i:
                     s_number = s_tr
                     break
             invres = (pdf2jpg.convert_pdf2jpg(inputpath,
                                               outputpath,
                                               pages="ALL"))
             invadd = invres[0]['output_jpgfiles']
             count_ += 1
             hwb_real = self.analyzeNum(self.eachPerName[a])
             forwardername = self.checkforwarder(self.eachPerName[a])
             textChecked = []
             limit = 0
             for n in invadd:
                 k = n
                 text = pytesseract.image_to_string(k)
                 for i in self.checkTXT(text):
                     textChecked.append(i)
                 if re.findall(r"\d+\s\d+k", text, re.I) != []:
                     p_w = re.findall(r"\d+\s\d+k", text, re.I)[0]
                     pack = str(re.findall(r"\d+", p_w, re.I)[0])
                     weight = str(re.findall(r"\d+", p_w, re.I)[1])
                 elif re.findall(r"\d+\s\d+.{3,4}K", text, re.I) != []:
                     p_w = re.findall(r"\d+\s\d+.{3,4}K", text, re.I)[0]
                     pack = str(re.findall(r"\d+", p_w, re.I)[0])
                     weight = ''
                     if len(re.findall(r"\d+", p_w, re.I)) == 2:
                         weight += str(re.findall(r"\d+", p_w, re.I)[1])
                     else:
                         weight += str(re.findall(r"\d+", p_w, re.I)[1])
                         weight += '.'
                         weight += str(re.findall(r"\d+", p_w, re.I)[2])
                 if limit >= 2:
                     break
                 limit += 1
             textChecked.append(forwardername)
             b.append(s_number + ": " + str(hwb_real) + str(textChecked))
             self.pdflist.append(
                 [s_number, hwb_real, textChecked, pack, weight])
             item += 1
             a += 1
             print('PDF已完成 ' + str(item) + ' 单' + '\n运单号为： ' +
                   self.eachPerName[a - 1])
         self.pdfstring = str(b)
     else:
         pass

예제 #20

0

파일 보기

# -*- coding: utf-8 -*-
"""
Created on Wed Nov 21 15:51:00 2018

@author: Murali
"""

from pdf2jpg import pdf2jpg

result = pdf2jpg.convert_pdf2jpg('D:\\Sargunan\\Table\\001.pdf', 'c:\\temp\\p')
print(result)

예제 #21

0

파일 보기

def main():
    inputpath = r"C:\302015_c.pdf"
    outputpath = r"C:\Users\azkb075\Downloads"
    # To convert single page
    result = pdf2jpg.convert_pdf2jpg(inputpath, outputpath, pages="1, 2")
    print(result)

예제 #22

0

파일 보기

def pdf_to_images(filename, inputpath, outputpath):
    inputpath = inputpath + filename
    pdf2jpg.convert_pdf2jpg(inputpath, outputpath, pages="ALL")

예제 #23

0

파일 보기

파일: main.py 프로젝트: HarshitPatel25/OCR

from pdf2jpg import pdf2jpg
inputpath = r"C:\New folder\New Vision Soft\OCR\PDF_JPG\PNB_BS.pdf"
outputpath = r"C:\New folder\New Vision Soft\OCR\PDF_JPG"

# To convert single page
result = pdf2jpg.convert_pdf2jpg(inputpath, outputpath, dpi=300, pages="1")
print(result)

# To convert multiple pages
result = pdf2jpg.convert_pdf2jpg(inputpath, outputpath, dpi=300, pages="1,0,3")
print(result)

# to convert all pages
result = pdf2jpg.convert_pdf2jpg(inputpath, outputpath, dpi=300, pages="ALL")
print(result)

from pdf2image import convert_from_path
pages = convert_from_path('PNB_BS.pdf', 500)

from pdf2image import convert_from_path
import glob

pdf_dir = glob.glob(r'C:\New folder\New Vision Soft\OCR\PDF_JPG\PNB_BS.pdf'
                    )  #your pdf folder path
img_dir = r'C:\New folder\New Vision Soft\OCR\PDF_JPG'

for pdf_ in pdf_dir:
    pages = convert_from_path(pdf_, 500)
    for page in pages:
        page.save(img_dir + pdf_.split("\\")[-1][:-3] + "jpg", 'JPEG')

예제 #24

0

파일 보기

파일: import.py 프로젝트: inshapardaz/tools

def extractJpegFromPng(inputFolder, outputFolder):
    pdf2jpg.convert_pdf2jpg(inputFolder, outputFolder, pages="ALL")

예제 #25

0

파일 보기

파일: api.py 프로젝트: yveronne/PVGenerator

def index():
    try:
        tab = ['RDPC', 'MRC', 'SDF', 'FPD', 'ADD', 'UDC', 'UNIVERS', 'PURS', 'MCNC', 'ANDP', 'CPP', 'SCNC', 'MP']
        variable.PARTIES_LIST = tab[:int(request.args.get('number_party'))]
        print("Je récupère les partis ")
        print(variable.PARTIES_LIST)
        variable.BUREAU_VOTES_NUMBER = int(request.args.get('bureau_number'))
        variable.ENROLLED_PEOPLE_NUMBER = int(request.args.get('enrolled_number'))
        variable.COALITION_MODE = int(request.args.get('coalition_mode'))
    except Exception:
        return str(status.HTTP_400_BAD_REQUEST), "The data is not in the real format"
    try:
        import shutil
        if os.path.exists(os.path.join(os.path.dirname(__file__), 'PV_IMAGE')):
            shutil.rmtree(os.path.join(os.path.dirname(__file__), 'PV_IMAGE'))
            os.makedirs(os.path.join(os.path.dirname(__file__), 'PV_IMAGE'))
        if os.path.exists(os.path.join(os.path.dirname(__file__), 'PV_PDF')):
            shutil.rmtree(os.path.join(os.path.dirname(__file__), 'PV_PDF'))
            os.makedirs(os.path.join(os.path.dirname(__file__), 'PV_PDF'))

        

        import random

        from PV import PV
        from variable import BUREAU_VOTES_NUMBER, ENROLLED_PEOPLE_NUMBER, PARTIES_LIST, COALITION_MODE, getCoalitionName

        from BureauVote import BureauVote
        from Fonction import Fonction

        bureauVotes_List = []
        print(PARTIES_LIST)
        repartitions = Fonction.generateRandomNumbers(BUREAU_VOTES_NUMBER, ENROLLED_PEOPLE_NUMBER)

        # je dispatche les nombres d'inscrits entre les bureaux de vote
        for i in range(BUREAU_VOTES_NUMBER):
            bureauVotes_List.append(BureauVote('Bureau_de_Vote_' + str(i + 1), repartitions[i]))

        # j'enregistre les resultats reels de chaque parti dans chaque bureau de vote, (RDPC,nbreVotes)
        for bureau in bureauVotes_List:
            pv = PV("Reference")
            pv.party_results = Fonction.generateRandomNumbers(len(PARTIES_LIST), bureau.enrolled_persons)
            bureau.results = pv
            # bureau.results = [(PARTIES_LIST[i], votes) for i, votes in
            #                   enumerate(Fonction.generateRandomNumbers(len(PARTIES_LIST), bureau.enrolled_persons))]

        # c'est la partie ou on commence a generer les pv en fonction de la fraude

        # print(Fonction.generatePVForBureauWithCoalitionMode(bureauVotes_List[0].results, COALITION_MODE, 2, [0, 6, 2]))
        # Ici on genere tous les PV avec une eventuelle coalition
        number_of_coalised_party = random.randint(1, len(PARTIES_LIST) - 1)
        coalised_group = Fonction.generateTabRandomElementsDiff(number_of_coalised_party, len(PARTIES_LIST))
        party_favorite = random.choice(coalised_group)
        for bureau in bureauVotes_List:
            bureau.all_pv = Fonction.generatePVForBureauWithCoalitionMode(bureau.results.party_results, COALITION_MODE, party_favorite, coalised_group)
        for bureau in bureauVotes_List:
            t = random.choices([0, 1], [0.35, 0.65])[0]
            if t:
                pv = PV("Elecam")
                pv.party_results = Fonction.generateRandomNumbersWithPref(len(PARTIES_LIST), bureau.enrolled_persons, 0)
                bureau.elecam_pv = pv
            else:
                pv = PV("Elecam")
                pv.party_results = bureau.results.party_results
                bureau.elecam_pv = pv
        for bureau in bureauVotes_List:
            print(bureau)
            for item in bureau.all_pv:
                print(item)

        # Creation de tous les dossiers

        #import os

        for party in PARTIES_LIST:
            if not os.path.exists(os.path.join(os.path.dirname(__file__), 'PV_PDF', party)):
                os.makedirs(os.path.join(os.path.dirname(__file__), 'PV_PDF', party))

        if not os.path.exists(os.path.join(os.path.dirname(__file__), 'PV_PDF', 'REAL_RESULT')):
            os.makedirs(os.path.join(os.path.dirname(__file__), 'PV_PDF', 'REAL_RESULT'))

        if not os.path.exists(os.path.join(os.path.dirname(__file__), 'PV_PDF', 'ELECAM')):
            os.makedirs(os.path.join(os.path.dirname(__file__), 'PV_PDF', 'ELECAM'))

        # Generating PDF files
        from fpdf import FPDF

        width_cell, height_cell = 150, 40

        for bureau in bureauVotes_List:
            for PV in bureau.all_pv:
                pdf = FPDF(orientation='P', unit='pt', format='A4')
                pdf.add_page()
                pdf.set_font("Arial", size=16)
                pdf.multi_cell(0, 20, "PV_" + bureau.name, 0)
                pdf.set_top_margin(20)
                pdf.set_font("Arial", size=14)
                pdf.multi_cell(0, 20, "Nombre_de_Votants : " + str(bureau.enrolled_persons), 0)
                pdf.set_top_margin(20)
                pdf.multi_cell(0, 20, "Type_de_Coalition : " + str(getCoalitionName(COALITION_MODE)), 0)
                pdf.set_top_margin(20)
                pdf.set_fill_color(193, 229, 252)
                pdf.cell(width_cell, height_cell, 'Nom_du_Parti', 1, 0, 'C', 1)
                pdf.cell(width_cell, height_cell, 'Nombre_de_Voix', 1, 1, 'C', 1)
                for index, votes in enumerate(PV.party_results):
                    pdf.set_font("Arial", size=14)
                    pdf.cell(width_cell, height_cell, str(PARTIES_LIST[index]), 1, 0, 'L', 0)
                    pdf.set_font("Arial", size=14)
                    pdf.cell(width_cell, height_cell, str(votes), 1, 1, 'L', 0)
                pdf.set_font("Arial", size=14)
                pdf.multi_cell(0, 20, "", 0)
                pdf.set_top_margin(20)
                pdf.set_fill_color(193, 229, 252)
                pdf.cell(width_cell, height_cell, 'Nom_du_Representant', 1, 0, 'C', 1)
                pdf.cell(2 * width_cell, height_cell, 'Signatures', 1, 1, 'C', 1)
                for index, votes in enumerate(PV.party_results):
                    pdf.set_font("Arial", size=14)
                    pdf.cell(width_cell, height_cell, str(PARTIES_LIST[index]), 1, 0, 'L', 0)
                    pdf.set_font("Arial", size=12)
                    pdf.cell(2 * width_cell, height_cell,
                            "Scrutateur_" + str(PARTIES_LIST[index]) + "_" + str(bureau.name), 1, 1, 'L', 0)

                pdf.output('PV_PDF/' + str(PV.party_name) + '/PV_' + str(bureau.name) + '.pdf')

            # pour mettre les PV d'elecam
            pdf = FPDF(orientation='P', unit='pt', format='A4')
            pdf.add_page()
            pdf.set_font("Arial", size=16)
            pdf.multi_cell(0, 20, "PV_ELECAM_" + bureau.name, 0)
            pdf.set_top_margin(20)
            pdf.set_font("Arial", size=14)
            pdf.multi_cell(0, 20, "Nombre_de_Votants : " + str(bureau.enrolled_persons), 0)
            pdf.set_top_margin(20)
            pdf.multi_cell(0, 20, "Type_de_Coalition : " + str(getCoalitionName(COALITION_MODE)), 0)
            pdf.set_top_margin(20)
            pdf.set_fill_color(193, 229, 252)
            pdf.cell(width_cell, height_cell, 'Nom_du_Parti', 1, 0, 'C', 1)
            pdf.cell(width_cell, height_cell, 'Nombre_de_Voix', 1, 1, 'C', 1)
            for index, votes in enumerate(bureau.elecam_pv.party_results):
                pdf.set_font("Arial", size=14)
                pdf.cell(width_cell, height_cell, str(PARTIES_LIST[index]), 1, 0, 'L', 0)
                pdf.set_font("Arial", size=14)
                pdf.cell(width_cell, height_cell, str(votes), 1, 1, 'L', 0)
            pdf.set_font("Arial", size=14)
            pdf.multi_cell(0, 20, "", 0)
            pdf.set_top_margin(20)
            pdf.set_fill_color(193, 229, 252)
            pdf.cell(width_cell, height_cell, 'Nom_du_Representant', 1, 0, 'C', 1)
            pdf.cell(2 * width_cell, height_cell, 'Signatures', 1, 1, 'C', 1)
            for index, votes in enumerate(bureau.elecam_pv.party_results):
                pdf.set_font("Arial", size=14)
                pdf.cell(width_cell, height_cell, str(PARTIES_LIST[index]), 1, 0, 'L', 0)
                pdf.set_font("Arial", size=12)
                pdf.cell(2 * width_cell, height_cell,
                        "Scrutateur_" + str(PARTIES_LIST[index]) + "_" + str(bureau.name), 1, 1, 'L', 0)

            pdf.output('PV_PDF/ELECAM/PV_Elecam_' + str(bureau.name) + '.pdf')

            # pour mettre les PV de reference
            pdf = FPDF(orientation='P', unit='pt', format='A4')
            pdf.add_page()
            pdf.set_font("Arial", size=16)
            pdf.multi_cell(0, 20, "PV_GAGNANT_" + bureau.name, 0)
            pdf.set_top_margin(20)
            pdf.set_font("Arial", size=14)
            pdf.multi_cell(0, 20, "Nombre_de_Votants : " + str(bureau.enrolled_persons), 0)
            pdf.set_top_margin(20)
            pdf.multi_cell(0, 20, "Type_de_Coalition : " + str(getCoalitionName(COALITION_MODE)), 0)
            pdf.set_top_margin(20)
            pdf.set_fill_color(193, 229, 252)
            pdf.cell(width_cell, height_cell, 'Nom_du_Parti', 1, 0, 'C', 1)
            pdf.cell(width_cell, height_cell, 'Nombre_de_Voix', 1, 1, 'C', 1)
            for index, votes in enumerate(bureau.results.party_results):
                pdf.set_font("Arial", size=14)
                pdf.cell(width_cell, height_cell, str(PARTIES_LIST[index]), 1, 0, 'L', 0)
                pdf.set_font("Arial", size=14)
                pdf.cell(width_cell, height_cell, str(votes), 1, 1, 'L', 0)
            pdf.set_font("Arial", size=14)
            pdf.multi_cell(0, 20, "", 0)
            pdf.set_top_margin(20)
            pdf.set_fill_color(193, 229, 252)
            pdf.cell(width_cell, height_cell, 'Nom_du_Representant', 1, 0, 'C', 1)
            pdf.cell(2 * width_cell, height_cell, 'Signatures', 1, 1, 'C', 1)
            for index, votes in enumerate(bureau.results.party_results):
                pdf.set_font("Arial", size=14)
                pdf.cell(width_cell, height_cell, str(PARTIES_LIST[index]), 1, 0, 'L', 0)
                pdf.set_font("Arial", size=12)
                pdf.cell(2 * width_cell, height_cell,
                        "Scrutateur_" + str(PARTIES_LIST[index]) + "_" + str(bureau.name), 1, 1, 'L', 0)

            pdf.output('PV_PDF/REAL_RESULT/PV_Gagnant_' + str(bureau.name) + '.pdf')




        #import os
        from pdf2jpg import pdf2jpg

        from variable import PARTIES_LIST

        #import shutil
        if os.path.exists(os.path.join(os.path.dirname(__file__), 'PV_IMAGE')):
            shutil.rmtree(os.path.join(os.path.dirname(__file__), 'PV_IMAGE'))


        for party in PARTIES_LIST:
            if not os.path.exists(os.path.join(os.path.dirname(__file__), 'PV_IMAGE', party)):
                os.makedirs(os.path.join(os.path.dirname(__file__), 'PV_IMAGE', party))

        if not os.path.exists(os.path.join(os.path.dirname(__file__), 'PV_IMAGE', 'REAL_RESULT')):
            os.makedirs(os.path.join(os.path.dirname(__file__), 'PV_IMAGE', 'REAL_RESULT'))

        if not os.path.exists(os.path.join(os.path.dirname(__file__), 'PV_IMAGE', 'ELECAM')):
            os.makedirs(os.path.join(os.path.dirname(__file__), 'PV_IMAGE', 'ELECAM'))

        for folder in os.listdir(os.path.join(os.path.dirname(__file__), 'PV_PDF')):
            for file in os.listdir(os.path.join(os.path.dirname(__file__), 'PV_PDF', folder)):
                # pages = convert_from_path(os.path.join(os.path.dirname(__file__), 'PV_PDF', folder, file), 500)
                # pages[0].save(os.path.join(os.path.dirname(__file__), 'PV_IMAGE', folder, file), 'JPEG')
                result = pdf2jpg.convert_pdf2jpg(os.path.join(os.path.dirname(__file__), 'PV_PDF', folder, file),
                                                os.path.join(os.path.dirname(__file__), 'PV_IMAGE', folder), pages="0")
                print(result)

        print('Delete that folders')

        #import shutil

        for folder in os.listdir(os.path.join(os.path.dirname(__file__), 'PV_IMAGE')):
            for small_folders in os.listdir(os.path.join(os.path.dirname(__file__), 'PV_IMAGE', folder)):
                file = os.listdir(os.path.join(os.path.dirname(__file__), 'PV_IMAGE', folder, small_folders))[0]
                tab = file.split('.')
                new_file_name = tab[0][2:] + '.' + tab[-1]
                shutil.move(os.path.join(os.path.dirname(__file__), 'PV_IMAGE', folder, small_folders, file),
                            os.path.join(os.path.dirname(__file__), 'PV_IMAGE', folder, new_file_name))
                os.rmdir(os.path.join(os.path.dirname(__file__), 'PV_IMAGE', folder, small_folders))

        


        import json
        #import os

        import requests
        from PIL import Image
        from pytesseract import image_to_string

        from variable import PARTIES_LIST


        def removeNumbers(string_numbers):
            for i, el in enumerate(string_numbers):
                if not string_numbers[-int(i) - 1].isdigit():
                    return string_numbers[-int(i - 1) - 1:]
                    break
            return string_numbers


        with open('results.json', 'w+') as f:
            f.write('[')
            for bureau_index, folder in enumerate(os.listdir(os.path.join(os.path.dirname(__file__), 'PV_IMAGE'))):
                for pv_index, image in enumerate(os.listdir(os.path.join(os.path.dirname(__file__), 'PV_IMAGE', folder))):
                    img = [str(elt) for elt in
                        image_to_string(
                            Image.open(os.path.join(os.path.dirname(__file__), 'PV_IMAGE', folder, image))).split(
                            '\n') if not elt.replace(" ", "") == ""]
                    bureau_de_Vote = img[0].replace("O", '0').replace('I', '1').replace(" ", "").split("_")[-1]
                    enrolled_number = img[1].split(':')[-1].replace("I", "1").replace(" ", "")
                    #print(bureau_de_Vote)
                    #print(enrolled_number)
                    #print(img)
                    #print(folder + image)
                    party = [int(removeNumbers(elt.replace(" ", "").replace("|", "1").replace("O", "0"))) for elt in
                            img[4:len(PARTIES_LIST) + 4]]
                    #print(party)
                    global_id = 11 * bureau_index + pv_index
                    #print(folder)
                    if folder == "ELECAM":
                        real_id, party_name = "elecam", -1
                    elif folder == "REAL_RESULT":
                        real_id, party_name = "bon", -2
                    else:
                        real_id, party_name = global_id, PARTIES_LIST.index(folder)

                    scrutateur_format = dict(scrutineerId=global_id, scrutineerName=real_id, partyNumber=party_name)
                    pv_format = dict(pvId=global_id, pollingStation=removeNumbers(bureau_de_Vote), numberOfSuscribers=enrolled_number,
                                    numberOfVoters=enrolled_number, voices=party, partyNumber=party_name,
                                    scrutineer="resource:org.cloud.elections.Scrutineer#{}".format(global_id), scrutineerName=real_id)
                    data_format = dict(id=global_id, bureau=bureau_de_Vote, nbreInscrits=enrolled_number,
                                    nbreVotants=enrolled_number,
                                    voix=party, idScrutateur=global_id,
                                    nomScrutateur=real_id, parti=party_name)
                    json.dump(data_format, f)
                    f.write(',')
                    r = requests.post('http://localhost:3000/api/Scrutineer', json=scrutateur_format)
                    r = requests.post('http://localhost:3000/api/Pv', json=pv_format)

            f.write(']')


        
        #print(variable.PARTIES_LIST)
        #print(variable.BUREAU_VOTES_NUMBER)
        #print(variable.ENROLLED_PEOPLE_NUMBER)
        #print(variable.COALITION_MODE)
        return str(status.HTTP_200_OK)
    except Exception as e:
        return str(status.HTTP_400_BAD_REQUEST), str(e)

예제 #26

0

파일 보기

파일: pdf2jpg.py 프로젝트: attolares/uppercase-dataset

from pdf2jpg import pdf2jpg

pdf2jpg.convert_pdf2jpg("ou1.pdf", "", dpi=300, pages="ALL")

예제 #27

0

파일 보기

파일: crawler.py 프로젝트: marcosfons/brumadinho_crawler

def toImages(filename, outPath=local_dir + r"\imgs"):
    from pdf2jpg import pdf2jpg
    result = pdf2jpg.convert_pdf2jpg(filename, outPath, pages="ALL")
    print(result)

예제 #28

0

파일 보기

def askdirectory():


    try:
        q=open("data_slovar.json",'r+',encoding='UTF-8')
        data_slovar=json.load(q)
        q.close()
        q=open("data_slovar.json",'w',encoding='UTF-8')
        print ("json был , скачаны файлы,  json обнулен и создан заново")
    except FileNotFoundError:
        q=open("data_slovar.json",'w',encoding='UTF-8')
        data_slovar={}
        print ("json не было ,json создан заново")
    outputpath=''
    
        
    
    list_of_pdf_files=[]
    of=filedialog.askdirectory()
    list_of_folder = os.listdir(of)
    try:
        os.mkdir("dump_for_jpg")
    except OSError:
        print ("Директория есть?")


    for i in range(len(list_of_folder)):
        if list_of_folder[i].endswith(".pdf"):
            list_of_pdf_files.append(list_of_folder[i])
    print(list_of_pdf_files)
    print(len(list_of_pdf_files))
    

    for i in range(len(list_of_pdf_files)):
        beta_inputpath=of+"/"+list_of_pdf_files[i]
        beta_inputpath=list(beta_inputpath)
        alfa_inputpath=''
        for k in range(len(beta_inputpath)):
            if beta_inputpath[k]=='/':
                alfa_inputpath=alfa_inputpath+'//'
            else:
                alfa_inputpath+=beta_inputpath[k]
        #print(alfa_inputpath)
        outputpath=os.getcwd()+'//'+'dump_for_jpg'
        result = pdf2jpg.convert_pdf2jpg(alfa_inputpath, outputpath, pages="ALL")
        
        
    
        dir_list = os.listdir(outputpath)
        
        for i in range(len(dir_list)):
            #list_of_jpg_input_for_tesseract=os.listdir()
            path_remove = outputpath +'//'+dir_list[i]
            list_of_jpgs=os.listdir(path_remove)
            name_of_folder_with_jpg=dir_list[i]
            for l in range(len(list_of_jpgs)):
                dir_of_jpg=path_remove+'//'+list_of_jpgs[l]
                string_of_image = pytesseract.image_to_string(Image.open(dir_of_jpg), lang='rus')
                if name_of_folder_with_jpg[:-4] in data_slovar:
                    data_slovar[name_of_folder_with_jpg[:-4]]=data_slovar[name_of_folder_with_jpg[:-4]]+' '+string_of_image
                else:
                    data_slovar[name_of_folder_with_jpg[:-4]]=string_of_image
                
    #!!!!! УДАЛЕНИЕ СОДЕРЖИМОГО ДИРЕКТОРИИ  
            shutil.rmtree(path_remove , ignore_errors=False, onerror=None)
    #!!!!! УДАЛЕНИЕ СОДЕРЖИМОГО ДИРЕКТОРИИ В "PATH'

    #закрытие файла json
    
    json.dump(data_slovar,q)
    q.close()
    print("Представление окончено")

예제 #29

0

파일 보기

파일: DarkModePdf.py 프로젝트: thechawla225/NIte-Mode

def pdf_to_images(filename,inputpath):
    outputpath = inputpath
    inputpath = os.path.join(inputpath,filename)
    pdf2jpg.convert_pdf2jpg(inputpath, outputpath, pages="ALL")

예제 #30

0

파일 보기

if not os.path.exists(
        os.path.join(os.path.dirname(__file__), 'PV_IMAGE', 'REAL_RESULT')):
    os.makedirs(
        os.path.join(os.path.dirname(__file__), 'PV_IMAGE', 'REAL_RESULT'))

if not os.path.exists(
        os.path.join(os.path.dirname(__file__), 'PV_IMAGE', 'ELECAM')):
    os.makedirs(os.path.join(os.path.dirname(__file__), 'PV_IMAGE', 'ELECAM'))

for folder in os.listdir(os.path.join(os.path.dirname(__file__), 'PV_PDF')):
    for file in os.listdir(
            os.path.join(os.path.dirname(__file__), 'PV_PDF', folder)):
        # pages = convert_from_path(os.path.join(os.path.dirname(__file__), 'PV_PDF', folder, file), 500)
        # pages[0].save(os.path.join(os.path.dirname(__file__), 'PV_IMAGE', folder, file), 'JPEG')
        result = pdf2jpg.convert_pdf2jpg(
            os.path.join(os.path.dirname(__file__), 'PV_PDF', folder, file),
            os.path.join(os.path.dirname(__file__), 'PV_IMAGE', folder),
            pages="0")
        print(result)

print('Delete that folders')

import shutil

for folder in os.listdir(os.path.join(os.path.dirname(__file__), 'PV_IMAGE')):
    for small_folders in os.listdir(
            os.path.join(os.path.dirname(__file__), 'PV_IMAGE', folder)):
        file = os.listdir(
            os.path.join(os.path.dirname(__file__), 'PV_IMAGE', folder,
                         small_folders))[0]
        tab = file.split('.')
        new_file_name = tab[0][2:] + '.' + tab[-1]