def extract_text(self): if not os.path.exists(self.image_out_path): os.mkdir(self.image_out_path) PDF_file = self.filename pdf2jpg.convert_pdf2jpg(PDF_file, self.image_out_path, dpi=300) print("Successfully saved images for each page for {}".format( self.image_out_path)) out_folder_name = os.path.basename(self.filename) + "_dir" english_text = list() for filename in os.listdir( os.path.join(self.image_out_path, out_folder_name)): if filename.endswith("jpg"): text = str(((pytesseract.image_to_string( Image.open( os.path.join(self.image_out_path, out_folder_name, filename)))))) text = text.replace('-\n', '') english_text.append(text) corpus = " ".join(english_text) corpus = re.sub(r'\n+', '\n', corpus).strip() corpus = corpus.split(".") self.english = list(map(str.strip, corpus)) print("English Text Extracted is : {}".format(self.english)) shutil.rmtree(self.image_out_path)
def convert_pdf(content_dict): try: os.mkdir('Images') except: pass for key in content_dict.keys(): found = False #Verfica se a prova existe try: with open('Pdfs/Prova ' + str(key) + '.pdf', 'rb') as file_check: found = True file_check.close() except: print('Erro ao converter ' + str(key)) continue if found == True: try: pdf_name = 'Pdfs/Prova ' + key + '.pdf' pdf2jpg.convert_pdf2jpg(pdf_name, 'Images/', pages='ALL') print('Convertendo Prova ' + str(key)) except: print('Erro ao converter Prova ' + str(key)) continue
def read_pdfpage_as_image(pdf, directory, cert_no): tgt_img_dir = os.path.basename(pdf) + "_dir" pdf2jpg.convert_pdf2jpg(pdf, directory, dpi=150, pages="ALL") tgt_folder = directory + "/" + tgt_img_dir img_list = os.listdir(tgt_folder) file_count = len(img_list) index = 0 for fname in os.listdir(tgt_folder): src_file = tgt_folder + "/" + fname tgt_path = directory timestamp = time.time() random_number = random.randint(10000000, 99999999) #tgt_path = tgt_path + "/"+ cert_no + "_" + str(timestamp) tgt_path = tgt_path + "/" + cert_no + "_" + str(random_number) if file_count > 1: index += 1 tgt_path = tgt_path + "_" + str(index) + ".jpg" else: tgt_path = tgt_path + ".jpg" file.rename_file(src_file, tgt_path) file.remove_directory(tgt_folder)
def get_page_image_from_pdf(page_num, image_file_name, pdf_file_path): """ Converting a pdf page into an Image for processing """ inputpath = pdf_file_path outputpath = "images/" # To convert single page pdf2jpg.convert_pdf2jpg(inputpath, outputpath, pages=str(page_num)) im = cv2.imread( "images/"+pdf_file_path+"/"+str(page_num)+"_"+pdf_file_path+".jpg", cv2.IMREAD_COLOR ) cv2.imwrite(image_file_name, im) return cv2.imread(image_file_name, 0)
def ocrfinal(): print("Started OCRING") btn1.configure(text="Started OCRING") global textdata global inputpath global outputpath global langs #inputpath=input() pytesseract.tesseract_cmd = "C:\Program Files (x86)\Tesseract-OCR" btn1.configure(text="pdf to images") print("pdf to images") result = pdf2jpg.convert_pdf2jpg((inputpath), outputpath, pages="ALL") path, dirs, files = next(os.walk("PDFIMAGES/" + inputpath)) btn1.configure(text="image to string") print("image to string") for file in files: btn1.configure(text=file) print(file) textdata = textdata + pytesseract.image_to_string( path + "/" + file, lang=str(langs[selected.get()])) #writing the files in txt format print("Saving the File")
def convert_qpaper(inputfolder, inputfilename, outputfolder, qpaperfoldername=""): inputpathfile = os.path.join(inputfolder, inputfilename) result = pdf2jpg.convert_pdf2jpg(inputpathfile, outputfolder, dpi=300, pages="ALL") print(result) #Remove filepaths and return only filenames as list if len(qpaperfoldername) > 0: qpaperfoldername = qpaperfoldername + "/" jpgoutputfolder = qpaperfoldername + os.path.basename( result[0]['output_pdfpath']) jpgfilepaths = result[0]['output_jpgfiles'] jpgfiles = [] for jpgfile in jpgfilepaths: jpgfiles.append( (inputfilename, jpgoutputfolder + "/" + os.path.basename(jpgfile))) return jpgfiles
def askfail(): try: q=open("data_slovar.json",'r+',encoding='UTF-8') data_slovar=json.load(q) q.close() q=open("data_slovar.json",'w',encoding='UTF-8') print ("json был , скачаны файлы, json обнулен и создан заново") except FileNotFoundError: q=open("data_slovar.json",'w',encoding='UTF-8') data_slovar={} print ("json не было ,json создан заново") try: os.mkdir("dump_for_jpg") except OSError: print ("Директория есть?") #ОСНОВА ПРОГРАММЫ НАЧАЛО directory_of_pdf_fail=filedialog.askopenfilename() #l2['text']=directory_of_pdf_fail outputpath=os.getcwd()+'//'+'dump_for_jpg' result = pdf2jpg.convert_pdf2jpg(directory_of_pdf_fail, outputpath, pages="ALL") name_of_folder_with_jpg=os.listdir(outputpath)[0] if name_of_folder_with_jpg[:-4] in data_slovar: json.dump(data_slovar,q) q.close() to_remove=outputpath+'//'+name_of_folder_with_jpg shutil.rmtree(to_remove, ignore_errors=False, onerror=None) messagebox.showinfo("Опа-па", "PDF фаил с таким именем уже присутствует в базе данных") return print("А ВЫХОДА ТО НЕ БЫЛО!!!") folder_with_jpg=outputpath+'//'+os.listdir(outputpath)[0] list_of_names_jpg=os.listdir(folder_with_jpg) print(list_of_names_jpg) print(folder_with_jpg) for i in range(len(list_of_names_jpg)): input_path_of_jpg_to_ocr=folder_with_jpg+'//'+list_of_names_jpg[i] string_of_image = pytesseract.image_to_string(Image.open(input_path_of_jpg_to_ocr), lang='rus') if name_of_folder_with_jpg[:-4] in data_slovar: data_slovar[name_of_folder_with_jpg[:-4]]=data_slovar[name_of_folder_with_jpg[:-4]]+' '+string_of_image else: data_slovar[name_of_folder_with_jpg[:-4]]=string_of_image #!!!!! УДАЛЕНИЕ СОДЕРЖИМОГО ДИРЕКТОРИИ shutil.rmtree(folder_with_jpg , ignore_errors=False, onerror=None) #!!!!! УДАЛЕНИЕ СОДЕРЖИМОГО ДИРЕКТОРИИ В "PATH' json.dump(data_slovar,q) q.close() print("Представление окончено")
def getImageOfPdf(inputpath): from pdf2jpg import pdf2jpg outputpath, _ = os.path.split(inputpath) outputimagePath = makedirectory(outputpath) # print(outputpath) # prepare task for it result = pdf2jpg.convert_pdf2jpg(inputpath, outputimagePath, pages="ALL") print(result) return outputimagePath
def extract_multipage_data(key, image): # file path input_file_path = os.path.join(tempfile.gettempdir(), f'{key}.blob') # path for output output_file_path = os.path.join(tempfile.gettempdir(), f'{key}') # create dir os.mkdir(output_file_path) try: # dump pdf file with open(input_file_path, "wb") as file: file.write(image) # pdf to images result = convert_pdf2jpg(input_file_path, output_file_path, dpi=300, pages="ALL") # process images all_text = [] all_pdf = [] for index, value in enumerate(result[0]['output_jpgfiles']): # extract text, pdf = extract_data(Image.open(value)) # store text all_text.append(text) # dump pdf path_pdf = os.path.join(tempfile.gettempdir(), f'{key}', f'{key}_{index}.pdf') with open(path_pdf, 'wb') as fout: fout.write(pdf) # store filepath all_pdf.append(path_pdf) # remove image os.remove(value) # combine text combine_text = ' '.join([x for x in all_text]).encode('utf-8') # combine pdf pdf_writer = PdfFileWriter() for x in all_pdf: pdf_reader = PdfFileReader(x) for page in range(pdf_reader.getNumPages()): pdf_writer.addPage(pdf_reader.getPage(page)) # remove old pdf os.remove(x) # dump all pdf file path_pdf = os.path.join(tempfile.gettempdir(), f'{key}', f'{key}_all.pdf') with open(path_pdf, 'wb') as fout: pdf_writer.write(fout) # read and encode with open(path_pdf, 'rb') as fin: data = base64.b64encode(fin.read()) # remove file os.remove(path_pdf) # done return combine_text, data finally: # delete file if os.path.exists(input_file_path): os.remove(input_file_path) # delete images if os.path.exists(output_file_path): shutil.rmtree(output_file_path)
def get_required_text_scanned_pdf(count, file, sentence, corpus): for i in range(0, count): pdf2jpg.convert_pdf2jpg(file, 'images/', pages=str(i)) config = ('-l eng --oem 1 --psm 3') im = cv2.imread('images/' + file + '/' + str(i) + '_' + file + '.jpg', cv2.IMREAD_COLOR) text = pytesseract.image_to_string(im, config=config) text_list = text.splitlines() for j in range(0, len(text_list) - 1): if corpus[0] in text_list[j] and corpus[1] in text_list[j]: try: s = text_list[j] + " " + text_list[j + 1] + " " + text_list[j + 2] except: s = text_list[j] + " " + text_list[j + 1] if corpus[0] in s and corpus[1] in s and corpus[2].lower( ) in s.lower(): sentence.append(s) return sentence
def watermark(original_pdf, output_pdf, watermark_pdf): """ Take the original pdf and do the following: - merge it with watermark pdf into an intermediary pdf - export the intermediary pdf to jpeg - build another pdf file from jpegs -> watermark + readonly Refs: - https://stackabuse.com/working-with-pdfs-in-python-adding-images-and-watermarks - http://www.blog.pythonlibrary.org/2018/06/07/an-intro-to-pypdf2 """ print("starting worker: {0}".format(threading.get_ident())) tmp_pdf_name = 'intermediary_' + str(uuid.uuid1()) + '_.pdf' tmp_pdf_path = "{0}/{1}".format("/tmp", tmp_pdf_name) jpegs_dir = "{0}/{1}_{2}".format("/tmp", "jpegs", str(uuid.uuid1())) watermark = PdfFileReader(watermark_pdf) watermark_page = watermark.getPage(0) pdf = PdfFileReader(original_pdf) pdf_writer = PdfFileWriter() for page in range(pdf.getNumPages()): pdf_page = pdf.getPage(page) pdf_page.mergePage(watermark_page) pdf_writer.addPage(pdf_page) with open(tmp_pdf_path, 'wb') as fh: pdf_writer.write(fh) pdf2jpg.convert_pdf2jpg(tmp_pdf_path, jpegs_dir, pages="ALL") images_list = [ i for i in os.listdir("{0}/{1}_{2}".format( jpegs_dir, tmp_pdf_name, "dir")) if i.endswith(".jpg") ] sort_nicely(images_list) makePdf(output_pdf, images_list, "{0}/{1}_{2}".format(jpegs_dir, tmp_pdf_name, "dir")) os.remove(tmp_pdf_path) shutil.rmtree(jpegs_dir)
def pdf_to_jpg(inputpath, imagepath): """ This function is used to convert PDF drawings into JPGs Arguments: inputpath {[type]} -- [description] imagepath {[type]} -- [description] """ try: global step_now for file in glob.glob(inputpath + "\\" + "*.pdf"): logging.info(' Converting ' + file + " to JPG") pdf2jpg.convert_pdf2jpg(file, imagepath, dpi=300, pages="ALL") if not __debug__: progress_bar_increment() label_1.configure(text = "Converting " + os.path.basename(file) + " to JPG") label_1.update() except: #global error_present error_present = 1 logging.error(' Error in "pdf_to_jpg" function : ') error_message = PrintException() logging.error(error_message) raise
def home(request): branch = 'CSE' year = '2' upload_dir = os.path.join('media', branch, year) ENV_PATH = os.path.abspath(os.path.dirname(__file__))[:-6] upload_dir = os.path.join(ENV_PATH, upload_dir) for filename in os.listdir(upload_dir): path = os.path.join(upload_dir, filename) newpath = os.path.join(upload_dir, filename[:-4] + '.jpg') if not os.path.exists(newpath): result = pdf2jpg.convert_pdf2jpg(path, upload_dir, pages="0") oldpath = os.path.join(path + '_dir', '0_' + filename + '.jpg') print(result) os.rename(oldpath, newpath) os.rmdir(path + '_dir') return render(request, 'home.html')
def pdfTojpg(self, event): outputpath = self.outputpath result = [] for i in self.eachPDF: inputpath = str(i) result.append( pdf2jpg.convert_pdf2jpg(inputpath, outputpath, pages="0")) a = 0 b = [] for n in result: k = str(str(n[0]['output_jpgfiles'][0])) text = pytesseract.image_to_string(k) hwb_demo = self.getNum(self.eachPerTIF[a]) a += 1 b.append( str(self.analyze(hwb_demo, text)) + str(self.checkTXT(text)))
def __call__(self, jarvis, s): if not s: jarvis.say("please enter file path after calling the plugin") elif not "pdf" in s: jarvis.say("Your file must be a .pdf file") else: #We have to add the '.' back beacause the Jarvis API removes it s = s.replace('pdf', '.' + 'pdf') source_path = s dest_path = s.replace('.pdf', '') jarvis.say(source_path) jarvis.say(dest_path) result = pdf2jpg.convert_pdf2jpg(source_path, dest_path, pages="ALL") jarvis.say("file successfully converted")
def do_ocr(key, blob, root_dir): # file path input_file_path = os.path.join(root_dir, f'{key}.blob') # path for output output_file_path = os.path.join(root_dir, f'{key}') # create dir os.mkdir(output_file_path) try: # dump pdf file with open(input_file_path, "wb") as file: file.write(blob) # check if image already if check_if_image(input_file_path): result = [input_file_path] else: # pdf to images result = convert_pdf2jpg(input_file_path, output_file_path, dpi=300, pages="ALL")[0]['output_jpgfiles'] # text all_text = [] # pdf all_hocr = [] # process images for value in result: # extract text and hocr text, hocr = extract_data_ocr(Image.open(value)) # dump text all_text.append(text) # dump hocr all_hocr.append(hocr) # combine text combine_text = ' '.join([x for x in all_text]).encode('utf-8') combine_hocr = ' '.join([x.decode('utf-8') for x in all_hocr]).encode('utf-8') # combine return combine_text, combine_hocr finally: # delete file if os.path.exists(input_file_path): os.remove(input_file_path) # delete images if os.path.exists(output_file_path): shutil.rmtree(output_file_path)
from pdf2jpg import pdf2jpg inputpath = r"test.pdf" outputpath = r"convertedpdf" result = pdf2jpg.convert_pdf2jpg(inputpath, outputpath, pages="ALL") print(result)
"Please enter the full path of the pdf document \n(Eg. " + r"C:\Users\Eugene\Desktop\CSC2014 Assignment Aug 2019\Test Files\Test PDF Documents\test.pdf) :" + "\n\n") sourcepath = r"" + userpdf #Prompts user to enter full path of the destination folder for the converted images to be saved into. userfolder = input( "Please enter the full path of the folder for images to be saved to \n(Eg. " + r"C:\Users\Eugene\Desktop\CSC2014 Assignment Aug 2019\Test Files\Test PDF Documents) :" + "\n\n") destinationpath = r"" + userfolder #Convert all pages of user's pdf into separate images saved in jpg format. convert = pdf2jpg.convert_pdf2jpg(sourcepath, destinationpath, pages="ALL") #If the entered pdf document cannot be found, output_jpgfiles list in dictionary at index 0 of convert list will be empty. An exception is raised if this happens. if len(convert[0]['output_jpgfiles']) == 0: raise Exception #Get the pdf file name from user's first input pdfsplit = sourcepath.split("\\") pdfname = pdfsplit[len(pdfsplit) - 1] #Read all converted images in jpg format in the destination folder and save all of it into the pages list pages = [ cv2.imread(image) for image in glob.glob(r"" + destinationpath + "\\" + pdfname + "_dir\\*.jpg")
def pdf_ocr(self): #OCR the PDF files. self.pdfstring = '' self.pdflist = [] termlist = ['EXW', 'CIP', 'CIF', 'FCA', 'FOB', 'DDU', 'DAP'] inv_list = [] outputpath = self.inputpath for k in self.eachpdfinv: inputpath = str(k) for n in self.newdir: s_part = re.findall('\d+', n)[0] s_tr = 's' + s_part if s_part in k: s_num = s_tr break invres = (pdf2jpg.convert_pdf2jpg(inputpath, outputpath, pages="ALL")) invadd = invres[0]['output_jpgfiles'] term = 'none' limit = 0 for kkk in invadd: text = pytesseract.image_to_string(kkk) for a_min in termlist: if a_min in text: term = a_min break if limit >= 2: break limit += 1 ll = [] ll.append(s_num) ll.append(term) self.inv_list.append(ll) a = 0 b = [] count_ = 0 item = 0 if self.eachPDF != []: outputpath = self.inputpath result = [] for i in self.eachPDF: inputpath = str(i) pack = 'none' weight = 'none' for n in self.newdir: num_part = re.findall(r'\d+', n)[0] s_tr = 's' + num_part if num_part in i: s_number = s_tr break invres = (pdf2jpg.convert_pdf2jpg(inputpath, outputpath, pages="ALL")) invadd = invres[0]['output_jpgfiles'] count_ += 1 hwb_real = self.analyzeNum(self.eachPerName[a]) forwardername = self.checkforwarder(self.eachPerName[a]) textChecked = [] limit = 0 for n in invadd: k = n text = pytesseract.image_to_string(k) for i in self.checkTXT(text): textChecked.append(i) if re.findall(r"\d+\s\d+k", text, re.I) != []: p_w = re.findall(r"\d+\s\d+k", text, re.I)[0] pack = str(re.findall(r"\d+", p_w, re.I)[0]) weight = str(re.findall(r"\d+", p_w, re.I)[1]) elif re.findall(r"\d+\s\d+.{3,4}K", text, re.I) != []: p_w = re.findall(r"\d+\s\d+.{3,4}K", text, re.I)[0] pack = str(re.findall(r"\d+", p_w, re.I)[0]) weight = '' if len(re.findall(r"\d+", p_w, re.I)) == 2: weight += str(re.findall(r"\d+", p_w, re.I)[1]) else: weight += str(re.findall(r"\d+", p_w, re.I)[1]) weight += '.' weight += str(re.findall(r"\d+", p_w, re.I)[2]) if limit >= 2: break limit += 1 textChecked.append(forwardername) b.append(s_number + ": " + str(hwb_real) + str(textChecked)) self.pdflist.append( [s_number, hwb_real, textChecked, pack, weight]) item += 1 a += 1 print('PDF已完成 ' + str(item) + ' 单' + '\n运单号为: ' + self.eachPerName[a - 1]) self.pdfstring = str(b) else: pass
# -*- coding: utf-8 -*- """ Created on Wed Nov 21 15:51:00 2018 @author: Murali """ from pdf2jpg import pdf2jpg result = pdf2jpg.convert_pdf2jpg('D:\\Sargunan\\Table\\001.pdf', 'c:\\temp\\p') print(result)
def main(): inputpath = r"C:\302015_c.pdf" outputpath = r"C:\Users\azkb075\Downloads" # To convert single page result = pdf2jpg.convert_pdf2jpg(inputpath, outputpath, pages="1, 2") print(result)
def pdf_to_images(filename, inputpath, outputpath): inputpath = inputpath + filename pdf2jpg.convert_pdf2jpg(inputpath, outputpath, pages="ALL")
from pdf2jpg import pdf2jpg inputpath = r"C:\New folder\New Vision Soft\OCR\PDF_JPG\PNB_BS.pdf" outputpath = r"C:\New folder\New Vision Soft\OCR\PDF_JPG" # To convert single page result = pdf2jpg.convert_pdf2jpg(inputpath, outputpath, dpi=300, pages="1") print(result) # To convert multiple pages result = pdf2jpg.convert_pdf2jpg(inputpath, outputpath, dpi=300, pages="1,0,3") print(result) # to convert all pages result = pdf2jpg.convert_pdf2jpg(inputpath, outputpath, dpi=300, pages="ALL") print(result) from pdf2image import convert_from_path pages = convert_from_path('PNB_BS.pdf', 500) from pdf2image import convert_from_path import glob pdf_dir = glob.glob(r'C:\New folder\New Vision Soft\OCR\PDF_JPG\PNB_BS.pdf' ) #your pdf folder path img_dir = r'C:\New folder\New Vision Soft\OCR\PDF_JPG' for pdf_ in pdf_dir: pages = convert_from_path(pdf_, 500) for page in pages: page.save(img_dir + pdf_.split("\\")[-1][:-3] + "jpg", 'JPEG')
def extractJpegFromPng(inputFolder, outputFolder): pdf2jpg.convert_pdf2jpg(inputFolder, outputFolder, pages="ALL")
def index(): try: tab = ['RDPC', 'MRC', 'SDF', 'FPD', 'ADD', 'UDC', 'UNIVERS', 'PURS', 'MCNC', 'ANDP', 'CPP', 'SCNC', 'MP'] variable.PARTIES_LIST = tab[:int(request.args.get('number_party'))] print("Je récupère les partis ") print(variable.PARTIES_LIST) variable.BUREAU_VOTES_NUMBER = int(request.args.get('bureau_number')) variable.ENROLLED_PEOPLE_NUMBER = int(request.args.get('enrolled_number')) variable.COALITION_MODE = int(request.args.get('coalition_mode')) except Exception: return str(status.HTTP_400_BAD_REQUEST), "The data is not in the real format" try: import shutil if os.path.exists(os.path.join(os.path.dirname(__file__), 'PV_IMAGE')): shutil.rmtree(os.path.join(os.path.dirname(__file__), 'PV_IMAGE')) os.makedirs(os.path.join(os.path.dirname(__file__), 'PV_IMAGE')) if os.path.exists(os.path.join(os.path.dirname(__file__), 'PV_PDF')): shutil.rmtree(os.path.join(os.path.dirname(__file__), 'PV_PDF')) os.makedirs(os.path.join(os.path.dirname(__file__), 'PV_PDF')) import random from PV import PV from variable import BUREAU_VOTES_NUMBER, ENROLLED_PEOPLE_NUMBER, PARTIES_LIST, COALITION_MODE, getCoalitionName from BureauVote import BureauVote from Fonction import Fonction bureauVotes_List = [] print(PARTIES_LIST) repartitions = Fonction.generateRandomNumbers(BUREAU_VOTES_NUMBER, ENROLLED_PEOPLE_NUMBER) # je dispatche les nombres d'inscrits entre les bureaux de vote for i in range(BUREAU_VOTES_NUMBER): bureauVotes_List.append(BureauVote('Bureau_de_Vote_' + str(i + 1), repartitions[i])) # j'enregistre les resultats reels de chaque parti dans chaque bureau de vote, (RDPC,nbreVotes) for bureau in bureauVotes_List: pv = PV("Reference") pv.party_results = Fonction.generateRandomNumbers(len(PARTIES_LIST), bureau.enrolled_persons) bureau.results = pv # bureau.results = [(PARTIES_LIST[i], votes) for i, votes in # enumerate(Fonction.generateRandomNumbers(len(PARTIES_LIST), bureau.enrolled_persons))] # c'est la partie ou on commence a generer les pv en fonction de la fraude # print(Fonction.generatePVForBureauWithCoalitionMode(bureauVotes_List[0].results, COALITION_MODE, 2, [0, 6, 2])) # Ici on genere tous les PV avec une eventuelle coalition number_of_coalised_party = random.randint(1, len(PARTIES_LIST) - 1) coalised_group = Fonction.generateTabRandomElementsDiff(number_of_coalised_party, len(PARTIES_LIST)) party_favorite = random.choice(coalised_group) for bureau in bureauVotes_List: bureau.all_pv = Fonction.generatePVForBureauWithCoalitionMode(bureau.results.party_results, COALITION_MODE, party_favorite, coalised_group) for bureau in bureauVotes_List: t = random.choices([0, 1], [0.35, 0.65])[0] if t: pv = PV("Elecam") pv.party_results = Fonction.generateRandomNumbersWithPref(len(PARTIES_LIST), bureau.enrolled_persons, 0) bureau.elecam_pv = pv else: pv = PV("Elecam") pv.party_results = bureau.results.party_results bureau.elecam_pv = pv for bureau in bureauVotes_List: print(bureau) for item in bureau.all_pv: print(item) # Creation de tous les dossiers #import os for party in PARTIES_LIST: if not os.path.exists(os.path.join(os.path.dirname(__file__), 'PV_PDF', party)): os.makedirs(os.path.join(os.path.dirname(__file__), 'PV_PDF', party)) if not os.path.exists(os.path.join(os.path.dirname(__file__), 'PV_PDF', 'REAL_RESULT')): os.makedirs(os.path.join(os.path.dirname(__file__), 'PV_PDF', 'REAL_RESULT')) if not os.path.exists(os.path.join(os.path.dirname(__file__), 'PV_PDF', 'ELECAM')): os.makedirs(os.path.join(os.path.dirname(__file__), 'PV_PDF', 'ELECAM')) # Generating PDF files from fpdf import FPDF width_cell, height_cell = 150, 40 for bureau in bureauVotes_List: for PV in bureau.all_pv: pdf = FPDF(orientation='P', unit='pt', format='A4') pdf.add_page() pdf.set_font("Arial", size=16) pdf.multi_cell(0, 20, "PV_" + bureau.name, 0) pdf.set_top_margin(20) pdf.set_font("Arial", size=14) pdf.multi_cell(0, 20, "Nombre_de_Votants : " + str(bureau.enrolled_persons), 0) pdf.set_top_margin(20) pdf.multi_cell(0, 20, "Type_de_Coalition : " + str(getCoalitionName(COALITION_MODE)), 0) pdf.set_top_margin(20) pdf.set_fill_color(193, 229, 252) pdf.cell(width_cell, height_cell, 'Nom_du_Parti', 1, 0, 'C', 1) pdf.cell(width_cell, height_cell, 'Nombre_de_Voix', 1, 1, 'C', 1) for index, votes in enumerate(PV.party_results): pdf.set_font("Arial", size=14) pdf.cell(width_cell, height_cell, str(PARTIES_LIST[index]), 1, 0, 'L', 0) pdf.set_font("Arial", size=14) pdf.cell(width_cell, height_cell, str(votes), 1, 1, 'L', 0) pdf.set_font("Arial", size=14) pdf.multi_cell(0, 20, "", 0) pdf.set_top_margin(20) pdf.set_fill_color(193, 229, 252) pdf.cell(width_cell, height_cell, 'Nom_du_Representant', 1, 0, 'C', 1) pdf.cell(2 * width_cell, height_cell, 'Signatures', 1, 1, 'C', 1) for index, votes in enumerate(PV.party_results): pdf.set_font("Arial", size=14) pdf.cell(width_cell, height_cell, str(PARTIES_LIST[index]), 1, 0, 'L', 0) pdf.set_font("Arial", size=12) pdf.cell(2 * width_cell, height_cell, "Scrutateur_" + str(PARTIES_LIST[index]) + "_" + str(bureau.name), 1, 1, 'L', 0) pdf.output('PV_PDF/' + str(PV.party_name) + '/PV_' + str(bureau.name) + '.pdf') # pour mettre les PV d'elecam pdf = FPDF(orientation='P', unit='pt', format='A4') pdf.add_page() pdf.set_font("Arial", size=16) pdf.multi_cell(0, 20, "PV_ELECAM_" + bureau.name, 0) pdf.set_top_margin(20) pdf.set_font("Arial", size=14) pdf.multi_cell(0, 20, "Nombre_de_Votants : " + str(bureau.enrolled_persons), 0) pdf.set_top_margin(20) pdf.multi_cell(0, 20, "Type_de_Coalition : " + str(getCoalitionName(COALITION_MODE)), 0) pdf.set_top_margin(20) pdf.set_fill_color(193, 229, 252) pdf.cell(width_cell, height_cell, 'Nom_du_Parti', 1, 0, 'C', 1) pdf.cell(width_cell, height_cell, 'Nombre_de_Voix', 1, 1, 'C', 1) for index, votes in enumerate(bureau.elecam_pv.party_results): pdf.set_font("Arial", size=14) pdf.cell(width_cell, height_cell, str(PARTIES_LIST[index]), 1, 0, 'L', 0) pdf.set_font("Arial", size=14) pdf.cell(width_cell, height_cell, str(votes), 1, 1, 'L', 0) pdf.set_font("Arial", size=14) pdf.multi_cell(0, 20, "", 0) pdf.set_top_margin(20) pdf.set_fill_color(193, 229, 252) pdf.cell(width_cell, height_cell, 'Nom_du_Representant', 1, 0, 'C', 1) pdf.cell(2 * width_cell, height_cell, 'Signatures', 1, 1, 'C', 1) for index, votes in enumerate(bureau.elecam_pv.party_results): pdf.set_font("Arial", size=14) pdf.cell(width_cell, height_cell, str(PARTIES_LIST[index]), 1, 0, 'L', 0) pdf.set_font("Arial", size=12) pdf.cell(2 * width_cell, height_cell, "Scrutateur_" + str(PARTIES_LIST[index]) + "_" + str(bureau.name), 1, 1, 'L', 0) pdf.output('PV_PDF/ELECAM/PV_Elecam_' + str(bureau.name) + '.pdf') # pour mettre les PV de reference pdf = FPDF(orientation='P', unit='pt', format='A4') pdf.add_page() pdf.set_font("Arial", size=16) pdf.multi_cell(0, 20, "PV_GAGNANT_" + bureau.name, 0) pdf.set_top_margin(20) pdf.set_font("Arial", size=14) pdf.multi_cell(0, 20, "Nombre_de_Votants : " + str(bureau.enrolled_persons), 0) pdf.set_top_margin(20) pdf.multi_cell(0, 20, "Type_de_Coalition : " + str(getCoalitionName(COALITION_MODE)), 0) pdf.set_top_margin(20) pdf.set_fill_color(193, 229, 252) pdf.cell(width_cell, height_cell, 'Nom_du_Parti', 1, 0, 'C', 1) pdf.cell(width_cell, height_cell, 'Nombre_de_Voix', 1, 1, 'C', 1) for index, votes in enumerate(bureau.results.party_results): pdf.set_font("Arial", size=14) pdf.cell(width_cell, height_cell, str(PARTIES_LIST[index]), 1, 0, 'L', 0) pdf.set_font("Arial", size=14) pdf.cell(width_cell, height_cell, str(votes), 1, 1, 'L', 0) pdf.set_font("Arial", size=14) pdf.multi_cell(0, 20, "", 0) pdf.set_top_margin(20) pdf.set_fill_color(193, 229, 252) pdf.cell(width_cell, height_cell, 'Nom_du_Representant', 1, 0, 'C', 1) pdf.cell(2 * width_cell, height_cell, 'Signatures', 1, 1, 'C', 1) for index, votes in enumerate(bureau.results.party_results): pdf.set_font("Arial", size=14) pdf.cell(width_cell, height_cell, str(PARTIES_LIST[index]), 1, 0, 'L', 0) pdf.set_font("Arial", size=12) pdf.cell(2 * width_cell, height_cell, "Scrutateur_" + str(PARTIES_LIST[index]) + "_" + str(bureau.name), 1, 1, 'L', 0) pdf.output('PV_PDF/REAL_RESULT/PV_Gagnant_' + str(bureau.name) + '.pdf') #import os from pdf2jpg import pdf2jpg from variable import PARTIES_LIST #import shutil if os.path.exists(os.path.join(os.path.dirname(__file__), 'PV_IMAGE')): shutil.rmtree(os.path.join(os.path.dirname(__file__), 'PV_IMAGE')) for party in PARTIES_LIST: if not os.path.exists(os.path.join(os.path.dirname(__file__), 'PV_IMAGE', party)): os.makedirs(os.path.join(os.path.dirname(__file__), 'PV_IMAGE', party)) if not os.path.exists(os.path.join(os.path.dirname(__file__), 'PV_IMAGE', 'REAL_RESULT')): os.makedirs(os.path.join(os.path.dirname(__file__), 'PV_IMAGE', 'REAL_RESULT')) if not os.path.exists(os.path.join(os.path.dirname(__file__), 'PV_IMAGE', 'ELECAM')): os.makedirs(os.path.join(os.path.dirname(__file__), 'PV_IMAGE', 'ELECAM')) for folder in os.listdir(os.path.join(os.path.dirname(__file__), 'PV_PDF')): for file in os.listdir(os.path.join(os.path.dirname(__file__), 'PV_PDF', folder)): # pages = convert_from_path(os.path.join(os.path.dirname(__file__), 'PV_PDF', folder, file), 500) # pages[0].save(os.path.join(os.path.dirname(__file__), 'PV_IMAGE', folder, file), 'JPEG') result = pdf2jpg.convert_pdf2jpg(os.path.join(os.path.dirname(__file__), 'PV_PDF', folder, file), os.path.join(os.path.dirname(__file__), 'PV_IMAGE', folder), pages="0") print(result) print('Delete that folders') #import shutil for folder in os.listdir(os.path.join(os.path.dirname(__file__), 'PV_IMAGE')): for small_folders in os.listdir(os.path.join(os.path.dirname(__file__), 'PV_IMAGE', folder)): file = os.listdir(os.path.join(os.path.dirname(__file__), 'PV_IMAGE', folder, small_folders))[0] tab = file.split('.') new_file_name = tab[0][2:] + '.' + tab[-1] shutil.move(os.path.join(os.path.dirname(__file__), 'PV_IMAGE', folder, small_folders, file), os.path.join(os.path.dirname(__file__), 'PV_IMAGE', folder, new_file_name)) os.rmdir(os.path.join(os.path.dirname(__file__), 'PV_IMAGE', folder, small_folders)) import json #import os import requests from PIL import Image from pytesseract import image_to_string from variable import PARTIES_LIST def removeNumbers(string_numbers): for i, el in enumerate(string_numbers): if not string_numbers[-int(i) - 1].isdigit(): return string_numbers[-int(i - 1) - 1:] break return string_numbers with open('results.json', 'w+') as f: f.write('[') for bureau_index, folder in enumerate(os.listdir(os.path.join(os.path.dirname(__file__), 'PV_IMAGE'))): for pv_index, image in enumerate(os.listdir(os.path.join(os.path.dirname(__file__), 'PV_IMAGE', folder))): img = [str(elt) for elt in image_to_string( Image.open(os.path.join(os.path.dirname(__file__), 'PV_IMAGE', folder, image))).split( '\n') if not elt.replace(" ", "") == ""] bureau_de_Vote = img[0].replace("O", '0').replace('I', '1').replace(" ", "").split("_")[-1] enrolled_number = img[1].split(':')[-1].replace("I", "1").replace(" ", "") #print(bureau_de_Vote) #print(enrolled_number) #print(img) #print(folder + image) party = [int(removeNumbers(elt.replace(" ", "").replace("|", "1").replace("O", "0"))) for elt in img[4:len(PARTIES_LIST) + 4]] #print(party) global_id = 11 * bureau_index + pv_index #print(folder) if folder == "ELECAM": real_id, party_name = "elecam", -1 elif folder == "REAL_RESULT": real_id, party_name = "bon", -2 else: real_id, party_name = global_id, PARTIES_LIST.index(folder) scrutateur_format = dict(scrutineerId=global_id, scrutineerName=real_id, partyNumber=party_name) pv_format = dict(pvId=global_id, pollingStation=removeNumbers(bureau_de_Vote), numberOfSuscribers=enrolled_number, numberOfVoters=enrolled_number, voices=party, partyNumber=party_name, scrutineer="resource:org.cloud.elections.Scrutineer#{}".format(global_id), scrutineerName=real_id) data_format = dict(id=global_id, bureau=bureau_de_Vote, nbreInscrits=enrolled_number, nbreVotants=enrolled_number, voix=party, idScrutateur=global_id, nomScrutateur=real_id, parti=party_name) json.dump(data_format, f) f.write(',') r = requests.post('http://localhost:3000/api/Scrutineer', json=scrutateur_format) r = requests.post('http://localhost:3000/api/Pv', json=pv_format) f.write(']') #print(variable.PARTIES_LIST) #print(variable.BUREAU_VOTES_NUMBER) #print(variable.ENROLLED_PEOPLE_NUMBER) #print(variable.COALITION_MODE) return str(status.HTTP_200_OK) except Exception as e: return str(status.HTTP_400_BAD_REQUEST), str(e)
from pdf2jpg import pdf2jpg pdf2jpg.convert_pdf2jpg("ou1.pdf", "", dpi=300, pages="ALL")
def toImages(filename, outPath=local_dir + r"\imgs"): from pdf2jpg import pdf2jpg result = pdf2jpg.convert_pdf2jpg(filename, outPath, pages="ALL") print(result)
def askdirectory(): try: q=open("data_slovar.json",'r+',encoding='UTF-8') data_slovar=json.load(q) q.close() q=open("data_slovar.json",'w',encoding='UTF-8') print ("json был , скачаны файлы, json обнулен и создан заново") except FileNotFoundError: q=open("data_slovar.json",'w',encoding='UTF-8') data_slovar={} print ("json не было ,json создан заново") outputpath='' list_of_pdf_files=[] of=filedialog.askdirectory() list_of_folder = os.listdir(of) try: os.mkdir("dump_for_jpg") except OSError: print ("Директория есть?") for i in range(len(list_of_folder)): if list_of_folder[i].endswith(".pdf"): list_of_pdf_files.append(list_of_folder[i]) print(list_of_pdf_files) print(len(list_of_pdf_files)) for i in range(len(list_of_pdf_files)): beta_inputpath=of+"/"+list_of_pdf_files[i] beta_inputpath=list(beta_inputpath) alfa_inputpath='' for k in range(len(beta_inputpath)): if beta_inputpath[k]=='/': alfa_inputpath=alfa_inputpath+'//' else: alfa_inputpath+=beta_inputpath[k] #print(alfa_inputpath) outputpath=os.getcwd()+'//'+'dump_for_jpg' result = pdf2jpg.convert_pdf2jpg(alfa_inputpath, outputpath, pages="ALL") dir_list = os.listdir(outputpath) for i in range(len(dir_list)): #list_of_jpg_input_for_tesseract=os.listdir() path_remove = outputpath +'//'+dir_list[i] list_of_jpgs=os.listdir(path_remove) name_of_folder_with_jpg=dir_list[i] for l in range(len(list_of_jpgs)): dir_of_jpg=path_remove+'//'+list_of_jpgs[l] string_of_image = pytesseract.image_to_string(Image.open(dir_of_jpg), lang='rus') if name_of_folder_with_jpg[:-4] in data_slovar: data_slovar[name_of_folder_with_jpg[:-4]]=data_slovar[name_of_folder_with_jpg[:-4]]+' '+string_of_image else: data_slovar[name_of_folder_with_jpg[:-4]]=string_of_image #!!!!! УДАЛЕНИЕ СОДЕРЖИМОГО ДИРЕКТОРИИ shutil.rmtree(path_remove , ignore_errors=False, onerror=None) #!!!!! УДАЛЕНИЕ СОДЕРЖИМОГО ДИРЕКТОРИИ В "PATH' #закрытие файла json json.dump(data_slovar,q) q.close() print("Представление окончено")
def pdf_to_images(filename,inputpath): outputpath = inputpath inputpath = os.path.join(inputpath,filename) pdf2jpg.convert_pdf2jpg(inputpath, outputpath, pages="ALL")
if not os.path.exists( os.path.join(os.path.dirname(__file__), 'PV_IMAGE', 'REAL_RESULT')): os.makedirs( os.path.join(os.path.dirname(__file__), 'PV_IMAGE', 'REAL_RESULT')) if not os.path.exists( os.path.join(os.path.dirname(__file__), 'PV_IMAGE', 'ELECAM')): os.makedirs(os.path.join(os.path.dirname(__file__), 'PV_IMAGE', 'ELECAM')) for folder in os.listdir(os.path.join(os.path.dirname(__file__), 'PV_PDF')): for file in os.listdir( os.path.join(os.path.dirname(__file__), 'PV_PDF', folder)): # pages = convert_from_path(os.path.join(os.path.dirname(__file__), 'PV_PDF', folder, file), 500) # pages[0].save(os.path.join(os.path.dirname(__file__), 'PV_IMAGE', folder, file), 'JPEG') result = pdf2jpg.convert_pdf2jpg( os.path.join(os.path.dirname(__file__), 'PV_PDF', folder, file), os.path.join(os.path.dirname(__file__), 'PV_IMAGE', folder), pages="0") print(result) print('Delete that folders') import shutil for folder in os.listdir(os.path.join(os.path.dirname(__file__), 'PV_IMAGE')): for small_folders in os.listdir( os.path.join(os.path.dirname(__file__), 'PV_IMAGE', folder)): file = os.listdir( os.path.join(os.path.dirname(__file__), 'PV_IMAGE', folder, small_folders))[0] tab = file.split('.') new_file_name = tab[0][2:] + '.' + tab[-1]