def splitPdfOnePageEach(file, wayToSaveFiles, sequential=0): try: nameFile = funcoesUteis.getOnlyNameFile(os.path.basename(file)) nameDirectoryToSave = f"{nameFile}-{sequential}" wayBaseToSaveFile = os.path.join(wayToSaveFiles, 'pdfs', nameDirectoryToSave) os.makedirs(wayBaseToSaveFile) with open(file, 'rb') as filePdf: pdfReader = PyPDF2.PdfFileReader(filePdf) countPages = pdfReader.getNumPages() for numberPage in range(countPages): pageContent = pdfReader.getPage(numberPage) pdfWriter = PyPDF2.PdfFileWriter() pdfWriter.addPage(pageContent) with open(f'{wayBaseToSaveFile}\\{numberPage+1}.pdf', 'wb') as newPdfPerPage: pdfWriter.write(newPdfPerPage) except Exception as e: pass #print(f'\t - Não foi possível processar o arquivo {file}, provavelmente o PDF está inválido e com erro no momento de abrir!')
def ImageToText(file, wayToSaveFile): nameFile = funcoesUteis.getOnlyNameFile(os.path.basename(file)) wayToSave = f"{wayToSaveFile}/{nameFile}.txt" wayToSave = open(wayToSave, "w", encoding='utf-8') content = ocr.image_to_string(Image.open(file), lang='por') wayToSave.write(content) wayToSave.close()
def PDFImgToText(file, wayToSaveFile): nameFile = funcoesUteis.getOnlyNameFile(os.path.basename(file)) wayToSave = f"{wayToSaveFile}/{nameFile}.jpg" command = f'magick -density 300 "{file}" "{wayToSave}"' os.system(command) ImageToText(wayToSave, wayToSaveFile)
def PDFToText(file, wayToSaveFile, mode="simple"): nameFile = funcoesUteis.getOnlyNameFile(os.path.basename(file)) wayToSave = f"{wayToSaveFile}/{nameFile}.txt" try: textPdf = "" with open(file, 'rb') as filePdf: documents = slate.PDF(filePdf) for document in documents: textPdf += document if funcoesUteis.treatTextField(textPdf) == "": PDFImgToText(file, wayToSaveFile) else: command = f'{fileDir}/exe/pdftotext64.exe -{mode} "{file}" "{wayToSave}"' os.system(command) except Exception as ex: print(f"Nao foi possivel transformar o arquivo \"{file}\". O erro é: {str(ex)}")