def convert(self, filename): '''Convert PDF file from sample path to output path.''' source_pdf_file = os.path.join(sample_path, f'{filename}.pdf') docx_file = os.path.join(output_path, f'{filename}.docx') cv = Converter(source_pdf_file) cv.convert(docx_file) cv.close()
def init_test(self, filename): ''' Initialize parsed layout and benchmark layout.''' # parsed layout: first page only pdf_file = os.path.join(self.sample_dir, f'{filename}.pdf') docx_file = os.path.join(self.output_dir, f'{filename}.docx') cv = Converter(pdf_file) cv.convert(docx_file, pages=[0]) self.test = cv[0] # type: Page cv.close() # restore sample layout cv = Converter(pdf_file) layout_file = os.path.join(self.layout_dir, f'{filename}.json') cv.deserialize(layout_file) self.sample = cv[0] # type: Page return self
def init_test(self, filename): ''' Initialize parsed layout and benchmark layout.''' # restore sample layout layout_file = os.path.join(self.layout_dir, f'{filename}.json') with open(layout_file, 'r') as f: raw_dict = json.load(f) self.sample = Layout().restore(raw_dict) # parsed layout: first page only pdf_file = os.path.join(self.sample_dir, f'{filename}.pdf') docx_file = os.path.join(self.output_dir, f'{filename}.docx') cv = Converter(pdf_file) layouts = cv.make_docx(docx_file, pages=[0]) self.test = layouts[0] # type: Layout cv.close() return self
def post(self, request, format=None): data = request.data['file'] print(data.__dict__) try: unique_filename = str(uuid.uuid4()) docs_file_name_path = 'docx/' + unique_filename + '.docx' file_path = 'pdf/' + unique_filename + '.pdf' path = default_storage.save(file_path, ContentFile(data.read())) tmp_file = os.path.join(settings.MEDIA_ROOT, path) cv = Converter(tmp_file) cv.convert(docs_file_name_path, start=0, end=None) cv.close() # response = FileResponse(open(docs_file_name_path, 'rb'),filename=data._name+'.docx') # os.remove(docs_file_name_path) os.remove(tmp_file) return Response( { 'success': True, 'file': 'https://api.pdfmake.com/media/' + docs_file_name_path }, status=200) except: return Response(status=400)
def extract(self, filename): # convert pdf to docx cv = Converter(filename) wordfile = tempfile.NamedTemporaryFile() cv.convert(wordfile.name, start=0, end=None) cv.close() output = docx2txt.process(wordfile.name) return output
def pdf_to_word(fileName): pdf_file = fileName # 正则获取不含文件类型后缀的部分,用于组成word文档绝对路径 name = re.findall(r'(.*?)\.', pdf_file)[0] docx_file = f'{name}.docx' cv = Converter(pdf_file) cv.convert(docx_file, start=0, end=None) cv.close()
def test_extracting_table(self): '''test extracting contents from table.''' filename = 'demo-table' pdf_file = os.path.join(self.sample_dir, f'{filename}.pdf') tables = Converter(pdf_file).extract_tables(end=1) print(tables) # compare the last table sample = [['Input', None, None, None, None, None], ['Description A', 'mm', '30.34', '35.30', '19.30', '80.21'], ['Description B', '1.00', '5.95', '6.16', '16.48', '48.81'], ['Description C', '1.00', '0.98', '0.94', '1.03', '0.32'], ['Description D', 'kg', '0.84', '0.53', '0.52', '0.33'], ['Description E', '1.00', '0.15', None, None, None], ['Description F', '1.00', '0.86', '0.37', '0.78', '0.01']] assert tables[-1] == sample
def pdf_to_word(entrada_pdf, q, adr, extension_out): try: salida_doc = entrada_pdf.split(".")[0] + '.docx' cv = Converter(entrada_pdf) cv.convert(salida_doc, start=0, end=None) cv.close() os.system("clear") print(adr) q.put(salida_doc) except: q.put("Error")
def convert_file(file_path): save_location = r"C:\Users\mdsak\Desktop\pdf-to-docx-converter\Document.docx" cv = Converter(file_path) cv.convert(save_location, start=0, end=None) cv.close() success_text = tk.Label(frame, text="Pdf has been successfully converted", font=(16), bg="#263D42", fg="#fff") success_text.pack()
def _callback_convert(self): '''Starts the convert of the file or files.''' # input check if not self.pdf_paths and not self.docx_folder: messagebox.showwarning( title='Neither files or folder selected', message='Select PDF file or files for convert ' 'and Select a folder for the converted files!') return if not self.pdf_paths: messagebox.showwarning( title='Not files for convert selected', message='Select PDF file or PDF files for convert!') return if not self.docx_folder: messagebox.showwarning( title='Not files folder selected', message='Select a folder for the converted files!') return # collect docx files to convert to docx_paths = [] for pdf_path in self.pdf_paths: base_name = os.path.basename(pdf_path) name, ext = os.path.splitext(base_name) docx_path = os.path.join(self.docx_folder, f'{name}.docx') docx_paths.append(docx_path) if any([os.path.exists(path) for path in docx_paths]) and \ not messagebox.askokcancel(title='Existed target file', message='Docx files with same target name are found under selected folder. ' 'Do you want to continue and replace them?'): return # now, do the converting work num_succ, num_fail = 0, 0 for pdf_path, docx_path in zip(self.pdf_paths, docx_paths): cv = Converter(pdf_path) try: cv.convert(docx_path) except Exception as e: print(e) num_fail += 1 else: num_succ += 1 finally: cv.close() messagebox.showinfo( title='Convert Done!', message=f'Successful ({num_succ}), Failed ({num_fail}).')
def post(self, request, format=None): data = request.data['file'] print(data.__dict__) if data.content_type != 'application/pdf': return Response(status=400) unique_filename = str(uuid.uuid4()) docs_file_name_path = 'docx/' + unique_filename + '.docx' file_path = 'pdf/' + unique_filename + '.pdf' path = default_storage.save(file_path, ContentFile(data.read())) tmp_file = os.path.join(settings.MEDIA_ROOT, path) cv = Converter(tmp_file) cv.convert(docs_file_name_path, start=0, end=None) cv.close() response = FileResponse(open(docs_file_name_path, 'rb'), filename=data._name + '.docx') os.remove(docs_file_name_path) os.remove(tmp_file) return response
def main(): global trust_ratio # 读取参数信息,调整全局单句阈值 if len(sys.argv) < 2: print("参数错误") exit(-1) if len(sys.argv) == 3: trust_ratio = float(sys.argv[2]) # 读取需要进行比对的原文件 filepath = sys.argv[1] filename, _type = os.path.splitext(filepath) # 对 pdf 文件先行进行格式转换 if _type == '.pdf': convert_path = filename + '.docx' cv = Converter(filepath) cv.convert(convert_path, multi_processing=True) cv.close() filepath = convert_path doc1 = readDocx(filepath, True) # 读取本地数据库内的论文,并逐文件进行比对 print("加载数据库...") path = '.\\database\\word' filename_list = os.listdir(path) print(f'数据库共 {len(filename_list)} 篇论文') progress = ProgressBar(len(filename_list), fmt=ProgressBar.FULL) t1 = datetime.datetime.now() print('开始比对...') for filename in filename_list: if filename.endswith('.docx'): doc2 = readDocx(os.path.join(path, filename)) for i in range(len(doc1)): for j in range(len(doc2)): compareParagraph(doc1, doc2, i, j, filename) progress.current += 1 progress() t2 = datetime.datetime.now() progress.done() print('\n比对完成,总用时: ', t2 - t1)
def local_test(filename, make_test_case=False): pdf_file = os.path.join(output, f'{filename}.pdf') docx_file = os.path.join(output, f'{filename}.docx') cv = Converter(pdf_file, docx_file) # process page by page for page in cv[0:1]: # print(page.rotation, page.rotationMatrix) # print(page.transformationMatrix) # print(page.rect, page.MediaBox, page.CropBox) # print(page.xref) # print(page.getContents()) # print(cv.doc_pdf.xrefObject(page.xref)) # page.cleanContents() # c = page.readContents().decode(encoding="ISO-8859-1") # with open('c.txt', 'w') as f: # f.write(c) # print(cv.doc_pdf.xrefObject(94)) # with open('x.svg', 'w') as f: # f.write(page.getSVGimage(text_as_path=False)) # parse layout cv.make_page(page) # # extract tables # tables = cv.extract_tables(page) # for table in tables: # print(table) cv.close() # close pdf # check results check_result(pdf_file, docx_file, 'comparison.pdf', make_test_case)
def local_test(sub_path, filename, compare=False, make_test_case=False): pdf_file = os.path.join(output, sub_path, f'{filename}.pdf') docx_file = os.path.join(output, sub_path, f'{filename}.docx') page_index = 0 cv = Converter(pdf_file) page = cv.fitz_doc[page_index] # print(page.rotation, page.rotationMatrix) # print(page.transformationMatrix) # print(page.rect, page.MediaBox, page.CropBox) # print(page.xref) # print(page.getContents()) # print(cv.doc_pdf.xrefObject(page.xref)) # page.cleanContents() # c = page.readContents().decode(encoding="ISO-8859-1") # with open('c.txt', 'w') as f: # f.write(c) # print(cv.doc_pdf.xrefObject(6)) # print(cv.doc_pdf._getXrefString(7)) # with open('x.svg', 'w') as f: # f.write(page.getSVGimage(text_as_path=False)) # parse layout cv.debug_page(page_index, docx_file) # # extract tables # tables = cv.extract_tables([page_index]) # for table in tables: # print(table) cv.close() # close pdf # check results if compare: check_result(pdf_file, docx_file, 'comparison.pdf', make_test_case)
def Start_PDF_Word(self): if os.path.isdir(self.filename): QMessageBox.question(win, '温馨提示!', '程序开始执行时,因为计算量大可能会导致卡顿,这是正常现象,请不要乱点,耐心稍等一会儿!!!', QMessageBox.Yes | QMessageBox.No, (QMessageBox.Yes)) config_parser = ConfigParser() config_parser.read('config.cfg', encoding='utf-8') config = config_parser['default'] for file in os.listdir(self.filename): extension_name = os.path.splitext(file)[1] if extension_name != '.pdf': continue file_name = os.path.splitext(file)[0] pdf_file = self.filename + '/' + file word_file = self.filename + '/' + file_name + '.docx' cv = Converter(pdf_file) cv.convert(word_file) cv.close() self.textedit_one.moveCursor(QTextCursor.End) self.textedit_one.insertPlainText( f'\nPDF文件已成功转换图片文件,请前往桌面查看!!!\n\n生成路径为:{word_file}\n') else: QMessageBox.question(win, '温馨提示!', '请导入一个内含PDF文件的文件夹!!!', QMessageBox.Yes | QMessageBox.No, (QMessageBox.Yes))
from pdf2docx import Converter pdf_file = 'imzalancaklar.pdf' docx_file = 'cv.docx' # convert pdf to docx cv = Converter(pdf_file) cv.convert(docx_file, start=0, end=None) cv.close()
import sys # Take PDF's path as input pdf = input("Enter the path to your file: ") assert os.path.exists(pdf), "File not found at, " + str(pdf) f = open(pdf, 'r+') #Ask for custom name for the word doc doc_name_choice = input( "Do you want to give a custom name to your file ?(Y/N)") if (doc_name_choice == 'Y' or doc_name_choice == 'y'): # User input doc_name = input("Enter the custom name : ") + ".docx" else: # Use the same name as pdf # Get the file name from the path provided by the user pdf_name = os.path.basename(pdf) # Get the name without the extension .pdf doc_name = os.path.splitext(pdf_name)[0] + ".docx" # Convert PDF to Word cv = Converter(pdf) #Path to the directory path = os.path.dirname(pdf) cv.convert(os.path.join(path, "", doc_name), start=0, end=None) print("Word doc created!") cv.close()
print('Comparing with sample pdf...') if compare_layput(pdf_file, docx_pdf_file, output_file, threshold=0.7): print('Fully matched.') else: print(f'Please convert {docx_file} to {docx_pdf_file} in advance.') if __name__ == '__main__': script_path = os.path.abspath(__file__) # current script path output = os.path.dirname(script_path) filename = 'test' pdf_file = os.path.join(output, f'{filename}.pdf') docx_file = os.path.join(output, f'{filename}.docx') cv = Converter(pdf_file, docx_file) # process page by page for page in cv[0:1]: # print(page.rotation, page.rotationMatrix) # print(page.transformationMatrix) # print(page.rect, page.MediaBox, page.CropBox) # print(page.xref) # print(page.getContents()) # print(cv.doc_pdf.xrefObject(page.xref)) # page.cleanContents() # c = page.readContents().decode(encoding="ISO-8859-1") # with open('c.txt', 'w') as f: # f.write(c)
def upload(): file = request.files['inputFile'] file.save(os.path.join(app.config['UPLOAD_FOLDER'], (file.filename))) #Checks if the file format is acceptabe VALID_FORMATS = {"pdf"} #, "PNG", "docx", "jpg"} valid = 0 for valid_format in VALID_FORMATS: if (file.filename)[-len(valid_format):] == valid_format: newFile = FileContents(name=file.filename[:-len(valid_format)-1], data=file.read()) valid = 1 original_format = valid_format break if valid == 0: return "Error: Wrong Format. Please upload a PDF file." #TODO: Implement convert logic newFile.data_pdf = newFile.data #newFile.data_png = newFile.data #newFile.data_docx = newFile.data #Need to upload newFile.data to the folder TMP #------------convert starts here----------- if original_format == "pdf": #-----PDF TO JPG----- outputDir = "tmp/" input_path = "tmp/" + newFile.name + ".pdf" pages = convert_from_path(input_path, 500) img = pages[0] # Create a buffer to hold the bytes buf = BytesIO() # Save the image as jpeg to the buffer img.save(buf, 'jpeg') # Rewind the buffer's file pointer buf.seek(0) # Read the bytes from the buffer image_bytes = buf.read() # Close the buffer buf.close() #Upload file to DB newFile.data_png = image_bytes #-----PDF TO DOCX----- pdf_file = 'tmp/' + newFile.name + ".pdf" docx_file = 'tmp/' + newFile.name + ".docx" # convert pdf to docx cv = Converter(pdf_file) cv.convert(docx_file, start=0, end=None) cv.close() newFile.data_docx = open("tmp/" + newFile.name + ".docx", 'rb').read() newFile.data_pdf = open("tmp/" + newFile.name + ".pdf", 'rb').read() #files = glob.glob('tmp') #newFile.data_pdf = files[0] #for f in files: #os.remove(f) db.session.add(newFile) db.session.commit() return redirect('/download/' + str(newFile.id))
from docx2pdf import convert from pdf2docx import Converter import sys import os from PIL import Image from fpdf import FPDF if sys.argv[1] == 'd-p': convert(sys.argv[2]) print("convert success") if sys.argv[1] == 'p-d': cv = Converter(sys.argv[2]) open(sys.argv[3], "a") cv.convert(sys.argv[3], start=0, end=None) cv.close() print("convert success") if sys.argv[1] == 'i-i': filename = sys.argv[2] img = Image.open(filename) img.save(sys.argv[3]) print("success") if sys.argv[1] == 'i-b': filename = sys.argv[2] img = Image.open(filename).save(sys.argv[3]) print("success") if sys.argv[1] == 't-p': filename = sys.argv[2]
def pdf_to_word(pdf_file_path, word_file_path): cv = Converter(pdf_file_path) cv.convert(word_file_path) cv.close()