Exemplo n.º 1
0
 def convert(self, filename):
     '''Convert PDF file from sample path to output path.'''
     source_pdf_file = os.path.join(sample_path, f'{filename}.pdf')
     docx_file = os.path.join(output_path, f'{filename}.docx')
     cv = Converter(source_pdf_file)
     cv.convert(docx_file)
     cv.close()
Exemplo n.º 2
0
    def init_test(self, filename):
        ''' Initialize parsed layout and benchmark layout.'''
        # parsed layout: first page only
        pdf_file = os.path.join(self.sample_dir, f'{filename}.pdf')
        docx_file = os.path.join(self.output_dir, f'{filename}.docx')
        cv = Converter(pdf_file)
        cv.convert(docx_file, pages=[0])
        self.test = cv[0]  # type: Page
        cv.close()

        # restore sample layout
        cv = Converter(pdf_file)
        layout_file = os.path.join(self.layout_dir, f'{filename}.json')
        cv.deserialize(layout_file)
        self.sample = cv[0]  # type: Page

        return self
Exemplo n.º 3
0
    def init_test(self, filename):
        ''' Initialize parsed layout and benchmark layout.'''
        # restore sample layout
        layout_file = os.path.join(self.layout_dir, f'{filename}.json')
        with open(layout_file, 'r') as f:
            raw_dict = json.load(f)
        self.sample = Layout().restore(raw_dict)

        # parsed layout: first page only
        pdf_file = os.path.join(self.sample_dir, f'{filename}.pdf')
        docx_file = os.path.join(self.output_dir, f'{filename}.docx')
        cv = Converter(pdf_file)        
        layouts = cv.make_docx(docx_file, pages=[0])
        self.test = layouts[0] # type: Layout
        cv.close()

        return self
Exemplo n.º 4
0
    def post(self, request, format=None):
        data = request.data['file']
        print(data.__dict__)
        try:

            unique_filename = str(uuid.uuid4())
            docs_file_name_path = 'docx/' + unique_filename + '.docx'
            file_path = 'pdf/' + unique_filename + '.pdf'
            path = default_storage.save(file_path, ContentFile(data.read()))
            tmp_file = os.path.join(settings.MEDIA_ROOT, path)

            cv = Converter(tmp_file)
            cv.convert(docs_file_name_path, start=0, end=None)
            cv.close()
            # response = FileResponse(open(docs_file_name_path, 'rb'),filename=data._name+'.docx')
            # os.remove(docs_file_name_path)
            os.remove(tmp_file)
            return Response(
                {
                    'success': True,
                    'file':
                    'https://api.pdfmake.com/media/' + docs_file_name_path
                },
                status=200)

        except:
            return Response(status=400)
 def extract(self, filename):
     # convert pdf to docx
     cv = Converter(filename)
     wordfile = tempfile.NamedTemporaryFile()
     cv.convert(wordfile.name, start=0, end=None)
     cv.close()
     output = docx2txt.process(wordfile.name)
     return output
Exemplo n.º 6
0
def pdf_to_word(fileName):
    pdf_file = fileName
    # 正则获取不含文件类型后缀的部分,用于组成word文档绝对路径
    name = re.findall(r'(.*?)\.', pdf_file)[0]
    docx_file = f'{name}.docx'

    cv = Converter(pdf_file)
    cv.convert(docx_file, start=0, end=None)
    cv.close()
Exemplo n.º 7
0
    def test_extracting_table(self):
        '''test extracting contents from table.'''
        filename = 'demo-table'
        pdf_file = os.path.join(self.sample_dir, f'{filename}.pdf')
        tables = Converter(pdf_file).extract_tables(end=1)
        print(tables)

        # compare the last table
        sample = [['Input', None, None, None, None, None],
                  ['Description A', 'mm', '30.34', '35.30', '19.30', '80.21'],
                  ['Description B', '1.00', '5.95', '6.16', '16.48', '48.81'],
                  ['Description C', '1.00', '0.98', '0.94', '1.03', '0.32'],
                  ['Description D', 'kg', '0.84', '0.53', '0.52', '0.33'],
                  ['Description E', '1.00', '0.15', None, None, None],
                  ['Description F', '1.00', '0.86', '0.37', '0.78', '0.01']]
        assert tables[-1] == sample
Exemplo n.º 8
0
def pdf_to_word(entrada_pdf, q, adr, extension_out):
    try:
        salida_doc = entrada_pdf.split(".")[0] + '.docx'
        cv = Converter(entrada_pdf)
        cv.convert(salida_doc, start=0, end=None)
        cv.close()
        os.system("clear")
        print(adr)
        q.put(salida_doc)
    except:
        q.put("Error")
def convert_file(file_path):
    save_location = r"C:\Users\mdsak\Desktop\pdf-to-docx-converter\Document.docx"
    cv = Converter(file_path)
    cv.convert(save_location, start=0, end=None)
    cv.close()
    success_text = tk.Label(frame,
                            text="Pdf has been successfully converted",
                            font=(16),
                            bg="#263D42",
                            fg="#fff")
    success_text.pack()
Exemplo n.º 10
0
    def _callback_convert(self):
        '''Starts the convert of the file or files.'''
        # input check
        if not self.pdf_paths and not self.docx_folder:
            messagebox.showwarning(
                title='Neither files or folder selected',
                message='Select PDF file or files for convert '
                'and Select a folder for the converted files!')
            return

        if not self.pdf_paths:
            messagebox.showwarning(
                title='Not files for convert selected',
                message='Select PDF file or PDF files for convert!')
            return

        if not self.docx_folder:
            messagebox.showwarning(
                title='Not files folder selected',
                message='Select a folder for the converted files!')
            return

        # collect docx files to convert to
        docx_paths = []
        for pdf_path in self.pdf_paths:
            base_name = os.path.basename(pdf_path)
            name, ext = os.path.splitext(base_name)
            docx_path = os.path.join(self.docx_folder, f'{name}.docx')
            docx_paths.append(docx_path)

        if any([os.path.exists(path) for path in docx_paths]) and \
            not messagebox.askokcancel(title='Existed target file',
                message='Docx files with same target name are found under selected folder. '
                        'Do you want to continue and replace them?'):
            return

        # now, do the converting work
        num_succ, num_fail = 0, 0
        for pdf_path, docx_path in zip(self.pdf_paths, docx_paths):
            cv = Converter(pdf_path)
            try:
                cv.convert(docx_path)
            except Exception as e:
                print(e)
                num_fail += 1
            else:
                num_succ += 1
            finally:
                cv.close()

        messagebox.showinfo(
            title='Convert Done!',
            message=f'Successful ({num_succ}), Failed ({num_fail}).')
Exemplo n.º 11
0
    def post(self, request, format=None):
        data = request.data['file']
        print(data.__dict__)
        if data.content_type != 'application/pdf':
            return Response(status=400)

        unique_filename = str(uuid.uuid4())
        docs_file_name_path = 'docx/' + unique_filename + '.docx'
        file_path = 'pdf/' + unique_filename + '.pdf'
        path = default_storage.save(file_path, ContentFile(data.read()))
        tmp_file = os.path.join(settings.MEDIA_ROOT, path)

        cv = Converter(tmp_file)
        cv.convert(docs_file_name_path, start=0, end=None)
        cv.close()
        response = FileResponse(open(docs_file_name_path, 'rb'),
                                filename=data._name + '.docx')
        os.remove(docs_file_name_path)
        os.remove(tmp_file)
        return response
Exemplo n.º 12
0
def main():
    global trust_ratio

    # 读取参数信息,调整全局单句阈值
    if len(sys.argv) < 2:
        print("参数错误")
        exit(-1)
    if len(sys.argv) == 3:
        trust_ratio = float(sys.argv[2])

    # 读取需要进行比对的原文件
    filepath = sys.argv[1]
    filename, _type = os.path.splitext(filepath)
    # 对 pdf 文件先行进行格式转换
    if _type == '.pdf':
        convert_path = filename + '.docx'
        cv = Converter(filepath)
        cv.convert(convert_path, multi_processing=True)
        cv.close()
        filepath = convert_path
    doc1 = readDocx(filepath, True)

    # 读取本地数据库内的论文,并逐文件进行比对
    print("加载数据库...")
    path = '.\\database\\word'
    filename_list = os.listdir(path)
    print(f'数据库共 {len(filename_list)} 篇论文')
    progress = ProgressBar(len(filename_list), fmt=ProgressBar.FULL)
    t1 = datetime.datetime.now()
    print('开始比对...')
    for filename in filename_list:
        if filename.endswith('.docx'):
            doc2 = readDocx(os.path.join(path, filename))
            for i in range(len(doc1)):
                for j in range(len(doc2)):
                    compareParagraph(doc1, doc2, i, j, filename)
        progress.current += 1
        progress()
    t2 = datetime.datetime.now()
    progress.done()
    print('\n比对完成,总用时: ', t2 - t1)
Exemplo n.º 13
0
def local_test(filename, make_test_case=False):
    pdf_file = os.path.join(output, f'{filename}.pdf')
    docx_file = os.path.join(output, f'{filename}.docx')

    cv = Converter(pdf_file, docx_file)

    # process page by page
    for page in cv[0:1]:

        # print(page.rotation, page.rotationMatrix)
        # print(page.transformationMatrix)
        # print(page.rect, page.MediaBox, page.CropBox)


        # print(page.xref)
        # print(page.getContents())
        # print(cv.doc_pdf.xrefObject(page.xref))
        # page.cleanContents()
        # c = page.readContents().decode(encoding="ISO-8859-1")
        # with open('c.txt', 'w') as f:
        #     f.write(c)
        
        # print(cv.doc_pdf.xrefObject(94))

        # with open('x.svg', 'w') as f:
        #     f.write(page.getSVGimage(text_as_path=False))
        
        # parse layout
        cv.make_page(page)
        
        # # extract tables
        # tables = cv.extract_tables(page)
        # for table in tables:
        #     print(table)
    
    cv.close() # close pdf


    # check results
    check_result(pdf_file, docx_file, 'comparison.pdf', make_test_case)
Exemplo n.º 14
0
def local_test(sub_path, filename, compare=False, make_test_case=False):
    pdf_file = os.path.join(output, sub_path, f'{filename}.pdf')
    docx_file = os.path.join(output, sub_path, f'{filename}.docx')

    page_index = 0
    cv = Converter(pdf_file)
    page = cv.fitz_doc[page_index]

    # print(page.rotation, page.rotationMatrix)
    # print(page.transformationMatrix)
    # print(page.rect, page.MediaBox, page.CropBox)

    # print(page.xref)
    # print(page.getContents())
    # print(cv.doc_pdf.xrefObject(page.xref))
    # page.cleanContents()
    # c = page.readContents().decode(encoding="ISO-8859-1")
    # with open('c.txt', 'w') as f:
    #     f.write(c)

    # print(cv.doc_pdf.xrefObject(6))
    # print(cv.doc_pdf._getXrefString(7))

    # with open('x.svg', 'w') as f:
    #     f.write(page.getSVGimage(text_as_path=False))

    # parse layout
    cv.debug_page(page_index, docx_file)

    # # extract tables
    # tables = cv.extract_tables([page_index])
    # for table in tables:
    #     print(table)

    cv.close()  # close pdf

    # check results
    if compare:
        check_result(pdf_file, docx_file, 'comparison.pdf', make_test_case)
Exemplo n.º 15
0
    def Start_PDF_Word(self):
        if os.path.isdir(self.filename):
            QMessageBox.question(win, '温馨提示!', '程序开始执行时,因为计算量大可能会导致卡顿,这是正常现象,请不要乱点,耐心稍等一会儿!!!',
                                 QMessageBox.Yes | QMessageBox.No, (QMessageBox.Yes))
            config_parser = ConfigParser()
            config_parser.read('config.cfg', encoding='utf-8')
            config = config_parser['default']
            for file in os.listdir(self.filename):
                extension_name = os.path.splitext(file)[1]
                if extension_name != '.pdf':
                    continue
                file_name = os.path.splitext(file)[0]
                pdf_file = self.filename + '/' + file
                word_file = self.filename + '/' + file_name + '.docx'

                cv = Converter(pdf_file)
                cv.convert(word_file)
                cv.close()
                self.textedit_one.moveCursor(QTextCursor.End)
                self.textedit_one.insertPlainText(
                    f'\nPDF文件已成功转换图片文件,请前往桌面查看!!!\n\n生成路径为:{word_file}\n')
        else:
            QMessageBox.question(win, '温馨提示!', '请导入一个内含PDF文件的文件夹!!!',
                                 QMessageBox.Yes | QMessageBox.No, (QMessageBox.Yes))
Exemplo n.º 16
0
from pdf2docx import Converter

pdf_file = 'imzalancaklar.pdf'
docx_file = 'cv.docx'

# convert pdf to docx
cv = Converter(pdf_file)
cv.convert(docx_file, start=0, end=None)
cv.close()
Exemplo n.º 17
0
import sys

# Take PDF's path as input
pdf = input("Enter the path to your file: ")
assert os.path.exists(pdf), "File not found at, " + str(pdf)
f = open(pdf, 'r+')

#Ask for custom name for the word doc
doc_name_choice = input(
    "Do you want to give a custom name to your file ?(Y/N)")

if (doc_name_choice == 'Y' or doc_name_choice == 'y'):
    # User input
    doc_name = input("Enter the custom name : ") + ".docx"

else:
    # Use the same name as pdf
    # Get the file name from the path provided by the user
    pdf_name = os.path.basename(pdf)
    # Get the name without the extension .pdf
    doc_name = os.path.splitext(pdf_name)[0] + ".docx"

# Convert PDF to Word
cv = Converter(pdf)

#Path to the directory
path = os.path.dirname(pdf)

cv.convert(os.path.join(path, "", doc_name), start=0, end=None)
print("Word doc created!")
cv.close()
Exemplo n.º 18
0
        print('Comparing with sample pdf...')
        if compare_layput(pdf_file, docx_pdf_file, output_file, threshold=0.7):
            print('Fully matched.')
    else:
        print(f'Please convert {docx_file} to {docx_pdf_file} in advance.')


if __name__ == '__main__':

    script_path = os.path.abspath(__file__)  # current script path
    output = os.path.dirname(script_path)
    filename = 'test'
    pdf_file = os.path.join(output, f'{filename}.pdf')
    docx_file = os.path.join(output, f'{filename}.docx')

    cv = Converter(pdf_file, docx_file)

    # process page by page
    for page in cv[0:1]:

        # print(page.rotation, page.rotationMatrix)
        # print(page.transformationMatrix)
        # print(page.rect, page.MediaBox, page.CropBox)

        # print(page.xref)
        # print(page.getContents())
        # print(cv.doc_pdf.xrefObject(page.xref))
        # page.cleanContents()
        # c = page.readContents().decode(encoding="ISO-8859-1")
        # with open('c.txt', 'w') as f:
        #     f.write(c)
Exemplo n.º 19
0
def upload():
    file = request.files['inputFile']
    file.save(os.path.join(app.config['UPLOAD_FOLDER'], (file.filename)))
    
    #Checks if the file format is acceptabe
    VALID_FORMATS = {"pdf"} #, "PNG", "docx", "jpg"}
    valid = 0
    for valid_format in VALID_FORMATS:
        if (file.filename)[-len(valid_format):] == valid_format:
            newFile = FileContents(name=file.filename[:-len(valid_format)-1], data=file.read())
            valid = 1
            original_format = valid_format
            break
    if valid == 0:
        return "Error: Wrong Format. Please upload a PDF file."

    #TODO: Implement convert logic
    

    newFile.data_pdf = newFile.data
    #newFile.data_png = newFile.data
    #newFile.data_docx = newFile.data

    #Need to upload newFile.data to the folder TMP
    

    #------------convert starts here-----------

    if original_format == "pdf":
        
        #-----PDF TO JPG-----
        outputDir = "tmp/"
        input_path = "tmp/" + newFile.name + ".pdf"
        pages = convert_from_path(input_path, 500)
        img = pages[0]

        # Create a buffer to hold the bytes
        buf = BytesIO()

        # Save the image as jpeg to the buffer
        img.save(buf, 'jpeg')

        # Rewind the buffer's file pointer
        buf.seek(0)

        # Read the bytes from the buffer
        image_bytes = buf.read()

        # Close the buffer
        buf.close()

        #Upload file to DB
        newFile.data_png = image_bytes
        
        #-----PDF TO DOCX-----
        

        pdf_file = 'tmp/' + newFile.name + ".pdf"
        docx_file = 'tmp/' + newFile.name + ".docx"

        # convert pdf to docx
        cv = Converter(pdf_file)
        cv.convert(docx_file, start=0, end=None)
        cv.close()
        newFile.data_docx = open("tmp/" + newFile.name + ".docx", 'rb').read()
        newFile.data_pdf = open("tmp/" + newFile.name + ".pdf", 'rb').read()
        
       

  
    #files = glob.glob('tmp')
    #newFile.data_pdf = files[0]
    #for f in files:
        #os.remove(f)

    db.session.add(newFile)
    db.session.commit()
    return redirect('/download/' + str(newFile.id))
Exemplo n.º 20
0
from docx2pdf import convert
from pdf2docx import Converter
import sys
import os
from PIL import Image
from fpdf import FPDF

if sys.argv[1] == 'd-p':
    convert(sys.argv[2])
    print("convert success")

if sys.argv[1] == 'p-d':
    cv = Converter(sys.argv[2])
    open(sys.argv[3], "a")
    cv.convert(sys.argv[3], start=0, end=None)
    cv.close()
    print("convert success")

if sys.argv[1] == 'i-i':
    filename = sys.argv[2]
    img = Image.open(filename)
    img.save(sys.argv[3])
    print("success")

if sys.argv[1] == 'i-b':
    filename = sys.argv[2]
    img = Image.open(filename).save(sys.argv[3])
    print("success")

if sys.argv[1] == 't-p':
    filename = sys.argv[2]
Exemplo n.º 21
0
def pdf_to_word(pdf_file_path, word_file_path):
    cv = Converter(pdf_file_path)
    cv.convert(word_file_path)
    cv.close()