Python parse示例，pdf2docx.parse Python示例

示例#1

0

显示文件

async def hmm(event):
    if not event.reply_to_msg_id:
        await event.reply("Reply to any Pdf File.")
        return
    hmmu = await event.reply("hmm... Please Wait...ðŸš¶")
    lol = await event.get_reply_message()
    starky = await borg.download_media(lol.media,
                                       Config.TMP_DOWNLOAD_DIRECTORY)

    hmmu = await event.reply("hmm... Please Wait...ðŸš¶")

    pdf_file = starky
    docx_file = './fridaybot/DOWNLOADS/FRIDAYOT.docx'

    parse(pdf_file, docx_file, start=0, end=None)

    await borg.send_file(
        event.chat_id,
        docx_file,
        caption=
        f"*PDF Converted Into Docx by Friday bot. Get your Friday From @FRIDAYOT."
    )
    os.remove(pdf_file)
    os.remove(docx_file)
    await event.delete()

示例#2

0

显示文件

 def pdf2docx_pdf_html(input_pdf, input_docx_location):
     # docx_name = r"/Users/vigneshramamurthy/opt/anaconda3/Workscripts/Spyder/Sprint 2.21 -  Mexico Unilever, Sydney Chennai, APAC Pepsico Penang, APAC Sydeny:Chennai /Accounts/GWF/Script/Word_location/Test/xx.docx"
     parse(input_pdf, input_docx_location)
     # parse(input_pdf, docx_name, start=page_no - 1, end=page_no)
     x = mammoth.convert_to_html(input_docx_location,
                                 style_map="b => b").value
     html = BeautifulSoup(x, 'html.parser')
     return html

示例#3

0

显示文件

文件： test.py 项目： sshuster/pdf2docx

    def test_multi_pages(self):
        '''test converting pdf with multi-pages.'''
        filename = 'demo'
        pdf_file = os.path.join(self.sample_dir, f'{filename}.pdf')
        docx_file = os.path.join(self.output_dir, f'{filename}.docx')    
        parse(pdf_file, docx_file, start=1, end=5)

        # check file        
        assert os.path.isfile(docx_file)

示例#4

0

显示文件

文件： Microsoft_Adobe_Utils.py 项目： Keshav-Abhishek-Hyper-Shroud/Cool_Tkinter_Projects

def pdftodocx():
    filename = filedialog.askopenfilename(title='Select a PDF File',
                                          filetypes=[('PDF File', '*.pdf')])
    if filename != '':
        pdf_docx['text'] = 'Converting...'
        from pdf2docx import parse
        parse(filename, filename.replace('.pdf', '_converted.docx'))
        messagebox.showinfo('Done!', 'PDF to Docx converted successfully.')
        pdf_docx['text'] = 'PDF to Docx'

示例#5

0

显示文件

文件： ferrero_f8_processing.py 项目： vj-kanagaraj/SGK-AI-API

def pdf_to_docx(file, pages):
    try:
        pdf_file = file
        # docx_file = path + 'nestle_file_p1.docx'

        # convert pdf to docx
        parse(pdf_file, converted_docx, pages=[pages - 1])

        return converted_docx
    except:
        pass

示例#6

0

显示文件

文件： PDF to DOCX.py 项目： Keshav-Abhishek-Hyper-Shroud/PyQT5_Apps

    def saveasdocx(self):
        global filename
        
        converted_filename=self.filename[0].replace('.pdf','_converted.docx')

        parse(self.filename[0],converted_filename)

        if self.allowOpen.checkState():
            os.startfile(converted_filename)

            self.allowOpen.setCheckState(False)
            self.convertPDF.setDisabled(True)

示例#7

0

显示文件

文件： ferrero_processing.py 项目： vj-kanagaraj/SGK-AI-API

 def pdf_to_docx_to_html(self, input_pdf, page_no):  # 3
     docx_name = f'{self.temp_directory.name}/{page_no}.docx'
     pdf_obj = pdfplumber.open(input_pdf)
     if not os.path.exists(docx_name) and page_no - 1 in range(
             len(pdf_obj.pages)):
         parse(input_pdf, docx_name, start=page_no - 1, end=page_no)
         x = mammoth.convert_to_html(docx_name, style_map="b => b").value
         html = BeautifulSoup(x, 'html.parser')
     else:
         x = ""
         html = BeautifulSoup(x, 'html.parser')
     return html

示例#8

0

显示文件

文件： pdflib.py 项目： XLPRUtils/pyxllib

    def to_docx(self, docx_file=None):
        """ pdf转docx """
        check_install_package('pdf2docx')
        from pdf2docx import parse

        pdf_file = self.src_file

        if docx_file is None:
            docx_file = pdf_file.with_suffix('.docx')

        # 注意这里是日志显示进度，不是printf输出.
        parse(str(pdf_file), str(docx_file))

示例#9

0

显示文件

文件： pdf2docx.py 项目： jayaganeshkumar/Automation-scripts

def convert_pdf2docx(input_file: str, pages: Tuple = None):

    if pages:
        pages = [int(i) for i in list(pages) if i.isnumeric()]
    result = parse(pdf_file=input_file, pages=pages)
    print("###### Conversion Complete #######")
    return result

示例#10

0

显示文件

文件： app.py 项目： jod35/pdf-write

def edit_pdf_file(file_name):

    file_to_edit = os.path.join(uploads_dir, file_name)

    output = f'{file_name}-ouput.docx'

    output_file = os.path.join(uploads_dir, output)

    parse(file_to_edit, output_file, start=0, end=None)

    context = {
        'filename': str(file_name),
        'file_path': file_to_edit,
        'output': output
    }

    return render_template("file-edit.html", **context)

示例#11

0

显示文件

 def main(self):
     if self.fileext == "pdf":
         pdf = PyPDF2.PdfFileReader(open(self.filepath, "rb"))
         with open(self.filepath.replace(".pdf", ".docx"), "a") as file:
             for page in pdf.pages:
                 file.write(page.extractText())
     else:
         new_filename = self.filepath.replace(".pdf", ".docx")
         return parse(self.filepath, new_filename, start=0, end=1)

示例#12

0

显示文件

文件： resume_analyser.py 项目： samyakj2307/recruitai_resume_backend

def ResumeAnalyzer(applicant_resume_pdf, job_description_pdf):
    applicant_resume_docx = resume_docx_path

    job_description_docx = job_desc_docx_path

    parse(applicant_resume_pdf, applicant_resume_docx, start=0, end=None)
    parse(job_description_pdf, job_description_docx, start=0, end=None)

    result1 = docx2txt.process(applicant_resume_docx)
    result2 = docx2txt.process(job_description_docx)

    text = [result1, result2]

    cv = CountVectorizer()
    count_matrix = cv.fit_transform(text)
    match_percentage = cosine_similarity(count_matrix)[0][1] * 100
    match_percentage = round(match_percentage, 2)
    return match_percentage

示例#13

0

显示文件

文件： app.py 项目： MakkiAbid/python_pdf

def pdf_word():
    upload_file = request.files['pdf-word']
    if upload_file.filename != '':
        docx_extension = ".docx"
        filename, file_extension = os.path.splitext(upload_file.filename)
        md5_obj = hashlib.md5()
        md5_obj.update(filename.encode('utf-8'))
        filename_hash = md5_obj.hexdigest()
        full_filename = secure_filename(filename_hash) + file_extension
        docx_filename = secure_filename(filename_hash) + docx_extension
        upload_file.save(os.path.join(DIR, full_filename))
        docx_url = DIR + filename_hash + docx_extension
        parse(os.path.join(DIR, full_filename), docx_url, start=0)
        return jsonify({
            'error': False,
            'file': url_for('uploaded_file', filename=docx_filename)
        })
    else:
        return jsonify({'error': True, 'message': 'Please provide the file'})

示例#14

0

显示文件

文件： pdf2docx.py 项目： perman-dev/perman_ub

async def hmm(event):
    if not event.reply_to_msg_id:
        await event.edit("Reply to any Pdf File.")
        return
    await event.edit("hmm... Please Wait...ðŸš¶")
    lol = await event.get_reply_message()
    starky = await borg.download_media(lol.media,
                                       Config.TMP_DOWNLOAD_DIRECTORY)
    await event.edit("hmm... Please Wait..")
    pdf_file = starky
    docx_file = "./virtualuserbot/DOWNLOADS/Infinity_Bots.docx"
    parse(pdf_file, docx_file, start=0, end=None)
    await borg.send_file(
        event.chat_id,
        docx_file,
        caption=
        f"*PDF Converted Into Docx by VirtualUserbot. Credits to @FRIDAYOT.",
    )
    os.remove(pdf_file)
    os.remove(docx_file)
    await event.delete()

示例#15

0

显示文件

def convert(request, pk):
    document = get_object_or_404(Document, pk=pk)
    print(document.document)
    translator = Translator()
    parse(f"D:\\audio\\audio\\static\\" + str(document.document),
          f"D:\\audio\\audio\\static\\" + str(document.document)[:-4] +
          ".docx",
          start=0)
    print("PDF --> DOCX")
    # extract text
    text = docxpy.process(f"D:\\audio\\audio\\static\\" +
                          str(document.document)[:-4] + ".docx")
    print("DOCX --> TXT")
    print(text)
    print("Translating...")
    translator = Translator()
    translation = translator.translate(text, dest="hi")
    print("English --> Hindi")
    print(translation.text)

    context = {'document': document}
    return render(request, 'audio_detail.html', context)

示例#16

0

显示文件

文件： main.py 项目： DilshadGit/Python

def pdf_convert_doc(inputf: str, outputf: str, page: Tuple = None):
    if pages:
        pages = [int(x) for x in list(pages) if x.isnumeric()]
    result = parse(pdf=input, doc_pth=output, pages=pages)

    summery = {'file': input, 'doc': output, 'pages': str(pages)}

    # Printing Summary
    print(
        "## Summary ########################################################")
    print("\n".join("{}:{}".format(x, y) for x, y in summary.items()))
    print(
        "###################################################################")
    return result

示例#17

0

显示文件

文件： convert_pdf2docx.py 项目： jinjorge/pythoncode-tutorials

def convert_pdf2docx(input_file: str, output_file: str, pages: Tuple = None):
    """Converts pdf to docx"""
    if pages:
        pages = [int(i) for i in list(pages) if i.isnumeric()]
    result = parse(pdf_file=input_file,
                   docx_with_path=output_file, pages=pages)
    summary = {
        "File": input_file, "Pages": str(pages), "Output File": output_file
    }
    # Printing Summary
    print("## Summary ########################################################")
    print("\n".join("{}:{}".format(i, j) for i, j in summary.items()))
    print("###################################################################")
    return result

示例#18

0

显示文件

async def starky(event):
    un = event.pattern_match.group(1)
    rndm = uuid.uuid4().hex
    frid = uuid.uuid4().hex
    diro = f"./{rndm}/"
    dirb = f"./{frid}/"
    os.makedirs(diro)
    os.makedirs(dirb)
    media_count = 0
    text_count = 0
    if un:
        chnnl = un
    else:
        chnnl = event.chat_id

    await event.edit(f"**Fetching All Files From This Channel**")
    try:
        chnnl_msgs = await borg.get_messages(chnnl, limit=3000)
    except:
        await event.edit(
            "**Unable To fetch Messages !** \n`Please, Check Channel Details And IF THere Are Any Media :/`"
        )
        return

    total = int(chnnl_msgs.total)

    await event.edit(f"**Downloading {total} Media/Messages**")
    for d in chnnl_msgs:
        if d.media:
            media_count += 1
            await borg.download_media(d.media, diro)
        if d.text:
            text_count += 1

    await event.edit(
        f"**Total Media :** `{total}` \n**Downloaded Media :** `{media_count}` \n**Total Texts :** `{text_count}` \n**Now Converting Files.**"
    )

    Azx = glob.glob(f"{diro}*.pdf")

    for friday in Azx:
        N = 9

        res = ''.join(
            random.choices(string.ascii_uppercase + string.digits, k=N))
        pdf_file = friday
        docx_file = f'{dirb}{str(res)}.docx'

        parse(pdf_file, docx_file, start=0, end=None)

    Ax = glob.glob(f"{dirb}*.docx")
    for pop in Ax:
        await borg.send_file(
            event.chat_id,
            pop,
            caption=
            f"**Total Media :** `{total}` \n**Downloaded Media :** `{media_count}` \n**Total Texts  :** `{text_count}` \n**By @fridayot**"
        )
    Azx = glob.glob(f"{diro}*")
    Azpx = glob.glob(f"{dirb}*")
    for x in Azx:
        os.remove(x)

    for pop in Azpx:
        os.remove(pop)

    os.rmdir(diro)
    os.rmdir(dirb)

示例#19

0

显示文件

def pdf_to_word(pdf_file):
    word_file = 'word.docx'
    parse(pdf_file, word_file)

示例#20

0

显示文件

文件： main.py 项目： enock295simiyu/documentconverter

 def convert_pdf_to_word(self, document_path):
     name, extension = self.get_document_name_and_extension(document_path)
     new_filename = name + '.' + 'docx'
     location_path = os.path.join(MEDIA_ROOT, new_filename)
     parse(document_path, location_path)
     return location_path

示例#21

0

显示文件

    'results-{}'.format(str(datetime.now()).replace(':', '-')[:19]))
try:
    os.mkdir(curr_result_dir)
except OSError:
    print(curr_result_dir + " exists in the system")

pdf_path = curr_result_dir + r'\ocr.pdf'

pdf = canvas.Canvas(pdf_path,
                    bottomup=0,
                    pagesize=(img.shape[1], img.shape[0]))
pdf.setTitle('OCR Results' + str(datetime.now())[:10])
px, py, ph = None, None, None
for label, (x, y, w, h) in zip(labels, boxs):
    if px is None and py is None:
        px, py = x, y
        pw, ph = w, h

    if py + ph > y:
        pdf.setFont('Times-Bold', ph)
        pdf.drawString(x, py + ph, label)
    else:
        pdf.setFont('Times-Bold', h)
        pdf.drawString(x, y + h, label)
        px, py, ph = x, y, h
pdf.save()

parse(pdf_path, curr_result_dir + r'\ocr.docx', start=0, end=None)
print('Results path {}'.format(result_dir))
cv2.imwrite(curr_result_dir + r'\overview.png', overview)

示例#22

0

显示文件

def pdf_to_docx(pdf_file, page, docx_location):
    # docx_file = path + 'file_p2.docx'
    parse(pdf_file, docx_location, pages=[page - 1])
    return docx_location

示例#23

0

显示文件

文件： views.py 项目： aahilashik/Editorial-Production-Workflow

def index(request):   

    if not request.user.is_authenticated:
        print("Trying to open without Signning In")
        request.session["bar"] = "Please Login In and Try Again"
        return redirect('/login')
        
    global unique_id
    logs = []
    if (request.method == 'POST'):
            
    
        if request.POST.get("button") == "UPLOAD & POPULATE":
            #deleteUpNDownloads(unique_id)
            
            print("Username : {} | Started to Process".format(request.user.username))
            
            unique_id = "81948" # str(random.randint(10000,99999))#"unique_id" # 
            if database.child("ingested data").get().val():
                while unique_id in database.child("ingested data").get().val().keys():
                    unique_id = str(random.randint(10000,99999))#"unique_id"
            
            print("Reference ID : ", unique_id)
            
            ppValues = {}
            try:
                if not os.path.exists(os.path.join(UPLOAD_LOCATION, unique_id)): 
                    os.mkdir(os.path.join(UPLOAD_LOCATION, unique_id))
                if not os.path.exists(os.path.join(UPLOAD_LOCATION, unique_id, "manuscript")): 
                    os.mkdir(os.path.join(UPLOAD_LOCATION, unique_id, "manuscript"))
                if not os.path.exists(os.path.join(UPLOAD_LOCATION, unique_id, "images")): 
                    os.mkdir(os.path.join(UPLOAD_LOCATION, unique_id, "images"))
                if not os.path.exists(os.path.join(UPLOAD_LOCATION, unique_id, "others")): 
                    os.mkdir(os.path.join(UPLOAD_LOCATION, unique_id, "others"))
            except: pass
            logs.append("Auto Populate Done")
            
            print("Uploading Files and Parse")
            for file in request.FILES.getlist("files"):
                
                if file.name.split(".")[-1] in ALLOWED_MS_FORMATS :
                    word_count, text = 0, ""
                    logs.append("File  : {} - Successfully Uploaded".format(str(file.name)))
                    with open(os.path.join(UPLOAD_LOCATION, unique_id, "manuscript", file.name), 'wb+') as destination:
                        for i, chunk in enumerate(file.chunks()):
                            destination.write(chunk)
                        destination.close()
                        
                    if file.name.split(".")[-1] == "docx":
                        
                        convert(os.path.join(UPLOAD_LOCATION, unique_id, "manuscript", file.name), os.path.join(UPLOAD_LOCATION, unique_id, "manuscript", file.name.split(".")[0] + ".pdf"))
                        word_count, text = word_counter(os.path.join(UPLOAD_LOCATION, unique_id, "manuscript", file.name))
                        
                    else:
                    
                        pdf2docx.parse(os.path.join(UPLOAD_LOCATION, unique_id, "manuscript", file.name), os.path.join(UPLOAD_LOCATION, unique_id, "manuscript", file.name.split(".")[0] + ".docx"))
                        word_count, text = word_counter(os.path.join(UPLOAD_LOCATION, unique_id, "manuscript", file.name.split(".")[0] + ".docx"))
                        os.unlink(os.path.join(UPLOAD_LOCATION, unique_id, "manuscript", file.name.split(".")[0] + ".docx"))
                    
                    createRnPlogs(text, unique_id)
                    gDF = createSpellLogs(text, unique_id)
                    createIzYzLogs(gDF, unique_id)
                    createGrammarLogs(text, unique_id)
                    createCommaLogs(text, unique_id)
                        
#                    client.process(os.path.join(UPLOAD_LOCATION, unique_id, "manuscript"),  os.path.join(UPLOAD_LOCATION, unique_id, "manuscript"), 10, "processHeaderDocument", False, 1, 0, True, False)
                    
                    coI, fund = 0, 0
                    headingList = minidom.parse(os.path.join(UPLOAD_LOCATION, "unique_id", "manuscript", file.name.split(".")[0] + ".tei.xml")).getElementsByTagName('head')
                    for head in headingList:
                        if head.firstChild:
                            if "conflict of interest" in head.firstChild.data.lower():
                                coI = 1
                            if "funding" in head.firstChild.data.lower():
                                fund = 1
                                
                    
                    with open(os.path.join(UPLOAD_LOCATION, "unique_id", "manuscript", file.name.split(".")[0] + ".tei.xml"), 'rb') as tei:
                        soup = bs4.BeautifulSoup(tei, 'lxml')
                                        
                        try:
                            a_title = soup.title.getText()
                        except: a_title = ""
                        try:
                            a_type = soup.title.get_attribute_list("type")[0]
                        except: a_type = ""
                        try:
                            date = soup.date.getText()
                        except: date = ""
                        try:
                            authors = ([a.persname.getText(" ") for a in soup.analytic.findAll("author")] if soup.analytic.parent.parent.name=="sourcedesc" else [])
                        except: authors = []
                        try:
                            abstract = soup.abstract.getText(separator=' ', strip=True)
                        except: abstract = ""
                        try:
                            n_tables = len(list(dict.fromkeys(soup.findAll("table"))))
                        except: n_tables = ""
                        try:
                            n_figures = sum([1 for f in soup.findAll("figure") if ("fig" in f.get_attribute_list("xml:id")[0] and type(list(f.children)[0])==bs4.element.NavigableString)])
                        except: n_figures = ""
                        try:
                            doi = soup.find('idno', type='DOI').getText()
                        except: doi = ""
                        mail = request.user.email
                        c_interest = coI
                        funding = fund
                        
                        ppValues = {"Content_Text": text, "Ref_ID": unique_id, "Mail_ID": mail, "Article_Title": a_title, "Article_Type": a_type, "Published_Date": date, "Authors": authors, "No_of_Figures": n_figures, "No_of_Tables": n_tables, "Abstract": abstract, "Special_Instructions": "none", "DOI": doi, "Conflict_of_Interest": c_interest, "Funding": fund, "Word_Count": word_count}
                    
                    if file.name.split(".")[-1] == "docx":
                        os.unlink(os.path.join(UPLOAD_LOCATION, unique_id, "manuscript", file.name.split(".")[0] + ".pdf"))
                    
                else:
                    logs.append("File  : {} - Upload Error : Not Supported".format(str(file.name)))
                    
            for file in request.FILES.getlist("images"):
                if file.name.split(".")[-1] in ALLOWED_IM_FORMATS:
                    logs.append("Image : {} - Successfully Uploaded".format(str(file.name)))
                    with open(os.path.join(UPLOAD_LOCATION, unique_id, "images", file.name), 'wb+') as destination:
                        for i, chunk in enumerate(file.chunks()):
                            destination.write(chunk)
                else:
                    logs.append("Image : {} - Upload Error : Not Supported".format(str(file.name)))

            for file in request.FILES.getlist("others"):
                logs.append("Other : {} - Successfully Uploaded".format(str(file.name)))
                with open(os.path.join(UPLOAD_LOCATION, unique_id, "others", file.name), 'wb+') as destination:
                    for i, chunk in enumerate(file.chunks()):
                        destination.write(chunk)
            
            context = {'logs':"\n".join(logs)}
            context.update(ppValues)
            return render(request, '../templates/ingPage.html', context)
            
            
        if request.POST.get("button") == "SUBMIT":
        
            mail        = request.POST['mail_id']
            doi         = request.POST['doi']
            aTitle      = request.POST['article_title']
            aType       = request.POST['article_type']
            abstract    = request.POST['abstract']
            date        = request.POST['published_date']
            author      = request.POST.getlist("authors")
            figures     = request.POST['no_of_figures']
            tables      = request.POST['no_of_tables']
            instruct    = request.POST['spl_instruct']
            cInterest   = request.POST['c_Interest']
            funding     = request.POST['funding']
            refID       = request.POST['unique_id']
            wCount      = request.POST['word_count']
            message     = request.POST['message']
            # text        = request.POST['content_text']
            username    = request.user.username
            
            
            dbValues = {"username": username, "mail id": mail, "message": message, "doi":doi, "word count":wCount,"article title": aTitle, "article type": aType, "published date": date, "authors": author, "no of figures": figures, "no of tables": tables, "abstract": abstract, "special instructions":instruct, "conflict of interest":cInterest, "funding information":funding}
            print("Values to be saved in Database : ", dbValues)
            
            if not len(os.listdir(os.path.join(UPLOAD_LOCATION, refID, "images"))) == int(figures):
                ppValues = {"Ref_ID": refID, "Mail_ID": mail, "Article_Title": aTitle, "Article_Type": aType, "Published_Date": date, "Authors": author, "No_of_Figures": "", "No_of_Tables": tables, "Abstract": abstract, "Special_Instructions": instruct, "DOI": doi, "Conflict_of_Interest": cInterest, "Funding": funding, "Word_Count": wCount}
                context = {'logs':"Submission Error : Image count", "alert":"#ff8888"}
                context.update(ppValues)
                return render(request, '../templates/ingPage.html', context)
                # return render(request, '../templates/ingPage.html', {"logs":"Submission Error : Image count", "alert":"#ff8888"})
                
            for folder in os.listdir(os.path.join(UPLOAD_LOCATION, refID)):
                for file in os.listdir(os.path.join(UPLOAD_LOCATION, refID, folder)):
                                        
                    path_on_cloud = CLOUD_LOCATION
                    path_on_local = os.path.join(UPLOAD_LOCATION, refID, folder, file)
                    storage.child(path_on_cloud).child(refID).child(folder).child(file).put(path_on_local)
                    os.unlink(path_on_local)
                    
            database.child(CLOUD_LOCATION).child(refID).set(dbValues)
            if mailService.start():
                logs.append("Confirmation mail has been sent to " + str(mail))
                mailService.send(mail, "Confirmation mail for your Submission", "Amnet Systems\nReference ID\t:\t{} \nYour request has been submitted successfully".format(refID))
            logs.append("Database : Updated Successfully")
            context = {'logs':"\n".join(logs), "alert":"#99ff99"}
            
            
            return render(request, '../templates/ingPage.html', context)
            
            
    context = {'logs': "", "alert":"#D6EAF8", "Conflict_of_Interest": 0, "Funding": 0}
    return render(request, '../templates/ingPage.html', context)

示例#24

0

显示文件

def ultimate_converter(give_path,
                       get_folder,
                       get_format,
                       type=False,
                       cut_text=False,
                       crop_gif=['00:00:00.00', '00:00:01.00', 1],
                       tags=False):
    ask_continue = True

    # check paths
    if not os.path.exists(get_folder) and ask_continue:
        get_folder = BASE_DIR + '/' + get_folder
        print(get_folder)
        if not os.path.exists(get_folder):
            print('path end are not exist')
            ask_continue = False

    if not os.path.exists(give_path) and ask_continue:
        give_path = BASE_DIR + '/' + give_path
        print(give_path)
        if not os.path.exists(give_path):
            print('path start are not exist')
            ask_continue = False

    if ask_continue:
        # give format
        pattern = re.compile(r'\w+')
        give_format = pattern.findall(give_path)[-1]
        print(give_format)
        if get_format != give_format:
            print('corect format')
            # get name
            ticks = time.time()
            get_name = str(round(ticks * 1000000)) + str(random.randint(0, 9))
            # generate path
            get_path = get_folder + get_name + '.' + get_format
            print(get_path)

            # get object
            type_arr = get_obj()

            if not type:
                # available type
                check_arr = []

                if give_format in type_arr['text']['get']:
                    check_arr.append('text')
                if give_format in type_arr['image']['get']:
                    check_arr.append('image')
                if give_format in type_arr['audio']['get']:
                    check_arr.append('audio')
                if give_format in type_arr['video']['get']:
                    check_arr.append('video')
                if give_format in type_arr['object']['get']:
                    check_arr.append('object')
                if give_format in type_arr['subtitles']['get']:
                    check_arr.append('subtitles')
                if give_format in type_arr['code']['get']:
                    check_arr.append('code')
                if give_format in type_arr['font']['get']:
                    check_arr.append('font')
                if give_format in type_arr['archive']['get']:
                    check_arr.append('archive')

                print(check_arr)

                # select type
                if len(check_arr) == 0:
                    sys.exit()
                elif len(check_arr) > 1:
                    i = False
                    while i == False:
                        try:
                            select_type = input(
                                'select type format like index:')
                            select_type = int(select_type)
                            check_arr[select_type]
                            i = True
                        except:
                            print('wrong type')
                else:
                    select_type = 0
                    print('only one type')

                type = check_arr[select_type]

            # check is format in array
            #print([get_format,type_arr[type]['get']])
            #print([give_format,type_arr[type]['give']])

            if get_format in type_arr[type]['get'] and give_format in type_arr[
                    type]['give']:
                print('corect format')
                json_settings = []

                # Type TEXT
                if type == 'text':
                    import pypandoc
                    print('type text')
                    if give_format == 'pdf':
                        print('give_file is pdf file')
                        # convert pdf to docx
                        from pdf2docx import parse
                        # get new path
                        get_doc_path = get_folder + get_name + '.docx'
                        parse(give_path, get_doc_path, start=0, end=None)
                        # convert docx to format
                        if get_format != 'docx':
                            output = pypandoc.convert_file(get_doc_path,
                                                           get_format,
                                                           outputfile=get_path)
                            # delete old file
                            os.remove(get_doc_path)
                    else:
                        output = pypandoc.convert_file(give_path,
                                                       get_format,
                                                       outputfile=get_path)

                # Type IMAGE
                elif type == 'image':
                    print('type image')
                    from PIL import Image
                    arr = get_obj('doctoimg') + get_obj('img_layers')
                    arr2 = get_obj('vid')
                    if give_format in arr:
                        print('give_file is pdf file')
                        from wand.image import Image as wi
                        # create new path
                        path = get_folder + get_name
                        os.mkdir(path)
                        pdf = wi(filename=give_path, resolution=300)
                        pdfimage = pdf.convert("png")
                        i = 1
                        # create images
                        for img in pdfimage.sequence:
                            page = wi(image=img)
                            img_path = path + '/' + str(i) + ".png"
                            page.save(filename=img_path)

                            # create info
                            file_info = {
                                'filename': str(i),
                                'format': get_format,
                                'path': get_folder + get_name + '/'
                            }
                            # cut text
                            if cut_text:
                                text = get_text_image(get_path, 600, 120)
                                print(text)
                                file_info['text'] = []
                                file_info['text'].append({
                                    'x': 0,
                                    'y': 0,
                                    'width': 0,
                                    'height': 0,
                                    'text': text
                                })
                            # add tags
                            if tags:
                                file_info['tags'] = tags
                            # add to json
                            json_settings.append(file_info)

                            # change format
                            if get_format != 'png':
                                image = Image.open(img_path)
                                arr = get_obj('img_jpg')
                                if get_format in arr:
                                    image = image.convert('RGB')
                                cur_path = path + '/' + str(
                                    i) + '.' + get_format
                                image.save(cur_path)
                                # delete old file
                                os.remove(img_path)

                            i += 1

                    elif give_format in arr2 and get_format == 'gif':
                        print('crop gif')
                        # 1,22.65 = 1 min 22 sec 65 milisec
                        movie_start = crop_gif[0]
                        movie_end = crop_gif[1]
                        try:
                            size = crop_gif[2]
                        except:
                            size = 1

                        from moviepy.editor import VideoFileClip
                        clip = (VideoFileClip(give_path).subclip(
                            (movie_start), (movie_end)).resize(size))
                        clip.write_gif(get_path)

                    else:
                        image = Image.open(give_path)
                        arr = get_obj('img_jpg')
                        if get_format in arr:
                            image = image.convert('RGB')
                        image.save(get_path)
                        # create info
                        file_info = {
                            'filename': get_name,
                            'format': get_format,
                            'path': get_folder
                        }
                        # cut text
                        if cut_text:
                            text = get_text_image(get_path, 600, 120)
                            print(text)
                            # add text
                            file_info['text'] = []
                            file_info['text'].append({
                                'x': 0,
                                'y': 0,
                                'width': 0,
                                'height': 0,
                                'text': text
                            })
                        # add to json
                        if tags:
                            file_info['tags'] = tags
                        json_settings.append(file_info)
                # Type AUDIO
                elif type == 'audio':
                    import ffmpy
                    print('type audio')
                    if get_format == 'mp3':
                        from moviepy.editor import VideoFileClip
                        # create new path
                        get_mp3_path = get_folder + get_name + '.mp3'
                        # Create file
                        videoclip = VideoFileClip(give_path)
                        audioclip = videoclip.audio
                        audioclip.write_audiofile(get_mp3_path)
                        audioclip.close()
                        videoclip.close()
                        # change format
                        if get_format != 'mp3':
                            ff = ffmpy.FFmpeg(inputs={get_mp3_path: None},
                                              outputs={get_path: None})
                            ff.run()
                            # delete old file
                            os.remove(get_mp3_path)
                    else:
                        ff = ffmpy.FFmpeg(inputs={give_path: None},
                                          outputs={get_path: None})
                        ff.run()

                # Type VIDEO
                elif type == 'video':
                    print('type video')
                    if get_format == 'gif':
                        # 1,22.65 = 1 min 22 sec 65 milisec
                        movie_start = crop_gif[0]
                        movie_end = crop_gif[1]
                        from moviepy.editor import VideoFileClip
                        clip = (VideoFileClip(give_path).subclip(
                            (movie_start), (movie_end)).resize(1))
                        clip.write_gif(get_path)
                    else:
                        import ffmpy
                        ff = ffmpy.FFmpeg(inputs={give_path: None},
                                          outputs={get_path: None})
                        ff.run()

                # create standart info
                if len(json_settings) == 0:
                    # create info
                    file_info = {
                        'filename': get_name,
                        'format': get_format,
                        'path': get_folder,
                        'page': 0
                    }
                    if tags:
                        file_info['tags'] = tags
                    json_settings.append(file_info)

                # Write JSON
                json_file = open(get_folder + get_name + '.json', 'w+')
                json_file.write(json.dumps(json_settings))
                json_file.close()

            else:
                print('wrong format')

            return get_path
        else:
            print('format should not be the same')
    else:
        print('ask_continue False')

示例#25

0

显示文件

def pdf_to_docx(pdf_file, pages, docx_file):
    parse(pdf_file, docx_file, pages=[pages - 1])
    return docx_file

示例#26

0

显示文件

 def pdf2docx(self):
     pdf2docx.parse(self.data_file_path, self.data_file_path + '.docx')
     self.data_file_path = self.data_file_path + '.docx'
     self.filetype = 'docx'
     return None

示例#27

0

显示文件

def PdfConverted():
    parse(root.filename, "C:\\Users\\Shravan Sheri\\Documents\\file.docx")
    root.filename = ""

示例#28

0

显示文件

文件： unilever_magnum.py 项目： vj-kanagaraj/SGK-AI-API

def pdf_to_docx(file, pages, input_docx_location):
    parse(file, input_docx_location, pages=[pages - 1])
    return input_docx_location

示例#29

0

显示文件

文件： fileMethods.py 项目： zhj12138/ebook-manager

def pdfToDocx(file_path, to_path):
    doc = fitz.open(file_path)
    count = doc.pageCount
    parse(file_path, to_path, start=0, end=count - 1)

示例#30

0

显示文件

def pdf_to_word(pdf_file):

    # convert to dpcx
    word_file = 'word.docx'
    parse(pdf_file, word_file)