def docx_to_html(self, file, method=None): print('entering docx to html') if file.startswith('\\'): print('connecting to SMB share') try: with smbclient.open_file(r"{}".format(file), mode='rb', username=smb_username, password=smb_password) as f: html = mammoth.convert_to_html(f).value print('file found') except: smbclient.reset_connection_cache() with smbclient.open_file(r"{}".format(file), mode='rb', username=smb_username, password=smb_password) as f: html = mammoth.convert_to_html(f).value print('file found') finally: smbclient.reset_connection_cache() else: print('local') file = document_location + file html = mammoth.convert_to_html(file).value return html
def table_extraction(Path): # ---- Converting to HTML file html = mammoth.convert_to_html(Path).value # ---- Creating Beautiful object soup = BeautifulSoup(html, "html.parser") tables = [] for i in soup.find_all('table'): rows = [] for j in i.find_all('tr'): cells = [] for k in j.find_all('td'): if k.text: temp = str(k).replace('<strong>', 'start_bold').replace( '</strong>', 'end_bold' ).replace("<br>", '\n').replace( "<br/>", '\n' ) # There might be new line in a cell and to capture it we need to replace the break tag cells.append( (BeautifulSoup(temp, 'html.parser').text).replace( 'start_bold', '<b>').replace( 'end_bold', '</b>').replace('\u200b', '').replace('\xa0', '').strip()) if len(cells) > 0: rows.append(cells) if len(rows) > 0: tables.append(rows) return tables, soup
def processDoc(documentPath, htmlDirectory): basename = os.path.basename(documentPath)[0:-5].replace(" ", "_") goodDirectory = htmlDirectory + "goodFiles/" badDirectory = htmlDirectory + "badFiles/" try: with open(documentPath, "rb") as docx_file: result = mammoth.convert_to_html(docx_file) html = result.value soup = BeautifulSoup(html, 'html.parser') regExpList = [ r'Coffman', r'149 Atlantic', r'Swampscott', r'\$\d*\.\d\d' ] # strip dollar amounts, etc. stripInfo(soup, regExpList) html = str(soup) # determine if file is annotated and place accordingly if "LC Class" in html: htmlPath = goodDirectory + basename + '.html' else: htmlPath = badDirectory + basename + '.html' with open(htmlPath, 'w') as fp: fp.write(html) except: print("html conversion error: " + documentPath)
def converter(in_file, out_file): in_file = os.path.abspath(in_file) out_file = os.path.abspath(out_file) in_ext = get_extension(in_file) out_ext = get_extension(out_file) if in_ext == "docx": extensions = ["jpg", "pdf", "pdfa", "png", "tiff", "txt", "zip"] if out_ext in extensions: result = convertapi.convert(out_ext, {'File': in_file}, from_format=in_ext) # print(result) result.file.save(out_file) print(True) elif out_ext == "html": with open(in_file, "rb") as docx_file: result = mammoth.convert_to_html(docx_file) html = result.value with open(out_file, "w+") as f: f.write(html) print(True) else: print(False) print("Invalid output extension") else: print(False) print("Invalid input extension")
def convert(docfile, htmlfile): f = open(docfile, 'rb') b = open(htmlfile, 'wb') document = mammoth.convert_to_html(f) b.write(document.value.encode('utf8')) f.close() b.close()
def parseTables(self, table, index, soup): keys = ("Field1", "Field2", "Field3", "Field4", "Field5", "Field6", "Field7", "Field8", "Field9", "Field10", "Field11", "Field12", "Field13") subKeys = ("Sub-Field-1", "Sub-Field-2", "Sub-Field-3", "Sub-Field-4", "Sub-Field-5", "Sub-Field-6", "Sub-Field-7", "Sub-Field-8", "Sub-Field-9", "Sub-Field-10", "Sub-Field-11", "Sub-Field-12", "Sub-Field-13", "Sub-Field-14", "Sub-Field-15", "Sub-Field-16", "Sub-Field-17", "Sub-Field-18", "Sub-Field-19", "Sub-Field-20", "Sub-Field-21") for i, column in enumerate(table.columns): result = (mammoth.convert_to_html(cell) for cell in column.cells) text = (cell.text.strip() for cell in column.cells) if i == 0: continue if i == 2: continue row_data = dict(zip(keys, text)) self.data.append(row_data) sub = [] # self.progress.setValue(index) print(index) for x in range(21): sub.append(table.cell(13 + x, 2).text) sub_data = dict(zip(subKeys, sub))
def docx_to_html(cls, filepath): with open(filepath, "rb") as docx_file: result = mammoth.convert_to_html(docx_file) html = result.value # The generated HTML print(html) messages = result.messages # Any messages, such as warnings during conversion print(messages)
def doctorPython(self): # self.worker.moveToThread(self.thread) print('here') # self.worker.start() # self.thread.started.connect(self.worker.doctorPython) # self.thread.start() # self.worker.emit('hello there') # self.worker.dropped.connect(self.btn_click) # self.doctorPython(self.doc) # self.thread.start() self.b1.setEnabled(False) if (self.doc != 'hello' and self.butt == False): self.butt = True document = Document(self.doc) table = document.tables[0] with open(self.doc, "rb") as docx_file: result = mammoth.convert_to_html(docx_file) html = result.value # The generated HTML messages = result.messages # Any messages, such as warnings during conversion soup = BeautifulSoup(html, "lxml") print('hello hello') for x in range(len(document.tables)): print('doctor') # QtCore.QCoreApplication.processEvents() self.parseTables(document.tables[x], x, soup) subprocess.Popen(r'explorer /select,"data.json"') open('data.json', 'w').close() with open('data.json', 'w') as outfile: json.dump(self.data, outfile) self.butt = False
def explicit_style_map_is_combined_with_embedded_style_map(): with open(test_path("embedded-style-map.docx"), "rb") as fileobj: result = mammoth.convert_to_html(fileobj=fileobj, style_map="r => strong") assert_equal("<h1><strong>Walking on imported air</strong></h1>", result.value) assert_equal([], result.messages)
def read_meta_at_path(self, filepath): meta = {} with open(filepath, "rb") as docx_file: result = mammoth.convert_to_html(docx_file, **self.get_mammoth_options()) html = result.value if len(result.messages) > 0: self.debug('Messages while reading ' + filepath) for message in result.messages: self.debug(str(message)) d = pq('<body>' + html + '</body>') self.clean_html(d) concept_columns = d.find('table > tr > td:first-child > p') for concept_column_left in concept_columns: concept_name = self.parse_concept_name(concept_column_left, d) if self.is_concept_name_valid(concept_name): concept_column_right = d(concept_column_left).parent().siblings('td') footnotes = self.parse_footnotes(concept_column_right, d) if self.is_concept_value_valid(concept_column_right, d): concept_value = self.parse_concept_value(concept_column_right, d) if len(footnotes) > 0: concept_value += self.wrap_footnotes(footnotes) concept_key = self.get_concept_key(concept_name) meta[concept_key] = concept_value return meta
def docx2html(infile): with open(infile, 'rb') as fp: mammout = mammoth.convert_to_html(fp) for m in mammout.messages: print("Mammoth %s: %s" % (m.type, m.message)) return prettyprint_html(mammout.value, infile)
def convert_text(filename): """Convert the post/page content using the converters""" text_content = open(filename, "r") if ".md" in filename: text_cont1 = "\n" + markdown.markdown(text_content.read()) + "\n" elif ".docx" in filename: with open(os.path.join(cwd, "content", filename), "rb") as docx_file: result = mammoth.convert_to_html(docx_file) final_docx_html = result.value text_cont1 = "\n" + final_docx_html + "\n" elif ".tile" in filename: text_cont1 = "\n" + textile.textile(text_content.read()) + "\n" elif ".jade" in filename: text_cont1 = "\n" + pyjade.simple_convert(text_content.read()) + "\n" elif ".rst" in filename: text_cont1 = "\n" + \ publish_parts(text_content.read(), writer_name='html')[ 'html_body'] + "\n" elif ".html" in filename: text_cont1 = text_content.read() elif ".txt" in filename: text_cont1 = text_content.read() else: print(filename + " is not a valid file type!") text_cont1 = "NULL" return text_cont1 + "\n\n"
def post(self, request, format=None): # tipo_analisis = request.POST['tipo_analisis'] uploaded_file = request.FILES['file'] file_name = uploaded_file.name file_extension = file_name.split(".")[1] destination = open('backendFondecyt/Docs/' + file_name, 'wb+') for chunk in uploaded_file.chunks(): destination.write(chunk) destination.close() if (file_extension == "doc"): file_name = self.converDocToDocx(file_name) if (file_extension == "doc" or file_extension == "docx"): with open('backendFondecyt/Docs/' + file_name, "rb") as docx_file: rawText = mammoth.extract_raw_text(docx_file).value html = mammoth.convert_to_html(docx_file).value if (file_extension == "txt"): txt_file = open('backendFondecyt/Docs/' + file_name, "r", encoding="utf-8") rawText = txt_file.read() html = "" for line in txt_file: stripped_line = line.rstrip() if (stripped_line.strip() != ""): html += "<p>" + line + "</p>" txt_file.close() payload = {'texto': rawText, 'html': html} data = requests.post('http://redilegra.com/general', data=payload) data = json.loads(data.text.encode('utf8')) os.remove('backendFondecyt/Docs/' + file_name) return Response(data, status.HTTP_201_CREATED)
def images_stored_outside_of_document_are_included_in_output(): with open(test_path("external-picture.docx"), "rb") as fileobj: result = mammoth.convert_to_html(fileobj=fileobj) assert_equal( """<p><img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAIAAAACUFjqAAAAAXNSR0IArs4c6QAAAAlwSFlzAAAOvgAADr4B6kKxwAAAABNJREFUKFNj/M+ADzDhlWUYqdIAQSwBE8U+X40AAAAASUVORK5CYII=" /></p>""", result.value) assert_equal([], result.messages)
def embedded_style_map_is_used_if_present(): with _copy_of_test_data("single-paragraph.docx") as fileobj: mammoth.embed_style_map(fileobj, "p => h1") result = mammoth.convert_to_html(fileobj=fileobj, ignore_empty_paragraphs=False) assert_equal("<h1>Walking on imported air</h1>", result.value) assert_equal([], result.messages)
def docx2json(fileName): with open(path + "\\docFiles\\" + fileName + ".docx", "rb") as docx_file: #convert to HTML result = mammoth.convert_to_html(docx_file) html = result.value # The generated HTML #split each html by heading htmlsplit = html.split("<h1>") htmlsplit = ["<h1>" + html for html in htmlsplit] for unit in htmlsplit[1:]: soup = BeautifulSoup(unit, 'html.parser') title = soup.h1.string data = {} #extract h1 text for file name cleanh1 = re.sub('[^0-9a-zA-Z]+', ' ', str(soup.h1.string)) data['title'] = title htmlPath = path + "\\htmlFiles\\" + cleanh1 + ".html" with open(htmlPath, "w") as html: html.write(str(soup)) print("Converstion of " + fileName + ".docx completed") #remove h1 from json for tag in soup.find_all('h1'): tag.replaceWith('') data['body'] = str(soup) json_data = json.dumps(data) print("Converstion of JSON for " + fileName + ".docx started") with open(path + "\\jsonFiles\\" + cleanh1 + ".json", "w") as file: file.write(json_data) print("Converstion of JSON for " + fileName + ".docx completed")
def sendNotifyUpload(req): if(req.method == 'POST'): try: result = {'status': 1} username = req.POST.get('username') token = req.POST.get('token') f = req.FILES.get('file') title = req.POST.get('title') user = models.User.objects.get(username=username) if(token == getToken(user, token_exp_time)): if(user.user_type == 2): updateToken(user) # do real work here f.seek(0) converted = mammoth.convert_to_html(f) html = converted.value models.Notify.objects.create( title=title, link=converted.value) result['status'] = 0 else: result['message'] = '无操作权限' else: result['status'] = -1 result['message'] = '用户未登录' except Exception as e: print(e) result['message'] = '请求无效' finally: return JsonResponse(result)
def docx2html(path: str): """Конвертация docx в html :param path: путь к docx файлам """ import mammoth if check_path(path): return path = os.path.join(DEFAULT_FOLDER, path) if not path.startswith(DEFAULT_FOLDER): return files = ListDir(path) for item in files: cur_item = os.path.join(path, item) if not item.endswith('.docx'): drop_file(cur_item) continue with open(full_path(cur_item), 'rb') as docx_file: try: result = mammoth.convert_to_html(docx_file) except Exception as e: drop_file(cur_item) logger.info('[ERROR]: %s' % e) continue html = result.value #messages = result.messages dest = os.path.join(path, item.replace('.docx', '.html')) with open_file(dest, 'w+') as f: f.write(html) drop_file(cur_item)
def extract(self, target_dir): self.target_dir = target_dir files = os.listdir(self.docx_dir) files = [ os.path.join(self.docx_dir, f) for f in files if f.endswith(".docx") ] #files.sort(key = lambda x: os.path.getmtime(x)) #files.reverse() for i, docx_filename in enumerate(sorted(files)): print(docx_filename) self.image_dir = "blog" + str(i) + "_images" if not os.path.exists(os.path.join(target_dir, self.image_dir)): os.mkdir(os.path.join(target_dir, self.image_dir)) html_name = "blog" + str(i) + "-" + str( docx_filename.split("/")[-1]).replace(".docx", ".html") with open(target_dir + "/" + html_name, 'w') as html_file: with open(docx_filename, "rb") as docx_file: result = mammoth.convert_to_html( docx_file, convert_image=mammoth.images.img_element( self.convert_image)) html = result.value # The generated HTML messages = result.messages # Any messages, such as warnings during conversion if messages: print("Parsing Message: " + str(messages)) html_file.write(html)
def upload_file(request): users = User.objects.filter(is_active=True).order_by('email') if request.method == 'POST': form = DocumentForm(request.POST, request.FILES) if form.is_valid(): doc = form.save(commit=False) doc.created_by = request.user doc.save() if doc.document: if str(doc.document).split('.')[1] == 'docx': print(doc.document) with open("media/" + str(doc.document).replace(" ", "_"), "rb") as docx_file: result = mammoth.convert_to_html(docx_file) html = result.value # The generated HTML print(html) doc.created_by = str(request.user) doc.doc_body = html doc.save() else: form = DocumentForm() return render(request, "upload_file.html", { 'form': form, })
def open_File(self): if self.textEdit.toPlainText() == self.saved_data: self.textEdit.setText("") pass # Change textEdit to QWebview # Read contents of the pdf file # SetContent() of the pdf to the QWebView name = QtGui.QFileDialog.getOpenFileName( self, 'Open File', os.getenv('HOME'), "All files(*.*);;(*.pdf);;txt(*.txt);;doc(*.doc)") if os.path.splitext(name)[1] == ".pdf": print(name) webbrowser.open_new(name) elif os.path.splitext(name)[1] == ".doc": with open(name, "rb") as docx_file: # Check if image is present in the document or not result = mammoth.convert_to_html(docx_file) html = result.value # The generated HTML messages = result.messages # Any messages, such as warnings during conversion else: if name: with open(name, 'r') as stream: self.opendFileText = stream.read() self.saved_data = self.opendFileText self.textEdit.setText(self.opendFileText) self.current_save_file_path = name self.setWindowTitle(name + "Qt Mini text Editor")
def table_content_list(output_file): html = mammoth.convert_to_html(output_file).value soup = BeautifulSoup(html, "html.parser") # print("soup------->",soup) table_content_list_all = [] for tables in soup.find_all('table'): for row in tables.find_all('tr'): column_list = [] for column in row.find_all('td'): # column_list.append(str(column).replace('<td>','').replace('</td>','').replace('</p>','').replace('<p>','').replace('<td colspan="2">','').strip()) raw_html = str(column).replace( '<strong>', 'start_bold').replace('</strong>', 'end_bold').replace('</p>', '\n').strip() cleantext = BeautifulSoup(raw_html, "lxml").text cleantext = cleantext.replace('start_bold', '<b>').replace( 'end_bold', '</b>') cleantext = cleantext.replace('<', '<').replace( '>', '>').replace('\n', '') column_list.append(cleantext.strip()) column_list = [i for i in column_list if i] # print(column_list) table_content_list_all.append(column_list) table_content_list_all = [x for x in table_content_list_all if x != []] return table_content_list_all
def test_getHTML(filename): with open(filename, "rb") as docx_file: result = mammoth.convert_to_html(docx_file) html = result.value # The generated HTML messages = result.messages # Any messages, such as warnings during conversion print(html) print(messages)
def inline_images_referenced_by_path_relative_to_base_are_included_in_output(): with open(test_path("tiny-picture-target-base-relative.docx"), "rb") as fileobj: result = mammoth.convert_to_html(fileobj=fileobj) assert_equal( """<p><img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAIAAAACUFjqAAAAAXNSR0IArs4c6QAAAAlwSFlzAAAOvgAADr4B6kKxwAAAABNJREFUKFNj/M+ADzDhlWUYqdIAQSwBE8U+X40AAAAASUVORK5CYII=" /></p>""", result.value) assert_equal([], result.messages)
def docx_to_html(): #docx_to_html f = open("Main/test.docx", 'rb') b = open('Main/test1.html', 'wb') document = mammoth.convert_to_html(f) b.write(document.value.encode('utf8')) f.close() b.close() print('Done!')
def display_document(doc_id): with open("E:\Emanuel\storedFiles\{}.docx".format(doc_id), "rb") as docx_file: result = mammoth.convert_to_html(docx_file) str_res = result.value for word in words_to_bold: str_res = str_res.replace(word, '<b>{}</b>'.format(word)) return render_template('resalt.html', text=str_res)
def word_to_html(self, owner, new_file_name): with open(FILES_PATH + new_file_name, "rb") as docx_file: result = mammoth.convert_to_html(docx_file) html = result.value #messages = result.messages self.insert_file(owner, new_file_name.replace(".docx", ".txt"), html) os.remove("files/" + new_file_name) print html
def edit_generated(generated_document_name): document = os.path.join(app.config['AUTOTEMPLATE_UPLOAD_FOLDER'], generated_document_name) with open(document, "rb") as docx_file: result = mammoth.convert_to_html(docx_file) html = result.value # The generated HTML #print(html) return html
def pdf2docx_pdf_html(input_pdf, input_docx_location): # docx_name = r"/Users/vigneshramamurthy/opt/anaconda3/Workscripts/Spyder/Sprint 2.21 - Mexico Unilever, Sydney Chennai, APAC Pepsico Penang, APAC Sydeny:Chennai /Accounts/GWF/Script/Word_location/Test/xx.docx" parse(input_pdf, input_docx_location) # parse(input_pdf, docx_name, start=page_no - 1, end=page_no) x = mammoth.convert_to_html(input_docx_location, style_map="b => b").value html = BeautifulSoup(x, 'html.parser') return html
def docx_to_html(docx_dir): with open(docx_dir, "rb") as docx_file: result = mammoth.convert_to_html(docx_file) html = result.value messages = result.messages if len(messages) == 0: print("No Errors Encountered") return html
def warn_if_images_stored_outside_of_document_are_specified_when_passing_fileobj_without_name(): fileobj = io.BytesIO() with open(test_path("external-picture.docx"), "rb") as source_fileobj: shutil.copyfileobj(source_fileobj, fileobj) result = mammoth.convert_to_html(fileobj=fileobj) assert_equal("", result.value) assert_equal([results.warning("could not find external image 'tiny-picture.png', fileobj has no name")], result.messages)
def warning_if_style_mapping_is_not_understood(): style_map = """ !!!! p => h1""" with open(test_path("single-paragraph.docx"), "rb") as fileobj: result = mammoth.convert_to_html(fileobj=fileobj, style_map=style_map) assert_equal("<h1>Walking on imported air</h1>", result.value) warning = "Did not understand this style mapping, so ignored it: !!!!" assert_equal([results.warning(warning)], result.messages)
def relationships_are_handled_properly_in_footnotes(): expected_html = ( '<p><sup><a href="#doc-42-footnote-1" id="doc-42-footnote-ref-1">[1]</a></sup></p>' + '<ol><li id="doc-42-footnote-1"><p> <a href="http://www.example.com">Example</a> <a href="#doc-42-footnote-ref-1">↑</a></p></li></ol>') with open(test_path("footnote-hyperlink.docx"), "rb") as fileobj: result = mammoth.convert_to_html(fileobj=fileobj, id_prefix="doc-42") assert_equal([], result.messages) assert_equal(expected_html, result.value)
def transform_document_is_applied_to_document_before_conversion(): def transform_document(document): document.children[0].style_id = "Heading1" return document with open(test_path("single-paragraph.docx"), "rb") as fileobj: result = mammoth.convert_to_html(fileobj=fileobj, transform_document=transform_document) assert_equal("<h1>Walking on imported air</h1>", result.value) assert_equal([], result.messages)
def endnotes_are_appended_to_text(): expected_html = ('<p>Ouch' + '<sup><a href="#doc-42-endnote-2" id="doc-42-endnote-ref-2">[1]</a></sup>.' + '<sup><a href="#doc-42-endnote-3" id="doc-42-endnote-ref-3">[2]</a></sup></p>' + '<ol><li id="doc-42-endnote-2"><p> A tachyon walks into a bar. <a href="#doc-42-endnote-ref-2">↑</a></p></li>' + '<li id="doc-42-endnote-3"><p> Fin. <a href="#doc-42-endnote-ref-3">↑</a></p></li></ol>') with open(test_path("endnotes.docx"), "rb") as fileobj: result = mammoth.convert_to_html(fileobj=fileobj, id_prefix="doc-42") assert_equal([], result.messages) assert_equal(expected_html, result.value)
def convert_file(docx_path, styles_path): if styles_path is not None: with open(styles_path) as styles_file: styles = styles_file.read() else: styles = None with open(docx_path, "rb") as docx_file: result = mammoth.convert_to_html(docx_file, styles=styles) self._view_model.html = result.value self._view_model.messages = result.messages
def clean_html(f,m): _r = mammoth.convert_to_html(f,style_map=m) _dr = html.fromstring(_r.value) # add 'word' class at top _dr.xpath('//div')[0].attrib['class'] = 'wordsection1' _r.value = etree.tostring(_dr, encoding='unicode', pretty_print=True).encode('ascii', 'xmlcharrefreplace') _r.value = _r.value.decode('utf-8') return(_r)
def footnotes_are_appended_to_text(): # TODO: don't duplicate footnotes with multiple references expected_html = ('<p>Ouch' + '<sup><a href="#doc-42-footnote-1" id="doc-42-footnote-ref-1">[1]</a></sup>.' + '<sup><a href="#doc-42-footnote-2" id="doc-42-footnote-ref-2">[2]</a></sup></p>' + '<ol><li id="doc-42-footnote-1"><p> A tachyon walks into a bar. <a href="#doc-42-footnote-ref-1">↑</a></p></li>' + '<li id="doc-42-footnote-2"><p> Fin. <a href="#doc-42-footnote-ref-2">↑</a></p></li></ol>') with open(test_path("footnotes.docx"), "rb") as fileobj: result = mammoth.convert_to_html(fileobj=fileobj, id_prefix="doc-42-") assert_equal([], result.messages) assert_equal(expected_html, result.value)
def when_style_mapping_is_defined_for_comment_references_then_comments_are_included(): expected_html = ( '<p>Ouch' + '<sup><a href="#doc-42-comment-0" id="doc-42-comment-ref-0">[MW1]</a></sup>.' + '<sup><a href="#doc-42-comment-2" id="doc-42-comment-ref-2">[MW2]</a></sup></p>' + '<dl><dt id="doc-42-comment-0">Comment [MW1]</dt><dd><p>A tachyon walks into a bar. <a href="#doc-42-comment-ref-0">↑</a></p></dd>' + '<dt id="doc-42-comment-2">Comment [MW2]</dt><dd><p>Fin. <a href="#doc-42-comment-ref-2">↑</a></p></dd></dl>' ) with open(test_path("comments.docx"), "rb") as fileobj: result = mammoth.convert_to_html(fileobj=fileobj, id_prefix="doc-42-", style_map="comment-reference => sup") assert_equal([], result.messages) assert_equal(expected_html, result.value)
def footnotes_are_appended_to_text(): # TODO: don't duplicate footnotes with multiple references expected_html = ('<p>Ouch' + '<sup><a href="#footnote-42-1" id="footnote-ref-42-1">[1]</a></sup>.' + '<sup><a href="#footnote-42-2" id="footnote-ref-42-2">[2]</a></sup></p>' + '<ol><li id="footnote-42-1"><p> A tachyon walks into a bar. <a href="#footnote-ref-42-1">↑</a></p></li>' + '<li id="footnote-42-2"><p> Fin. <a href="#footnote-ref-42-2">↑</a></p></li></ol>') with open(test_path("footnotes.docx"), "rb") as fileobj: result = mammoth.convert_to_html(fileobj=fileobj, generate_uniquifier=lambda: 42) # TODO: get rid of warnings #~ assert_equal([], result.messages) assert_equal(expected_html, result.value)
def word_tables_are_converted_to_html_tables(): expected_html = ("<p>Above</p>" + "<table>" + "<tr><td><p>Top left</p></td><td><p>Top right</p></td></tr>" + "<tr><td><p>Bottom left</p></td><td><p>Bottom right</p></td></tr>" + "</table>" + "<p>Below</p>") with open(test_path("tables.docx"), "rb") as fileobj: result = mammoth.convert_to_html(fileobj=fileobj) assert_equal([], result.messages) assert_equal(expected_html, result.value)
def warn_if_images_stored_outside_of_document_are_not_found(): with tempman.create_temp_dir() as temp_dir: document_path = os.path.join(temp_dir.path, "document.docx") with open(document_path, "wb") as fileobj: with open(test_path("external-picture.docx"), "rb") as source_fileobj: shutil.copyfileobj(source_fileobj, fileobj) with open(document_path, "rb") as fileobj: result = mammoth.convert_to_html(fileobj=fileobj) assert_equal("", result.value) expected_warning = "could not open external image: 'tiny-picture.png'" assert_equal("warning", result.messages[0].type) assert result.messages[0].message.startswith(expected_warning), "message was: " + result.messages[0].message assert_equal(1, len(result.messages))
def convert_docx_to_html(file_obj): """ Convert a docx file-like object to html. Return a sanitized, django-safe html string, or raise a ConversionError if something went wrong. """ try: result = mammoth.convert_to_html(file_obj) except Exception as exc: logger.info('Conversion error ' + str(exc)) raise ConversionError(exc) html = sanitize_html(result.value) return mark_safe(html)
def load_document(document_url): import mammoth, os with open(document_url, 'rb') as docx_file: data = [] result = mammoth.convert_to_html(docx_file) soup = BeautifulSoup(result.value) paras = soup.findAll('p') title = document_url.split('/')[-1].replace('.docx', '') for idx, p in enumerate(paras): if len(p.findAll('strong')) > 0: if title is None and idx is 0: pass else: data.append("<h3>%s</h3>" % p.text.replace(':', ' ')) else: data.append("<p>%s</p>" % p.text) return title, ''.join(data)
def upload(): if request.method == 'GET': return render_template('upload.html', message='') elif request.method == 'POST': docfile = request.files['document'] if docfile and allowed_file(docfile.filename): filename = secure_filename(docfile.filename) docfile.save(os.path.join(app.config['UPLOAD_FOLDER'], filename)) doc_html = mammoth.convert_to_html(docfile).value data = {'wrongFileType': False, 'message': doc_html} response = make_response(json.dumps(data), 200) response.headers['Content-Type'] = 'application/json' return response else: data = {'wrongFileType': True} response = make_response(json.dumps(data), 200) response.headers['Content-Type'] = 'application/json' return response
def processFile(filename, reviews): style_map = "u => em" with open(filename, "rb") as docx_file: result = mammoth.convert_to_html(docx_file, style_map=style_map) html = result.value # The generated HTML paras = html.split('<p>') currentRotation = None parsedReview = None lastParsedReview = None for p in paras: s = p[:-4].strip() #s = s.replace(u'\xa0', ' ').replace(u'\u2013', '-') s = unicodedata.normalize('NFKC', s).replace(u'\u2013', '-') if len(s) > 0 and s[0] != '<': if len(s) > 0: if len(s) < 10: s = s[:-1].rstrip() if s=='H' or s=='M' or s=='L' or s=='R/N': currentRotation = s albumName = None albumReview = None waitingForAlbum = True else: if parsedReview is None: if ReviewParser.isNameString(s): parsedReview = ReviewParser(filename, currentRotation) parsedReview.parseNameString(s) lastParsedReview = None elif not (lastParsedReview is None): # did somebody put a newline in the middle of a review? Try to add it to the last parsedReview lastParsedReview.parseReviewString(s) else: parsedReview.parseReviewString(s) reviews.append(parsedReview.review) lastParsedReview = parsedReview parsedReview = None
def empty_paragraphs_are_preserved_if_ignore_empty_paragraphs_is_false(): with open(test_path("empty.docx"), "rb") as fileobj: result = mammoth.convert_to_html(fileobj=fileobj, ignore_empty_paragraphs=False) assert_equal("<p></p>", result.value) assert_equal([], result.messages)
def docx_containing_one_paragraph_is_converted_to_single_p_element(): with open(test_path("single-paragraph.docx"), "rb") as fileobj: result = mammoth.convert_to_html(fileobj=fileobj) assert_equal("<p>Walking on imported air</p>", result.value) assert_equal([], result.messages)
def text_boxes_are_read(): with open(test_path("text-box.docx"), "rb") as fileobj: result = mammoth.convert_to_html(fileobj=fileobj) assert_equal('<p>Datum plane</p>', result.value)
def createHtml(folderName, part, html): filename = folderName+'/content/'+part+'.html' with open(filename,'w') as html_file: html_file.write(html) def convert_image(image): with image.open() as image_bytes: encoded_src = base64.b64encode(image_bytes.read()).decode("ascii") return { "src": "data:{0};base64,{1}".format(image.content_type, encoded_src) } with open("ll.docx", "rb") as docx_file: result = mammoth.convert_to_html(docx_file) html = result.value counter = 100001 soup = BeautifulSoup(html) table = soup.find('table') trs = table.findAll('tr', recursive=False) book_code = 'protec_book_' + trs[0].findAll('td')[1].text code = str(trs[0].findAll('td')[1].text)+'_'+str(counter) book_number = trs[0].findAll('td')[1].text book_name = trs[3].findAll('td')[1].text book_desc = "".join([str(x) for x in trs[3].findAll('td')[2].contents]) name = book_name if trs[3].findAll('td')[2].text == '': _type = 'folder' url = ''
def underline_is_ignored_by_default(): with open(test_path("underline.docx"), "rb") as fileobj: result = mammoth.convert_to_html(fileobj=fileobj) assert_equal('<p><strong>The Sunset Tree</strong></p>', result.value)
def underline_can_be_configured_with_convert_underline_option(): with open(test_path("underline.docx"), "rb") as fileobj: result = mammoth.convert_to_html(fileobj=fileobj, convert_underline=mammoth.underline.element("em")) assert_equal('<p><strong>The <em>Sunset</em> Tree</strong></p>', result.value)
def underline_can_be_configured_with_style_mapping(): with open(test_path("underline.docx"), "rb") as fileobj: result = mammoth.convert_to_html(fileobj=fileobj, style_map="u => em") assert_equal('<p><strong>The <em>Sunset</em> Tree</strong></p>', result.value)
def strikethrough_is_converted_to_s_element_by_default(): with open(test_path("strikethrough.docx"), "rb") as fileobj: result = mammoth.convert_to_html(fileobj=fileobj) assert_equal("<p><s>Today's Special: Salmon</s> Sold out</p>", result.value)
def strikethrough_conversion_can_be_configured_with_style_mapping(): with open(test_path("strikethrough.docx"), "rb") as fileobj: result = mammoth.convert_to_html(fileobj=fileobj, style_map="strike => del") assert_equal("<p><del>Today's Special: Salmon</del> Sold out</p>", result.value)
def can_read_xml_files_with_utf8_bom(): with open(test_path("utf8-bom.docx"), "rb") as fileobj: result = mammoth.convert_to_html(fileobj=fileobj) assert_equal("<p>This XML has a byte order mark.</p>", result.value) assert_equal([], result.messages)
def empty_paragraphs_are_ignored_by_default(): with open(test_path("empty.docx"), "rb") as fileobj: result = mammoth.convert_to_html(fileobj=fileobj) assert_equal("", result.value) assert_equal([], result.messages)
def images_stored_outside_of_document_are_included_in_output(): with open(test_path("external-picture.docx"), "rb") as fileobj: result = mammoth.convert_to_html(fileobj=fileobj) assert_equal("""<p><img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAIAAAACUFjqAAAAAXNSR0IArs4c6QAAAAlwSFlzAAAOvgAADr4B6kKxwAAAABNJREFUKFNj/M+ADzDhlWUYqdIAQSwBE8U+X40AAAAASUVORK5CYII=" /></p>""", result.value) assert_equal([], result.messages)
import mammoth style_map = """ p[style-name='Section Title'] => h1:fresh p[style-name='Subsection Title'] => h2:fresh """ # KeyError: "There is no item named 'word/styles.xml' in the archive" with open("../../res/paper.doc", "rb") as doc_file: print(doc_file.name) result = mammoth.convert_to_html(doc_file, style_map=style_map) html = result.value # The generated HTML messages = result.messages # Any messages, such as warnings during conversion print(html)