示例#1
0
def convert(output_type, docx_path, output_path):
    if output_type == '--html':
        output = PyDocX.to_html(docx_path)
    elif output_type == '--markdown':
        output = PyDocX.to_markdown(docx_path)
    else:
        print('Only valid output formats are --html and --markdown')
        return 2
    with open(output_path, 'wb') as f:
        f.write(output.encode('utf-8'))
    return 0
示例#2
0
def wordtohtml(request):
    media_root = os.path.join(settings.BASE_DIR, 'upload/')
    if request.method == "POST":
        file = request.FILES.get('file')
        if file is not None:
            t = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
            id = shortuuid.uuid()
            word = wordhtml(word=file, time=t, uuid=id)
            word.save()
            file_path = media_root + str(word.word)
            html = PyDocX.to_html(file_path)
            html_name = str(word.uuid) + ".html"
            txt_name = media_root + "word/" + html_name
            f = open(txt_name, 'w', encoding="utf-8")
            f.write(html)
            f.close()
            if settings.DEBUG:
                url = "http://127.0.0.1:8000/upload/word/" + html_name
            else:
                url = "https://www.manjiexiang.cn/upload/word/" + html_name
            return HttpResponseRedirect(url)
        else:
            me = Me.objects.all()
            return render(request, "wordtohtml.html", {"msg": me[0]})
    else:
        me = Me.objects.all()
        return render(request, "wordtohtml.html", {"msg": me[0]})
示例#3
0
def get_img(file):
    html = PyDocX.to_html(file)
    soup = BeautifulSoup(html, 'lxml')
    images_in_docx = []
    images_in_docx.append("aaaaaaaaaaaaaa")

    # for img in soup.findAll('img'):
    # 	reg = re.compile('data.*?/(.*?);', re.S)
    # 	style_img = reg.findall(img['src'])[0]
    # 	strg = img['src'].replace("data:image/wmf;base64,", "").replace("data:image/jpeg;base64,", "")
    # 	byte = base64.urlsafe_b64decode(strg)
    # 	t0 = int(round(time.time() * 1000))
    # 	tmp_path = '/tmp/%d.%s' % (t0, str(style_img))
    # 	with open(tmp_path, 'wb') as file:
    # 		file.write(byte)
    # 	if style_img == 'wmf':
    # 		t1 = int(round(time.time() * 1000))
    # 		png_path = '/tmp/%d.png' % t1
    # 		os.system('convert %s %s' % (tmp_path, png_path))
    # 		f = open(png_path, 'rb')
    # 		url = put(f)
    # 		f.close()
    # 		img['src'] = url
    # 		images_in_docx.append(img)
    # 		os.remove(png_path)
    # 		os.remove(tmp_path)
    # 	else:
    # 		f = open(tmp_path, 'rb')
    # 		url = put(f)
    # 		f.close()
    # 		img['src'] = url
    # 		images_in_docx.append(img)
    # 		os.remove(tmp_path)
    return images_in_docx
示例#4
0
def read_word(file):
    proxy = []
    doc = docx.Document(file)
    for para in doc.paragraphs:
        proxy.append(para._element.xml)  # 返回docx文档的xml文件

    threads = []
    # q = Queue
    q = {}
    t1 = threading.Thread(target=getMathml, args=(proxy, 'mmls_in_para', q))
    threads.append(t1)
    html = PyDocX.to_html(file)
    t2 = threading.Thread(target=get_img, args=(html, 'images_in_para', 'images_in_table', q))
    threads.append(t2)
    t1.start()
    t2.start()
    t1.join()
    t2.join()
    # for t in threads:
    # 	t.setDaemon(True)
    # 	t.start()
    # t.join()
    mmls_in_para = q.get('mmls_in_para')

    images_in_para, images_in_table = q.get('images_in_para'), q.get('images_in_table')

    table_html, table_para = get_table(doc, images_in_table)  # 返回表格数据

    paragraphs = get_para_html(proxy, mmls_in_para, images_in_para, table_html, table_para)  # 得到文档的段落信息,返回的是HTML标签

    return paragraphs
示例#5
0
def docxtract(docxfile, **kwargs):
    '''
    Get a docx file and extract chapters. Return an array for further processing by the user.
    '''
    header = [u'h1', u'h2', u'h3', u'h4']
    html = PyDocX.to_html(docxfile)
    html = cleanhtml(html, **kwargs)
    # Get the tree
    p = False
    tree = []
    for child in html.find_all(True):
        if child.name in header:
            if p:
                tree.append([ptext, p])
                p = False
            tree.append(["<b>%s</b>" % child.get_text(), unicode(child)])
        else:
            if child.get_text().strip():
                if p:
                        p = ''.join([p, unicode(child)])
                else:
                    p = unicode(child)
                    ptext = "%s..." % child.get_text()[:80]
    if p:
        tree.append([ptext, p])
    return tree
示例#6
0
def docx2html(file_path):
    '''
    将一个docx转换成html
    :param file_path:
    :return:
    '''
    return PyDocX.to_html(file_path)
def store_document():

    sys.setdefaultencoding('utf-8')
    # Pass in a path
    html = PyDocX.to_html(open('cmpe273-greensheet.docx', 'rb'))
    db.docCollection.insert({"HTML": html, "FileName": 'cmpe273-greensheet'})
    return "sucess"
示例#8
0
def _main_(tournament):
    os.chdir('C:\School\Quiz Bowl\Quizbowl Packets\\' + tournament)
    files = os.listdir(os.curdir)
    q = open(tournament + ' answers.csv', 'w')
    writer = csv.DictWriter(q, fieldnames=['Full Answer', 'Underlined Answer', 'Tournament', 'Packet'],
                            lineterminator='\n')
    writer.writeheader()
    for f in files:
        html = PyDocX.to_html(f).decode('Windows-1252').encode('utf-8')
        html = clean_file(html)
        answers = re.findall(r'ANSWER: (?P<id>.*?)(?:</li>|<br />|</p>)', html)
        und = []
        for a in answers:
            underlined = re.findall(r'<span class=\"pydocx-underline\">(?P<id>.*?)</span>', a)
            output = ""
            for u in underlined:
                output += ' ' + u.strip()
            und.append(output.strip())

        for i in range(0, len(answers)):
            answer = re.sub(r'<[^>]*>', '', answers[i]).strip()
            writer.writerow({'Full Answer': answer.encode('utf-8'), 'Underlined Answer': und[i].encode('utf-8'),
                            'Tournament': tournament, 'Packet': f})
    q.close()
    shutil.copy(tournament + ' answers.csv', 'C:\School\Quiz Bowl\Stats\Answers')
    os.remove(tournament + ' answers.csv')
示例#9
0
def docx_to_html(docx_path):
    docx_html = PyDocX.to_html(docx_path)
    style_start = docx_html.find("<style>")
    style_end = docx_html.find("</style>")
    docx_html = docx_html[:style_start] + docx_html[style_end:]
    docx_html = '<div>' + docx_html.replace("_", "&ensp;") + '</div>'
    return docx_html
示例#10
0
def main():
    #修改为遍历时操作
    filelist=os.listdir(config.filepath)
    for file in filelist:
        if file.endswith(".docx"):
            print("start to solve ",file)
            filepath=config.filepath+"/"+file
            name=os.path.basename(filepath)
            temp_photo_path=temppath+os.path.sep+name.replace(".docx","")+os.path.sep
            if not os.path.exists(temp_photo_path):
                os.makedirs(temp_photo_path)
            # 使用pydocx转化为html
            html = PyDocX.to_html(filepath)
            bsoup = bs4.BeautifulSoup(html, "lxml")
            imglist = bsoup.find_all("img")
            for img in imglist:
                #将html中的图片保存并转换后存入html中
                img["src"],path=savephoto(img["src"], temp_photo_path)
            with open(temp_photo_path + name.replace(".docx", ".html"), "w", encoding="utf-8") as file:
                file.write(bsoup.prettify())
            #将html转为docx
            #获取所有p标签
            dstpath=temp_photo_path + name
            newdoc=docx.Document(os.getcwd()+os.path.sep+"docx/templates/default.docx")
            bodycontent = bsoup.find("body")
            bodychild = bodycontent.contents
            for child in bodychild:
                handle_tag(child, newdoc,temp_photo_path)
            newdoc.save(dstpath)
示例#11
0
def insert_document():
    msg = ''
    if request.method == 'POST':
        uploaded_file = request.files['file']
        uploaded_file.save(secure_filename(uploaded_file.filename))
        html = PyDocX.to_html(open(uploaded_file.filename, 'rb'))
        parse_store_content(html)
    return "File successfully parsed and uploaded"
示例#12
0
def trans_to_html():

    for one_file in glob.glob('*.docx'):
        name = one_file.replace('.docx', '')
        html = PyDocX.to_html(one_file)
        f = open(name + '.html' ,'w',encoding = 'utf-8')
        f.write(html)
        f.close()
示例#13
0
def merge_docx(docx_list=None, out_htmlpath=None):
    """
    docx_list is a list of strings which contains the (absolute) path of DOC/DOCX files to be merged.
    MERGE_DOCX() will follow the index order of docx_list for appending.
    Returns the HTML file as string. 
    If OUT_HTMLPATH is given, write the HTML file out as well.
    """
    if docx_list is None:
        return None
    
    cleaner = Cleaner()
    parser = HTMLParser(encoding='utf-8')
    html_list = []
    for path in docx_list:
        try:
            tmp_html =  PyDocX.to_html(path)
            html_list.append(cleaner.clean_html(lxml.html.fromstring(tmp_html, parser=parser)))
        except:
            #'MalformedDocxException'
            try:
                # Pretend it is a html
                html_file = '{}.html'.format(path)
                with open(html_file, 'rb') as tmp:
                    tmp_html = tmp.read()
                tmp_html = tmp_html.decode('utf-8')
                html_list.append(cleaner.clean_html(lxml.html.fromstring(tmp_html, parser=parser)))
            except:
                # Cannot convert
                continue
    
    #print html_list
    if len(html_list)>1:
        #Append element at the end of first body
        main_body = html_list[0].xpath('./body')[0]
        for tree in html_list[1:]:
            elem_list = tree.xpath('./body/*')
            for elem in elem_list:
                main_body.append(elem)
    elif len(html_list)==1:
        main_body = html_list[0].xpath('./body')[0]
    else:
        try:
            main_body = html_list[0].xpath('./body')[0]
        except IndexError:
            # no body content. Most likely just an image/appendix
            return None
    
    # Convert ElementTree back to string
    # in this way we will lose the 'style' info in html_list[0][0], which is usually in header,
    # but not sure if it will cause any differences to parser later on. Probably not.
    html_str = lxml.etree.tostring(main_body)
    
    if out_htmlpath is not None:
        with open(out_htmlpath, 'wb') as tmp:
            tmp.write(html_str.encode('utf-8'))
                
    return html_str
        
示例#14
0
def file_change(input, output='out.html'):
    html = PyDocX.to_html(input)
    # the title
    html = html.replace('pydocx-center', 'pydocx-center subtitle', -1)
    # the subtitle
    html = html.replace('pydocx-center subtitle', 'pydocx-center title', 2)
    with open(output, 'w', encoding='utf-8') as f:
        f.write(html)
        f.close
示例#15
0
def docx2html(docx_filepath, html_filename=None):
    with open(docx_filepath, 'rb') as docx_file:
        html = PyDocX.to_html(docx_file)
        xmltree = etree.fromstring(html)
        prettyxml = etree.tostring(xmltree, pretty_print=True)
        if html_filename is None:
            html_filename = os.path.splitext(docx_filepath)[0] + '.html'
        with open(html_filename, 'w+b') as html_file:
            html_file.write(prettyxml)
示例#16
0
def docx2pdf(filepath, dirpath, filename):
    #结果是html,xswl
    # wdFormatPDF = 17
    in_file = filepath
    out_path = dirpath + '/Webpages'
    create_dir(out_path)
    html = PyDocX.to_html(in_file)
    f = open(filename, 'w', encoding="utf-8")
    f.write(html)
    f.close()
示例#17
0
def insert_document():
    msg = ''
    if request.method == 'POST':
        uploaded_file = request.files['file']
        uploaded_file.save(secure_filename(uploaded_file.filename))
        html = PyDocX.to_html(open(uploaded_file.filename, 'rb'))
        file_name = str(uploaded_file.filename)
        db[file_name].insert({"filename": file_name, "HTML": html})
        parse_store_content(file_name)
    return "File successfully uploaded"
def insert_document():   
   if request.method == 'POST':
      f = request.files['file']
   f.save(secure_filename(f.filename))
   html = PyDocX.to_html(open(f.filename, 'rb'))
   db.greensheetdocs.insert({
        "filename":"CMPE273",
        "HTML":html
    })
   return "Successfully Uploaded"
示例#19
0
def convert_docx_html(srcfile):
    """
    转换docx文件为html文件
    :param srcfile: docx文件
    :return:
    """
    html = PyDocX.to_html(srcfile)
    name = srcfile[:srcfile.rfind(".")]
    f = open(name.encode("gbk") + ".html", 'w')
    f.write(html.encode("utf-8"))
    f.close()
示例#20
0
def docx2html(_path_docx):

    # html = PyDocX.to_html("test.docx")
    # html = PyDocX.to_html(r'E:\\3101A0CV-20170615-H3C RPS800-A 用户手册-6PW101\06-正文.docx')
    html = PyDocX.to_html(_path_docx)
    # f = open("test.html", 'w', encoding="utf-8")
    path_html = _path_docx.split("\\")[-1].split(".")[0] + ".html"
    f = open(path_html, 'w', encoding="utf-8")
    f.write(html)
    f.close()
    return path_html
示例#21
0
def get_img(file):
	"""

	:param file:文件对象
	:return:返回文档中的图片
	"""
	html = PyDocX.to_html(file)
	print('html{}'.format(html))
	soup = BeautifulSoup(html, 'lxml')
	images_in_para = []
	images_in_table = []

	for img in soup.find_all('img'):
		if img.find_parents('table') != []:
			img['src'] = "$$$$$$$$$$$$$$$$"
			images_in_table.append(str(img))
		else:
			img['src'] = "aaaaaaaaaaaa"
			images_in_para.append(str(img))
	#     reg = re.compile('data.*?/(.*?);', re.S)
	#     style_img = reg.findall(img['src'])[0]
	#     strg = img['src'].replace("data:image/wmf;base64,", "").replace("data:image/jpeg;base64,", "")
	#     byte = base64.urlsafe_b64decode(strg)
	#     t0 = int(round(time.time() * 1000))
	#     tmp_path = '/tmp/%d.%s' % (t0, str(style_img))
	#     with open(tmp_path, 'wb') as file:
	#         file.write(byte)
	#     if style_img == 'wmf':
	#         t1 = int(round(time.time() * 1000))
	#         png_path = '/tmp/%d.png' % t1
	#         os.system('convert %s %s' % (tmp_path, png_path))
	#         f = open(png_path, 'rb')
	#         url = put(f)
	#         f.close()
	#         img['src'] = url
	#         if img.find_parents('table') != []:
	#             images_in_table.append(img)
	#         else:
	#             images_in_para.append(img)
	#         os.remove(png_path)
	#         os.remove(tmp_path)
	#     else:
	#         f = open(tmp_path, 'rb')
	#         url = put(f)
	#         f.close()
	#         img['src'] = url
	#         if img.find_parents('table') != []:
	#             images_in_table.append(img)
	#         else:
	#             images_in_para.append(img)
	#         os.remove(tmp_path)
	return images_in_para, images_in_table
示例#22
0
def to_html():

    #transfer all docx files into html through this for loop.
    for one_file in glob.glob('*.docx'):
        #extract the name of each employee
        name = re.compile(r'(.*?)员工履历').findall(one_file)[0]
        #extract content of docx file
        html = PyDocX.to_html(one_file)
        #create a html file
        f = open(name + '.html', 'w', encoding='utf-8')
        #save the content into this html file.
        f.write(html)
        #save and close the html file.
        f.close()
示例#23
0
def docx_ol(request, file_id):
    file_info = FileInfo.objects.get(id=file_id)
    init_path = file_info.file_path
    html_path = file_info.file_path.split('.')[0] + '.html'
    html_name = file_info.file_name.split('.')[0] + '.html'
    # if file_info.file_type =='doc':
    #     doc2x(file_info.file_path)
    #     init_path=file_info.file_path.split('.')[0]+'.docx'
    #     pythoncom.CoInitialize()
    html = PyDocX.to_html(init_path)
    f = open(html_path, 'w', encoding="utf-8")
    f.write(html)
    f.close()
    shutil.copy(html_path,
                'D:\\python\\jzyy\\fileserver\\templates\\fileserver\\ol')
    return render(request, 'fileserver/ol/%s' % html_name)
示例#24
0
def upload(request):
    if request.method == 'POST':  # 获取对象
        file = request.FILES.get("file", None)
        print(file.name)
        bs = base64.b64decode(file.name)
        filename = str(bs, 'ISO-8859-1')
        print(filename)
        extension = fileExtension(filename)
        print(extension)
        html = None
        if (extension == '.docx' or extension == '.doc'):
            html = PyDocX.to_html(file)
        if (extension == '.xls' or extension == '.xlsx'):
            xd = pd.ExcelFile(file)
            df = xd.parse()
            html = df.to_html(header=True, index=False)
    return HttpResponse(html)
示例#25
0
def docx_to_html(filepath, overwrite=False):
    """
    Converts docx file to in-memory html string

    :param filepath: full path to the file to convert
    :return: unicode string
    """
    html_file = '{}.html'.format(filepath)
    if not os.path.exists(html_file) or overwrite:
        #res = pydocx.docx2html(filepath)
        res = PyDocX.to_html(filepath)
        with open(html_file, 'wb') as tmp:
            tmp.write(res.encode('utf-8'))
    else:
        with open(html_file, 'rb') as tmp:
            res = tmp.read().decode('utf-8')
    return res
示例#26
0
def index(request):
    if request.method == "GET":
        papername = request.GET.get('filename', None)
        paper = PaperGrade.objects.get(PaperName=papername)
        username = request.session.get('username', None)
        user = UserInfo.objects.get(Name=username)
        user.PaperChecking = papername
        user.save()
        if paper.file_type() is 'docx':
            html = PyDocX.to_html(paper)
            return render(request, html)  #不知对错
        else:
            return render(
                request, 'All/index.html',
                {'filename': r'/File/' + paper.PaperName})  #url写法自我怀疑人生中
    else:
        return render(request, 'All/index.html')
    return render(request, 'All/index.html')
示例#27
0
    def _create_pdf(self, file):
        _, extension = os.path.splitext(file.file.name)
        aws_response = requests.get(self._get_aws_url(file.file, 5))
        content = ContentFile(aws_response.content)

        if extension in ['.xlsx', '.xls', '.xlb']:
            preview_file = f'preview;name;data:application/vnd.openxmlformats-officedocument.spreadsheetml.sheet;base64,{base64.b64encode(content.read()).decode("utf-8")}'
            return Response(
                {
                    'preview': preview_file,
                    'extension': extension,
                    'title': file.name
                },
                status=status.HTTP_200_OK)
        elif extension == '.pdf':
            preview_file = f'preview;name;data:application/pdf;base64,{base64.b64encode(content.read()).decode("utf-8")}'
            return Response(
                {
                    'preview': preview_file,
                    'extension': extension,
                    'title': file.name
                },
                status=status.HTTP_200_OK)
        elif extension == '.docx':
            html = PyDocX.to_html(content)
        elif extension == '.rtf':
            html = rtf.getHtml(content.read().decode('UTF-8'))
        else:
            raise ValidationError(
                "File is required, expected format: pdf, doc, docx, xls, xlsx, rtf"
            )
        response = HttpResponse()
        response['Content-Disposition'] = 'attachment; filename="report.pdf"'
        pisa.CreatePDF(html, dest=response)
        preview_file = f'preview;name;data:application/pdf;base64,{base64.b64encode(response.content).decode("utf-8")}'
        return Response(
            {
                'preview': preview_file,
                'extension': extension,
                'title': file.name
            },
            status=status.HTTP_200_OK)
示例#28
0
def get_img(file):
	html = PyDocX.to_html(file)
	soup = BeautifulSoup(html, 'lxml')
	images_in_para = []
	images_in_table = []

	for img in soup.find_all('img'):
		reg = re.compile('data.*?/(.*?);', re.S)
		style_img = reg.findall(img['src'])[0]
		strg = img['src'].replace("data:image/wmf;base64,", "").replace("data:image/jpeg;base64,", "")
		byte = base64.urlsafe_b64decode(strg)
		t0 = int(round(time.time() * 1000))
		tmp_path = '/tmp/%d.%s' % (t0, str(style_img))
		with open(tmp_path, 'wb') as file:
			file.write(byte)
		if style_img == 'wmf':
			t1 = int(round(time.time() * 1000))
			png_path = '/tmp/%d.png' % t1
			os.system('convert %s %s' % (tmp_path, png_path))
			f = open(png_path, 'rb')
			url = put(f)
			f.close()
			img['src'] = url
			if img.find_parents('table') != []:
				images_in_table.append(img)
			else:
				images_in_para.append(img)
			os.remove(png_path)
			os.remove(tmp_path)
		else:
			f = open(tmp_path, 'rb')
			url = put(f)
			f.close()
			img['src'] = url
			if img.find_parents('table') != []:
				images_in_table.append(img)
			else:
				images_in_para.append(img)
			os.remove(tmp_path)
	return images_in_para, images_in_table
示例#29
0
def get_img(file):
    html = PyDocX.to_html(file)
    soup = BeautifulSoup(html, 'lxml')
    images_in_para = []
    images_in_table = []

    for img in soup.find_all('img'):
        reg = re.compile('data.*?/(.*?);', re.S)
        style_img = reg.findall(img['src'])[0]
        strg = re.sub(re.compile("data:[/\w]*;base64,", re.S | re.I), '',
                      img['src'])
        byte = base64.urlsafe_b64decode(strg)
        t0 = str(uuid.uuid4()) or int(round(time.time() * 1000))
        tmp_path = '/tmp/{}.{}'.format(t0, str(style_img))
        with open(tmp_path, 'wb') as file:
            file.write(byte)
        if style_img == 'wmf':
            t1 = str(uuid.uuid4()) or int(round(time.time() * 1000))
            png_path = '/tmp/{}.png'.format(t1)
            os.system('convert %s %s' % (tmp_path, png_path))
            with open(png_path, 'rb') as f:
                url = put(f)
            img['src'] = url
            if img.find_parents('table'):
                images_in_table.append(img)
            else:
                images_in_para.append(img)
            os.remove(png_path)
            os.remove(tmp_path)
        else:
            with open(tmp_path, 'rb') as f:
                url = put(f)
            img['src'] = url
            if img.find_parents('table'):
                images_in_table.append(img)
            else:
                images_in_para.append(img)
            os.remove(tmp_path)
    return images_in_para, images_in_table
示例#30
0
def word2html(htmlfile, wordfile):
    html = PyDocX.to_html(wordfile)
    f = open(htmlfile, 'w', encoding="utf-8")
    f.write(html)
    f.close()
    print("data end")
示例#31
0
def gui_parse():

    global __file__                         # to fix stupid
    __file__ = os.path.abspath(__file__)    # __file__ handling
    _file_ = os.path.basename(__file__)     # in python 2

    global debug

    root = Tk()
    root.withdraw()

    sys.stderr = codecs.getwriter('utf8')(sys.stderr)

    parser = argparse.ArgumentParser()
    parser.add_argument('filename', nargs='?')
    parser.add_argument('--debug', '-d', action='store_true')
    args = parser.parse_args()

    if args.debug:
        debug = True

    if args.filename is None:
        args.filename = tkFileDialog.askopenfilename(
            filetypes=[
            ('Word 2007+','*.docx'),
            ('Plain text','*.txt'),
            ])

    os.chdir(os.path.dirname(os.path.abspath(args.filename)))

    if os.path.splitext(args.filename)[1] == '.txt':

        with codecs.open(args.filename, 'r', 'utf8') as input_file:
                input_text = input_file.read()

        input_text = input_text.replace('\r','')

        final_structure = chgk_parse(input_text)


    elif os.path.splitext(args.filename)[1] == '.docx':
        from pydocx import PyDocX
        from bs4 import BeautifulSoup
        from parse import parse
        import base64
        import html2text
        input_docx = PyDocX.to_html(args.filename)
        bsoup = BeautifulSoup(input_docx)

        if args.debug:
            with codecs.open('debug.pydocx', 'w', 'utf8') as dbg:
                dbg.write(input_docx)
        
        def generate_imgname(ext):
            imgcounter = 1
            while os.path.isfile('{:03}.{}'
                .format(imgcounter, ext)):
                imgcounter += 1
            return '{:03}.{}'.format(imgcounter, ext)

        for tag in bsoup.find_all('style'):
            tag.extract()
        for tag in bsoup.find_all('p'):
            if tag.string:
                tag.string = tag.string + SEP
        for tag in bsoup.find_all('b'):
            tag.unwrap()
        for tag in bsoup.find_all('strong'):
            tag.unwrap()
        for tag in bsoup.find_all('i'):
            tag.string = '_' + tag.string + '_'
            tag.unwrap()
        for tag in bsoup.find_all('em'):
            tag.string = '_' + tag.string + '_'
            tag.unwrap()
        for tag in bsoup.find_all('li'):
            if tag.string:
                tag.string = '- ' + tag.string
        for tag in bsoup.find_all('img'):
            imgparse = parse('data:image/{ext};base64,{b64}', tag['src'])
            imgname = generate_imgname(imgparse['ext'])
            tag.insert_before('(img {})'.format(imgname))
            if not args.debug:
                with open(imgname, 'wb') as f:
                    f.write(base64.b64decode(imgparse['b64']))
            tag.extract()
        for tag in bsoup.find_all('a'):
            if rew(tag.string) == '':
                tag.extract()
            else:
                tag.string = tag['href']
                tag.unwrap()

        h = html2text.HTML2Text()
        h.body_width = 0
        txt = (h.handle(bsoup.prettify())
            .replace('\\-','')
            .replace('\\.','.')
            .replace('( ', '(')
            .replace('[ ', '[')
            .replace(' )', ')')
            .replace(' ]', ']')
            .replace(' :', ':')
            )

        if args.debug:
            with codecs.open('debug.debug', 'w', 'utf8') as dbg:
                dbg.write(txt)

        final_structure = chgk_parse(txt)

    else:
        sys.stderr.write('Error: unsupported file format.' + SEP)
        sys.exit()

    os.chdir(os.path.dirname(os.path.abspath(__file__)))

    with codecs.open(
        make_filename(args.filename), 'w', 'utf8') as output_file:
        output_file.write(
            compose_4s(final_structure))

    print('Please review the resulting file {}:'.format(
        make_filename(args.filename)))
    subprocess.call(shlex.split('{} "{}"'
        .format(
            TEXTEDITOR,
            make_filename(args.filename)).encode('cp1251',errors='replace')))
示例#32
0
from pydocx import PyDocX
html = PyDocX.to_html("111.docx")
f = open("test.html", 'w', encoding="utf-8")
f.write(html)

f.close()
示例#33
0
 def convert(self):
   self._raw = PyDocX.to_html(self.path)
   bs = BeautifulSoup(self._raw, 'html.parser')
   self.data = bs.body
示例#34
0
def papertest():
    html = PyDocX.to_html('./paperPDF/' + 'bitcoin.docx')
    print(html)
    return html
示例#35
0
文件: 1.py 项目: yf1291/nlp4
import os, sys

import os

from pydocx import PyDocX

html = PyDocX.to_html("2.docx")

print(html)
示例#36
0
from pydocx import PyDocX
"""
旧包:docx2html,此包要求python版本较低
新包路径如下:
https://github.com/CenterForOpenScience/pydocx
"""

docx_file = r'D:\十二刻度-个人信息及隐私政策.docx'
html_fle = r'D:\result.html'
html = PyDocX.to_html(docx_file)
with open(html_fle, encoding='UTF-8', mode='w') as f:
    f.write(html)