forked from akkana/scripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
word2html.py
executable file
·145 lines (120 loc) · 4.09 KB
/
word2html.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
#!/usr/bin/env python3
# Convert Word files, either doc or docx, to HTML
# using python-mammoth for .docx to HTML
# and unoconv (if available) or wvHtml (otherwise) for .doc.
from __future__ import print_function
import sys
import os.path
import subprocess
import mammoth
from bs4 import BeautifulSoup
import prettysoup
def docx2html(infile):
with open(infile, 'rb') as fp:
mammout = mammoth.convert_to_html(fp)
for m in mammout.messages:
print("Mammoth %s: %s" % (m.type, m.message))
return prettyprint_html(mammout.value, infile)
def docx2htmlfile(infile, outfile):
html = docx2html(infile)
if outfile == "--":
print(html)
else:
with open(outfile, "w") as fp:
print(html, file=fp)
def prettyprint_html(inhtml, filename=''):
'''Prettyprint some HTML, which is assumed not to
have <html>, <head>, <body>..
'''
soup = BeautifulSoup(inhtml, "lxml")
# Try to get a title:
if soup.head and soup.head.title:
title = soup.head.title.text
elif soup.h1:
title = soup.h1.text
else:
print("filename is", filename)
title = os.path.splitext(filename)[0]
if title:
title = '<title>' + title + '</title>'
# Failed attempts to add meta charset via BS:
# # Add <meta charset="UTF-8"> at beginning of <head>
# head = soup.head
# if not head:
# print("There's no head")
# head = soup.new_tag('head')
# soup.body.insert_before(head)
# metatag = soup.new_tag('meta')
# metatag.attrs['http-equiv'] = 'Content-Type'
# metatag.attrs['content'] = 'text/html; charset=utf-8"'
# # soup.head.append(metatag)
# soup.head.insert(0, metatag)
# BS adds <body> but not <head>
header = '''<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
<html>
<head>
<meta http-equiv="content-type" content="text/html; charset=utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1">
%s
</head>
''' % title
footer = '</html>'
return header + prettysoup.prettyprint(soup.body) + footer
# return soup.prettify().encode("utf-8")
def html2html(infile, outfile):
with open(infile) as infp:
pphtml = prettyprint_html(infp.read(), infile)
with open(outfile, "w") as outfp:
outfp.write(pphtml)
def doc2html(infile, outfile):
base, ext = os.path.splitext(infile)
docxfile = base + ".docx"
try:
# Unoconv always fails the first time with default args:
# it takes too long for its server to start up,
# so it needs a longer timeout. See:
# https://github.com/dagwieers/unoconv/issues/415
rv = subprocess.call(["unoconv", "-f", "docx", "-T", "10",
"-o", docxfile, infile])
except OSError:
rv = 1
if not rv:
docx2htmlfile(docxfile, outfile)
print("removing", docxfile)
os.unlink(docxfile)
return
# unoconv failed. Try wvHtml.
print("Warning: %s: unoconv returned %d. Trying wvHtml" % (infile, rv))
try:
rv = subprocess.call(["wvHtml", infile, outfile])
except OSError:
rv = 1
if not rv:
return
print("Couldn't convert %s with either unoconv or wvHtml. Aborting."
% infile)
sys.exit(1)
if __name__ == "__main__":
if len(sys.argv) < 2 or len(sys.argv) > 3 \
or sys.argv[1] == "-h" or sys.argv[1] == "--help":
print("Usage: %s infile.docx [outfile.html]"
% os.path.basename(sys.argv[0]))
sys.exit(1)
infile = sys.argv[1]
base, ext = os.path.splitext(infile)
if len(sys.argv) == 3:
outfile = sys.argv[2]
elif ext == ".html":
outfile = base + "-pp.html"
else:
outfile = base + ".html"
if ext.lower() == ".doc":
doc2html(infile, outfile)
elif ext.lower() == ".html":
# If it's already HTML, just prettyprint it.
# It's a handy way of doing so since otherwise there's no
# command-line prettyprinter.
html2html(infile, outfile)
else:
docx2htmlfile(infile, outfile)
print("Wrote to %s" % outfile)