forked from ptsefton/omeka-python-utils
/
convert_word_docs_to_html.py
95 lines (75 loc) · 3.45 KB
/
convert_word_docs_to_html.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
"""
Omeka API Script to find word/office documents and add HTML renditions of them to items.
Requires word2html to be in a directory parallel to this or in the python path
"""
from omekaclient import OmekaClient
from omekautils import get_omeka_config
from omekautils import create_stream_logger
from sys import stdout
import argparse
import json
import os
import tempfile
#Hacky stuff as this is a one off
import sys
sys.path
#Change this to point to where you downloaded word2html from the WordDown project
sys.path.append('../jischtml5/tools/commandline')
import word2html
logger = create_stream_logger('converting', stdout)
config = get_omeka_config()
parser = argparse.ArgumentParser()
parser.add_argument('-k', '--key', default=None, help='Omeka API Key')
parser.add_argument('-u', '--api_url', default=None, help='Omeka API Endpoint URL (hint, ends in /api)')
parser.add_argument('-d', '--delete_html', action='store_true', help='Delete html docs')
parser.add_argument('-n', '--do_not_convert', action='store_true', help='Do not convert')
args = vars(parser.parse_args())
endpoint = args['api_url'] if args['api_url'] <> None else config['api_url']
apikey = args['key'] if args['api_url'] <> None else config['key']
omeka_client = OmekaClient(endpoint.encode("utf-8"), logger, apikey)
resp, cont = omeka_client.get("items")
items = json.loads(cont)
temp_dir = tempfile.mkdtemp()
os.chmod(temp_dir, 0o2770) #Sets group permissions and "sticky bit"
num_docs_found = 0
num_html_uploaded = 0
num_html_deleted = 0
for item in items:
logger.info('Looking at %s', item['id'])
#First pass - delete HTML if required
if args['delete_html']:
for f in omeka_client.get_files_for_item(item['id']):
fname = f['original_filename']
name, ext = os.path.splitext(fname)
if ext.lower() == ".html":
logger.info("Deleting html file: %s", f['id'])
num_html_deleted += 1
omeka_client.delete('files', f['id'])
#Second pass do the conversion if required
if not args['do_not_convert']:
for f in omeka_client.get_files_for_item(item['id']):
fname = f['original_filename']
name, ext = os.path.splitext(fname)
if ext.lower() in [".docx", ".doc", ".odt", ".rtf"]:
num_docs_found += 1
res, data = omeka_client.get_file(f['file_urls']['original'])
download_file = os.path.join(temp_dir, fname)
out = open(download_file, 'wb')
out.write(data)
out.close()
logger.info("Converting office doc file %s to HTML", f['id'])
out_dir, x = os.path.split(download_file)
html_file = os.path.join(temp_dir, name + ".html")
word2html.convert(download_file, html_file , True, True, False)
if omeka_client.post_file_from_filename(html_file, item['id']):
num_html_uploaded += 1
logger.info("Uploaded %s successfully", f['id'])
logger.info("********************")
logger.info("SUMMARY:")
logger.info("Deleted %s HTML", num_html_deleted)
logger.info("Docs found: %s", num_docs_found)
logger.info("HTML files converted and added: %s", num_html_uploaded)
if num_docs_found == num_html_uploaded:
logger.info("No errors detected")
else:
logger.error("Number of docs does not match number of HTML files uploaded")