/
wanding.py
53 lines (39 loc) · 1.31 KB
/
wanding.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
from wand.image import Image as IM
import sys
import os
import tempfile
import subprocess
'''
This is the beast that takes input as pdf, converts to png, saves in tempfile
and then runs it through the Tesseract engine as a subprocess.
'''
def pdf_to_text(filename):
pdf = IM(filename=filename, resolution=300)
pages = len(pdf.sequence)
image = IM(width=pdf.width, height=pdf.height * pages)
for i in xrange(pages):
image.composite(
pdf.sequence[i],
top=pdf.height * i,
left=0
)
img = image.convert('png')
with tempfile.NamedTemporaryFile(prefix="tess_") as temp_file:
img.save(filename=temp_file.name)
try:
temp = tempfile.NamedTemporaryFile(delete=False)
process = subprocess.Popen(['tesseract', temp_file.name, temp.name], \
stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
process.communicate()
with open(temp.name + '.txt', 'r') as handle:
contents = handle.read()
os.remove(temp.name + '.txt')
os.remove(temp.name)
print contents
except:
print "ERROR"
def main():
filename = sys.argv[1]
pdf_to_text(filename)
if __name__ == "__main__":
main()