/
ocr_pdf.py
48 lines (33 loc) · 1 KB
/
ocr_pdf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
from wand.image import Image
from PIL import Image as PI
import pyocr
import pyocr.builders
import io
import sys
def pdf2text(pdf_filename):
tool = pyocr.get_available_tools()[0]
lang = tool.get_available_languages()[1]
req_image = []
final_text = []
image_pdf = Image(filename=pdf_filename, resolution=300)
image_jpeg = image_pdf.convert('jpeg')
for img in image_jpeg.sequence:
img_page = Image(image=img)
req_image.append(img_page.make_blob('jpeg'))
for img in req_image:
txt = tool.image_to_string(
PI.open(io.BytesIO(img)),
lang=lang,
builder=pyocr.builders.TextBuilder()
)
final_text.append(txt)
return final_text
if __name__ == "__main__":
text = pdf2text(sys.argv[1])
if len(sys.argv)==3:
ofilename = sys.argv[2]
else:
ofilename = "ocrpdf_output.txt"
with open(ofilename,'wb') as ofile:
for chunk in text:
ofile.write(chunk.encode('ascii', 'ignore'))