-
Notifications
You must be signed in to change notification settings - Fork 0
/
textExtractor.py
83 lines (77 loc) · 3.03 KB
/
textExtractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import pdfquery
import unicodedata
from pyPdf import PdfFileReader, PdfFileWriter
from wand.image import Image
from PIL import Image as tim
from pytesseract import image_to_string
import os
class txtExtractor:
def __init__(self,fileName):
self.pdfQuery = pdfquery.PDFQuery(fileName)
self.pdf = PdfFileReader(file(fileName,"rb"))
self.x_off = float(self.pdf.pages[0].trimBox[0])
self.y_off = float(self.pdf.pages[0].trimBox[1])
self.lastPage=-1
def pdf_to_text(self):
num_of_pages = self.pdfQuery.doc.catalog['Pages'].resolve()['Count']
res = []
for i in range(num_of_pages):
res.append(self.get_text_from_page(i))
return res
def get_text_from_page(self,pageNum):
self.pdfQuery.load(pageNum)
resDict = self.pdfQuery.extract([#('with_parent','LTPage[pageid="%s"]' % (pageNum)),
('with_formatter', 'text'),
('text','LTTextLineHorizontal')])
unitext = resDict['text']
if type(unitext) == unicode:
text = unicodedata.normalize('NFKD', unitext).encode('ascii','ignore')
else:
text = unitext
text = text.replace("- ","")
text = text.replace("fi ","fi")
return text
def get_text_from_box(self,pageNum,coordinatesList,picture=False):
if not picture:
return self.__text_pdf__(pageNum,coordinatesList)
def __text_pdf__(self,pageNum,coordinates):
if self.lastPage!=pageNum:
self.pdfQuery.load(pageNum)
self.lastPage=pageNum
x=coordinates[0] + self.x_off
y=coordinates[1] + self.y_off
w=coordinates[2] + x
h=coordinates[3] + y
resDict = self.pdfQuery.extract([
('with_formatter', 'text'),
('text','LTTextLineHorizontal:overlaps_bbox("%s,%s,%s,%s")' % (round(x),round(y),round(w),round(h)))])
unitext = resDict['text']
if type(unitext) == unicode:
text = unicodedata.normalize('NFKD', unitext).encode('ascii','ignore')
else:
text = unitext
text = text.replace("- ","")
text = text.replace("fi ","fi")
return text
def __picture_pdf__(self,pageNum,coordinatesList):
page = self.pdf.pages[pageNum-1]
res=[]
for cord in coordinatesList:
x=cord[0] + self.x_off
y=cord[1] + self.y_off
w=cord[2] + x
h=cord[3] + y
page.mediaBox.lowerLeft = (x, y)
page.mediaBox.upperRight = (w, h)
out = PdfFileWriter()
out.addPage(page)
outFile = file("tmp.pdf", "wb")
out.write(outFile)
outFile.close()
im = Image(filename='tmp.pdf',resolution=300)
im.save(filename=("tmp.jpg"))
img = tim.open(("tmp.jpg"))
res.append(image_to_string(img))
os.remove("tmpImage.jpg")
os.remove("tmp.pdf")
return res