/
indexMaker.py
114 lines (97 loc) · 3.67 KB
/
indexMaker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import string
import time
import argparse
from sys import argv
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
class IndexMaker():
def __init__(self, args):
self.run(args)
def run(self, args):
try:
# get the word list and create a dict of words
words_list = [line.strip() for line in open(args.w)]
except IOError as e:
print e
exit()
# start the timer
start = time.clock()
# buil the index
index = create_index(args.f, words_list)
f = open(args.o,'w')
for word in index:
# write the index to a file
f.write("%s: %s \n" % (word, index[word]))
if args.p:
print "%s: %s \n" % (word, index[word])
f.close()
end = time.clock()
print "Finished in %f seconds" % (end - start)
def get_pdf_text(path):
""" Reads a pdf file and returns a dict of the text where the
index represents the page number.
http://stackoverflow.com/a/20905381
"""
rsrcmgr = PDFResourceManager()
retstr = StringIO()
# change to to utf-8 if the text comes out garbled
codec = 'ascii'
#codec = 'utf-8'
laparams = LAParams()
pages = {}
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams, showpageno=True, pages=pages)
fp = file(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True):
interpreter.process_page(page)
fp.close()
device.close()
retstr.close()
return pages
def find_whole_word(needle, haystack, case_sensitive = False):
""" Searches text for a whole word match. Ignores whitespace and punctuation.
Example: find_whole_word('test', 'This is a test; This is also a tester')
matches 'test;' but not 'tester'
http://stackoverflow.com/a/4155029
"""
if case_sensitive:
index = haystack.find(needle)
else:
index = haystack.lower().find(needle.lower())
if index == -1:
return False
if index != 0 and haystack[index-1].isalnum():
return False
L = index + len(needle)
if L < len(haystack) and haystack[L].isalnum():
return False
return True
def create_index(pdf_path, words_list):
""" Create a word index from pdf file
"""
text_data = get_pdf_text(pdf_path)
word_index = {}
for page in text_data:
for word in words_list:
if find_whole_word(word, text_data[page]):
if word in word_index:
word_index[word].append(page)
else:
word_index[word] = [page]
return word_index
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Creates an index of words from a PDF file.')
parser.add_argument('--version', action='version', version='0.02')
parser.add_argument('-o', default='index.txt', help='The file to output the index to')
parser.add_argument('-p', default=False, help='Print output to console')
parser.add_argument('-w', required=True, help='A text file of new line delimited words')
parser.add_argument('-f', required=True, help='The pdf file to create the index from')
args = parser.parse_args()
IndexMaker(args)