/
extraction_module.py
322 lines (268 loc) · 12 KB
/
extraction_module.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
# -*- coding: cp1252 -*-
## don't forget to clip off leading and traling white space.
## Imports----------
from marbles import glass as jc
import ast
import re
import os
RIGHT=1
LEFT=-1
## My workaround encoding issues
#unicode_chars={"—":"-","‘":"'","’":"'","é":"e",'“':'"',"‘":"'"}
## The functions below look useless, but I want it to be easy to flesh them out
## Returns the location of text file containing all utility information (regexes etc.)
def get_utility_library_directory():
utility_library_directory="C:\Users\James McGlynn\My Programs\Python Programs\pdf2txt\WorkRelated\Utility_Libraries"
return utility_library_directory
## Asks the user what library they want to use. It only asks once per time the program runs ATM
def get_current_utility(list_of_utils):
print "CHOOSE FROM THE LIST BELOW: "
start=1
end=len(list_of_utils)
promptString="MAKE YOUR SELECTION: "
default=1
choice=jc.getIntegerInput(start, end, promptString, default,list_of_utils)
return list_of_utils[choice-1]
## Takes path and returns every other line from the file at that path
## This should be part of a data base.
def get_document_list(text_file):
with open(text_file,'r') as fh:
lines=fh.readlines()
document_list=[]
for i in range(0,len(lines),2):
document_list.append(lines[i+1].rstrip('\n'))
page_list=[]
return document_list
## Retrieves the info on a desired utility. Returns a dictionary.
def get_utility_library(utility,utility_dictionary_location):
fh=open(utility_dictionary_location,'r')
lines_list=fh.readlines()
whole_doc=''
for line in lines_list:
whole_doc=whole_doc+line
utility_dictionary=ast.literal_eval(whole_doc)
utility_library_scope_def=utility_dictionary[utility]
fh.close()
return utility_library_scope_def
## Returns all available utilities (top level dictionary keys)
def get_utility_list(utility_dictionary_location):
fh=open(utility_dictionary_location,'r')
lines_list=fh.readlines()
whole_doc=''
for line in lines_list:
whole_doc=whole_doc+line
utility_dictionary=ast.literal_eval(whole_doc)
util_list=[]
for key in utility_dictionary:
util_list.append(key)
fh.close()
return util_list
## This uses get_index_and_match to find a match for a given
## Regex and then collects some characters in the vicinity.
def get_raw_chars(page_text,library_entry,unicode_chars):
data_flag=library_entry['data_flag']
flag_inst=library_entry['data_flag_inst']
direction=library_entry['direction']
num_chars=library_entry['raw_char_collect']
if direction==RIGHT:
index_and_match=jc.get_index_and_match(page_text,data_flag,flag_inst)
match_length=len(index_and_match['match'])
try:
start_index=index_and_match['index']+match_length
end_index=start_index+num_chars
if end_index>len(page_text):
end_index=len(page_text)
raw_text=page_text[start_index:end_index].strip()
except:
start_index=index_and_match['index']
end_index=index_and_match['index']
raw_text=index_and_match['index']
elif direction==LEFT:
## if left, just find the index of the flag
index_and_match=jc.get_index_and_match(page_text,data_flag,flag_inst)
match_length=len(index_and_match['match'])
try:
## And subtract to get the right character range
end_index=index_and_match['index'] - 1
start_index=end_index-num_chars
if start_index<0:
start_index=0
raw_text=page_text[start_index:end_index].strip()
except:
start_index=index_and_match['index']
end_index=index_and_match['index']
raw_text=index_and_match['index']
# This was added later to try and prevent encoding errors from popping up.
# It looks for problem characters and replaces them with more regular ones.
# The only affected part of the program is printing the raw chars to the xlsx
raw_text_string_2=""
matched=0
for char in raw_text:
matched=0
for key in unicode_chars.iterkeys():
if char==key:
print char,
print "equals",
print key
raw_text_string_2=raw_text_string_2+unicode_chars[key]
print raw_text_string2
matched=1
else:pass
if matched==0:
raw_text_string_2=raw_text_string_2+char
else:pass
raw_text=raw_text_string_2
return {'raw_text':raw_text,'match':index_and_match['match']}
def get_ref_text(raw_text_string,library_entry):
## The idea is to take the raw chars and the instructions and get the data
## The raw text function gets one peice of info at a time it looks like.
## So this should do the same.
## Initializations using the library
collection_method=library_entry['collection_method']
left_bound_regex=library_entry['left_bound_regex']
right_bound_regex=library_entry['right_bound_regex']
data_regex=library_entry['data_regex']
character_list=library_entry['character_list']
character_trans=library_entry['character_trans']
## If the raw text reads no matches or instance not found, then the refined text is
## "No Raw Text"
if raw_text_string=="NO MATCHES":
return "No Raw Text"
elif raw_text_string=="INSTANCE NOT FOUND":
return "No Raw Text"
## If it is not, we continue. What is the collection method?
## Meaning - Am I using regular expressions to fine the bounds of the desired information
## or the information itself.
elif collection_method=='bounds': ##If I'm using the bounds
if left_bound_regex=="null": ## Check to see what the LB Regex is
start_index=0 ## If it's the literal string "null" then set start_index to start of string
else: ## If it isn't, find the left bound using the leftbound regex
## and set the start_index equal to its instance.
index_and_match=jc.get_index_and_match(raw_text_string,left_bound_regex,1)
# Try is necessary here incase the left bound regex doesn't come up with anything
## But why isn't there one for the right bound?
try:
start_index=index_and_match['index']+len(index_and_match['match'])
except:
start_index=index_and_match['index']
if right_bound_regex=="null":
end_index=len(raw_text_string)
else:
index_and_match=jc.get_index_and_match(raw_text_string,right_bound_regex,1)
end_index=index_and_match['index']
elif collection_method=='data':
index_and_match=jc.get_index_and_match(raw_text_string,data_regex,1)
start_index=index_and_match['index']
try:
end_index=len(index_and_match['match'])+start_index
except:
end_index=len(raw_text_string)
try:
ref_text=raw_text_string[start_index:end_index]
except:
ref_text="Bad Raw Text"
if ref_text=="Bad Raw Text":
pass
elif ref_text=="No Raw Text":
pass
else:
if character_list != "none":
ref_text=jc.character_selection(ref_text,character_list)
if character_trans != "none":
ref_text=jc.character_transform(ref_text,character_trans)
return ref_text
#### this is taking a string(denoting the util_library), a dictionary (of what is to be printed)
#### a file path to make the xlsx file from, and a flag so you know what type of data you have?
##book_name=print_to_workbook(utility_library,raw_text_dict,ocr_text_path,"raw_text")
def print_to_workbook(data_order_list,dict_to_print,source_path,tab_name):
## Openpyxl library imports
from openpyxl import Workbook
from openpyxl import load_workbook
## Get the excel filename from the name of the text file that the user navigated to
book_name=source_path[:source_path.rindex('.')]+'.xlsx'#jc.getFilenameFromPath(source_path)+'.xlsx'
try: ## Try opening the workbook with the same name"
#Found Workbook
wb=load_workbook(book_name)
new_wb=-1
try:
# Found tab
ws = wb.get_sheet_by_name(tab_name) #this does not error if it doesn't find what it wants
if ws==None:
1+"one"#throw error"
else:
pass
except:
#Found workbook, but didn't find tab"
ws=wb.create_sheet(-1,tab_name)
ws=wb.get_sheet_by_name(tab_name)
new_wb=-1
i=0
for key in data_order_list['library_info']['collection_order']:
c=ws.cell(row=0, column=i)
c.value=key
i=i+1
except: ## If it doesn't exist then start a new workbook in memory
#Didn't find workbook
wb = Workbook()
ws = wb.create_sheet(-1,tab_name)
ws = wb.get_sheet_by_name(tab_name)
new_wb=1
i=0
for key in data_order_list['library_info']['collection_order']:
c=ws.cell(row=0, column=i)
c.value=key
i=i+1
last_occ_row=ws.rows[-1][0].row
i=0
for key in data_order_list['library_info']['collection_order']:
c=ws.cell(row=last_occ_row, column=i)
try:
c.value=dict_to_print[key].strip()
except:
#Using this area to keep track of encoding errors
print "This text caused an error"
print dict_to_print[key]
c.value="Raw text contained bad chars, see intepreter."
#c.value=dict_to_print[key]
i=i+1
wb.save(book_name)
return book_name
def txt2xlsx(ocr_text_path,utility_library,unicode_chars):
## Gets the user to select the document containing the OCR'ed text
#ocr_text_path=jc.getPath(jc.get_default_directory())[i]
## Uses that path to extract the data.
document_list=get_document_list(ocr_text_path)
## Retrieves the utility library from the utility dictionary for the specified utility
## which is currently determined with a function (That just returns "Consolidated Edison"
#utility_library=get_utility_library(get_current_utility(),get_utility_library_directory())
## Sample of using def get raw chars
#library_entry_sample=utility_library['extraction_parameters']['G&T Demand1']
#raw_chars=get_raw_chars(document_list[0],library_entry_sample)
## Collecting the raw characters, but I would also like to collect
## the flags found by the regular expressions.
match_dict={}
raw_text_dict={}
ref_text_dict={}
all_results_dict={} ## I might not use this
for i in range(len(document_list)):
#print "-----------------------------------------------------------"
print "Extracting Raw Text for Page: "+str(i+1)
#print "-----------------------------------------------------------"
for key in utility_library['library_info']['collection_order']:
#print utility_library['extraction_parameters'][key]
raw_text_and_match=get_raw_chars(document_list[i],utility_library['extraction_parameters'][key],unicode_chars)
raw_text=raw_text_and_match['raw_text']
match=raw_text_and_match['match']
#print str(key)+" : "+str(raw_text)+" : "+str(match)
#This is how the function call would go
#print "raw_text: "+raw_text
#print "key: "+key
ref_text=get_ref_text(raw_text,utility_library['extraction_parameters'][key])
ref_text_dict[key]=ref_text
raw_text_dict[key]=raw_text
match_dict[key]=match
# raw text dict is currently for page of data, not for file of data.
book_name=print_to_workbook(utility_library,raw_text_dict,ocr_text_path,"raw_text")
book_name=print_to_workbook(utility_library,ref_text_dict,ocr_text_path,"ref_text")
return book_name
#book_name=txt2xlsx(0)