-
Notifications
You must be signed in to change notification settings - Fork 0
/
mailMerge.py
291 lines (236 loc) · 10.6 KB
/
mailMerge.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
import pandas as pd
import numpy as np
import glob
import os
import json
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import probablepeople
import usaddress
from argparse import ArgumentParser
from gooey import Gooey, GooeyParser
@Gooey(program_name="Mail Merge")
def parse_args(progress_regex=r"^progress: (\d+)%$",
disable_stop_button=True):
""" Use GooeyParser to build up the arguments we will use in our script
Save the arguments in a default json file so that we can retrieve them
every time we run the script.
"""
stored_args = {}
# get the script name without the extension & use it to build up
# the json filename
script_name = os.path.splitext(os.path.basename(__file__))[0]
args_file = "{}-args.json".format(script_name)
# Read in the prior arguments as a dictionary
if os.path.isfile(args_file):
with open(args_file) as data_file:
stored_args = json.load(data_file)
parser = GooeyParser(description='Mail Merge')
parser.add_argument('data_directory',
action='store',
default=stored_args.get('data_directory'),
widget='DirChooser',
help="Source directory that contains Excel files")
parser.add_argument('output_directory',
action='store',
widget='DirChooser',
default=stored_args.get('output_directory'),
help="Output directory to save merged files")
#parser.add_argument("FileSaver", help="Name the output file you want to process", widget="FileSaver")
#parser.add_argument("-o", "--overwrite", action="store_true", help="Overwrite output file (if present)")
#parser.add_argument("-s", "--sheets", action="store_true", help="Would you like to ignore multiple sheets?")
args = parser.parse_args()
# Store the values of the arguments so we have them next time we run
with open(args_file, 'w') as data_file:
# Using vars(args) returns the data as a dictionary
json.dump(vars(args), data_file)
return args
def combine_files(src_directory):
""" Read in all of the xlsx files apply functions and combine into 1
combined DataFrame
"""
all_data = pd.DataFrame()
filelist = []
fileTypes = ['*.xls', '*.xlsx','*.xlsm','*XLSX']
for ftype in fileTypes:
filelist.extend(glob.glob(src_directory+'/'+ftype))
for f in filelist:
for sheet in pd.ExcelFile(f).sheet_names:
df = pd.read_excel(f,sheet)
df = rename_columns(df)
all_data = all_data.append(df, ignore_index=True)
all_data.reset_index(drop=True)
print 'alldata',all_data[10:100]
return all_data
def dedup(df):
df.drop_duplicates(subset=['Address 1', 'Address 2', 'City', 'State', 'Zipcode'])
def save_results(output):
""" Perform a summary of the data and save the data as an excel file
"""
# extension = '.xlsx'
# if filename.lower().endswith('.xlsx'):
# output_file = os.path.join(output, filename)
# else:
# output_file = os.path.join(output, filename+extension)
for key, value in output_df.iteritems():
output_file = os.path.join(output, key)
writer = pd.ExcelWriter(output_file, engine='xlsxwriter')
value.to_excel(writer)
writer.save()
#-------------------------------------------------------------------------------------#
# Check address update 4 dataframes
def addresscheck(df,usadd,wrongaddress):
for index, row in df.iterrows():
addressstring = (unicode(row['Address 1'])+' '+ unicode(row['Address 2'])+' ' + unicode(row['City'])+' ' + unicode(row['State']) + ' ' + unicode(row['Zipcode']))
try:
addresscheck = usaddress.tag(addressstring)
except usaddress.RepeatedLabelError:
wrongaddress.loc[len(wrongaddress)] = df.iloc[index]
if addresscheck[1] != 'Ambiguous':
if addressstring.find('14261') == -1 and addressstring.find('14260') == -1 and addressstring.find('Clement') == -1 and addressstring.find('Goodyear') == -1:
usadd.loc[len(usadd)]= df.iloc[index]
else:
if addressstring.find('14261') == -1 and addressstring.find('14260') == -1 and addressstring.find('Clement') == -1 and addressstring.find('Goodyear') == -1:
wrongaddress.loc[len(wrongaddress)] = df.iloc[index]
#-------------------------------------------------------------------------------------#
# Mail Merge methods
def guess_column_names(columnname):
''' An attempt to standardize and rename column headers for manipulation later.
Input: column name (String)
Output: Corrected name (String)
'''
#list of possible "correct" column headers
correct_headers = ['First Name', 'Last Name','Fullname','Student Name','Job Title', 'Title','ID',
'Institution','School','Company','Company Name1','Company Name2','Organization Name','Department','Division',
'Email Address','Street Address','Street 1','Dorm Address 1','Dorm Address 2','Dorm Address 3',
'Dorm Address 4','Address 1','Street 2','Address 2','Address','Street 3','Address 3','Street 4','Address 4',
'Work Street 1','Work Street 2','Work Street 2','Work Street 3','Work Street 4',
'Zipcode','Home Zipcode','Work_City','Dorm Postalplus4','HOME_FOREIGN_CITYZIP','WORK_FOREIGN_CITYZIP','Work_State','Work_Country',
'Postal','City','County','State','Country']
# if column is exact match return name
if columnname in correct_headers: # might want to make this a dict for O(1) lookups
return columnname#, 100
# if column name is longer than 20 characters, return best quess based on last 15 characters
if len(columnname) > 20:
new_name, score = process.extractOne(columnname[-15:], correct_headers)
return new_name#, score
# for all others,
else:
new_name, score = process.extractOne(columnname, correct_headers)
#if score > 80, return new_name
if score < 80:
# returns orginal name if match is bad
return columnname#, score
else:
return new_name#, score
def unique_columns(df_columns):
''' Columns with same name get numbered to avoid duplication
Input: list of columns
Output: list of columns with appended numbers
'''
seen = set()
for col in df_columns:
append = 1
newitem = col
while newitem in seen:
append += 1
newitem = "{} {}".format(col, append)
yield newitem
seen.add(newitem)
def drop_columns(df):
''' Columns of the dataframe are dropped if they are not in the list below '''
# If names don't match this list drop them from dataframe
headers = ['First Name','Last Name','Fullname','Title',
'Company','Department',
'Address 1','Address 2',
'City','State','Zipcode','Country']
cols = [col for col in df.columns if col not in headers]
#print cols
df.drop(cols, axis=1, inplace=True)
#print 'Drop: ',df.columns
return df
def reorder_columns(dataframe, seq):
'''Takes a dataframe and a sequence of columns names,
returns dataframe with seq as first columns. If seq contains columns
that aren't in the dataframe then the columns are created with Nan values.
'''
cols = seq[:]
for x in dataframe.columns:
if x not in cols:
cols.append(x)
for x in cols:
if x not in dataframe.columns: #If column from seq is not in df,
dataframe[x]= np.nan #create a new column filled with Nan
return dataframe[cols]
def nameMerge(df):
#create
#check if nan
df1 = df[['First Name','Last Name', 'Fullname' ]]
col1 = df1.as_matrix(columns=None);
nameList =[]
for name in col1:
first = str(name[0])
last = str(name[1])
ful = str(name[2])
if first == "nan":
first =""
if last == "nan":
last = ""
if ful == "nan":
ful= ""
full = last +", "+first + ful
nameList.append(full)
df["Fullname"] = nameList
def rename_columns(df):
# List of New "corrected" column names
new_col = [guess_column_names(col) for col in df.columns]
#Set df columns equal to "corrected" columns
df.columns = new_col
#print 'Corrected: ',df.columns
# Rename Columns
df.rename(columns={'School':'Department',
'Institution':'Company',
'Organization':'Company',
'Organization Name':'Company',
'Street Address':'Street 1',
'Division':'Department',
'Postal':'Zipcode',
'Zip':'Zipcode',
'Home Zipcode':'Zipcode',
'Street 1':'Address 1',
'Street 2':'Address 2',
'Street 3':'Address 3',
'PERSON_NAME':'Fullname',
'Student Name':'Fullname' #???
}, inplace=True)
#print 'Rename: ',df.columns
#Call nameConcate here
#nameMerge(df)
# Create unique versions of Columns to avoid issues with pandas
df.columns = list(unique_columns(df.columns))
#print 'Unique: ',df.columns
# Drop columns not needed
df = drop_columns(df)
#print 'Drop: ',df.columns
# Reorder Columns in df
df = reorder_columns(df,['First Name','Last Name','Fullname','Title','Company','Department','Address 1','Address 2','City','State','Zipcode','Country'])
#print 'Reorder: ', df.columns
return df
# -------------------------------------------------------------------------------#
# Main
if __name__ == '__main__':
conf = parse_args()
## --- setup dataframes
ls = ('First Name','Last Name','Fullname','Title','Company','Department','Address 1','Address 2','City','State','Zipcode','Country')
usadd = pd.DataFrame(columns=ls)
wrongaddress = pd.DataFrame(columns=ls)
output_df = {'usAddress.xlsx':usadd,'wrongAddress.xlsx':wrongaddress}
print("Reading files and combining")
all_df = combine_files(conf.data_directory)
print("dropping duplicate address")
all_df = all_df.drop_duplicates(subset=['Address 1', 'Address 2', 'City', 'State', 'Zipcode'])
print("checking address")
addresscheck(all_df,usadd,wrongaddress)
print("Saving data")
save_results(conf.output_directory)
print("Done")