def merger(dollars_divided, geo): ''' Merges the dollars_divided and geo dataframes, coalescing the values across matching columns. Drops unwanted columns. Returns a dataframe. ''' # Define the arguments to merge_coalesce keys = ['Address', 'City', 'State', 'ZipCode'] sfx = '_R' how = 'left' # Merge dollars_divided and geo together, filling in coordinates df = u.merge_coalesce(dollars_divided, geo, keys, sfx, how) # Drop these columns df = df.drop([ 'ClusterID', 'VendorName_LINK1', 'VendorName_LINK2', 'Name', 'CSDS_Vendor_ID_LINK2' ], axis=1) # Drop duplicates based only on this subset subset = ['CSDS_Vendor_ID', 'Address', 'City', 'State', 'ZipCode'] return df.drop_duplicates(subset=subset).reset_index(drop=True)
def try_fill(df): ''' Fills in missing zip codes and coordinates as best as possible. Copies in values from elsewhere in the dataset and from the geocoded HQ addresses. Returns a dataframe. ''' # Print progress report print('\nFilling in missing zip codes and coordinates as best as possible') # Fill in missing zip codes as best as possible targetsZ = ['ZipCode'] keys1Z = ['Address', 'City', 'State'] keys2Z = ['Name', 'Longitude', 'Latitude'] df = filler(df, targetsZ, str, keys1Z, keys2Z) # Fill in missing longitude and latitude coordinates as best as possible targetsL = ['Longitude', 'Latitude'] keys1L = ['Address', 'City', 'State', 'ZipCode'] keys2L = ['Name'] df = filler(df, targetsL, float, keys1L, keys2L) # Read in the geocoded HQ addresses and fill in zip codes and coordinates as # best as possible geo = read_geo() subset = ['Address', 'City', 'State'] df = u.merge_coalesce(df.reset_index(drop=True), geo, subset) return df
def preprocess_contracts(): ''' Reads in the contract records. Preprocesses them to clean the amounts and keep only those over the minimum amount specified in the MIN_DOLLARS constant. Imports hand-collected addresses for Cook and IL contracts and merges in addresses from IRS990 forms to fill in as many blanks as possible. Returns a dataframe. ''' # Read in the contracts and clean the dollar amounts contracts = read_contracts() contracts = clean_amounts(contracts) # Read in the COOK addresses dataset cook = import_addresses('cook') # Fill in addresses from the COOK dataset, matching on VendorName; then, # standardize VendorName print('Coalescing COOK address matches') merged = u.merge_coalesce(contracts, cook, 'VendorName') merged['VendorName'] = merged['VendorName'].apply(stdname) # Read in the IRS dataset irs = import_addresses('irs') # Get a datframe of JW similarity matches >= JWSIM_THRESH between the merged # and irs dataframes sfx = '_IRS' jwsim = jwsim_contracts_irs(merged, irs, sfx) # Print progress report print('Coalescing IRS matches') # Fill in addresses from the IRS dataset coalesced = coalesce_matches(merged, jwsim, sfx) # Read in the IL addresses dataset il = import_addresses('il') # Print progress report print('Coalescing IL matches') # Fill in addresses from the IL dataset, matching on VendorName df = u.merge_coalesce(coalesced, il, 'VendorName') return df
def coalesce_matches(contracts, jwsim, suffix): ''' Pulls in the addresses from IRS records previously deemed to match the IL agencies. Returns a dataframe. ''' jwsim = trim_jwsim(jwsim, suffix) # Define the key on which to coalesce keys = ['CSDS_Contract_ID'] # Fill in missing values in contracts from matches in jwsim, matchin on keys df = u.merge_coalesce(contracts, jwsim, keys, suffix) return df
def fix_duplicate_addresses(df, key='ClusterID', target='Address_SVC'): ''' Takes in a dataframe. Attempts to fix duplicate addresses (by default, in the 'Address_SVC' field) if they have the same key (bby default, the 'ClusterID' field). Returns a dataframe. ''' print('\nFixing duplicate addresses') # Sort the target field by length, longest to shortest sorter = df[target].str.len().sort_values(ascending=False).index df = df.reindex(sorter) # Make a mini version of the dataframe with two fields, the key & the target # (which has been renamed to indicate it's the original field) minimized_df = df[[key, target]].drop_duplicates().dropna() minimized_df[target + '_Original'] = minimized_df[target] # Make a list of the unique values in the key field unique_keys = list(minimized_df[key].unique()) # Set a flag to FALSE new_df_exists = False # OVERVIEW: Call the iter_df() function on subsets of the dataframe (one # subset per key) to compare and fix the address strings assigned to that # key. # For each value in the list of unique keys: # Make a mini dataframe that is just the rows corresponding to that key # If the there is more than 1 row: # Call iter_df() on the mini df & assign the result to local_df2 # If the new_df_exists flag is set to TRUE: # Create new_df by concatenating the existing new_df and local_df2 # else: Assign the name new_df to local_df2 and set the new_df_exists to TRUE for uKey in unique_keys: local_df = minimized_df[minimized_df[key] == uKey] if len(local_df) > 1: local_df2 = iter_df( local_df.copy().drop_duplicates().reset_index(drop=True), target) if new_df_exists: new_df = pd.concat([new_df, local_df2]) else: new_df = local_df2 new_df_exists = True print('Coalescing fixed addresses') # Rename the columns in preparation of calling merge_coalesce() new_cols = {target: target + '_COAL', target + '_Original': target} new_df = new_df.rename(columns=new_cols, index=str) # Rename the columns in preparation of calling merge_coalesce() min_cols = {target + '_Original': target + '_COAL'} minimized_df = minimized_df.rename(columns=min_cols, index=str) # Coalesce with the dfs in this order so that we keep the new values merged = u.merge_coalesce(new_df, minimized_df, [key, target], how='right') # Merge the new address strings in, drop the original field, and rename the # new one df = df.merge(merged, how='left').drop(target, axis=1) df = df.rename(columns={target + '_COAL': target}, index=str) return df