def read_svc(): ''' Reads in the service agency addresses. Calls the COMPARE_ADDRESSES module to merge duplicate addresses per agency. Counts the number of service addresses per organization. Returns a dataframe. ''' # Print progress report print('\nReading in service agencies') # Read in the service agencies, converting zip code to string df = pd.read_csv(SVC, converters={'ZipCode': str}) # Append '_SVC' to all columns except CSDS_Svc_ID df = u.rename_cols(df, [x for x in df.columns if x != 'CSDS_Svc_ID'], '_SVC') # Rename a column to prepare for linking df = df.rename(columns={'CSDS_Svc_ID': 'CSDS_Vendor_ID_LINK2'}, index=str) # Use the COMPARE_ADDRESSES module to clean up multiple strings for a single # address record key = 'CSDS_Vendor_ID_LINK2' target = 'Address_SVC' fixed_addresses = ca.fix_duplicate_addresses(df, key, target) # Drop duplicates based on the key and target fields fixed_addresses = fixed_addresses.drop_duplicates(subset=[key, target]) return fixed_addresses
def jwsim_contracts_irs(contracts, irs, suffix): ''' Takes the contracts and IRS dataframes and returns a dataframe of records with matching names where the JW similarity is >= JWSIM_THRESH. ''' # Rename the columns in IRS: irs = u.rename_cols(irs, irs.columns, suffix) # Restrict the contracts df to just those from IL contracts = contracts[contracts.CSDS_Contract_ID.str.startswith('IL')] # Take the cartesian product between the two; replace np.NaN with '' prod = mn.cart_prod(contracts, irs) prod = prod.replace(np.NaN, '') # Print progress report print('Calculating Jaro-Winkler similarity on vendor names') # Compute the Jaro-Winkler similarity on the VendorName cols col1 = 'VendorName' arg = ((prod, col1, col1 + suffix)) jwsim = mn.parallelize(mn.jwsim, arg) # Return only the rows where JW similarity >= JWSIM_THRESH return jwsim[jwsim.JWSimilarity >= JWSIM_THRESH]
def linker(): ''' Reads in the linker file (to link HQ agencies to service agencies). Merges a copy of itself on cluster ID, then eliminates records that match on vendor ID (to produce only matches that have different vendor IDs). Returns a dataframe. ''' # Read in the link dataframe link = read_linker() # Make two new dataframes by copying the link dataframe and renaming columns link1 = link.rename(columns={'VendorName': 'VendorName_LINK1'}, index=str) link2 = u.rename_cols(link, ['VendorName', 'CSDS_Vendor_ID'], '_LINK2') # Merge the two link dataframes together df = link1.merge(link2, how='left') # Drop self-matches and reset the index df = df[df['CSDS_Vendor_ID'] != df['CSDS_Vendor_ID_LINK2']].reset_index( drop=True) return df