def preprocess(sfdf, repdf, keys): key_list = list() '''preprocessing''' sfdf.update(clean(sfdf.FirstName)) sfdf.update(clean(sfdf.LastName)) sfdf.update(clean(sfdf.Email)) sfdf.update(clean(sfdf.MailingState)) sfdf.update(phonenumbers(sfdf.MailingPostalCode)) sfdf.update(clean(sfdf.MailingCity)) sfdf.update(phonenumbers(sfdf.Phone)) sfdf.update(clean(sfdf.CRD__c.astype(str))) repdf.update(clean(repdf.FirstName)) repdf.update(clean(repdf.LastName)) repdf.update(clean(repdf.Email)) repdf.update(clean(repdf.MailingState)) repdf.update(phonenumbers(repdf.MailingPostalCode)) repdf.update(clean(repdf.MailingCity)) repdf.update(phonenumbers(repdf.Phone)) repdf.update(clean(repdf.CRD__c.astype(str))) '''key generating''' for df in [sfdf, repdf]: for key in keys: if len(key[:-1]) > 1: key_col = ''.join( [''.join(c for c in s if c.isupper()) for s in key[:-1]]) if key_col not in key_list: key_list.append(key_col) df[key_col] = pd.Series( np.add.reduce(df[key[:-1]].astype(str), axis=1)) else: if key[0] not in key_list: key_list.append(key[0]) return sfdf, repdf, key_list
def test_clean_phonenumbers(self): values = pd.Series( [np.nan, '0033612345678', '+1 201 123 4567', '+336-123 45678']) expected = pd.Series( [np.nan, '0033612345678', '+12011234567', '+33612345678']) clean_series = phonenumbers(values) # Check if series are identical. pdt.assert_series_equal(clean_series, expected)
def clean_data(df): df = df.replace(r'^\s*$', np.nan, regex=True) df = df[df['customer_id'].notna()] df['phone_clean'] = phonenumbers(df['phone']) df['first_name_clean'] = clean(df['first_name']) df['last_name_clean'] = clean(df['last_name']) df['address_clean'] = clean(df['address']) df['city_clean'] = clean(df['city']) df['state_clean'] = clean(df['state']) df['zip_clean'] = clean(df['zip'].str.split('-').str[0]) return df
def preprocess(df1, df2): # set index to the id column df1 = df1.set_index('id') df2 = df2.set_index('id') # replace empty cells with NaN df1 = df1.replace("", np.nan) df2 = df2.replace("", np.nan) # drop country, locality and region df1 = df1.drop(['country', 'locality', 'region'], axis=1) df2 = df2.drop(['country', 'locality', 'region'], axis=1) # remove all non-numbers from phone & convert to numeric df1.loc[:, 'phone'] = pd.to_numeric(phonenumbers(df1.loc[:, 'phone'])) df2.loc[:, 'phone'] = pd.to_numeric(phonenumbers(df2.loc[:, 'phone'])) # convert postal_code to numeric df1.loc[:, 'postal_code'] = pd.to_numeric(df1.loc[:, 'postal_code']) df2.loc[:, 'postal_code'] = pd.to_numeric(df2.loc[:, 'postal_code']) # clean street_address & website df1.loc[:, 'street_address'] = clean(df1.loc[:, 'street_address']) df1.loc[:, 'website'] = clean(df1.loc[:, 'website']) df2.loc[:, 'street_address'] = clean(df2.loc[:, 'street_address']) df2.loc[:, 'website'] = clean(df2.loc[:, 'website']) # convert NaNs to 0s for numerics df1.loc[:, ['latitude', 'longitude', 'phone', 'postal_code']] = df1.loc[:, [ 'latitude', 'longitude', 'phone', 'postal_code' ]].replace(np.nan, 0) df2.loc[:, ['latitude', 'longitude', 'phone', 'postal_code']] = df2.loc[:, [ 'latitude', 'longitude', 'phone', 'postal_code' ]].replace(np.nan, 0) return df1, df2
def preprocess(sfdf, repdf): print('enter PREPROCESS') global key_list, keys '''preprocessing''' sfdf.update(clean(sfdf.FirstName)) sfdf.update(clean(sfdf.LastName)) sfdf.update(clean(sfdf.Email)) sfdf.update(clean(sfdf.State)) sfdf.update(phonenumbers(sfdf.Zip)) sfdf.update(clean(sfdf.City)) sfdf.update(phonenumbers(sfdf.Phone)) sfdf.update(clean(sfdf.CRD.astype(str))) repdf.update(clean(repdf.FirstName)) repdf.update(clean(repdf.LastName)) repdf.update(clean(repdf.Email)) repdf.update(clean(repdf.State)) repdf.update(phonenumbers(repdf.Zip)) repdf.update(clean(repdf.City)) repdf.update(phonenumbers(repdf.Phone)) repdf.update(clean(repdf.CRD.astype(str))) '''key generating''' for df in [sfdf, repdf]: for key in keys: if len(key) > 1: key_col = ''.join([''.join(c for c in s if c.isupper()) for s in key]) if key_col not in key_list: key_list.append(key_col) df[key_col] = pd.Series(np.add.reduce(df[key].astype(str), axis=1)) else: if key[0] not in key_list: key_list.append(key[0]) print('exit PREPROCESS')