def split_by_missingvalues(): before_cleaning = Cleaner.count_nan_columns(cleaner.azdias) cleaner.to_nan() after_cleaning = Cleaner.count_nan_columns(cleaner.azdias) l, g, s1, s2 = Cleaner.split_by_treshhold(cleaner.azdias) cleaner.azdias = l return before_cleaning, after_cleaning, l.shape, g.shape
def test_tonan(self): before_cleaning = Cleaner.count_nan_columns(self.cleaner.azdias) self.cleaner.to_nan() after_cleaning = Cleaner.count_nan_columns(self.cleaner.azdias) db = dict(before_cleaning) da = dict(after_cleaning) self.assertEqual(db['AGER_TYP'], 0) self.assertEqual(da['AGER_TYP'], 685843)
def test_nans(self): nans_col = Cleaner.count_nan_columns(self.cleaner.azdias) nans_row = Cleaner.count_nan_rows(self.cleaner.azdias) d = dict(nans_col) self.assertEqual(d['CAMEO_DEUG_2015'], 98979) self.assertEqual(d['PLZ8_ANTG1'], 116515) self.assertEqual(nans_row.min(), 0) self.assertEqual(nans_row.max(), 46)
def clean_data(df): l, g, less_trh, gr_trh = Cleaner.split_by_treshhold(df, treshold=34) df = l df = Cleaner.to_nan(df, cleaner.summary) df = recode(df) df = extract_features(df) df = drop(df) df = dummies(df) return df
class TestData(unittest.TestCase): def __init__(self, *args): super().__init__(*args) self.cleaner = Cleaner(data_paths=data) def test_load_data(self): self.assertEqual(self.cleaner.azdias.shape, (891221, 85), "Shape of demography data") self.assertEqual(self.cleaner.customers.shape, (191652, 85), "Shape of customers data") self.assertEqual(self.cleaner.summary.shape, (85, 4), "Shape of customers data") def test_nans(self): nans_col = Cleaner.count_nan_columns(self.cleaner.azdias) nans_row = Cleaner.count_nan_rows(self.cleaner.azdias) d = dict(nans_col) self.assertEqual(d['CAMEO_DEUG_2015'], 98979) self.assertEqual(d['PLZ8_ANTG1'], 116515) self.assertEqual(nans_row.min(), 0) self.assertEqual(nans_row.max(), 46) def test_tonan(self): before_cleaning = Cleaner.count_nan_columns(self.cleaner.azdias) self.cleaner.to_nan() after_cleaning = Cleaner.count_nan_columns(self.cleaner.azdias) db = dict(before_cleaning) da = dict(after_cleaning) self.assertEqual(db['AGER_TYP'], 0) self.assertEqual(da['AGER_TYP'], 685843) def test_split(self): l, g = Cleaner.split_by_treshhold(self.cleaner.azdias) self.assertEqual(l.shape, (798293, 85)) self.assertEqual(g.shape, (92928, 85)) self.cleaner.azdias = l def test_recode(self): Cleaner.recode(self.cleaner.azdias, 'ANREDE_KZ', 2, 0) Cleaner.recode(self.cleaner.azdias, 'OST_WEST_KZ', 'O', 0) Cleaner.recode(self.cleaner.azdias, 'OST_WEST_KZ', 'W', 1) t1 = self.cleaner.azdias[self.cleaner.azdias['ANREDE_KZ'] == 2] t2 = self.cleaner.azdias[self.cleaner.azdias['OST_WEST_KZ'] == 'O'] t3 = self.cleaner.azdias[self.cleaner.azdias['OST_WEST_KZ'] == 'W'] t4 = self.cleaner.azdias[self.cleaner.azdias['OST_WEST_KZ'] == 1] t5 = self.cleaner.azdias[self.cleaner.azdias['OST_WEST_KZ'] == 0] self.assertEqual(len(t1), 0) self.assertEqual(len(t2), 0) self.assertEqual(len(t3), 0) self.assertEqual(len(t4), 629528) self.assertEqual(len(t5), 168545) def test_remap(self): self.cleaner.azdias['PRAEGENDE_JUGENDJAHRE'].apply(Cleaner.remap_jugendjahre()) print('')
def extract_features(df): new_dec = [] new_mov = [] to_apply = Cleaner.remap_jugendjahre(new_dec=new_dec, new_mov=new_mov) df['PRAEGENDE_JUGENDJAHRE'].apply(to_apply) df['decade'] = np.array(new_dec) df['movement'] = np.array(new_mov) df['wealth'] = df['CAMEO_INTL_2015'].str[0] df['life_stage'] = df['CAMEO_INTL_2015'].str[1] # cleaner.azdias = cleaner.azdias.drop(['CAMEO_INTL_2015', 'PRAEGENDE_JUGENDJAHRE', 'LP_LEBENSPHASE_GROB'], # axis=1) new_mov, new_dec = None, None return df
def test_recode(self): Cleaner.recode(self.cleaner.azdias, 'ANREDE_KZ', 2, 0) Cleaner.recode(self.cleaner.azdias, 'OST_WEST_KZ', 'O', 0) Cleaner.recode(self.cleaner.azdias, 'OST_WEST_KZ', 'W', 1) t1 = self.cleaner.azdias[self.cleaner.azdias['ANREDE_KZ'] == 2] t2 = self.cleaner.azdias[self.cleaner.azdias['OST_WEST_KZ'] == 'O'] t3 = self.cleaner.azdias[self.cleaner.azdias['OST_WEST_KZ'] == 'W'] t4 = self.cleaner.azdias[self.cleaner.azdias['OST_WEST_KZ'] == 1] t5 = self.cleaner.azdias[self.cleaner.azdias['OST_WEST_KZ'] == 0] self.assertEqual(len(t1), 0) self.assertEqual(len(t2), 0) self.assertEqual(len(t3), 0) self.assertEqual(len(t4), 629528) self.assertEqual(len(t5), 168545)
def __init__(self, *args): super().__init__(*args) self.cleaner = Cleaner(data_paths=data)
def test_remap(self): self.cleaner.azdias['PRAEGENDE_JUGENDJAHRE'].apply(Cleaner.remap_jugendjahre()) print('')
def test_split(self): l, g = Cleaner.split_by_treshhold(self.cleaner.azdias) self.assertEqual(l.shape, (798293, 85)) self.assertEqual(g.shape, (92928, 85)) self.cleaner.azdias = l
def recode(df): df = Cleaner.recode(df, 'ANREDE_KZ', 2, 0) df = Cleaner.recode(df, 'OST_WEST_KZ', 'O', 0) df = Cleaner.recode(df, 'OST_WEST_KZ', 'W', 1) return df
from config import data from data_preparation import Cleaner import pandas as pd import numpy as np from visuals import plot_bars from sklearn.preprocessing import Imputer if __name__ == '__main__': cleaner = Cleaner(data_paths=data) def split_by_missingvalues(): before_cleaning = Cleaner.count_nan_columns(cleaner.azdias) cleaner.to_nan() after_cleaning = Cleaner.count_nan_columns(cleaner.azdias) l, g, s1, s2 = Cleaner.split_by_treshhold(cleaner.azdias) cleaner.azdias = l return before_cleaning, after_cleaning, l.shape, g.shape def recode(df): df = Cleaner.recode(df, 'ANREDE_KZ', 2, 0) df = Cleaner.recode(df, 'OST_WEST_KZ', 'O', 0) df = Cleaner.recode(df, 'OST_WEST_KZ', 'W', 1) return df def extract_features(df): new_dec = [] new_mov = [] to_apply = Cleaner.remap_jugendjahre(new_dec=new_dec, new_mov=new_mov) df['PRAEGENDE_JUGENDJAHRE'].apply(to_apply) df['decade'] = np.array(new_dec)
x, y = zip(*before) x1, y1 = zip(*after) sns.barplot(x=list(x), y=list(y), ax=ax1) sns.barplot(x=list(x1), y=list(y1), ax=ax2) plt.setp(ax1.get_xticklabels(), rotation='vertical', fontsize=5) plt.setp(ax2.get_xticklabels(), rotation='vertical', fontsize=5) plt.show() def plot_bar(points): fig, ax = plt.subplots(figsize=(20, 8)) ax = sns.barplot(x=list(points[0]), y=list(points[1])) plt.setp(ax.get_xticklabels(), rotation='vertical', fontsize=10) plt.show() def plot_bars(dc): fig, ax = plt.subplots(figsize=(20, 8)) ax = sns.barplot(x=list(dc.keys()), y=list(dc.values())) plt.setp(ax.get_xticklabels(), rotation='vertical', fontsize=5) plt.show() if __name__ == '__main__': cleaner = Cleaner(data_paths=data) before_cleaning = Cleaner.count_nan_columns(cleaner._azdias) cleaner.to_nan() after_cleaning = Cleaner.count_nan_columns(cleaner._azdias) plot_nans(before=before_cleaning, after=after_cleaning) sorted_yielder = Cleaner.sort_by_values(dict(after_cleaning)) plot_bar(list(sorted_yielder))