def load_complete_data(self): """ will be use to create estimators to fillNA :return: """ df1 = self.load_train_data() df2 = self.load_test_data() manipulator = DataFrameManipulation(df1) df1 = manipulator.drop_columns([self.actl_lbl]) return pd.concat([df1, df2])
def prep_data(df, common): manipulator = DataFrameManipulation(df) df = manipulator.map_columns_values(**common.map_dict) drop_list = ["Name", "Ticket", "Cabin", "PassengerId"] df = manipulator.drop_columns(drop_list, df) for col_name in list(df): df[col_name] = df[col_name].fillna(df[col_name].median()) df.Age = df.Age.astype(int) return df
def test_apply_those_functions2(self): def upper_second_char(s): return "{}{}{}".format(s[0], str(s[1]).upper(), s[2:]) def concat_len(s): return str(s) + str(len(s)) func_list = [upper_second_char, concat_len] colman = DataFrameManipulation(self.iris_df) _df = colman.map_columns_values(Target={0: "Setosa", 1: "Versicolour", 2: "Virginica"}) self.assertTrue("Setosa" in _df.Target.unique()) _df = colman.apply_those_functions(Target=func_list) self.assertTrue("SEtosa6" in _df.Target.unique())
def test_apply_those_functions1(self): """apply one function on the target column and one function on the""" def upper_second_char(s): return "{}{}{}".format(s[0], str(s[1]).upper(), s[2:]) def is_big(n): return n > 6 colman = DataFrameManipulation(self.iris_df) _df = colman.map_columns_values(Target={0: "Setosa", 1: "Versicolour", 2: "Virginica"}) self.assertTrue("Setosa" in _df.Target.unique()) _df = colman.apply_those_functions(Target=upper_second_char, sepal_length_cm=is_big) self.assertTrue("SEtosa" in _df.Target.unique()) self.assertTrue(set([True, False]) == set(_df.sepal_length_cm.unique()))
def create_age_estimator(common): df = common.load_complete_data() manipulator = DataFrameManipulation(df) df = manipulator.map_columns_values(**common.map_dict) df = add_columns(df, common, manipulator) df.to_csv("create_age_estimator.csv") drop_list = ["Name", "Ticket", "Cabin", "PassengerId"] df = manipulator.drop_columns(drop_list, df) col_lst = list(df) col_lst.remove('Age') for col_name in col_lst: df[col_name] = df[col_name].fillna(df[col_name].median()) df_with_age = df[(df.Age.notnull())] # df_with_age = df_with_age[df_with_age.Age < 40] df_with_age = df_with_age.dropna(how='any', axis=0) df_no_age = df[(df.Age.isnull())] drop_list = ["Name", "Ticket", "Cabin", "PassengerId"] lm = LinearRegression() mu = LinearModelUtils(df=df_with_age, lm=lm, predicted_lbl='PredAge', actual_lbl='Age') mu.split_and_train() results_df = mu.test_model() evp = EvaluationPlots(df=results_df, actual_lbl=mu.actual_lbl, predicted_lbl=mu.predicted_lbl) evp.plot_predicted_vs_actual(title="LinearRegression as fillna") print("LinearRegression rmse={}".format( mu.rmse(results_df.PredAge, results_df.Age))) plt.show() results_df["med_age"] = df.Age.median() evp = EvaluationPlots(df=results_df, actual_lbl=mu.actual_lbl, predicted_lbl="med_age") evp.plot_predicted_vs_actual(title="median as fillna") print("med_age rmse={}".format( mu.rmse(results_df["med_age"], results_df.Age))) plt.show() common.age_estimator = mu.model exit()
def test_split_columns_into_columns(self): def under_split(s): return str(s).split('_') colman = DataFrameManipulation(self.iris_df) _df = colman.map_columns_values(Target={0: "s_e_t_o_s_a", 1: "v_e_r_s_i_colour", 2: "V_i_rginica"}) self.assertTrue("s_e_t_o_s_a" in _df.Target.unique()) col_names = [c for c in "abcdefghyjk"] splited_df = colman.split_columns_into_columns(col_to_split="Target", new_columns_names_lst=col_names, split_func=under_split) self.assertEquals(_df.shape, _df.shape) self.assertEquals(_df.shape, splited_df.shape) b_lst = splited_df[splited_df.Target == 's_e_t_o_s_a'].b.unique() f_lst = splited_df[splited_df.Target == 'v_e_r_s_i_colour'].f.unique() self.assertTrue(b_lst[0] == 'e') self.assertTrue(f_lst[0] == 'colour') self.assertTrue(set([True, False]) == set(_df.sepal_length_cm.unique()))
def prep_data(df): map_dict = { "Embarked": { 'S': 'S', 'C': 'C', 'Q': 'Q', np.NaN: 'Nan_str' }, "Survived": { 0: 'No', 1: 'Yes', 3: "Unknown" } } manipulator = DataFrameManipulation(df) df = manipulator.map_columns_values(**map_dict) age_mid = df.Age.mean() for col_name in list(df): df[col_name] = df[col_name].fillna(df[col_name].mode()[0]) def fillna_age(age): if age not in [None, np.NaN]: return age return age_mid def fix_age(age): age = int(age) if age < 2: return 1 return age def number_of_cabins(c): if c in [None, np.NaN]: return 0 return len(c.split()) df["number_of_cabins"] = df.Cabin.apply(number_of_cabins) df = manipulator.apply_those_functions(Age=fix_age, Fare=int) for col_name in list(df): df[col_name] = df[col_name].fillna(df[col_name].mode()[0]) df.Age = df.Age.astype(int) return df
def test_map_columns_values(self): iris_target = self.iris_df.Target count = iris_target.value_counts() self.assertEquals(count.sum(), 150) self.assertEquals(count[0], 50) map_dict = { "Target": { 0: "Setosa", 1: "Versicolour", 2: "Virginica" } } colman = DataFrameManipulation(self.iris_df) _df = colman.map_columns_values(**map_dict) target_uniq = sorted(list(_df.Target.unique())) expected_values = sorted(map_dict["Target"].values()) _target = _df.Target count = _target.value_counts() self.assertEquals(target_uniq, expected_values) self.assertEquals(count.sum(), 150) self.assertEquals(count.Setosa, 50)
def prep_data(df, common): manipulator = DataFrameManipulation(df) df = manipulator.map_columns_values(**common.map_dict) df = add_columns(df, common, manipulator) drop_list = ["Name", "Ticket", "Cabin", "PassengerId"] df = manipulator.drop_columns(drop_list, df) col_lst = list(df) col_lst.remove('Age') for col_name in col_lst: df[col_name] = df[col_name].fillna(df[col_name].median()) def age_huristic(value, df_row=None): if value: return value else: return common.age_estimator.predict(df_row) # df= manipulator.fillna_by_heuristic(age_huristic,"Age",df) # df.Age = df.Age.astype(int) return df
def test_drop_columns(self): drop_list = ["CRIM", "INDUS"] colman = DataFrameManipulation(self.boston_df) df = colman.drop_columns(drop_list) self.assertTrue("CRIM" not in list(df)) self.assertTrue("INDUS" not in list(df))