def load_complete_data(self):
     """
     will be use to create estimators to fillNA
     :return:
     """
     df1 = self.load_train_data()
     df2 = self.load_test_data()
     manipulator = DataFrameManipulation(df1)
     df1 = manipulator.drop_columns([self.actl_lbl])
     return pd.concat([df1, df2])
示例#2
0
def prep_data(df, common):
    manipulator = DataFrameManipulation(df)
    df = manipulator.map_columns_values(**common.map_dict)
    drop_list = ["Name", "Ticket", "Cabin", "PassengerId"]
    df = manipulator.drop_columns(drop_list, df)

    for col_name in list(df):
        df[col_name] = df[col_name].fillna(df[col_name].median())

    df.Age = df.Age.astype(int)

    return df
    def test_apply_those_functions2(self):
        def upper_second_char(s):
            return "{}{}{}".format(s[0], str(s[1]).upper(), s[2:])

        def concat_len(s):
            return str(s) + str(len(s))

        func_list = [upper_second_char, concat_len]
        colman = DataFrameManipulation(self.iris_df)
        _df = colman.map_columns_values(Target={0: "Setosa", 1: "Versicolour", 2: "Virginica"})
        self.assertTrue("Setosa" in _df.Target.unique())

        _df = colman.apply_those_functions(Target=func_list)
        self.assertTrue("SEtosa6" in _df.Target.unique())
    def test_apply_those_functions1(self):
        """apply one function on the target column and one function on the"""

        def upper_second_char(s):
            return "{}{}{}".format(s[0], str(s[1]).upper(), s[2:])

        def is_big(n):
            return n > 6

        colman = DataFrameManipulation(self.iris_df)
        _df = colman.map_columns_values(Target={0: "Setosa", 1: "Versicolour", 2: "Virginica"})
        self.assertTrue("Setosa" in _df.Target.unique())

        _df = colman.apply_those_functions(Target=upper_second_char, sepal_length_cm=is_big)
        self.assertTrue("SEtosa" in _df.Target.unique())

        self.assertTrue(set([True, False]) == set(_df.sepal_length_cm.unique()))
示例#5
0
def create_age_estimator(common):
    df = common.load_complete_data()
    manipulator = DataFrameManipulation(df)
    df = manipulator.map_columns_values(**common.map_dict)
    df = add_columns(df, common, manipulator)
    df.to_csv("create_age_estimator.csv")

    drop_list = ["Name", "Ticket", "Cabin", "PassengerId"]
    df = manipulator.drop_columns(drop_list, df)
    col_lst = list(df)
    col_lst.remove('Age')
    for col_name in col_lst:
        df[col_name] = df[col_name].fillna(df[col_name].median())

    df_with_age = df[(df.Age.notnull())]
    # df_with_age = df_with_age[df_with_age.Age < 40]
    df_with_age = df_with_age.dropna(how='any', axis=0)
    df_no_age = df[(df.Age.isnull())]
    drop_list = ["Name", "Ticket", "Cabin", "PassengerId"]

    lm = LinearRegression()
    mu = LinearModelUtils(df=df_with_age,
                          lm=lm,
                          predicted_lbl='PredAge',
                          actual_lbl='Age')
    mu.split_and_train()
    results_df = mu.test_model()
    evp = EvaluationPlots(df=results_df,
                          actual_lbl=mu.actual_lbl,
                          predicted_lbl=mu.predicted_lbl)
    evp.plot_predicted_vs_actual(title="LinearRegression as fillna")
    print("LinearRegression rmse={}".format(
        mu.rmse(results_df.PredAge, results_df.Age)))
    plt.show()

    results_df["med_age"] = df.Age.median()
    evp = EvaluationPlots(df=results_df,
                          actual_lbl=mu.actual_lbl,
                          predicted_lbl="med_age")
    evp.plot_predicted_vs_actual(title="median as fillna")
    print("med_age rmse={}".format(
        mu.rmse(results_df["med_age"], results_df.Age)))
    plt.show()
    common.age_estimator = mu.model
    exit()
    def test_split_columns_into_columns(self):
        def under_split(s): return str(s).split('_')

        colman = DataFrameManipulation(self.iris_df)
        _df = colman.map_columns_values(Target={0: "s_e_t_o_s_a", 1: "v_e_r_s_i_colour", 2: "V_i_rginica"})
        self.assertTrue("s_e_t_o_s_a" in _df.Target.unique())

        col_names = [c for c in "abcdefghyjk"]
        splited_df = colman.split_columns_into_columns(col_to_split="Target", new_columns_names_lst=col_names,
                                                       split_func=under_split)
        self.assertEquals(_df.shape, _df.shape)
        self.assertEquals(_df.shape, splited_df.shape)
        b_lst = splited_df[splited_df.Target == 's_e_t_o_s_a'].b.unique()
        f_lst = splited_df[splited_df.Target == 'v_e_r_s_i_colour'].f.unique()

        self.assertTrue(b_lst[0] == 'e')
        self.assertTrue(f_lst[0] == 'colour')

        self.assertTrue(set([True, False]) == set(_df.sepal_length_cm.unique()))
示例#7
0
def prep_data(df):
    map_dict = {
        "Embarked": {
            'S': 'S',
            'C': 'C',
            'Q': 'Q',
            np.NaN: 'Nan_str'
        },
        "Survived": {
            0: 'No',
            1: 'Yes',
            3: "Unknown"
        }
    }
    manipulator = DataFrameManipulation(df)
    df = manipulator.map_columns_values(**map_dict)
    age_mid = df.Age.mean()
    for col_name in list(df):
        df[col_name] = df[col_name].fillna(df[col_name].mode()[0])

    def fillna_age(age):
        if age not in [None, np.NaN]:
            return age
        return age_mid

    def fix_age(age):
        age = int(age)
        if age < 2:
            return 1
        return age

    def number_of_cabins(c):
        if c in [None, np.NaN]: return 0
        return len(c.split())

    df["number_of_cabins"] = df.Cabin.apply(number_of_cabins)

    df = manipulator.apply_those_functions(Age=fix_age, Fare=int)
    for col_name in list(df):
        df[col_name] = df[col_name].fillna(df[col_name].mode()[0])
    df.Age = df.Age.astype(int)

    return df
    def test_map_columns_values(self):
        iris_target = self.iris_df.Target
        count = iris_target.value_counts()
        self.assertEquals(count.sum(), 150)
        self.assertEquals(count[0], 50)
        map_dict = {
            "Target": {
                0: "Setosa",
                1: "Versicolour",
                2: "Virginica"
            }
        }

        colman = DataFrameManipulation(self.iris_df)
        _df = colman.map_columns_values(**map_dict)
        target_uniq = sorted(list(_df.Target.unique()))
        expected_values = sorted(map_dict["Target"].values())
        _target = _df.Target
        count = _target.value_counts()
        self.assertEquals(target_uniq, expected_values)
        self.assertEquals(count.sum(), 150)
        self.assertEquals(count.Setosa, 50)
示例#9
0
def prep_data(df, common):
    manipulator = DataFrameManipulation(df)
    df = manipulator.map_columns_values(**common.map_dict)
    df = add_columns(df, common, manipulator)
    drop_list = ["Name", "Ticket", "Cabin", "PassengerId"]
    df = manipulator.drop_columns(drop_list, df)

    col_lst = list(df)
    col_lst.remove('Age')
    for col_name in col_lst:
        df[col_name] = df[col_name].fillna(df[col_name].median())

    def age_huristic(value, df_row=None):
        if value:
            return value
        else:
            return common.age_estimator.predict(df_row)

    # df= manipulator.fillna_by_heuristic(age_huristic,"Age",df)
    # df.Age = df.Age.astype(int)

    return df
 def test_drop_columns(self):
     drop_list = ["CRIM", "INDUS"]
     colman = DataFrameManipulation(self.boston_df)
     df = colman.drop_columns(drop_list)
     self.assertTrue("CRIM" not in list(df))
     self.assertTrue("INDUS" not in list(df))