def test_impute_nans_for_categorical_columns_replaces_na_with_most_frequent_mode( self): df = pd.DataFrame( {'some_categorical_column': ['A', 'A', 'B', np.nan, 'A', np.nan]}) expected = pd.DataFrame( {'some_categorical_column': ['A', 'A', 'B', 'A', 'A', 'A']}) assert_frame_equal( expected, impute_nans(df, categorical_columns=['some_categorical_column']))
def test_impute_nans_for_continuous_columns_replaces_na_with_median(self): df = pd.DataFrame({ # median value: 20 'some_continuous_column': [10, 20, np.nan, np.nan, 30] }) expected = pd.DataFrame( {'some_continuous_column': [10, 20, 20, 20, 30]}) assert_frame_equal(expected, impute_nans( df, continuous_columns=['some_continuous_column']), check_dtype=False)
def prepare_data_and_train_model(): df = pd.read_csv("./data/train.csv") df = impute_nans(df, categorical_columns=[ 'Embarked'], continuous_columns=['Fare', 'Age']) df = add_derived_title(df) df = add_is_alone_column(df) df = add_categorical_columns(df) df = df.drop(['Parch', 'SibSp', 'Name', 'PassengerId', 'Ticket', 'Cabin'], axis=1) Y = df["Survived"] X = df.drop("Survived", axis=1) X_train, X_test, Y_train, Y_test = train_test_split(X, Y) rf_model, accuracy_random_forest = train_model( RandomForestClassifier, X_train, Y_train, n_estimators=100) return rf_model, X_test, Y_test