예제 #1
0
    def test_impute_nans_for_categorical_columns_replaces_na_with_most_frequent_mode(
            self):
        df = pd.DataFrame(
            {'some_categorical_column': ['A', 'A', 'B', np.nan, 'A', np.nan]})

        expected = pd.DataFrame(
            {'some_categorical_column': ['A', 'A', 'B', 'A', 'A', 'A']})

        assert_frame_equal(
            expected,
            impute_nans(df, categorical_columns=['some_categorical_column']))
예제 #2
0
    def test_impute_nans_for_continuous_columns_replaces_na_with_median(self):
        df = pd.DataFrame({
            # median value: 20
            'some_continuous_column': [10, 20, np.nan, np.nan, 30]
        })

        expected = pd.DataFrame(
            {'some_continuous_column': [10, 20, 20, 20, 30]})

        assert_frame_equal(expected,
                           impute_nans(
                               df,
                               continuous_columns=['some_continuous_column']),
                           check_dtype=False)
예제 #3
0
def prepare_data_and_train_model():
    df = pd.read_csv("./data/train.csv")

    df = impute_nans(df, categorical_columns=[
        'Embarked'], continuous_columns=['Fare', 'Age'])
    df = add_derived_title(df)
    df = add_is_alone_column(df)
    df = add_categorical_columns(df)

    df = df.drop(['Parch', 'SibSp', 'Name', 'PassengerId',
                  'Ticket', 'Cabin'], axis=1)

    Y = df["Survived"]
    X = df.drop("Survived", axis=1)

    X_train, X_test, Y_train, Y_test = train_test_split(X, Y)

    rf_model, accuracy_random_forest = train_model(
        RandomForestClassifier, X_train, Y_train, n_estimators=100)

    return rf_model, X_test, Y_test