def test_imputing_all_missing_values_categorical(missing): """Assert that all missing values are imputed in categorical columns.""" X = [[missing, "a", "a"], ["b", "c", missing], ["b", "a", "c"], ["c", "a", "a"]] y = [1, 1, 0, 0] imputer = Imputer(strat_cat="most_frequent") X, y = imputer.fit_transform(X, y) assert X.isna().sum().sum() == 0
def test_imputing_all_missing_values_numeric(missing): """Assert that all missing values are imputed in numeric columns.""" X = [[missing, 1, 1], [2, 5, 2], [4, missing, 1], [2, 1, 1]] y = [1, 1, 0, 0] imputer = Imputer(strat_num="mean") imputer.missing.append(99) X, y = imputer.fit_transform(X, y) assert X.isna().sum().sum() == 0
def test_cols_too_many_nans(): """Assert that columns with too many missing values are dropped.""" X = X_bin.copy() for i in range(5): # Add 5 cols with all NaN values X["col " + str(i)] = [np.nan for _ in range(X.shape[0])] impute = Imputer(strat_num="mean", strat_cat="most_frequent", min_frac_cols=0.5) X, y = impute.fit_transform(X, y_bin) assert len(X.columns) == 30 # Original number of columns assert X.isna().sum().sum() == 0
def test_rows_too_many_nans(): """Assert that rows with too many missing values are dropped.""" X = X_bin.copy() for i in range(5): # Add 5 rows with all NaN values X.loc[len(X)] = [np.nan for _ in range(X.shape[1])] y = [np.random.randint(2) for _ in range(len(X))] impute = Imputer(strat_num="mean", strat_cat="most_frequent", min_frac_rows=0.5) X, y = impute.fit_transform(X, y) assert len(X) == 569 # Original size assert X.isna().sum().sum() == 0
def test_imputing_non_numeric_most_frequent(): """Assert that the most_frequent strategy for non-numerical works.""" imputer = Imputer(strat_cat="most_frequent") X, y = imputer.fit_transform(X10_sn, y10) assert X.iloc[0, 2] == "d" assert X.isna().sum().sum() == 0
def test_imputing_non_numeric_drop(): """Assert that the drop strategy for non-numerical works.""" imputer = Imputer(strat_cat="drop") X, y = imputer.fit_transform(X10_sn, y10) assert len(X) == 9 assert X.isna().sum().sum() == 0
def test_imputing_non_numeric_string(): """Assert that imputing a string for non-numerical values works.""" imputer = Imputer(strat_cat="missing") X, y = imputer.fit_transform(X10_sn, y10) assert X.iloc[0, 2] == "missing" assert X.isna().sum().sum() == 0
def test_imputing_numeric_most_frequent(): """Assert that imputing the most_frequent for numerical values works.""" imputer = Imputer(strat_num="most_frequent") X, y = imputer.fit_transform(X10_nan, y10) assert X.iloc[0, 0] == 3 assert X.isna().sum().sum() == 0
def test_imputing_numeric_mean(): """Assert that imputing the mean for numerical values works.""" imputer = Imputer(strat_num="mean") X, y = imputer.fit_transform(X10_nan, y10) assert X.iloc[0, 0] == pytest.approx(2.577778, rel=1e-6, abs=1e-12) assert X.isna().sum().sum() == 0
def test_imputing_numeric_number(): """Assert that imputing a number for numerical values works.""" imputer = Imputer(strat_num=3.2) X, y = imputer.fit_transform(X10_nan, y10) assert X.iloc[0, 0] == 3.2 assert X.isna().sum().sum() == 0
def test_imputing_numeric_drop(): """Assert that imputing drop for numerical values works.""" imputer = Imputer(strat_num="drop") X, y = imputer.fit_transform(X10_nan, y10) assert len(X) == 8 assert X.isna().sum().sum() == 0
def test_imputer_is_fitted(): """Assert that an error is raised if the instance is not fitted.""" pytest.raises(NotFittedError, Imputer().transform, X_bin, y_bin)
def test_invalid_min_frac_cols(): """Assert that an error is raised for invalid min_frac_cols.""" imputer = Imputer(min_frac_cols=5.2) pytest.raises(ValueError, imputer.fit, X_bin, y_bin)
def test_strat_num_parameter(): """Assert that the strat_num parameter is set correctly.""" imputer = Imputer(strat_num="invalid") pytest.raises(ValueError, imputer.fit, X_bin, y_bin)
def test_load_data_with_no_trainer(): """Assert that an error is raised when data is provided without a trainer.""" Imputer().save(FILE_DIR + "imputer") pytest.raises(TypeError, ATOMLoader, FILE_DIR + "imputer", data=(X_bin, ))