def _check_statistics(X, X_true, strategy, statistics, missing_values): """Utility function for testing imputation for a given strategy. Test with dense and sparse arrays Check that: - the statistics (mean, median, mode) are correct - the missing values are imputed correctly""" err_msg = "Parameters: strategy = %s, missing_values = %s, " \ "sparse = {0}" % (strategy, missing_values) assert_ae = assert_array_equal if X.dtype.kind == 'f' or X_true.dtype.kind == 'f': assert_ae = assert_array_almost_equal # Normal matrix imputer = SimpleImputer(missing_values, strategy=strategy) X_trans = imputer.fit(X).transform(X.copy()) assert_ae(imputer.statistics_, statistics, err_msg=err_msg.format(False)) assert_ae(X_trans, X_true, err_msg=err_msg.format(False)) # Sparse matrix imputer = SimpleImputer(missing_values, strategy=strategy) imputer.fit(sparse.csc_matrix(X)) X_trans = imputer.transform(sparse.csc_matrix(X.copy())) if sparse.issparse(X_trans): X_trans = X_trans.toarray() assert_ae(imputer.statistics_, statistics, err_msg=err_msg.format(True)) assert_ae(X_trans, X_true, err_msg=err_msg.format(True))
def test_imputation_error_sparse_0(strategy): # check that error are raised when missing_values = 0 and input is sparse X = np.ones((3, 5)) X[0] = 0 X = sparse.csc_matrix(X) imputer = SimpleImputer(strategy=strategy, missing_values=0) with pytest.raises(ValueError, match="Provide a dense array"): imputer.fit(X) imputer.fit(X.toarray()) with pytest.raises(ValueError, match="Provide a dense array"): imputer.transform(X)
def test_imputation_const_mostf_error_invalid_types(strategy, dtype): # Test imputation on non-numeric data using "most_frequent" and "constant" # strategy X = np.array([ [np.nan, np.nan, "a", "f"], [np.nan, "c", np.nan, "d"], [np.nan, "b", "d", np.nan], [np.nan, "c", "d", "h"], ], dtype=dtype) err_msg = "SimpleImputer does not support data" with pytest.raises(ValueError, match=err_msg): imputer = SimpleImputer(strategy=strategy) imputer.fit(X).transform(X)
def test_imputation_copy(): # Test imputation with copy X_orig = sparse_random_matrix(5, 5, density=0.75, random_state=0) # copy=True, dense => copy X = X_orig.copy().toarray() imputer = SimpleImputer(missing_values=0, strategy="mean", copy=True) Xt = imputer.fit(X).transform(X) Xt[0, 0] = -1 assert not np.all(X == Xt) # copy=True, sparse csr => copy X = X_orig.copy() imputer = SimpleImputer(missing_values=X.data[0], strategy="mean", copy=True) Xt = imputer.fit(X).transform(X) Xt.data[0] = -1 assert not np.all(X.data == Xt.data) # copy=False, dense => no copy X = X_orig.copy().toarray() imputer = SimpleImputer(missing_values=0, strategy="mean", copy=False) Xt = imputer.fit(X).transform(X) Xt[0, 0] = -1 assert_array_almost_equal(X, Xt) # copy=False, sparse csc => no copy X = X_orig.copy().tocsc() imputer = SimpleImputer(missing_values=X.data[0], strategy="mean", copy=False) Xt = imputer.fit(X).transform(X) Xt.data[0] = -1 assert_array_almost_equal(X.data, Xt.data) # copy=False, sparse csr => copy X = X_orig.copy() imputer = SimpleImputer(missing_values=X.data[0], strategy="mean", copy=False) Xt = imputer.fit(X).transform(X) Xt.data[0] = -1 assert not np.all(X.data == Xt.data)
def test_imputation_most_frequent_objects(marker): # Test imputation using the most-frequent strategy. X = np.array([ [marker, marker, "a", "f"], [marker, "c", marker, "d"], [marker, "b", "d", marker], [marker, "c", "d", "h"], ], dtype=object) X_true = np.array([ ["c", "a", "f"], ["c", "d", "d"], ["b", "d", "d"], ["c", "d", "h"], ], dtype=object) imputer = SimpleImputer(missing_values=marker, strategy="most_frequent") X_trans = imputer.fit(X).transform(X) assert_array_equal(X_trans, X_true)