def test_chained_imputer_imputation_order(imputation_order): rng = np.random.RandomState(0) n = 100 d = 10 X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() X[:, 0] = 1 # this column should not be discarded by ChainedImputer imputer = ChainedImputer(missing_values=0, n_imputations=1, n_burn_in=1, n_nearest_features=5, min_value=0, max_value=1, verbose=False, imputation_order=imputation_order, random_state=rng) imputer.fit_transform(X) ordered_idx = [i.feat_idx for i in imputer.imputation_sequence_] if imputation_order == 'roman': assert np.all(ordered_idx[:d-1] == np.arange(1, d)) elif imputation_order == 'arabic': assert np.all(ordered_idx[:d-1] == np.arange(d-1, 0, -1)) elif imputation_order == 'random': ordered_idx_round_1 = ordered_idx[:d-1] ordered_idx_round_2 = ordered_idx[d-1:] assert ordered_idx_round_1 != ordered_idx_round_2 elif 'ending' in imputation_order: assert len(ordered_idx) == 2 * (d - 1)
def test_chained_imputer_imputation_order(imputation_order): rng = np.random.RandomState(0) n = 100 d = 10 X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() X[:, 0] = 1 # this column should not be discarded by ChainedImputer imputer = ChainedImputer(missing_values=0, n_imputations=1, n_burn_in=1, n_nearest_features=5, min_value=0, max_value=1, verbose=False, imputation_order=imputation_order, random_state=rng) imputer.fit_transform(X) ordered_idx = [i.feat_idx for i in imputer.imputation_sequence_] if imputation_order == 'roman': assert np.all(ordered_idx[:d - 1] == np.arange(1, d)) elif imputation_order == 'arabic': assert np.all(ordered_idx[:d - 1] == np.arange(d - 1, 0, -1)) elif imputation_order == 'random': ordered_idx_round_1 = ordered_idx[:d - 1] ordered_idx_round_2 = ordered_idx[d - 1:] assert ordered_idx_round_1 != ordered_idx_round_2 elif 'ending' in imputation_order: assert len(ordered_idx) == 2 * (d - 1)
def get_results_multiple_imputation_approach2(X_train, X_test, y_train, y_test): m = 5 multiple_predictions = [] for i in range(m): # Fit the imputer for every i in m # Be aware that you fit the imputer on the train data # And apply to the test data imputer = ChainedImputer(n_burn_in=99, n_imputations=1, random_state=i) X_train_imputed = imputer.fit_transform(X_train) X_test_imputed = imputer.transform(X_test) # Perform the steps you wish to take before fitting the estimator # Such as standardization scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train_imputed) X_test_scaled = scaler.transform(X_test_imputed) # Finally fit the estimator and calculate the predictions for every i # in m. Save the predictions. estimator = LinearRegression() estimator.fit(X_train_scaled, y_train) y_predict = estimator.predict(X_test_scaled) multiple_predictions.append(y_predict) # Average the predictions over the m loops # Then calculate the error metric. predictions_average = np.mean(multiple_predictions, axis=0) mse_approach2 = mse(y_test, predictions_average) return mse_approach2
def test_chained_imputer_no_missing(): rng = np.random.RandomState(0) X = rng.rand(100, 100) X[:, 0] = np.nan m1 = ChainedImputer(n_imputations=10, random_state=rng) m2 = ChainedImputer(n_imputations=10, random_state=rng) pred1 = m1.fit(X).transform(X) pred2 = m2.fit_transform(X) # should exclude the first column entirely assert_allclose(X[:, 1:], pred1) # fit and fit_transform should both be identical assert_allclose(pred1, pred2)
def test_chained_imputer_predictors(predictor): rng = np.random.RandomState(0) n = 100 d = 10 X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() imputer = ChainedImputer(missing_values=0, n_imputations=1, n_burn_in=1, predictor=predictor, random_state=rng) imputer.fit_transform(X) # check that types are correct for predictors hashes = [] for triplet in imputer.imputation_sequence_: assert triplet.predictor hashes.append(id(triplet.predictor)) # check that each predictor is unique assert len(set(hashes)) == len(hashes)
def test_imputation_shape(): # Verify the shapes of the imputed matrix for different strategies. X = np.random.randn(10, 2) X[::2] = np.nan for strategy in ['mean', 'median', 'most_frequent', "constant"]: imputer = SimpleImputer(strategy=strategy) X_imputed = imputer.fit_transform(sparse.csr_matrix(X)) assert X_imputed.shape == (10, 2) X_imputed = imputer.fit_transform(X) assert X_imputed.shape == (10, 2) chained_imputer = ChainedImputer(initial_strategy=strategy) X_imputed = chained_imputer.fit_transform(X) assert X_imputed.shape == (10, 2)
def test_chained_imputer_rank_one(): rng = np.random.RandomState(0) d = 100 A = rng.rand(d, 1) B = rng.rand(1, d) X = np.dot(A, B) nan_mask = rng.rand(d, d) < 0.5 X_missing = X.copy() X_missing[nan_mask] = np.nan imputer = ChainedImputer(n_imputations=5, n_burn_in=5, verbose=True, random_state=rng) X_filled = imputer.fit_transform(X_missing) assert_allclose(X_filled, X, atol=0.001)
def test_chained_imputer_clip(): rng = np.random.RandomState(0) n = 100 d = 10 X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() imputer = ChainedImputer(missing_values=0, n_imputations=1, n_burn_in=1, min_value=0.1, max_value=0.2, random_state=rng) Xt = imputer.fit_transform(X) assert_allclose(np.min(Xt[X == 0]), 0.1) assert_allclose(np.max(Xt[X == 0]), 0.2) assert_allclose(Xt[X != 0], X[X != 0])
def get_results_single_imputation(X_train, X_test, y_train, y_test): # Apply imputation imputer = ChainedImputer(n_burn_in=99, n_imputations=1, random_state=0) X_train_imputed = imputer.fit_transform(X_train) X_test_imputed = imputer.transform(X_test) # Standardize data scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train_imputed) X_test_scaled = scaler.transform(X_test_imputed) # Perform estimation and prediction estimator = LinearRegression() estimator.fit(X_train_scaled, y_train) y_predict = estimator.predict(X_test_scaled) mse_single = mse(y_test, y_predict) return mse_single