def test_mice_additive_matrix(): rng = np.random.RandomState(0) n = 100 d = 10 A = rng.randn(n, d) B = rng.randn(n, d) X_filled = np.zeros(A.shape) for i in range(d): for j in range(d): X_filled[:, (i+j) % d] += (A[:, i] + B[:, j]) / 2 # a quarter is randomly missing nan_mask = rng.rand(n, d) < 0.25 X_missing = X_filled.copy() X_missing[nan_mask] = np.nan # split up data n = n // 2 X_train = X_missing[:n] X_test_filled = X_filled[n:] X_test = X_missing[n:] imputer = MICEImputer(n_imputations=25, n_burn_in=10, verbose=True, random_state=rng).fit(X_train) X_test_est = imputer.transform(X_test) assert_allclose(X_test_filled, X_test_est, atol=0.01)
def test_mice_additive_matrix(): rng = np.random.RandomState(0) n = 100 d = 10 A = rng.randn(n, d) B = rng.randn(n, d) X_filled = np.zeros(A.shape) for i in range(d): for j in range(d): X_filled[:, (i + j) % d] += (A[:, i] + B[:, j]) / 2 # a quarter is randomly missing nan_mask = rng.rand(n, d) < 0.25 X_missing = X_filled.copy() X_missing[nan_mask] = np.nan # split up data n = n // 2 X_train = X_missing[:n] X_test_filled = X_filled[n:] X_test = X_missing[n:] imputer = MICEImputer(n_imputations=25, n_burn_in=10, verbose=True, random_state=rng).fit(X_train) X_test_est = imputer.transform(X_test) assert_allclose(X_test_filled, X_test_est, atol=0.01)
def test_mice_imputation_order(imputation_order): rng = np.random.RandomState(0) n = 100 d = 10 X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() X[:, 0] = 1 # this column should not be discarded by MICEImputer imputer = MICEImputer(missing_values=0, n_imputations=1, n_burn_in=1, n_nearest_features=5, min_value=0, max_value=1, verbose=False, imputation_order=imputation_order, random_state=rng) imputer.fit_transform(X) ordered_idx = [i.feat_idx for i in imputer.imputation_sequence_] if imputation_order == 'roman': assert np.all(ordered_idx[:d-1] == np.arange(1, d)) elif imputation_order == 'arabic': assert np.all(ordered_idx[:d-1] == np.arange(d-1, 0, -1)) elif imputation_order == 'random': ordered_idx_round_1 = ordered_idx[:d-1] ordered_idx_round_2 = ordered_idx[d-1:] assert ordered_idx_round_1 != ordered_idx_round_2 elif 'ending' in imputation_order: assert len(ordered_idx) == 2 * (d - 1)
def test_mice_imputation_order(imputation_order): rng = np.random.RandomState(0) n = 100 d = 10 X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() X[:, 0] = 1 # this column should not be discarded by MICEImputer imputer = MICEImputer(missing_values=0, n_imputations=1, n_burn_in=1, n_nearest_features=5, min_value=0, max_value=1, verbose=False, imputation_order=imputation_order, random_state=rng) imputer.fit_transform(X) ordered_idx = [i.feat_idx for i in imputer.imputation_sequence_] if imputation_order == 'roman': assert np.all(ordered_idx[:d - 1] == np.arange(1, d)) elif imputation_order == 'arabic': assert np.all(ordered_idx[:d - 1] == np.arange(d - 1, 0, -1)) elif imputation_order == 'random': ordered_idx_round_1 = ordered_idx[:d - 1] ordered_idx_round_2 = ordered_idx[d - 1:] assert ordered_idx_round_1 != ordered_idx_round_2 elif 'ending' in imputation_order: assert len(ordered_idx) == 2 * (d - 1)
def test_mice_no_missing(): rng = np.random.RandomState(0) X = rng.rand(100, 100) X[:, 0] = np.nan m1 = MICEImputer(n_imputations=10, random_state=rng) m2 = MICEImputer(n_imputations=10, random_state=rng) pred1 = m1.fit(X).transform(X) pred2 = m2.fit_transform(X) # should exclude the first column entirely assert_allclose(X[:, 1:], pred1) # fit and fit_transform should both be identical assert_allclose(pred1, pred2)
def test_imputation_shape(): # Verify the shapes of the imputed matrix for different strategies. X = np.random.randn(10, 2) X[::2] = np.nan for strategy in ['mean', 'median', 'most_frequent']: imputer = SimpleImputer(strategy=strategy) X_imputed = imputer.fit_transform(sparse.csr_matrix(X)) assert X_imputed.shape == (10, 2) X_imputed = imputer.fit_transform(X) assert X_imputed.shape == (10, 2) mice_imputer = MICEImputer(initial_strategy=strategy) X_imputed = mice_imputer.fit_transform(X) assert X_imputed.shape == (10, 2)
def test_imputation_shape(): # Verify the shapes of the imputed matrix for different strategies. X = np.random.randn(10, 2) X[::2] = np.nan for strategy in ['mean', 'median', 'most_frequent', "constant"]: imputer = SimpleImputer(strategy=strategy) X_imputed = imputer.fit_transform(sparse.csr_matrix(X)) assert X_imputed.shape == (10, 2) X_imputed = imputer.fit_transform(X) assert X_imputed.shape == (10, 2) mice_imputer = MICEImputer(initial_strategy=strategy) X_imputed = mice_imputer.fit_transform(X) assert X_imputed.shape == (10, 2)
def test_mice_rank_one(): rng = np.random.RandomState(0) d = 100 A = rng.rand(d, 1) B = rng.rand(1, d) X = np.dot(A, B) nan_mask = rng.rand(d, d) < 0.5 X_missing = X.copy() X_missing[nan_mask] = np.nan imputer = MICEImputer(n_imputations=5, n_burn_in=5, verbose=True, random_state=rng) X_filled = imputer.fit_transform(X_missing) assert_allclose(X_filled, X, atol=0.001)
def test_mice_transform_stochasticity(): rng = np.random.RandomState(0) n = 100 d = 10 X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() imputer = MICEImputer(missing_values=0, n_imputations=1, n_burn_in=1, random_state=rng) imputer.fit(X) X_fitted_1 = imputer.transform(X) X_fitted_2 = imputer.transform(X) # sufficient to assert that the means are not the same assert np.mean(X_fitted_1) != pytest.approx(np.mean(X_fitted_2))
def test_mice_clip(): rng = np.random.RandomState(0) n = 100 d = 10 X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() imputer = MICEImputer(missing_values=0, n_imputations=1, n_burn_in=1, min_value=0.1, max_value=0.2, random_state=rng) Xt = imputer.fit_transform(X) assert_allclose(np.min(Xt[X == 0]), 0.1) assert_allclose(np.max(Xt[X == 0]), 0.2) assert_allclose(Xt[X != 0], X[X != 0])
def get_results(dataset): X_full, y_full = dataset.data, dataset.target n_samples = X_full.shape[0] n_features = X_full.shape[1] # Estimate the score on the entire dataset, with no missing values estimator = RandomForestRegressor(random_state=0, n_estimators=100) full_scores = cross_val_score(estimator, X_full, y_full, scoring='neg_mean_squared_error') # Add missing values in 75% of the lines missing_rate = 0.75 n_missing_samples = int(np.floor(n_samples * missing_rate)) missing_samples = np.hstack( (np.zeros(n_samples - n_missing_samples, dtype=np.bool), np.ones(n_missing_samples, dtype=np.bool))) rng.shuffle(missing_samples) missing_features = rng.randint(0, n_features, n_missing_samples) # Estimate the score after replacing missing values by 0 X_missing = X_full.copy() X_missing[np.where(missing_samples)[0], missing_features] = 0 y_missing = y_full.copy() estimator = RandomForestRegressor(random_state=0, n_estimators=100) zero_impute_scores = cross_val_score(estimator, X_missing, y_missing, scoring='neg_mean_squared_error') # Estimate the score after imputation (mean strategy) of the missing values X_missing = X_full.copy() X_missing[np.where(missing_samples)[0], missing_features] = 0 y_missing = y_full.copy() estimator = Pipeline([ ("imputer", SimpleImputer(missing_values=0, strategy="mean")), ("forest", RandomForestRegressor(random_state=0, n_estimators=100)) ]) mean_impute_scores = cross_val_score(estimator, X_missing, y_missing, scoring='neg_mean_squared_error') # Estimate the score after imputation (MICE strategy) of the missing values estimator = Pipeline([ ("imputer", MICEImputer(missing_values=0, random_state=0)), ("forest", RandomForestRegressor(random_state=0, n_estimators=100)) ]) mice_impute_scores = cross_val_score(estimator, X_missing, y_missing, scoring='neg_mean_squared_error') return ((full_scores.mean(), full_scores.std()), (zero_impute_scores.mean(), zero_impute_scores.std()), (mean_impute_scores.mean(), mean_impute_scores.std()), (mice_impute_scores.mean(), mice_impute_scores.std()))
def test_mice_missing_at_transform(strategy): rng = np.random.RandomState(0) n = 100 d = 10 X_train = rng.randint(low=0, high=3, size=(n, d)) X_test = rng.randint(low=0, high=3, size=(n, d)) X_train[:, 0] = 1 # definitely no missing values in 0th column X_test[0, 0] = 0 # definitely missing value in 0th column mice = MICEImputer(missing_values=0, n_imputations=1, n_burn_in=1, initial_strategy=strategy, random_state=rng).fit(X_train) initial_imputer = SimpleImputer(missing_values=0, strategy=strategy).fit(X_train) # if there were no missing values at time of fit, then mice will # only use the initial imputer for that feature at transform assert np.all(mice.transform(X_test)[:, 0] == initial_imputer.transform(X_test)[:, 0])
def test_mice_predictors(predictor): rng = np.random.RandomState(0) n = 100 d = 10 X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() imputer = MICEImputer(missing_values=0, n_imputations=1, n_burn_in=1, predictor=predictor, random_state=rng) imputer.fit_transform(X) # check that types are correct for predictors hashes = [] for triplet in imputer.imputation_sequence_: assert triplet.predictor hashes.append(id(triplet.predictor)) # check that each predictor is unique assert len(set(hashes)) == len(hashes)
def test_mice_missing_at_transform(strategy): rng = np.random.RandomState(0) n = 100 d = 10 X_train = rng.randint(low=0, high=3, size=(n, d)) X_test = rng.randint(low=0, high=3, size=(n, d)) X_train[:, 0] = 1 # definitely no missing values in 0th column X_test[0, 0] = 0 # definitely missing value in 0th column mice = MICEImputer(missing_values=0, n_imputations=1, n_burn_in=1, initial_strategy=strategy, random_state=rng).fit(X_train) initial_imputer = SimpleImputer(missing_values=0, strategy=strategy).fit(X_train) # if there were no missing values at time of fit, then mice will # only use the initial imputer for that feature at transform assert np.all( mice.transform(X_test)[:, 0] == initial_imputer.transform(X_test)[:, 0])
def test_mice_transform_recovery(rank): rng = np.random.RandomState(0) n = 100 d = 100 A = rng.rand(n, rank) B = rng.rand(rank, d) X_filled = np.dot(A, B) # half is randomly missing nan_mask = rng.rand(n, d) < 0.5 X_missing = X_filled.copy() X_missing[nan_mask] = np.nan # split up data in half n = n // 2 X_train = X_missing[:n] X_test_filled = X_filled[n:] X_test = X_missing[n:] imputer = MICEImputer(n_imputations=10, n_burn_in=10, verbose=True, random_state=rng).fit(X_train) X_test_est = imputer.transform(X_test) assert_allclose(X_test_filled, X_test_est, rtol=1e-5, atol=0.1)
def make_impute_pipeline(): categorical_cols = CATEGORICAL_COLS numerical_cols = NUMERICAL_COLS categorical_pre = Pipeline([ ('selector', DataFrameSelector(categorical_cols)), ('impute', CustomImputer(strategy='mode')), ]) categorical_pipeline = Pipeline([ ('categorical_pre', categorical_pre), ('encoder', CategoricalEncoder(encoding='onehot-dense')), ]) num_init_quantile_transformer = QuantileTransformer( output_distribution='normal') numerical_pipeline = Pipeline([ ('selector', DataFrameSelector(numerical_cols)), ('scale', num_init_quantile_transformer), ]) combined_features = FeatureUnion([ ('numerical_pipeline', numerical_pipeline), ('cat_ordinal_pipeline', categorical_pipeline), ]) mice_pipeline = Pipeline([ ('combined_features', combined_features), ('mice_impute', MICEImputer(verbose=True)), ]) impute_pipeline = Pipeline([ ('mice_pipeline', mice_pipeline), ('inverse_qt', SelectiveAction( col=list(range(len(numerical_cols))), action=FunctionTransformer( inverse_func, kw_args={'transformer': num_init_quantile_transformer}))), ('numerical_selection', ColumnSelector(range(len(numerical_cols)))) ]) final_pipeline = FeatureUnion([('impute_pipeline', impute_pipeline), ('categorical_pre', categorical_pre)]) return final_pipeline
print(train_use.shape) print(test_use.shape) # ## <a id='4'>4. Preprocessing</a> # ### Step 4: preprocessing: impute missing, normalization, etc. # # Typically in real world project, we need to deal with NAs and do some transformation before modeling. # # Here, we only need to impute the missing values in *Age* and *Fare*. We use [MiceImputer](http://scikit-learn.org/dev/modules/generated/sklearn.impute.MICEImputer.html) from sklearn. # In[15]: train_use[train_use.columns.tolist()] = MICEImputer( initial_strategy='median', n_imputations=50, n_nearest_features=20, verbose=False).fit_transform(train_use) test_use[test_use.columns.tolist()] = MICEImputer( initial_strategy='median', n_imputations=50, n_nearest_features=20, verbose=False).fit_transform(test_use) # ## <a id='5'>5. Final Model and Prediction</a> # ### Train Model and Tune Parameters # In[17]: X = train_use.iloc[:, 2:]
return num_init_quantile_transformer.inverse_transform(X) numerical_pipeline = Pipeline([ ('selector', tf.DataFrameSelector(numerical_cols)), ('scale', num_init_quantile_transformer), ]) combined_features = FeatureUnion([ ('numerical_pipeline', numerical_pipeline), ('cat_nominal_pipeline', cat_nominal_pipeline), ('cat_ordinal_pipeline', cat_ordinal_pipeline), ]) mice_pipeline = Pipeline([ ('combined_features', combined_features), ('mice_impute', MICEImputer()), ('reverse_quantile_transform', tf.SelectiveAction(col=list(range(len(numerical_cols))), action=FunctionTransformer())) ]) feature_transform_pipeline = Pipeline([ ('mice_pipeline', mice_pipeline), ('inverse_qt', tf.SelectiveAction(col=list(range(len(numerical_cols))), action=FunctionTransformer(inverse_func))), ('feature_scaling', tf.SelectiveAction( col=(range(len(numerical_cols))), action=QuantileTransformer(output_distribution='normal'))), ('feature_selection', None), ('model', RandomForestClassifier())