Пример #1
0
def test_iterative_imputer_additive_matrix():
    rng = np.random.RandomState(0)
    n = 100
    d = 10
    A = rng.randn(n, d)
    B = rng.randn(n, d)
    X_filled = np.zeros(A.shape)
    for i in range(d):
        for j in range(d):
            X_filled[:, (i + j) % d] += (A[:, i] + B[:, j]) / 2
    # a quarter is randomly missing
    nan_mask = rng.rand(n, d) < 0.25
    X_missing = X_filled.copy()
    X_missing[nan_mask] = np.nan

    # split up data
    n = n // 2
    X_train = X_missing[:n]
    X_test_filled = X_filled[n:]
    X_test = X_missing[n:]

    imputer = IterativeImputer(max_iter=10, verbose=1,
                               random_state=rng).fit(X_train)
    X_test_est = imputer.transform(X_test)
    assert_allclose(X_test_filled, X_test_est, rtol=1e-3, atol=0.01)
Пример #2
0
def test_iterative_imputer_imputation_order(imputation_order):
    rng = np.random.RandomState(0)
    n = 100
    d = 10
    max_iter = 2
    X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()
    X[:, 0] = 1  # this column should not be discarded by IterativeImputer

    imputer = IterativeImputer(missing_values=0,
                               max_iter=max_iter,
                               n_nearest_features=5,
                               sample_posterior=False,
                               min_value=0,
                               max_value=1,
                               verbose=1,
                               imputation_order=imputation_order,
                               random_state=rng)
    imputer.fit_transform(X)
    ordered_idx = [i.feat_idx for i in imputer.imputation_sequence_]

    assert (len(ordered_idx) //
            imputer.n_iter_ == imputer.n_features_with_missing_)

    if imputation_order == 'roman':
        assert np.all(ordered_idx[:d - 1] == np.arange(1, d))
    elif imputation_order == 'arabic':
        assert np.all(ordered_idx[:d - 1] == np.arange(d - 1, 0, -1))
    elif imputation_order == 'random':
        ordered_idx_round_1 = ordered_idx[:d - 1]
        ordered_idx_round_2 = ordered_idx[d - 1:]
        assert ordered_idx_round_1 != ordered_idx_round_2
    elif 'ending' in imputation_order:
        assert len(ordered_idx) == max_iter * (d - 1)
Пример #3
0
def test_iterative_imputer_early_stopping():
    rng = np.random.RandomState(0)
    n = 50
    d = 5
    A = rng.rand(n, 1)
    B = rng.rand(1, d)
    X = np.dot(A, B)
    nan_mask = rng.rand(n, d) < 0.5
    X_missing = X.copy()
    X_missing[nan_mask] = np.nan

    imputer = IterativeImputer(max_iter=100,
                               tol=1e-2,
                               sample_posterior=False,
                               verbose=1,
                               random_state=rng)
    X_filled_100 = imputer.fit_transform(X_missing)
    assert len(imputer.imputation_sequence_) == d * imputer.n_iter_

    imputer = IterativeImputer(max_iter=imputer.n_iter_,
                               sample_posterior=False,
                               verbose=1,
                               random_state=rng)
    X_filled_early = imputer.fit_transform(X_missing)
    assert_allclose(X_filled_100, X_filled_early, atol=1e-7)

    imputer = IterativeImputer(max_iter=100,
                               tol=0,
                               sample_posterior=False,
                               verbose=1,
                               random_state=rng)
    imputer.fit(X_missing)
    assert imputer.n_iter_ == imputer.max_iter
Пример #4
0
def test_iterative_imputer_truncated_normal_posterior():
    #  test that the values that are imputed using `sample_posterior=True`
    #  with boundaries (`min_value` and `max_value` are not None) are drawn
    #  from a distribution that looks gaussian via the Kolmogorov Smirnov test.
    #  note that starting from the wrong random seed will make this test fail
    #  because random sampling doesn't occur at all when the imputation
    #  is outside of the (min_value, max_value) range
    pytest.importorskip("scipy", minversion="0.17.0")
    rng = np.random.RandomState(42)

    X = rng.normal(size=(5, 5))
    X[0][0] = np.nan

    imputer = IterativeImputer(min_value=0,
                               max_value=0.5,
                               sample_posterior=True,
                               random_state=rng)

    imputer.fit_transform(X)
    # generate multiple imputations for the single missing value
    imputations = np.array([imputer.transform(X)[0][0] for _ in range(100)])

    assert all(imputations >= 0)
    assert all(imputations <= 0.5)

    mu, sigma = imputations.mean(), imputations.std()
    ks_statistic, p_value = kstest((imputations - mu) / sigma, 'norm')
    if sigma == 0:
        sigma += 1e-12
    ks_statistic, p_value = kstest((imputations - mu) / sigma, 'norm')
    # we want to fail to reject null hypothesis
    # null hypothesis: distributions are the same
    assert ks_statistic < 0.2 or p_value > 0.1, \
        "The posterior does appear to be normal"
Пример #5
0
def test_iterative_imputer_all_missing():
    n = 100
    d = 3
    X = np.zeros((n, d))
    imputer = IterativeImputer(missing_values=0, max_iter=1)
    X_imputed = imputer.fit_transform(X)
    assert_allclose(X_imputed, imputer.initial_imputer_.transform(X))
Пример #6
0
def test_iterative_imputer_rank_one():
    rng = np.random.RandomState(0)
    d = 50
    A = rng.rand(d, 1)
    B = rng.rand(1, d)
    X = np.dot(A, B)
    nan_mask = rng.rand(d, d) < 0.5
    X_missing = X.copy()
    X_missing[nan_mask] = np.nan

    imputer = IterativeImputer(max_iter=5, verbose=1, random_state=rng)
    X_filled = imputer.fit_transform(X_missing)
    assert_allclose(X_filled, X, atol=0.02)
Пример #7
0
def test_imputation_shape(strategy):
    # Verify the shapes of the imputed matrix for different strategies.
    X = np.random.randn(10, 2)
    X[::2] = np.nan

    imputer = SimpleImputer(strategy=strategy)
    X_imputed = imputer.fit_transform(sparse.csr_matrix(X))
    assert X_imputed.shape == (10, 2)
    X_imputed = imputer.fit_transform(X)
    assert X_imputed.shape == (10, 2)

    iterative_imputer = IterativeImputer(initial_strategy=strategy)
    X_imputed = iterative_imputer.fit_transform(X)
    assert X_imputed.shape == (10, 2)
Пример #8
0
def test_iterative_imputer_clip():
    rng = np.random.RandomState(0)
    n = 100
    d = 10
    X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()

    imputer = IterativeImputer(missing_values=0,
                               max_iter=1,
                               min_value=0.1,
                               max_value=0.2,
                               random_state=rng)

    Xt = imputer.fit_transform(X)
    assert_allclose(np.min(Xt[X == 0]), 0.1)
    assert_allclose(np.max(Xt[X == 0]), 0.2)
    assert_allclose(Xt[X != 0], X[X != 0])
Пример #9
0
def test_iterative_imputer_clip_truncnorm():
    rng = np.random.RandomState(0)
    n = 100
    d = 10
    X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()
    X[:, 0] = 1

    imputer = IterativeImputer(missing_values=0,
                               max_iter=2,
                               n_nearest_features=5,
                               sample_posterior=True,
                               min_value=0.1,
                               max_value=0.2,
                               verbose=1,
                               imputation_order='random',
                               random_state=rng)
    Xt = imputer.fit_transform(X)
    assert_allclose(np.min(Xt[X == 0]), 0.1)
    assert_allclose(np.max(Xt[X == 0]), 0.2)
    assert_allclose(Xt[X != 0], X[X != 0])
Пример #10
0
def test_iterative_imputer_transform_recovery(rank):
    rng = np.random.RandomState(0)
    n = 70
    d = 70
    A = rng.rand(n, rank)
    B = rng.rand(rank, d)
    X_filled = np.dot(A, B)
    nan_mask = rng.rand(n, d) < 0.5
    X_missing = X_filled.copy()
    X_missing[nan_mask] = np.nan

    # split up data in half
    n = n // 2
    X_train = X_missing[:n]
    X_test_filled = X_filled[n:]
    X_test = X_missing[n:]

    imputer = IterativeImputer(max_iter=5, verbose=1,
                               random_state=rng).fit(X_train)
    X_test_est = imputer.transform(X_test)
    assert_allclose(X_test_filled, X_test_est, atol=0.1)
Пример #11
0
def test_iterative_imputer_missing_at_transform(strategy):
    rng = np.random.RandomState(0)
    n = 100
    d = 10
    X_train = rng.randint(low=0, high=3, size=(n, d))
    X_test = rng.randint(low=0, high=3, size=(n, d))

    X_train[:, 0] = 1  # definitely no missing values in 0th column
    X_test[0, 0] = 0  # definitely missing value in 0th column

    imputer = IterativeImputer(missing_values=0,
                               max_iter=1,
                               initial_strategy=strategy,
                               random_state=rng).fit(X_train)
    initial_imputer = SimpleImputer(missing_values=0,
                                    strategy=strategy).fit(X_train)

    # if there were no missing values at time of fit, then imputer will
    # only use the initial imputer for that feature at transform
    assert_allclose(
        imputer.transform(X_test)[:, 0],
        initial_imputer.transform(X_test)[:, 0])
Пример #12
0
def test_iterative_imputer_estimators(estimator):
    rng = np.random.RandomState(0)

    n = 100
    d = 10
    X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()

    imputer = IterativeImputer(missing_values=0,
                               max_iter=1,
                               estimator=estimator,
                               random_state=rng)
    imputer.fit_transform(X)

    # check that types are correct for estimators
    hashes = []
    for triplet in imputer.imputation_sequence_:
        expected_type = (type(estimator)
                         if estimator is not None else type(BayesianRidge()))
        assert isinstance(triplet.estimator, expected_type)
        hashes.append(id(triplet.estimator))

    # check that each estimator is unique
    assert len(set(hashes)) == len(hashes)
Пример #13
0
def test_iterative_imputer_catch_warning():
    # check that we catch a RuntimeWarning due to a division by zero when a
    # feature is constant in the dataset
    X, y = load_boston(return_X_y=True)
    n_samples, n_features = X.shape

    # simulate that a feature only contain one category during fit
    X[:, 3] = 1

    # add some missing values
    rng = np.random.RandomState(0)
    missing_rate = 0.15
    for feat in range(n_features):
        sample_idx = rng.choice(np.arange(n_samples),
                                size=int(n_samples * missing_rate),
                                replace=False)
        X[sample_idx, feat] = np.nan

    imputer = IterativeImputer(n_nearest_features=5, sample_posterior=True)
    with pytest.warns(None) as record:
        X_fill = imputer.fit_transform(X, y)
    assert not record.list
    assert not np.any(np.isnan(X_fill))
Пример #14
0
def get_results(dataset):
    X_full, y_full = dataset.data, dataset.target
    n_samples = X_full.shape[0]
    n_features = X_full.shape[1]

    # Estimate the score on the entire dataset, with no missing values
    full_scores = cross_val_score(REGRESSOR, X_full, y_full,
                                  scoring='neg_mean_squared_error',
                                  cv=N_SPLITS)

    # Add missing values in 75% of the lines
    missing_rate = 0.75
    n_missing_samples = int(np.floor(n_samples * missing_rate))
    missing_samples = np.hstack((np.zeros(n_samples - n_missing_samples,
                                          dtype=np.bool),
                                 np.ones(n_missing_samples,
                                         dtype=np.bool)))
    rng.shuffle(missing_samples)
    missing_features = rng.randint(0, n_features, n_missing_samples)
    X_missing = X_full.copy()
    X_missing[np.where(missing_samples)[0], missing_features] = 0
    y_missing = y_full.copy()

    # Estimate the score after replacing missing values by 0
    imputer = SimpleImputer(missing_values=0,
                            strategy='constant',
                            fill_value=0)
    zero_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing)

    # Estimate the score after imputation (mean strategy) of the missing values
    imputer = SimpleImputer(missing_values=0, strategy="mean")
    mean_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing)

    # Estimate the score after iterative imputation of the missing values
    imputer = IterativeImputer(missing_values=0,
                               random_state=0,
                               n_nearest_features=5,
                               sample_posterior=True)
    iterative_impute_scores = get_scores_for_imputer(imputer,
                                                     X_missing,
                                                     y_missing)

    return ((full_scores.mean(), full_scores.std()),
            (zero_impute_scores.mean(), zero_impute_scores.std()),
            (mean_impute_scores.mean(), mean_impute_scores.std()),
            (iterative_impute_scores.mean(), iterative_impute_scores.std()))
Пример #15
0
def test_iterative_imputer_no_missing():
    rng = np.random.RandomState(0)
    X = rng.rand(100, 100)
    X[:, 0] = np.nan
    m1 = IterativeImputer(max_iter=10, random_state=rng)
    m2 = IterativeImputer(max_iter=10, random_state=rng)
    pred1 = m1.fit(X).transform(X)
    pred2 = m2.fit_transform(X)
    # should exclude the first column entirely
    assert_allclose(X[:, 1:], pred1)
    # fit and fit_transform should both be identical
    assert_allclose(pred1, pred2)
Пример #16
0
def test_iterative_imputer_zero_iters():
    rng = np.random.RandomState(0)

    n = 100
    d = 10
    X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()
    missing_flag = X == 0
    X[missing_flag] = np.nan

    imputer = IterativeImputer(max_iter=0)
    X_imputed = imputer.fit_transform(X)
    # with max_iter=0, only initial imputation is performed
    assert_allclose(X_imputed, imputer.initial_imputer_.transform(X))

    # repeat but force n_iter_ to 0
    imputer = IterativeImputer(max_iter=5).fit(X)
    # transformed should not be equal to initial imputation
    assert not np.all(
        imputer.transform(X) == imputer.initial_imputer_.transform(X))

    imputer.n_iter_ = 0
    # now they should be equal as only initial imputation is done
    assert_allclose(imputer.transform(X),
                    imputer.initial_imputer_.transform(X))
Пример #17
0
        y_missing,
        scoring='neg_mean_squared_error',
        cv=N_SPLITS)

# Estimate the score after iterative imputation of the missing values
# with different estimators
estimators = [
    BayesianRidge(),
    DecisionTreeRegressor(max_features='sqrt', random_state=0),
    ExtraTreesRegressor(n_estimators=10, random_state=0),
    KNeighborsRegressor(n_neighbors=15)
]
score_iterative_imputer = pd.DataFrame()
for impute_estimator in estimators:
    estimator = make_pipeline(
        IterativeImputer(random_state=0, estimator=impute_estimator),
        br_estimator)
    score_iterative_imputer[impute_estimator.__class__.__name__] = \
        cross_val_score(
            estimator, X_missing, y_missing, scoring='neg_mean_squared_error',
            cv=N_SPLITS
        )

scores = pd.concat(
    [score_full_data, score_simple_imputer, score_iterative_imputer],
    keys=['Original', 'SimpleImputer', 'IterativeImputer'],
    axis=1)

# plot boston results
fig, ax = plt.subplots(figsize=(13, 6))
means = -scores.mean()
Пример #18
0
def test_iterative_imputer_verbose():
    rng = np.random.RandomState(0)

    n = 100
    d = 3
    X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()
    imputer = IterativeImputer(missing_values=0, max_iter=1, verbose=1)
    imputer.fit(X)
    imputer.transform(X)
    imputer = IterativeImputer(missing_values=0, max_iter=1, verbose=2)
    imputer.fit(X)
    imputer.transform(X)
Пример #19
0
def test_iterative_imputer_error_param(max_iter, tol, error_type, warning):
    X = np.zeros((100, 2))
    imputer = IterativeImputer(max_iter=max_iter, tol=tol)
    with pytest.raises(error_type, match=warning):
        imputer.fit_transform(X)
Пример #20
0
def test_iterative_imputer_transform_stochasticity():
    pytest.importorskip("scipy", minversion="0.17.0")
    rng1 = np.random.RandomState(0)
    rng2 = np.random.RandomState(1)
    n = 100
    d = 10
    X = sparse_random_matrix(n, d, density=0.10, random_state=rng1).toarray()

    # when sample_posterior=True, two transforms shouldn't be equal
    imputer = IterativeImputer(missing_values=0,
                               max_iter=1,
                               sample_posterior=True,
                               random_state=rng1)
    imputer.fit(X)

    X_fitted_1 = imputer.transform(X)
    X_fitted_2 = imputer.transform(X)

    # sufficient to assert that the means are not the same
    assert np.mean(X_fitted_1) != pytest.approx(np.mean(X_fitted_2))

    # when sample_posterior=False, and n_nearest_features=None
    # and imputation_order is not random
    # the two transforms should be identical even if rng are different
    imputer1 = IterativeImputer(missing_values=0,
                                max_iter=1,
                                sample_posterior=False,
                                n_nearest_features=None,
                                imputation_order='ascending',
                                random_state=rng1)

    imputer2 = IterativeImputer(missing_values=0,
                                max_iter=1,
                                sample_posterior=False,
                                n_nearest_features=None,
                                imputation_order='ascending',
                                random_state=rng2)
    imputer1.fit(X)
    imputer2.fit(X)

    X_fitted_1a = imputer1.transform(X)
    X_fitted_1b = imputer1.transform(X)
    X_fitted_2 = imputer2.transform(X)

    assert_allclose(X_fitted_1a, X_fitted_1b)
    assert_allclose(X_fitted_1a, X_fitted_2)