示例#1
0
def test_chained_imputer_imputation_order(imputation_order):
    rng = np.random.RandomState(0)
    n = 100
    d = 10
    X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()
    X[:, 0] = 1  # this column should not be discarded by ChainedImputer

    imputer = ChainedImputer(missing_values=0,
                             n_imputations=1,
                             n_burn_in=1,
                             n_nearest_features=5,
                             min_value=0,
                             max_value=1,
                             verbose=False,
                             imputation_order=imputation_order,
                             random_state=rng)
    imputer.fit_transform(X)
    ordered_idx = [i.feat_idx for i in imputer.imputation_sequence_]
    if imputation_order == 'roman':
        assert np.all(ordered_idx[:d-1] == np.arange(1, d))
    elif imputation_order == 'arabic':
        assert np.all(ordered_idx[:d-1] == np.arange(d-1, 0, -1))
    elif imputation_order == 'random':
        ordered_idx_round_1 = ordered_idx[:d-1]
        ordered_idx_round_2 = ordered_idx[d-1:]
        assert ordered_idx_round_1 != ordered_idx_round_2
    elif 'ending' in imputation_order:
        assert len(ordered_idx) == 2 * (d - 1)
示例#2
0
def test_chained_imputer_imputation_order(imputation_order):
    rng = np.random.RandomState(0)
    n = 100
    d = 10
    X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()
    X[:, 0] = 1  # this column should not be discarded by ChainedImputer

    imputer = ChainedImputer(missing_values=0,
                             n_imputations=1,
                             n_burn_in=1,
                             n_nearest_features=5,
                             min_value=0,
                             max_value=1,
                             verbose=False,
                             imputation_order=imputation_order,
                             random_state=rng)
    imputer.fit_transform(X)
    ordered_idx = [i.feat_idx for i in imputer.imputation_sequence_]
    if imputation_order == 'roman':
        assert np.all(ordered_idx[:d - 1] == np.arange(1, d))
    elif imputation_order == 'arabic':
        assert np.all(ordered_idx[:d - 1] == np.arange(d - 1, 0, -1))
    elif imputation_order == 'random':
        ordered_idx_round_1 = ordered_idx[:d - 1]
        ordered_idx_round_2 = ordered_idx[d - 1:]
        assert ordered_idx_round_1 != ordered_idx_round_2
    elif 'ending' in imputation_order:
        assert len(ordered_idx) == 2 * (d - 1)
示例#3
0
def get_results_multiple_imputation_approach2(X_train, X_test, y_train,
                                              y_test):
    m = 5
    multiple_predictions = []
    for i in range(m):
        # Fit the imputer for every i in m
        # Be aware that you fit the imputer on the train data
        # And apply to the test data
        imputer = ChainedImputer(n_burn_in=99, n_imputations=1, random_state=i)
        X_train_imputed = imputer.fit_transform(X_train)
        X_test_imputed = imputer.transform(X_test)

        # Perform the steps you wish to take before fitting the estimator
        # Such as standardization
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train_imputed)
        X_test_scaled = scaler.transform(X_test_imputed)

        # Finally fit the estimator and calculate the predictions for every i
        # in m. Save the predictions.
        estimator = LinearRegression()
        estimator.fit(X_train_scaled, y_train)
        y_predict = estimator.predict(X_test_scaled)
        multiple_predictions.append(y_predict)

    # Average the predictions over the m loops
    # Then calculate the error metric.
    predictions_average = np.mean(multiple_predictions, axis=0)
    mse_approach2 = mse(y_test, predictions_average)

    return mse_approach2
示例#4
0
def test_chained_imputer_no_missing():
    rng = np.random.RandomState(0)
    X = rng.rand(100, 100)
    X[:, 0] = np.nan
    m1 = ChainedImputer(n_imputations=10, random_state=rng)
    m2 = ChainedImputer(n_imputations=10, random_state=rng)
    pred1 = m1.fit(X).transform(X)
    pred2 = m2.fit_transform(X)
    # should exclude the first column entirely
    assert_allclose(X[:, 1:], pred1)
    # fit and fit_transform should both be identical
    assert_allclose(pred1, pred2)
示例#5
0
def test_chained_imputer_no_missing():
    rng = np.random.RandomState(0)
    X = rng.rand(100, 100)
    X[:, 0] = np.nan
    m1 = ChainedImputer(n_imputations=10, random_state=rng)
    m2 = ChainedImputer(n_imputations=10, random_state=rng)
    pred1 = m1.fit(X).transform(X)
    pred2 = m2.fit_transform(X)
    # should exclude the first column entirely
    assert_allclose(X[:, 1:], pred1)
    # fit and fit_transform should both be identical
    assert_allclose(pred1, pred2)
示例#6
0
def test_chained_imputer_predictors(predictor):
    rng = np.random.RandomState(0)

    n = 100
    d = 10
    X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()

    imputer = ChainedImputer(missing_values=0,
                             n_imputations=1,
                             n_burn_in=1,
                             predictor=predictor,
                             random_state=rng)
    imputer.fit_transform(X)

    # check that types are correct for predictors
    hashes = []
    for triplet in imputer.imputation_sequence_:
        assert triplet.predictor
        hashes.append(id(triplet.predictor))

    # check that each predictor is unique
    assert len(set(hashes)) == len(hashes)
示例#7
0
def test_chained_imputer_predictors(predictor):
    rng = np.random.RandomState(0)

    n = 100
    d = 10
    X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()

    imputer = ChainedImputer(missing_values=0,
                             n_imputations=1,
                             n_burn_in=1,
                             predictor=predictor,
                             random_state=rng)
    imputer.fit_transform(X)

    # check that types are correct for predictors
    hashes = []
    for triplet in imputer.imputation_sequence_:
        assert triplet.predictor
        hashes.append(id(triplet.predictor))

    # check that each predictor is unique
    assert len(set(hashes)) == len(hashes)
示例#8
0
def test_imputation_shape():
    # Verify the shapes of the imputed matrix for different strategies.
    X = np.random.randn(10, 2)
    X[::2] = np.nan

    for strategy in ['mean', 'median', 'most_frequent', "constant"]:
        imputer = SimpleImputer(strategy=strategy)
        X_imputed = imputer.fit_transform(sparse.csr_matrix(X))
        assert X_imputed.shape == (10, 2)
        X_imputed = imputer.fit_transform(X)
        assert X_imputed.shape == (10, 2)

        chained_imputer = ChainedImputer(initial_strategy=strategy)
        X_imputed = chained_imputer.fit_transform(X)
        assert X_imputed.shape == (10, 2)
示例#9
0
def test_imputation_shape():
    # Verify the shapes of the imputed matrix for different strategies.
    X = np.random.randn(10, 2)
    X[::2] = np.nan

    for strategy in ['mean', 'median', 'most_frequent', "constant"]:
        imputer = SimpleImputer(strategy=strategy)
        X_imputed = imputer.fit_transform(sparse.csr_matrix(X))
        assert X_imputed.shape == (10, 2)
        X_imputed = imputer.fit_transform(X)
        assert X_imputed.shape == (10, 2)

        chained_imputer = ChainedImputer(initial_strategy=strategy)
        X_imputed = chained_imputer.fit_transform(X)
        assert X_imputed.shape == (10, 2)
示例#10
0
def test_chained_imputer_rank_one():
    rng = np.random.RandomState(0)
    d = 100
    A = rng.rand(d, 1)
    B = rng.rand(1, d)
    X = np.dot(A, B)
    nan_mask = rng.rand(d, d) < 0.5
    X_missing = X.copy()
    X_missing[nan_mask] = np.nan

    imputer = ChainedImputer(n_imputations=5,
                             n_burn_in=5,
                             verbose=True,
                             random_state=rng)
    X_filled = imputer.fit_transform(X_missing)
    assert_allclose(X_filled, X, atol=0.001)
示例#11
0
def test_chained_imputer_rank_one():
    rng = np.random.RandomState(0)
    d = 100
    A = rng.rand(d, 1)
    B = rng.rand(1, d)
    X = np.dot(A, B)
    nan_mask = rng.rand(d, d) < 0.5
    X_missing = X.copy()
    X_missing[nan_mask] = np.nan

    imputer = ChainedImputer(n_imputations=5,
                             n_burn_in=5,
                             verbose=True,
                             random_state=rng)
    X_filled = imputer.fit_transform(X_missing)
    assert_allclose(X_filled, X, atol=0.001)
示例#12
0
def test_chained_imputer_clip():
    rng = np.random.RandomState(0)
    n = 100
    d = 10
    X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()

    imputer = ChainedImputer(missing_values=0,
                             n_imputations=1,
                             n_burn_in=1,
                             min_value=0.1,
                             max_value=0.2,
                             random_state=rng)

    Xt = imputer.fit_transform(X)
    assert_allclose(np.min(Xt[X == 0]), 0.1)
    assert_allclose(np.max(Xt[X == 0]), 0.2)
    assert_allclose(Xt[X != 0], X[X != 0])
示例#13
0
def test_chained_imputer_clip():
    rng = np.random.RandomState(0)
    n = 100
    d = 10
    X = sparse_random_matrix(n, d, density=0.10,
                             random_state=rng).toarray()

    imputer = ChainedImputer(missing_values=0,
                             n_imputations=1,
                             n_burn_in=1,
                             min_value=0.1,
                             max_value=0.2,
                             random_state=rng)

    Xt = imputer.fit_transform(X)
    assert_allclose(np.min(Xt[X == 0]), 0.1)
    assert_allclose(np.max(Xt[X == 0]), 0.2)
    assert_allclose(Xt[X != 0], X[X != 0])
示例#14
0
def get_results_single_imputation(X_train, X_test, y_train, y_test):
    # Apply imputation
    imputer = ChainedImputer(n_burn_in=99, n_imputations=1, random_state=0)
    X_train_imputed = imputer.fit_transform(X_train)
    X_test_imputed = imputer.transform(X_test)

    # Standardize data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_imputed)
    X_test_scaled = scaler.transform(X_test_imputed)

    # Perform estimation and prediction
    estimator = LinearRegression()
    estimator.fit(X_train_scaled, y_train)
    y_predict = estimator.predict(X_test_scaled)
    mse_single = mse(y_test, y_predict)

    return mse_single