Пример #1
0
def get_results_multiple_imputation_approach2(X_train, X_test, y_train,
                                              y_test):
    m = 5
    multiple_predictions = []
    for i in range(m):
        # Fit the imputer for every i in m
        # Be aware that you fit the imputer on the train data
        # And apply to the test data
        imputer = ChainedImputer(n_burn_in=99, n_imputations=1, random_state=i)
        X_train_imputed = imputer.fit_transform(X_train)
        X_test_imputed = imputer.transform(X_test)

        # Perform the steps you wish to take before fitting the estimator
        # Such as standardization
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train_imputed)
        X_test_scaled = scaler.transform(X_test_imputed)

        # Finally fit the estimator and calculate the predictions for every i
        # in m. Save the predictions.
        estimator = LinearRegression()
        estimator.fit(X_train_scaled, y_train)
        y_predict = estimator.predict(X_test_scaled)
        multiple_predictions.append(y_predict)

    # Average the predictions over the m loops
    # Then calculate the error metric.
    predictions_average = np.mean(multiple_predictions, axis=0)
    mse_approach2 = mse(y_test, predictions_average)

    return mse_approach2
Пример #2
0
def test_chained_imputer_additive_matrix():
    rng = np.random.RandomState(0)
    n = 100
    d = 10
    A = rng.randn(n, d)
    B = rng.randn(n, d)
    X_filled = np.zeros(A.shape)
    for i in range(d):
        for j in range(d):
            X_filled[:, (i+j) % d] += (A[:, i] + B[:, j]) / 2
    # a quarter is randomly missing
    nan_mask = rng.rand(n, d) < 0.25
    X_missing = X_filled.copy()
    X_missing[nan_mask] = np.nan

    # split up data
    n = n // 2
    X_train = X_missing[:n]
    X_test_filled = X_filled[n:]
    X_test = X_missing[n:]

    imputer = ChainedImputer(n_imputations=25,
                             n_burn_in=10,
                             verbose=True,
                             random_state=rng).fit(X_train)
    X_test_est = imputer.transform(X_test)
    assert_allclose(X_test_filled, X_test_est, atol=0.01)
Пример #3
0
def test_chained_imputer_additive_matrix():
    rng = np.random.RandomState(0)
    n = 100
    d = 10
    A = rng.randn(n, d)
    B = rng.randn(n, d)
    X_filled = np.zeros(A.shape)
    for i in range(d):
        for j in range(d):
            X_filled[:, (i + j) % d] += (A[:, i] + B[:, j]) / 2
    # a quarter is randomly missing
    nan_mask = rng.rand(n, d) < 0.25
    X_missing = X_filled.copy()
    X_missing[nan_mask] = np.nan

    # split up data
    n = n // 2
    X_train = X_missing[:n]
    X_test_filled = X_filled[n:]
    X_test = X_missing[n:]

    imputer = ChainedImputer(n_imputations=25,
                             n_burn_in=10,
                             verbose=True,
                             random_state=rng).fit(X_train)
    X_test_est = imputer.transform(X_test)
    assert_allclose(X_test_filled, X_test_est, atol=0.01)
Пример #4
0
def get_results_mice_imputation_includingy(X_incomplete, y):
    # Impute incomplete data using the IterativeImputer as a MICEImputer
    # Now using the output variable in the imputation loop
    m = 5
    multiple_imputations = []
    for i in range(m):
        Xy = np.column_stack((X_incomplete, y))
        imputer = ChainedImputer(n_burn_in=99, n_imputations=1, random_state=i)
        imputer.fit(Xy)
        data_imputed = imputer.transform(Xy)

        # We save only the X imputed data because we do not want to use y to
        # predict y later on.
        X_imputed = data_imputed[:, :-1]
        multiple_imputations.append(X_imputed)

    # Perform linear regression on mice multiple imputed data
    # Estimate beta estimates and their variances
    m_coefs = []
    m_vars = []
    for i in range(m):
        estimator = LinearRegression()
        estimator.fit(multiple_imputations[i], y)
        y_predict = estimator.predict(multiple_imputations[i])
        m_coefs.append(estimator.coef_)
        m_vars.append(
            calculate_variance_of_beta_estimates(y, y_predict,
                                                 multiple_imputations[i]))

    # Calculate the end estimates by applying Rubin's rules.
    Qbar = calculate_Qbar(m_coefs)
    T = calculate_T(m_coefs, m_vars, Qbar)
    mice_errorbar = 1.96 * np.sqrt(T)

    return Qbar, T, mice_errorbar
Пример #5
0
def get_results_mice_imputation(X_incomplete, y):
    # Impute incomplete data using the IterativeImputer to perform multiple
    # imputation. We set n_burn_in at 99 and use only last imputation and
    # loop this procedure m times.
    m = 5
    multiple_imputations = []
    for i in range(m):
        imputer = ChainedImputer(n_burn_in=99, n_imputations=1, random_state=i)
        imputer.fit(X_incomplete)
        X_imputed = imputer.transform(X_incomplete)
        multiple_imputations.append(X_imputed)

    # Perform a model on each of the m imputed datasets
    # Estimate the estimates for each model/dataset
    m_coefs = []
    m_vars = []
    for i in range(m):
        estimator = LinearRegression()
        estimator.fit(multiple_imputations[i], y)
        y_predict = estimator.predict(multiple_imputations[i])
        m_coefs.append(estimator.coef_)
        m_vars.append(
            calculate_variance_of_beta_estimates(y, y_predict,
                                                 multiple_imputations[i]))

    # Calculate the end estimates by applying Rubin's rules.
    Qbar = calculate_Qbar(m_coefs)
    T = calculate_T(m_coefs, m_vars, Qbar)
    mice_errorbar = 1.96 * np.sqrt(T)

    return Qbar, T, mice_errorbar
Пример #6
0
def test_chained_imputer_transform_stochasticity():
    rng = np.random.RandomState(0)
    n = 100
    d = 10
    X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()

    imputer = ChainedImputer(missing_values=0,
                             n_imputations=1,
                             n_burn_in=1,
                             random_state=rng)
    imputer.fit(X)

    X_fitted_1 = imputer.transform(X)
    X_fitted_2 = imputer.transform(X)

    # sufficient to assert that the means are not the same
    assert np.mean(X_fitted_1) != pytest.approx(np.mean(X_fitted_2))
Пример #7
0
def test_chained_imputer_transform_stochasticity():
    rng = np.random.RandomState(0)
    n = 100
    d = 10
    X = sparse_random_matrix(n, d, density=0.10,
                             random_state=rng).toarray()

    imputer = ChainedImputer(missing_values=0,
                             n_imputations=1,
                             n_burn_in=1,
                             random_state=rng)
    imputer.fit(X)

    X_fitted_1 = imputer.transform(X)
    X_fitted_2 = imputer.transform(X)

    # sufficient to assert that the means are not the same
    assert np.mean(X_fitted_1) != pytest.approx(np.mean(X_fitted_2))
Пример #8
0
def get_results_single_imputation(X_train, X_test, y_train, y_test):
    # Apply imputation
    imputer = ChainedImputer(n_burn_in=99, n_imputations=1, random_state=0)
    X_train_imputed = imputer.fit_transform(X_train)
    X_test_imputed = imputer.transform(X_test)

    # Standardize data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_imputed)
    X_test_scaled = scaler.transform(X_test_imputed)

    # Perform estimation and prediction
    estimator = LinearRegression()
    estimator.fit(X_train_scaled, y_train)
    y_predict = estimator.predict(X_test_scaled)
    mse_single = mse(y_test, y_predict)

    return mse_single
Пример #9
0
def get_results_chained_imputation(X_incomplete, y):
    # Impute incomplete data with IterativeImputer using single imputation
    # We set n_burn_in at 99 and use only the last imputation
    imputer = ChainedImputer(n_burn_in=99, n_imputations=1)
    imputer.fit(X_incomplete)
    X_imputed = imputer.transform(X_incomplete)

    # Perform linear regression on chained single imputed data
    # Estimate beta estimates and their variances
    estimator = LinearRegression()
    estimator.fit(X_imputed, y)
    y_predict = estimator.predict(X_imputed)

    # Save the beta estimates, the variance of these estimates and 1.96 *
    # standard error of the estimates
    chained_coefs = estimator.coef_
    chained_vars = calculate_variance_of_beta_estimates(
        y, y_predict, X_imputed)
    chained_errorbar = 1.96 * np.sqrt(chained_vars)

    return chained_coefs, chained_vars, chained_errorbar
Пример #10
0
def test_chained_imputer_missing_at_transform(strategy):
    rng = np.random.RandomState(0)
    n = 100
    d = 10
    X_train = rng.randint(low=0, high=3, size=(n, d))
    X_test = rng.randint(low=0, high=3, size=(n, d))

    X_train[:, 0] = 1  # definitely no missing values in 0th column
    X_test[0, 0] = 0  # definitely missing value in 0th column

    imputer = ChainedImputer(missing_values=0,
                             n_imputations=1,
                             n_burn_in=1,
                             initial_strategy=strategy,
                             random_state=rng).fit(X_train)
    initial_imputer = SimpleImputer(missing_values=0,
                                    strategy=strategy).fit(X_train)

    # if there were no missing values at time of fit, then imputer will
    # only use the initial imputer for that feature at transform
    assert np.all(imputer.transform(X_test)[:, 0] ==
                  initial_imputer.transform(X_test)[:, 0])
Пример #11
0
def test_chained_imputer_transform_recovery(rank):
    rng = np.random.RandomState(0)
    n = 100
    d = 100
    A = rng.rand(n, rank)
    B = rng.rand(rank, d)
    X_filled = np.dot(A, B)
    # half is randomly missing
    nan_mask = rng.rand(n, d) < 0.5
    X_missing = X_filled.copy()
    X_missing[nan_mask] = np.nan

    # split up data in half
    n = n // 2
    X_train = X_missing[:n]
    X_test_filled = X_filled[n:]
    X_test = X_missing[n:]

    imputer = ChainedImputer(n_imputations=10,
                             n_burn_in=10,
                             verbose=True,
                             random_state=rng).fit(X_train)
    X_test_est = imputer.transform(X_test)
    assert_allclose(X_test_filled, X_test_est, rtol=1e-5, atol=0.1)
Пример #12
0
def test_chained_imputer_transform_recovery(rank):
    rng = np.random.RandomState(0)
    n = 100
    d = 100
    A = rng.rand(n, rank)
    B = rng.rand(rank, d)
    X_filled = np.dot(A, B)
    # half is randomly missing
    nan_mask = rng.rand(n, d) < 0.5
    X_missing = X_filled.copy()
    X_missing[nan_mask] = np.nan

    # split up data in half
    n = n // 2
    X_train = X_missing[:n]
    X_test_filled = X_filled[n:]
    X_test = X_missing[n:]

    imputer = ChainedImputer(n_imputations=10,
                             n_burn_in=10,
                             verbose=True,
                             random_state=rng).fit(X_train)
    X_test_est = imputer.transform(X_test)
    assert_allclose(X_test_filled, X_test_est, rtol=1e-5, atol=0.1)
Пример #13
0
def test_chained_imputer_missing_at_transform(strategy):
    rng = np.random.RandomState(0)
    n = 100
    d = 10
    X_train = rng.randint(low=0, high=3, size=(n, d))
    X_test = rng.randint(low=0, high=3, size=(n, d))

    X_train[:, 0] = 1  # definitely no missing values in 0th column
    X_test[0, 0] = 0  # definitely missing value in 0th column

    imputer = ChainedImputer(missing_values=0,
                             n_imputations=1,
                             n_burn_in=1,
                             initial_strategy=strategy,
                             random_state=rng).fit(X_train)
    initial_imputer = SimpleImputer(missing_values=0,
                                    strategy=strategy).fit(X_train)

    # if there were no missing values at time of fit, then imputer will
    # only use the initial imputer for that feature at transform
    assert np.all(
        imputer.transform(X_test)[:,
                                  0] == initial_imputer.transform(X_test)[:,
                                                                          0])