def test_defaults_pandas():

    new_data = boston_amp.loc[range(10), :].copy()

    kernel = mf.ImputationKernel(data=boston_amp,
                                 datasets=2,
                                 mean_match_scheme=mean_match_scheme_fast_cat,
                                 save_models=1,
                                 initialization="empty")
    kernel.mice(iterations=2, compile_candidates=True, verbose=True)

    kernel2 = mf.ImputationKernel(data=boston_amp,
                                  datasets=1,
                                  mean_match_scheme=mean_match_scheme_fast_cat)
    kernel2.mice(iterations=2)

    # Test appending and then test kernel.
    kernel.append(kernel2)
    kernel.compile_candidate_preds()

    # Test mice after appendage
    kernel.mice(1)

    kernel.complete_data(0, inplace=True)
    assert all(kernel.working_data.isnull().sum() == 0)
    assert kernel.get_model(0, 0, 3).params['objective'] == 'regression'
    assert kernel.get_model(0, 3, 3).params['objective'] == 'binary'
    assert kernel.get_model(0, 8, 3).params['objective'] == 'multiclass'

    # Make sure we didn't touch the original data
    assert all(boston_amp.isnull().sum() > 0)

    imp_ds = kernel.impute_new_data(new_data)
    imp_ds.complete_data(2, inplace=True)
    assert all(imp_ds.working_data.isnull().sum(0) == 0)
    assert new_data.isnull().sum().sum() > 0
def test_defaults_numpy():

    working_set = boston_amp.copy()

    working_set["3"] = working_set["3"].cat.codes
    working_set["8"] = working_set["8"].cat.codes
    working_set["3"].replace(-1, np.NaN, inplace=True)
    working_set["8"].replace(-1, np.NaN, inplace=True)
    new_data = working_set.loc[range(10), :].copy()
    working_set = working_set.values
    new_data = new_data.values

    s = datetime.now()
    kernel = mf.ImputationKernel(data=working_set,
                                 datasets=3,
                                 categorical_feature=[3, 8],
                                 mean_match_scheme=mean_match_scheme_fast_cat)
    kernel.mice(iterations=1, verbose=True)
    kernel.compile_candidate_preds()

    # Complete data with copy.
    comp_dat = kernel.complete_data(0, inplace=False)

    # We didn't complete data in place. Make sure we created
    # a copy, and did not affect internal data or original data.
    assert all(np.isnan(comp_dat).sum(0) == 0)
    assert all(np.isnan(kernel.working_data).sum(0) > 0)
    assert all(np.isnan(working_set).sum(0) > 0)

    # Complete data in place
    kernel.complete_data(0, inplace=True)

    # We completed data in place. Make sure we only affected
    # the kernel.working_data and not the original data.
    assert all(np.isnan(kernel.working_data).sum(0) == 0)
    assert all(np.isnan(working_set).sum(0) > 0)

    imp_ds = kernel.impute_new_data(new_data)
    imp_ds.complete_data(0, inplace=True)
    assert all(np.isnan(imp_ds.working_data).sum(0) == 0)
    assert np.isnan(new_data).sum() > 0
    print(datetime.now() - s)
def test_pandas_reproducibility():

    datasets = 2
    kernel = mf.ImputationKernel(data=boston_amp,
                                 datasets=datasets,
                                 initialization="random",
                                 save_models=2,
                                 random_state=2)

    kernel2 = mf.ImputationKernel(data=boston_amp,
                                  datasets=datasets,
                                  initialization="random",
                                  save_models=2,
                                  random_state=2)

    assert kernel.complete_data(0).equals(kernel2.complete_data(0)), (
        "random_state initialization failed to be deterministic")

    # Run mice for 2 iterations
    kernel.mice(2)
    kernel2.mice(2)

    assert kernel.complete_data(0).equals(kernel2.complete_data(0)), (
        "random_state after mice() failed to be deterministic")

    kernel_imputed_as_new = kernel.impute_new_data(
        boston_amp, random_state=4, random_seed_array=random_seed_array)

    # Generate and impute new data as a reordering of original
    new_order = np.arange(rows)
    random_state.shuffle(new_order)
    new_data = boston_amp.loc[new_order]
    new_seeds = random_seed_array[new_order]
    new_imputed = kernel.impute_new_data(new_data,
                                         random_state=4,
                                         random_seed_array=new_seeds)

    # Expect deterministic imputations at the record level, since seeds were passed.
    for i in range(datasets):
        reordered_kernel_completed = kernel_imputed_as_new.complete_data(
            dataset=0).loc[new_order]
        new_data_completed = new_imputed.complete_data(dataset=0)

        assert (reordered_kernel_completed == new_data_completed).all().all(
        ), ("Seeds did not cause deterministic imputations when data was reordered."
            )

    # Generate and impute new data as a subset of original
    new_ind = [0, 1, 4, 7, 8, 10]
    new_data = boston_amp.loc[new_ind]
    new_seeds = random_seed_array[new_ind]
    new_imputed = kernel.impute_new_data(new_data,
                                         random_state=4,
                                         random_seed_array=new_seeds)

    # Expect deterministic imputations at the record level, since seeds were passed.
    for i in range(datasets):
        reordered_kernel_completed = kernel_imputed_as_new.complete_data(
            dataset=0).loc[new_ind]
        new_data_completed = new_imputed.complete_data(dataset=0)

        assert (reordered_kernel_completed == new_data_completed).all().all(
        ), ("Seeds did not cause deterministic imputations when data was reordered."
            )

    # Generate and impute new data as a reordering of original
    new_order = np.arange(rows)
    random_state.shuffle(new_order)
    new_data = boston_amp.loc[new_order]
    new_imputed = kernel.impute_new_data(new_data,
                                         random_state=4,
                                         random_seed_array=random_seed_array)

    # Expect deterministic imputations at the record level, since seeds were passed.
    for i in range(datasets):
        reordered_kernel_completed = kernel_imputed_as_new.complete_data(
            dataset=0).loc[new_order]
        new_data_completed = new_imputed.complete_data(dataset=0)

        assert not (reordered_kernel_completed == new_data_completed).all(
        ).all(), (
            "Different seeds caused deterministic imputations for all rows / columns."
        )
def test_complex_pandas():

    working_set = boston_amp.copy()

    # Switch our category columns to integer codes.
    # Replace -1 with np.NaN or lightgbm will complain.
    working_set["3"] = working_set["3"].cat.codes
    working_set["8"] = working_set["8"].cat.codes
    working_set["3"].replace(-1, np.NaN, inplace=True)
    working_set["8"].replace(-1, np.NaN, inplace=True)
    new_data = working_set.loc[range(10), :].copy()

    # Customize everything.
    vs = {
        "1": ["2", "3", "4", "5"],
        "2": ["6", "7"],
        "3": ["1", "2", "8"],
        "4": ["8", "9", "10"]
    }
    mmc = {"1": 4, "2": 0.01, "3": 0}
    ds = {"2": 100, "3": 0.5}
    io = ["2", "3", "1"]

    imputed_var_names = io
    non_imputed_var_names = [str(x) for x in range(13) if str(x) not in io]

    kernel = mf.ImputationKernel(data=working_set,
                                 datasets=2,
                                 variable_schema=vs,
                                 imputation_order=io,
                                 train_nonmissing=True,
                                 mean_match_candidates=mmc,
                                 data_subset=ds,
                                 categorical_feature=[3, 8],
                                 copy_data=False)
    kernel2 = mf.ImputationKernel(data=working_set,
                                  datasets=1,
                                  variable_schema=vs,
                                  imputation_order=io,
                                  train_nonmissing=True,
                                  mean_match_candidates=mmc,
                                  data_subset=ds,
                                  categorical_feature=[3, 8],
                                  copy_data=False)
    new_file, filename = mkstemp()
    kernel2.save_kernel(filename)
    kernel2 = mf.utils.load_kernel(filename)

    assert kernel.mean_match_candidates == {
        1: 4,
        2: 3,
        3: 0,
        4: 5
    }, "mean_match_candidates initialization failed"
    assert kernel.data_subset == {
        1: 380,
        2: 100,
        3: 190,
        4: 380
    }, "mean_match_subset initialization failed"
    assert kernel.iteration_count() == 0, "iteration initialization failed"
    assert kernel.categorical_variables == [
        3, 8
    ], "categorical recognition failed."

    # This section tests many things:
    # After saving / loading a kernel, and appending 2 kernels together:
    # mice can continue
    # Aliases are fixed, even when different aliases are passed
    # variable specific parameters supercede globally specified parameters
    # The parameters come through the actual model
    nround = 2
    kernel.mice(nround - 1,
                compile_candidates=True,
                variable_parameters={"1": {
                    "n_iter": 15
                }},
                num_trees=10,
                verbose=True)
    kernel.compile_candidate_preds()
    kernel2.mice(nround - 1,
                 variable_parameters={"1": {
                     "n_estimators": 15
                 }},
                 n_estimators=10,
                 verbose=True)
    kernel.append(kernel2)
    kernel.compile_candidate_preds()
    assert kernel.get_model(0, 1, nround - 1).num_trees() == 15
    assert kernel.get_model(0, 2, nround - 1).num_trees() == 10
    kernel.mice(1,
                variable_parameters={1: {
                    "n_iter": 15
                }},
                num_trees=10,
                verbose=True)
    assert kernel.iteration_count(
    ) == nround, "iteration counting is incorrect."
    assert kernel.get_model(0, 1, nround).num_trees() == 15
    assert kernel.get_model(0, 2, nround).num_trees() == 10

    # Make sure we only impute variables in variable_schema
    compdat = kernel.complete_data(0)
    assert all(compdat[imputed_var_names].isnull().sum() == 0)
    assert all(compdat[non_imputed_var_names].isnull().sum() > 0)

    # Test the ability to tune parameters with custom setup
    optimization_steps = 2
    op, ol = kernel.tune_parameters(dataset=0,
                                    optimization_steps=optimization_steps,
                                    variable_parameters={
                                        1: {
                                            "bagging_fraction": 0.9,
                                            "feature_fraction_bynode":
                                            (0.85, 0.9)
                                        }
                                    },
                                    bagging_fraction=0.8,
                                    feature_fraction_bynode=(0.70, 0.75),
                                    verbose=True)
    assert op[1]["bagging_fraction"] == 0.9
    assert op[2]["bagging_fraction"] == 0.8
    assert (op[1]["feature_fraction_bynode"] >=
            0.85) and (op[1]["feature_fraction_bynode"] <= 0.9)
    assert (op[2]["feature_fraction_bynode"] >=
            0.70) and (op[2]["feature_fraction_bynode"] <= 0.75)
    kernel.mice(1, variable_parameters=op, verbose=True)
    model_2_params = kernel.get_model(0, 2, nround + 1).params
    model_1_params = kernel.get_model(0, 1, nround + 1).params
    assert model_2_params["bagging_fraction"] == 0.8
    assert model_1_params["bagging_fraction"] == 0.9
    assert (model_2_params["feature_fraction_bynode"] >=
            0.70) and (model_2_params["feature_fraction_bynode"] <= 0.75)
    assert (model_1_params["feature_fraction_bynode"] >=
            0.85) and (model_1_params["feature_fraction_bynode"] <= 0.9)

    new_imp_dat = kernel.impute_new_data(new_data=new_data, verbose=True)
    new_imp_complete = new_imp_dat.complete_data(0)
    new_imp_dat.na_where[2]
    new_imp_dat.imputation_values[0, 2, 3]
    assert all(new_imp_complete[["1", "2", "3", "4"]].isnull().sum() == 0)

    # Plotting on multiple imputed dataset
    new_imp_dat.plot_mean_convergence()
    close()
    new_imp_dat.plot_imputed_distributions()
    close()

    # Plotting on Multiple Imputed Kernel
    kernel.plot_feature_importance(0)
    close()
    kernel.plot_mean_convergence()
    close()
    kernel.plot_imputed_distributions()
    close()
def test_complex_numpy():

    working_set = boston_amp.copy()

    # Switch our category columns to integer codes.
    # Replace -1 with np.NaN or lightgbm will complain.
    working_set["3"] = working_set["3"].cat.codes
    working_set["8"] = working_set["8"].cat.codes
    working_set["3"].replace(-1, np.NaN, inplace=True)
    working_set["8"].replace(-1, np.NaN, inplace=True)
    new_data = working_set.loc[range(100), :].copy()

    working_set = working_set.values
    new_data = new_data.values

    # Specify that models should be built for variables 1, 2, 3, 4
    vs = {1: [2, 3, 4, 5], 2: [6, 7], 3: [1, 2, 8], 4: [8, 9, 10]}
    mmc = {1: 4, 2: 0.01, 3: 0}
    ds = {2: 100, 3: 0.5}
    # Only variables 1, 2, 3 should be imputed using mice.
    io = [2, 3, 1]
    niv = np.setdiff1d(np.arange(working_set.shape[1]), io)
    nivs = np.setdiff1d(np.arange(working_set.shape[1]), list(vs))

    kernel = mf.ImputationKernel(data=working_set,
                                 datasets=2,
                                 variable_schema=vs,
                                 imputation_order=io,
                                 train_nonmissing=True,
                                 mean_match_candidates=mmc,
                                 data_subset=ds,
                                 mean_match_scheme=mean_match_scheme_fast_cat,
                                 categorical_feature=[3, 8],
                                 copy_data=False)

    kernel2 = mf.ImputationKernel(data=working_set,
                                  datasets=1,
                                  variable_schema=vs,
                                  imputation_order=io,
                                  train_nonmissing=True,
                                  mean_match_candidates=mmc,
                                  data_subset=ds,
                                  mean_match_scheme=mean_match_scheme_fast_cat,
                                  categorical_feature=[3, 8],
                                  copy_data=False)
    new_file, filename = mkstemp()
    kernel2.save_kernel(filename)
    kernel2 = mf.utils.load_kernel(filename)

    assert kernel.mean_match_candidates == {
        2: 3,
        3: 0,
        1: 4,
        4: 5
    }, "mean_match_candidates initialization failed"
    assert kernel.data_subset == {
        2: 100,
        3: 190,
        1: 380,
        4: 380
    }, "mean_match_subset initialization failed"
    assert kernel.iteration_count() == 0, "iteration initialization failed"
    assert kernel.categorical_variables == [
        3, 8
    ], "categorical recognition failed."

    nround = 2
    kernel.mice(nround - 1,
                variable_parameters={1: {
                    "n_iter": 15
                }},
                num_trees=10,
                verbose=True)
    kernel.compile_candidate_preds()
    kernel2.mice(nround - 1,
                 variable_parameters={1: {
                     "n_iter": 15
                 }},
                 num_trees=10,
                 verbose=True)
    kernel.append(kernel2)
    kernel.compile_candidate_preds()
    assert kernel.get_model(0, 1, nround - 1).num_trees() == 15
    assert kernel.get_model(0, 2, nround - 1).num_trees() == 10
    kernel.mice(1,
                variable_parameters={1: {
                    "n_estimators": 15
                }},
                n_estimators=10,
                verbose=True)
    assert kernel.iteration_count(
    ) == nround, "iteration counting is incorrect."
    assert kernel.get_model(0, 1, nround).num_trees() == 15
    assert kernel.get_model(0, 2, nround).num_trees() == 10

    # Complete data with copy. Make sure only correct datasets and variables were affected.
    compdat = kernel.complete_data(0, inplace=False)
    assert all(np.isnan(compdat[:, io]).sum(0) == 0)
    assert all(np.isnan(compdat[:, niv]).sum(0) > 0)

    # Should have no affect on working_data
    assert all(np.isnan(kernel.working_data).sum(0) > 0)

    # Should have no affect on working_set
    assert all(np.isnan(working_set).sum(0) > 0)

    # Now complete the data in place
    kernel.complete_data(0, inplace=True)

    # Should have affect on working_data and original data
    assert all(np.isnan(kernel.working_data[:, io]).sum(0) == 0)
    assert all(np.isnan(working_set[:, io]).sum(0) == 0)
    assert all(np.isnan(kernel.working_data[:, niv]).sum(0) > 0)
    assert all(np.isnan(working_set[:, niv]).sum(0) > 0)

    # Test the ability to tune parameters with custom setup
    optimization_steps = 2
    op, ol = kernel.tune_parameters(dataset=0,
                                    optimization_steps=optimization_steps,
                                    variable_parameters={
                                        1: {
                                            "bagging_fraction": 0.9,
                                            "feature_fraction_bynode":
                                            (0.85, 0.9)
                                        }
                                    },
                                    bagging_fraction=0.8,
                                    feature_fraction_bynode=(0.70, 0.75),
                                    verbose=True)

    assert op[1]["bagging_fraction"] == 0.9
    assert op[2]["bagging_fraction"] == 0.8
    assert (op[1]["feature_fraction_bynode"] >=
            0.85) and (op[1]["feature_fraction_bynode"] <= 0.9)
    assert (op[2]["feature_fraction_bynode"] >=
            0.70) and (op[2]["feature_fraction_bynode"] <= 0.75)
    kernel.mice(1, variable_parameters=op, verbose=True)
    model_2_params = kernel.get_model(0, 2, nround + 1).params
    model_1_params = kernel.get_model(0, 1, nround + 1).params
    assert model_2_params["bagging_fraction"] == 0.8
    assert model_1_params["bagging_fraction"] == 0.9
    assert (model_2_params["feature_fraction_bynode"] >=
            0.70) and (model_2_params["feature_fraction_bynode"] <= 0.75)
    assert (model_1_params["feature_fraction_bynode"] >=
            0.85) and (model_1_params["feature_fraction_bynode"] <= 0.9)

    new_imp_dat = kernel.impute_new_data(new_data=new_data,
                                         copy_data=True,
                                         verbose=True)

    # Not in place
    new_imp_complete = new_imp_dat.complete_data(0, inplace=False)
    assert all(np.isnan(new_imp_complete[:, list(vs)]).sum(0) == 0)
    assert all(np.isnan(new_imp_complete[:, nivs]).sum(0) > 0)

    # Should have no affect on working_data or original data
    assert all(np.isnan(new_imp_dat.working_data).sum(0) > 0)
    assert all(np.isnan(new_data[:, list(vs)]).sum(0) > 0)

    # complete data in place
    new_imp_dat.complete_data(0, inplace=True)
    assert all(np.isnan(new_imp_dat.working_data[:, list(vs)]).sum(0) == 0)
    assert all(np.isnan(new_data[:, nivs]).sum(0) > 0)

    # Alter in place
    new_imp_dat = kernel.impute_new_data(new_data=new_data,
                                         copy_data=False,
                                         verbose=True)

    # Before completion, nan's should still exist in data:
    assert all(np.isnan(new_data).sum(0) > 0)
    assert all(np.isnan(new_imp_dat.working_data).sum(0) > 0)

    # Complete data not in place
    new_imp_complete = new_imp_dat.complete_data(0, inplace=False)
    assert all(np.isnan(new_imp_complete[:, nivs]).sum(0) > 0)
    assert all(np.isnan(new_imp_complete[:, list(vs)]).sum(0) == 0)
    assert all(np.isnan(new_data).sum(0) > 0)
    assert all(np.isnan(new_imp_dat.working_data).sum(0) > 0)

    # Complete data in place
    new_imp_dat.complete_data(0, inplace=True)
    assert all(np.isnan(new_data[:, nivs]).sum(0) > 0)
    assert all(np.isnan(new_data[:, list(vs)]).sum(0) == 0)
    assert all(np.isnan(new_imp_dat.working_data[:, nivs]).sum(0) > 0)
    assert all(np.isnan(new_imp_dat.working_data[:, list(vs)]).sum(0) == 0)

    # Plotting on multiple imputed dataset
    new_imp_dat.plot_mean_convergence()
    close()
    new_imp_dat.plot_imputed_distributions()
    close()

    # Plotting on Multiple Imputed Kernel
    kernel.plot_feature_importance(0)
    close()
    kernel.plot_mean_convergence()
    close()
    kernel.plot_imputed_distributions()
    close()
示例#6
0
iris_amp = mf.utils.ampute_data(iris, perc=0.20)
iris_new = iris.iloc[random_state.choice(iris.index,
                                         iris.shape[0],
                                         replace=False)].reset_index(drop=True)
iris_new_amp = mf.utils.ampute_data(iris_new, perc=0.20)


def mse(x, y):
    return np.mean((x - y)**2)


iterations = 2

kernel_sm2 = mf.ImputationKernel(iris_amp,
                                 datasets=1,
                                 data_subset=0.75,
                                 mean_match_candidates=3,
                                 random_state=random_state)
kernel_sm2.mice(iterations,
                boosting='random_forest',
                num_iterations=100,
                num_leaves=31)

kernel_sm1 = mf.ImputationKernel(iris_amp,
                                 datasets=1,
                                 data_subset=0.75,
                                 mean_match_candidates=3,
                                 save_models=1,
                                 random_state=random_state)
kernel_sm1.mice(iterations,
                boosting='random_forest',