def test_defaults_pandas(): new_data = boston_amp.loc[range(10), :].copy() kernel = mf.ImputationKernel(data=boston_amp, datasets=2, mean_match_scheme=mean_match_scheme_fast_cat, save_models=1, initialization="empty") kernel.mice(iterations=2, compile_candidates=True, verbose=True) kernel2 = mf.ImputationKernel(data=boston_amp, datasets=1, mean_match_scheme=mean_match_scheme_fast_cat) kernel2.mice(iterations=2) # Test appending and then test kernel. kernel.append(kernel2) kernel.compile_candidate_preds() # Test mice after appendage kernel.mice(1) kernel.complete_data(0, inplace=True) assert all(kernel.working_data.isnull().sum() == 0) assert kernel.get_model(0, 0, 3).params['objective'] == 'regression' assert kernel.get_model(0, 3, 3).params['objective'] == 'binary' assert kernel.get_model(0, 8, 3).params['objective'] == 'multiclass' # Make sure we didn't touch the original data assert all(boston_amp.isnull().sum() > 0) imp_ds = kernel.impute_new_data(new_data) imp_ds.complete_data(2, inplace=True) assert all(imp_ds.working_data.isnull().sum(0) == 0) assert new_data.isnull().sum().sum() > 0
def test_defaults_numpy(): working_set = boston_amp.copy() working_set["3"] = working_set["3"].cat.codes working_set["8"] = working_set["8"].cat.codes working_set["3"].replace(-1, np.NaN, inplace=True) working_set["8"].replace(-1, np.NaN, inplace=True) new_data = working_set.loc[range(10), :].copy() working_set = working_set.values new_data = new_data.values s = datetime.now() kernel = mf.ImputationKernel(data=working_set, datasets=3, categorical_feature=[3, 8], mean_match_scheme=mean_match_scheme_fast_cat) kernel.mice(iterations=1, verbose=True) kernel.compile_candidate_preds() # Complete data with copy. comp_dat = kernel.complete_data(0, inplace=False) # We didn't complete data in place. Make sure we created # a copy, and did not affect internal data or original data. assert all(np.isnan(comp_dat).sum(0) == 0) assert all(np.isnan(kernel.working_data).sum(0) > 0) assert all(np.isnan(working_set).sum(0) > 0) # Complete data in place kernel.complete_data(0, inplace=True) # We completed data in place. Make sure we only affected # the kernel.working_data and not the original data. assert all(np.isnan(kernel.working_data).sum(0) == 0) assert all(np.isnan(working_set).sum(0) > 0) imp_ds = kernel.impute_new_data(new_data) imp_ds.complete_data(0, inplace=True) assert all(np.isnan(imp_ds.working_data).sum(0) == 0) assert np.isnan(new_data).sum() > 0 print(datetime.now() - s)
def test_pandas_reproducibility(): datasets = 2 kernel = mf.ImputationKernel(data=boston_amp, datasets=datasets, initialization="random", save_models=2, random_state=2) kernel2 = mf.ImputationKernel(data=boston_amp, datasets=datasets, initialization="random", save_models=2, random_state=2) assert kernel.complete_data(0).equals(kernel2.complete_data(0)), ( "random_state initialization failed to be deterministic") # Run mice for 2 iterations kernel.mice(2) kernel2.mice(2) assert kernel.complete_data(0).equals(kernel2.complete_data(0)), ( "random_state after mice() failed to be deterministic") kernel_imputed_as_new = kernel.impute_new_data( boston_amp, random_state=4, random_seed_array=random_seed_array) # Generate and impute new data as a reordering of original new_order = np.arange(rows) random_state.shuffle(new_order) new_data = boston_amp.loc[new_order] new_seeds = random_seed_array[new_order] new_imputed = kernel.impute_new_data(new_data, random_state=4, random_seed_array=new_seeds) # Expect deterministic imputations at the record level, since seeds were passed. for i in range(datasets): reordered_kernel_completed = kernel_imputed_as_new.complete_data( dataset=0).loc[new_order] new_data_completed = new_imputed.complete_data(dataset=0) assert (reordered_kernel_completed == new_data_completed).all().all( ), ("Seeds did not cause deterministic imputations when data was reordered." ) # Generate and impute new data as a subset of original new_ind = [0, 1, 4, 7, 8, 10] new_data = boston_amp.loc[new_ind] new_seeds = random_seed_array[new_ind] new_imputed = kernel.impute_new_data(new_data, random_state=4, random_seed_array=new_seeds) # Expect deterministic imputations at the record level, since seeds were passed. for i in range(datasets): reordered_kernel_completed = kernel_imputed_as_new.complete_data( dataset=0).loc[new_ind] new_data_completed = new_imputed.complete_data(dataset=0) assert (reordered_kernel_completed == new_data_completed).all().all( ), ("Seeds did not cause deterministic imputations when data was reordered." ) # Generate and impute new data as a reordering of original new_order = np.arange(rows) random_state.shuffle(new_order) new_data = boston_amp.loc[new_order] new_imputed = kernel.impute_new_data(new_data, random_state=4, random_seed_array=random_seed_array) # Expect deterministic imputations at the record level, since seeds were passed. for i in range(datasets): reordered_kernel_completed = kernel_imputed_as_new.complete_data( dataset=0).loc[new_order] new_data_completed = new_imputed.complete_data(dataset=0) assert not (reordered_kernel_completed == new_data_completed).all( ).all(), ( "Different seeds caused deterministic imputations for all rows / columns." )
def test_complex_pandas(): working_set = boston_amp.copy() # Switch our category columns to integer codes. # Replace -1 with np.NaN or lightgbm will complain. working_set["3"] = working_set["3"].cat.codes working_set["8"] = working_set["8"].cat.codes working_set["3"].replace(-1, np.NaN, inplace=True) working_set["8"].replace(-1, np.NaN, inplace=True) new_data = working_set.loc[range(10), :].copy() # Customize everything. vs = { "1": ["2", "3", "4", "5"], "2": ["6", "7"], "3": ["1", "2", "8"], "4": ["8", "9", "10"] } mmc = {"1": 4, "2": 0.01, "3": 0} ds = {"2": 100, "3": 0.5} io = ["2", "3", "1"] imputed_var_names = io non_imputed_var_names = [str(x) for x in range(13) if str(x) not in io] kernel = mf.ImputationKernel(data=working_set, datasets=2, variable_schema=vs, imputation_order=io, train_nonmissing=True, mean_match_candidates=mmc, data_subset=ds, categorical_feature=[3, 8], copy_data=False) kernel2 = mf.ImputationKernel(data=working_set, datasets=1, variable_schema=vs, imputation_order=io, train_nonmissing=True, mean_match_candidates=mmc, data_subset=ds, categorical_feature=[3, 8], copy_data=False) new_file, filename = mkstemp() kernel2.save_kernel(filename) kernel2 = mf.utils.load_kernel(filename) assert kernel.mean_match_candidates == { 1: 4, 2: 3, 3: 0, 4: 5 }, "mean_match_candidates initialization failed" assert kernel.data_subset == { 1: 380, 2: 100, 3: 190, 4: 380 }, "mean_match_subset initialization failed" assert kernel.iteration_count() == 0, "iteration initialization failed" assert kernel.categorical_variables == [ 3, 8 ], "categorical recognition failed." # This section tests many things: # After saving / loading a kernel, and appending 2 kernels together: # mice can continue # Aliases are fixed, even when different aliases are passed # variable specific parameters supercede globally specified parameters # The parameters come through the actual model nround = 2 kernel.mice(nround - 1, compile_candidates=True, variable_parameters={"1": { "n_iter": 15 }}, num_trees=10, verbose=True) kernel.compile_candidate_preds() kernel2.mice(nround - 1, variable_parameters={"1": { "n_estimators": 15 }}, n_estimators=10, verbose=True) kernel.append(kernel2) kernel.compile_candidate_preds() assert kernel.get_model(0, 1, nround - 1).num_trees() == 15 assert kernel.get_model(0, 2, nround - 1).num_trees() == 10 kernel.mice(1, variable_parameters={1: { "n_iter": 15 }}, num_trees=10, verbose=True) assert kernel.iteration_count( ) == nround, "iteration counting is incorrect." assert kernel.get_model(0, 1, nround).num_trees() == 15 assert kernel.get_model(0, 2, nround).num_trees() == 10 # Make sure we only impute variables in variable_schema compdat = kernel.complete_data(0) assert all(compdat[imputed_var_names].isnull().sum() == 0) assert all(compdat[non_imputed_var_names].isnull().sum() > 0) # Test the ability to tune parameters with custom setup optimization_steps = 2 op, ol = kernel.tune_parameters(dataset=0, optimization_steps=optimization_steps, variable_parameters={ 1: { "bagging_fraction": 0.9, "feature_fraction_bynode": (0.85, 0.9) } }, bagging_fraction=0.8, feature_fraction_bynode=(0.70, 0.75), verbose=True) assert op[1]["bagging_fraction"] == 0.9 assert op[2]["bagging_fraction"] == 0.8 assert (op[1]["feature_fraction_bynode"] >= 0.85) and (op[1]["feature_fraction_bynode"] <= 0.9) assert (op[2]["feature_fraction_bynode"] >= 0.70) and (op[2]["feature_fraction_bynode"] <= 0.75) kernel.mice(1, variable_parameters=op, verbose=True) model_2_params = kernel.get_model(0, 2, nround + 1).params model_1_params = kernel.get_model(0, 1, nround + 1).params assert model_2_params["bagging_fraction"] == 0.8 assert model_1_params["bagging_fraction"] == 0.9 assert (model_2_params["feature_fraction_bynode"] >= 0.70) and (model_2_params["feature_fraction_bynode"] <= 0.75) assert (model_1_params["feature_fraction_bynode"] >= 0.85) and (model_1_params["feature_fraction_bynode"] <= 0.9) new_imp_dat = kernel.impute_new_data(new_data=new_data, verbose=True) new_imp_complete = new_imp_dat.complete_data(0) new_imp_dat.na_where[2] new_imp_dat.imputation_values[0, 2, 3] assert all(new_imp_complete[["1", "2", "3", "4"]].isnull().sum() == 0) # Plotting on multiple imputed dataset new_imp_dat.plot_mean_convergence() close() new_imp_dat.plot_imputed_distributions() close() # Plotting on Multiple Imputed Kernel kernel.plot_feature_importance(0) close() kernel.plot_mean_convergence() close() kernel.plot_imputed_distributions() close()
def test_complex_numpy(): working_set = boston_amp.copy() # Switch our category columns to integer codes. # Replace -1 with np.NaN or lightgbm will complain. working_set["3"] = working_set["3"].cat.codes working_set["8"] = working_set["8"].cat.codes working_set["3"].replace(-1, np.NaN, inplace=True) working_set["8"].replace(-1, np.NaN, inplace=True) new_data = working_set.loc[range(100), :].copy() working_set = working_set.values new_data = new_data.values # Specify that models should be built for variables 1, 2, 3, 4 vs = {1: [2, 3, 4, 5], 2: [6, 7], 3: [1, 2, 8], 4: [8, 9, 10]} mmc = {1: 4, 2: 0.01, 3: 0} ds = {2: 100, 3: 0.5} # Only variables 1, 2, 3 should be imputed using mice. io = [2, 3, 1] niv = np.setdiff1d(np.arange(working_set.shape[1]), io) nivs = np.setdiff1d(np.arange(working_set.shape[1]), list(vs)) kernel = mf.ImputationKernel(data=working_set, datasets=2, variable_schema=vs, imputation_order=io, train_nonmissing=True, mean_match_candidates=mmc, data_subset=ds, mean_match_scheme=mean_match_scheme_fast_cat, categorical_feature=[3, 8], copy_data=False) kernel2 = mf.ImputationKernel(data=working_set, datasets=1, variable_schema=vs, imputation_order=io, train_nonmissing=True, mean_match_candidates=mmc, data_subset=ds, mean_match_scheme=mean_match_scheme_fast_cat, categorical_feature=[3, 8], copy_data=False) new_file, filename = mkstemp() kernel2.save_kernel(filename) kernel2 = mf.utils.load_kernel(filename) assert kernel.mean_match_candidates == { 2: 3, 3: 0, 1: 4, 4: 5 }, "mean_match_candidates initialization failed" assert kernel.data_subset == { 2: 100, 3: 190, 1: 380, 4: 380 }, "mean_match_subset initialization failed" assert kernel.iteration_count() == 0, "iteration initialization failed" assert kernel.categorical_variables == [ 3, 8 ], "categorical recognition failed." nround = 2 kernel.mice(nround - 1, variable_parameters={1: { "n_iter": 15 }}, num_trees=10, verbose=True) kernel.compile_candidate_preds() kernel2.mice(nround - 1, variable_parameters={1: { "n_iter": 15 }}, num_trees=10, verbose=True) kernel.append(kernel2) kernel.compile_candidate_preds() assert kernel.get_model(0, 1, nround - 1).num_trees() == 15 assert kernel.get_model(0, 2, nround - 1).num_trees() == 10 kernel.mice(1, variable_parameters={1: { "n_estimators": 15 }}, n_estimators=10, verbose=True) assert kernel.iteration_count( ) == nround, "iteration counting is incorrect." assert kernel.get_model(0, 1, nround).num_trees() == 15 assert kernel.get_model(0, 2, nround).num_trees() == 10 # Complete data with copy. Make sure only correct datasets and variables were affected. compdat = kernel.complete_data(0, inplace=False) assert all(np.isnan(compdat[:, io]).sum(0) == 0) assert all(np.isnan(compdat[:, niv]).sum(0) > 0) # Should have no affect on working_data assert all(np.isnan(kernel.working_data).sum(0) > 0) # Should have no affect on working_set assert all(np.isnan(working_set).sum(0) > 0) # Now complete the data in place kernel.complete_data(0, inplace=True) # Should have affect on working_data and original data assert all(np.isnan(kernel.working_data[:, io]).sum(0) == 0) assert all(np.isnan(working_set[:, io]).sum(0) == 0) assert all(np.isnan(kernel.working_data[:, niv]).sum(0) > 0) assert all(np.isnan(working_set[:, niv]).sum(0) > 0) # Test the ability to tune parameters with custom setup optimization_steps = 2 op, ol = kernel.tune_parameters(dataset=0, optimization_steps=optimization_steps, variable_parameters={ 1: { "bagging_fraction": 0.9, "feature_fraction_bynode": (0.85, 0.9) } }, bagging_fraction=0.8, feature_fraction_bynode=(0.70, 0.75), verbose=True) assert op[1]["bagging_fraction"] == 0.9 assert op[2]["bagging_fraction"] == 0.8 assert (op[1]["feature_fraction_bynode"] >= 0.85) and (op[1]["feature_fraction_bynode"] <= 0.9) assert (op[2]["feature_fraction_bynode"] >= 0.70) and (op[2]["feature_fraction_bynode"] <= 0.75) kernel.mice(1, variable_parameters=op, verbose=True) model_2_params = kernel.get_model(0, 2, nround + 1).params model_1_params = kernel.get_model(0, 1, nround + 1).params assert model_2_params["bagging_fraction"] == 0.8 assert model_1_params["bagging_fraction"] == 0.9 assert (model_2_params["feature_fraction_bynode"] >= 0.70) and (model_2_params["feature_fraction_bynode"] <= 0.75) assert (model_1_params["feature_fraction_bynode"] >= 0.85) and (model_1_params["feature_fraction_bynode"] <= 0.9) new_imp_dat = kernel.impute_new_data(new_data=new_data, copy_data=True, verbose=True) # Not in place new_imp_complete = new_imp_dat.complete_data(0, inplace=False) assert all(np.isnan(new_imp_complete[:, list(vs)]).sum(0) == 0) assert all(np.isnan(new_imp_complete[:, nivs]).sum(0) > 0) # Should have no affect on working_data or original data assert all(np.isnan(new_imp_dat.working_data).sum(0) > 0) assert all(np.isnan(new_data[:, list(vs)]).sum(0) > 0) # complete data in place new_imp_dat.complete_data(0, inplace=True) assert all(np.isnan(new_imp_dat.working_data[:, list(vs)]).sum(0) == 0) assert all(np.isnan(new_data[:, nivs]).sum(0) > 0) # Alter in place new_imp_dat = kernel.impute_new_data(new_data=new_data, copy_data=False, verbose=True) # Before completion, nan's should still exist in data: assert all(np.isnan(new_data).sum(0) > 0) assert all(np.isnan(new_imp_dat.working_data).sum(0) > 0) # Complete data not in place new_imp_complete = new_imp_dat.complete_data(0, inplace=False) assert all(np.isnan(new_imp_complete[:, nivs]).sum(0) > 0) assert all(np.isnan(new_imp_complete[:, list(vs)]).sum(0) == 0) assert all(np.isnan(new_data).sum(0) > 0) assert all(np.isnan(new_imp_dat.working_data).sum(0) > 0) # Complete data in place new_imp_dat.complete_data(0, inplace=True) assert all(np.isnan(new_data[:, nivs]).sum(0) > 0) assert all(np.isnan(new_data[:, list(vs)]).sum(0) == 0) assert all(np.isnan(new_imp_dat.working_data[:, nivs]).sum(0) > 0) assert all(np.isnan(new_imp_dat.working_data[:, list(vs)]).sum(0) == 0) # Plotting on multiple imputed dataset new_imp_dat.plot_mean_convergence() close() new_imp_dat.plot_imputed_distributions() close() # Plotting on Multiple Imputed Kernel kernel.plot_feature_importance(0) close() kernel.plot_mean_convergence() close() kernel.plot_imputed_distributions() close()
iris_amp = mf.utils.ampute_data(iris, perc=0.20) iris_new = iris.iloc[random_state.choice(iris.index, iris.shape[0], replace=False)].reset_index(drop=True) iris_new_amp = mf.utils.ampute_data(iris_new, perc=0.20) def mse(x, y): return np.mean((x - y)**2) iterations = 2 kernel_sm2 = mf.ImputationKernel(iris_amp, datasets=1, data_subset=0.75, mean_match_candidates=3, random_state=random_state) kernel_sm2.mice(iterations, boosting='random_forest', num_iterations=100, num_leaves=31) kernel_sm1 = mf.ImputationKernel(iris_amp, datasets=1, data_subset=0.75, mean_match_candidates=3, save_models=1, random_state=random_state) kernel_sm1.mice(iterations, boosting='random_forest',