def test_outcome_name_required(): numpy.random.seed(235) d = pandas.DataFrame({"x": ["1", "1", "1", "2", "2", "2"]}) y = [1, 2, 3, 4, 5, 6] transform = vtreat.NumericOutcomeTreatment( params=vtreat.vtreat_parameters({"filter_to_recommended": False})) transform.fit_transform(d, y) with pytest.raises(Exception): transform.fit_transform(d) transform = vtreat.BinomialOutcomeTreatment( params=vtreat.vtreat_parameters({"filter_to_recommended": False}), outcome_target=3, ) transform.fit_transform(d, y) with pytest.raises(Exception): transform.fit_transform(d) transform = vtreat.vtreat_api.MultinomialOutcomeTreatment( params=vtreat.vtreat_parameters({"filter_to_recommended": False})) transform.fit_transform(d, y) with pytest.raises(Exception): transform.fit_transform(d)
def test_classification(): numpy.random.seed(46546) def make_data(nrows): d = pandas.DataFrame({"x": [0.1 * i for i in range(nrows)]}) d["y"] = d["x"] + numpy.sin( d["x"]) + 0.1 * numpy.random.normal(size=d.shape[0]) d["xc"] = ["level_" + str(5 * numpy.round(yi / 5, 1)) for yi in d["y"]] d["x2"] = numpy.random.normal(size=d.shape[0]) d.loc[d["xc"] == "level_-1.0", "xc"] = numpy.nan # introduce a nan level d["yc"] = d["y"] > 0.5 return d d = make_data(500) vars = [c for c in d.columns if c not in set(["y", "yc"])] d_test = make_data(100) transform = vtreat.BinomialOutcomeTreatment( outcome_name="yc", # outcome variable outcome_target=True, # outcome of interest cols_to_copy=[ "y" ], # columns to "carry along" but not treat as input variables params=vtreat.vtreat_parameters({"filter_to_recommended": False}), ) d_prepared = transform.fit_transform(d[vars], d["yc"]) # show vars are under control assert transform.get_result_restriction() is None assert "x2" in set(d_prepared.columns) transform.set_result_restriction(["xc_logit_code", "x2"]) dt_prepared = transform.transform(d_test) assert set(dt_prepared.columns) == set(["y", "yc", "x2", "xc_logit_code"]) transform = vtreat.BinomialOutcomeTreatment( outcome_name="yc", # outcome variable outcome_target=True, # outcome of interest cols_to_copy=[ "y" ], # columns to "carry along" but not treat as input variables params=vtreat.vtreat_parameters({"filter_to_recommended": True}), ) d_prepared = transform.fit_transform(d[vars], d["yc"]) assert transform.get_result_restriction() is not None assert "x2" not in transform.get_result_restriction() assert "x2" not in set(d_prepared.columns) transform.set_result_restriction(["xc_logit_code", "x2"]) dt_prepared = transform.transform(d_test) assert set(dt_prepared.columns) == set(["y", "yc", "x2", "xc_logit_code"])
def test_nan_inf(): numpy.random.seed(235) d = pandas.DataFrame({ "x": [1.0, numpy.nan, numpy.inf, -numpy.inf, None, 0], "y": [1, 2, 3, 4, 5, 6] }) transform = vtreat.NumericOutcomeTreatment( outcome_name="y", params=vtreat.vtreat_parameters({"filter_to_recommended": False}), ) d_treated = transform.fit_transform(d, d["y"]) for c in d_treated.columns: assert vtreat.util.can_convert_v_to_numeric(d_treated[c]) assert numpy.sum(vtreat.util.is_bad(d_treated[c])) == 0 expect = pandas.DataFrame({ "x": [1.0, 0.5, 0.5, 0.5, 0.5, 0], "x_is_bad": [0, 1, 1, 1, 1, 0], "y": [1, 2, 3, 4, 5, 6], }) for c in expect.columns: ec = numpy.asarray(expect[c]) ed = numpy.asarray(d_treated[c]) assert numpy.max(numpy.abs(ec - ed)) < 1.0e-6
def test_r1_issue(): plan = vtreat.NumericOutcomeTreatment( outcome_name="y", params=vtreat.vtreat_parameters({"filter_to_recommended": False}), ) # from https://pandas.pydata.org/pandas-docs/stable/user_guide/missing_data.html df = pandas.DataFrame( numpy.random.randn(5, 3), index=["a", "c", "e", "f", "h"], columns=["one", "two", "three"], ) df["four"] = "foo" df["five"] = df["one"] > 0 df2 = df.reindex(["a", "b", "c", "d", "e", "f", "g", "h"]) df2.reset_index(inplace=True, drop=True) df2["y"] = range(df2.shape[0]) df2.loc[3, "four"] = "blog" df2["const"] = 1 vtreat.util.is_bad(df2["five"]) prepped = plan.fit_transform(df2, df2["y"]) # used to raise an exception for c in prepped.columns: assert vtreat.util.can_convert_v_to_numeric(prepped[c]) assert numpy.sum(vtreat.util.is_bad(prepped[c])) == 0
def test_db_adapter_monster(): outcome_name = "y" row_id_name = 'row_id' n_vars = 5 def mk_data(n_rows: int = 100): step = 1 / np.sqrt(n_vars) cols = dict() y = np.random.normal(size=n_rows) for i in range(n_vars): vname = f"v_{i}" v = np.random.choice(["a", "b"], replace=True, size=n_rows) y = y + np.where(v == "a", step, -step) cols[vname] = v vars = list(cols.keys()) vars.sort() cols[outcome_name] = y cols[row_id_name] = range(n_rows) d = pd.DataFrame(cols) return d, vars d, vars = mk_data(100) d_app, _ = mk_data(10) cols_to_copy = [outcome_name, row_id_name] columns = vars + cols_to_copy treatment = vtreat.NumericOutcomeTreatment( cols_to_copy=cols_to_copy, outcome_name=outcome_name, params=vtreat.vtreat_parameters({ "sparse_indicators": False, "filter_to_recommended": False, }), ) d_train_treated = treatment.fit_transform(d) assert isinstance(d_train_treated, pd.DataFrame) d_app_treated = treatment.transform(d_app) transform_as_data = treatment.description_matrix() # transform_as_data.to_csv('example_transform.csv', index=False) ops = as_data_algebra_pipeline( source=descr(d_app=d), vtreat_descr=transform_as_data, treatment_table_name="transform_as_data", row_keys=[row_id_name], ) ops_source = str(ops) assert isinstance(ops_source, str) d_app_res = ops.eval({ "d_app": d_app, "transform_as_data": transform_as_data }) assert data_algebra.test_util.equivalent_frames(d_app_treated, d_app_res) assert numpy.all([c in d_app_res.columns for c in cols_to_copy])
def test_pipeparams(): numpy.random.seed(2019) def make_data(nrows): d = pd.DataFrame({"x": 5 * numpy.random.normal(size=nrows)}) d["y"] = numpy.sin(d["x"]) + 0.1 * numpy.random.normal(size=nrows) d.loc[numpy.arange(3, 10), "x"] = numpy.nan # introduce a nan level d["xc"] = ["level_" + str(5 * numpy.round(yi / 5, 1)) for yi in d["y"]] d["x2"] = np.random.normal(size=nrows) d.loc[d["xc"] == "level_-1.0", "xc"] = numpy.nan # introduce a nan level d["yc"] = d["y"] > 0.5 return d df = make_data(500) df = df.drop(columns=["y"]) transform = vtreat.BinomialOutcomeTreatment( outcome_target=True, params=vtreat.vtreat_parameters({"sparse_indicators": False}), ) clf = Pipeline(steps=[ ("preprocessor", transform), ("classifier", LogisticRegression(solver="lbfgs")), ]) X, y = df, df.pop("yc") X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) clf.fit(X_train, y_train) #%% t_params = transform.get_params() assert t_params["indicator_min_fraction"] is not None assert transform.get_params()["indicator_min_fraction"] != 0 #%% p_params = clf.get_params() assert p_params["preprocessor__indicator_min_fraction"] is not None #%% clf.set_params(preprocessor__indicator_min_fraction=0) assert transform.get_params()["indicator_min_fraction"] == 0 # no warning or error with pytest.warns(None) as record: clf.fit(X_train, y_train) assert not record
def test_pipeparams(): numpy.random.seed(2019) def make_data(nrows): d = pd.DataFrame({'x': 5*numpy.random.normal(size=nrows)}) d['y'] = numpy.sin(d['x']) + 0.1*numpy.random.normal(size=nrows) d.loc[numpy.arange(3, 10), 'x'] = numpy.nan # introduce a nan level d['xc'] = ['level_' + str(5*numpy.round(yi/5, 1)) for yi in d['y']] d['x2'] = np.random.normal(size=nrows) d.loc[d['xc']=='level_-1.0', 'xc'] = numpy.nan # introduce a nan level d['yc'] = d['y']>0.5 return d df = make_data(500) df = df.drop(columns=['y']) transform = vtreat.BinomialOutcomeTreatment( outcome_target=True, params=vtreat.vtreat_parameters({'sparse_indicators': False})) clf = Pipeline(steps=[ ('preprocessor', transform), ('classifier', LogisticRegression(solver = 'lbfgs'))] ) X, y = df, df.pop('yc') X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) clf.fit(X_train, y_train) #%% t_params = transform.get_params() assert t_params['indicator_min_fraction'] is not None assert transform.get_params()['indicator_min_fraction'] != 0 #%% p_params = clf.get_params() assert p_params['preprocessor__indicator_min_fraction'] is not None #%% clf.set_params(preprocessor__indicator_min_fraction=0) assert transform.get_params()['indicator_min_fraction'] == 0 # no warning or error with pytest.warns(None) as record: clf.fit(X_train, y_train) assert not record
def test_unexpected_nan(): # confirm NaN processing correct, even when none seenin training data numpy.random.seed(235) d = pandas.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": [1, 2, 3, 4, 5, 6]}) transform = vtreat.NumericOutcomeTreatment( outcome_name="y", params=vtreat.vtreat_parameters({"filter_to_recommended": False}), ) d_treated = transform.fit_transform(d, d["y"]) assert transform.score_frame_.shape[0] == 1 assert "x" in set(transform.score_frame_["variable"]) d_app = pandas.DataFrame({"x": [1, 2, numpy.NAN, 4, None, 6]}) assert numpy.any(numpy.isnan(d_app["x"])) d_app_treated = transform.transform(d_app) assert not numpy.any(numpy.isnan(d_app_treated["x"]))
def test_db_adapter_1_cdata(): # Example from: # https://github.com/WinVector/pyvtreat/blob/main/Examples/Database/vtreat_db_adapter.ipynb # Data from: # https://archive.ics.uci.edu/ml/datasets/Diabetes+130-US+hospitals+for+years+1999-2008 # data_all = pd.read_csv("diabetes_head.csv") dir_path = os.path.dirname(os.path.realpath(__file__)) data_all = pd.read_csv(os.path.join(dir_path, "diabetes_head.csv")) n = data_all.shape[0] data_all["orig_index"] = range(n) d_train = data_all.loc[range(n - 5), :].reset_index(inplace=False, drop=True) d_app = data_all.loc[range(n - 5, n)].reset_index(inplace=False, drop=True) #%% outcome_name = "readmitted" cols_to_copy = ["orig_index", "encounter_id", "patient_nbr" ] + [outcome_name] vars = ["time_in_hospital", "weight"] columns = vars + cols_to_copy # d_train.loc[:, columns] #%% treatment = vtreat.BinomialOutcomeTreatment( cols_to_copy=cols_to_copy, outcome_name=outcome_name, outcome_target=True, params=vtreat.vtreat_parameters({ "sparse_indicators": False, "filter_to_recommended": False, }), ) d_train_treated = treatment.fit_transform(d_train.loc[:, columns]) d_app_treated = treatment.transform(d_app.loc[:, columns]) # d_app_treated #%% transform_as_data = treatment.description_matrix() # transform_as_data #%% ops = as_data_algebra_pipeline( source=descr(d_app=d_app.loc[:, columns]), vtreat_descr=transform_as_data, treatment_table_name="transform_as_data", row_keys=['orig_index'], ) # print(ops) #%% transformed = ops.eval({ "d_app": d_app.loc[:, columns], "transform_as_data": transform_as_data }) # transformed #%% assert data_algebra.test_util.equivalent_frames(transformed, d_app_treated) #%% db_handle = data_algebra.SQLite.example_handle() sql = db_handle.to_sql(ops) assert isinstance(sql, str) # print(sql) #%% db_handle.insert_table(d_app.loc[:, columns], table_name="d_app") db_handle.insert_table(transform_as_data, table_name="transform_as_data") db_handle.execute("CREATE TABLE res AS " + sql) res_db = db_handle.read_query( "SELECT * FROM res ORDER BY orig_index LIMIT 10") # res_db #%% assert data_algebra.test_util.equivalent_frames(res_db, d_app_treated) #%% db_handle.close()
def test_db_adapter_general(): # set up example data def mk_data( n_rows: int = 100, *, outcome_name: str = "y", n_cat_vars: int = 5, n_num_vars: int = 5, add_unknowns: bool = False, ): step = 1 / np.sqrt(n_cat_vars + n_num_vars) cols = dict() y = np.random.normal(size=n_rows) for i in range(n_cat_vars): vname = f"vc_{i}" levels = ["a", "b", "c", "none"] if add_unknowns: levels = levels + ["d"] level_values = { v: step * np.random.normal(size=1)[0] for v in levels } v = np.random.choice(levels, replace=True, size=n_rows) y = y + np.array([level_values[vi] for vi in v]) v = np.array([vi if vi != "none" else None for vi in v]) cols[vname] = v for i in range(n_num_vars): vname = f"vn_{i}" v = np.random.normal(size=n_rows) y = y + step * v v[np.random.uniform(size=n_rows) < 0.24] = None cols[vname] = v vars = list(cols.keys()) vars.sort() cols[outcome_name] = y d = pd.DataFrame(cols) d["orig_index"] = range(d.shape[0]) return d, outcome_name, vars d, outcome_name, vars = mk_data(100) d_app, _, _ = mk_data(50, add_unknowns=True) cols_to_copy = [outcome_name, "orig_index"] columns = vars + cols_to_copy # get reference result treatment = vtreat.NumericOutcomeTreatment( cols_to_copy=cols_to_copy, outcome_name=outcome_name, params=vtreat.vtreat_parameters({ "sparse_indicators": False, "filter_to_recommended": False, }), ) d_train_treated = treatment.fit_transform(d) assert isinstance(d_train_treated, pd.DataFrame) d_app_treated = treatment.transform(d_app) # test ops path transform_as_data = treatment.description_matrix() ops = as_data_algebra_pipeline( source=descr(d_app=d), vtreat_descr=transform_as_data, treatment_table_name="transform_as_data", row_keys=["orig_index"], ) ops_source = str(ops) assert isinstance(ops_source, str) d_app_res = ops.eval({ "d_app": d_app, "transform_as_data": transform_as_data }) assert data_algebra.test_util.equivalent_frames(d_app_treated, d_app_res) # test ops db path source_descr = TableDescription( table_name="d_app", column_names=columns, ) db_handle = data_algebra.SQLite.example_handle() db_handle.insert_table(d_app.loc[:, columns], table_name="d_app") db_handle.insert_table(transform_as_data, table_name="transform_as_data") db_handle.execute("CREATE TABLE res AS " + db_handle.to_sql(ops)) res_db = db_handle.read_query("SELECT * FROM res ORDER BY orig_index") assert data_algebra.test_util.equivalent_frames(res_db, d_app_treated) db_handle.close()
def test_KDD2009_vtreat_1(): data_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'KDD2009') test_on_BigQuery = False test_xicor = True # data from https://github.com/WinVector/PDSwR2/tree/master/KDD2009 expect_test = pandas.read_csv(os.path.join(data_dir, 'test_processed.csv.gz'), compression='gzip') d = pandas.read_csv(os.path.join(data_dir, 'orange_small_train.data.gz'), sep='\t', header=0) orig_vars = list(d.columns) # Read in dependent variable we are trying to predict. churn = pandas.read_csv(os.path.join( data_dir, 'orange_small_train_churn.labels.txt'), header=None) churn.columns = ["churn"] churn['churn'] = churn['churn'] == 1 # replace with True / False # Arrange test/train split. numpy.random.seed(2020) n = d.shape[0] # https://github.com/WinVector/pyvtreat/blob/master/Examples/CustomizedCrossPlan/CustomizedCrossPlan.md split1 = vtreat.cross_plan.KWayCrossPlanYStratified().split_plan( n_rows=n, k_folds=10, y=churn.iloc[:, 0]) train_idx = set(split1[0]['train']) is_train = [i in train_idx for i in range(n)] is_test = numpy.logical_not(is_train) d['orig_index'] = range(d.shape[0]) d_train = d.loc[is_train, :].reset_index(drop=True, inplace=False) churn_train = numpy.asarray(churn.loc[is_train, :]["churn"]) d_test = d.loc[is_test, :].reset_index(drop=True, inplace=False) churn_test = numpy.asarray(churn.loc[is_test, :]["churn"]) # build treatment plan plan = vtreat.BinomialOutcomeTreatment(outcome_target=True, outcome_name='churn', cols_to_copy=['orig_index'], params=vtreat.vtreat_parameters({ 'filter_to_recommended': True, 'sparse_indicators': True, })) cross_frame = plan.fit_transform(d_train, churn_train) test_processed = plan.transform(d_test) # check we got lots of variables, as seen in worksheet rec = plan.score_frame_.loc[plan.score_frame_.recommended, :] vc = rec.treatment.value_counts() treatments_seen = set(vc.index) assert numpy.all([ t in treatments_seen for t in [ 'missing_indicator', 'indicator_code', 'logit_code', 'prevalence_code', 'clean_copy' ] ]) assert numpy.min(vc) >= 10 model_vars = list(rec['variable']) if test_xicor: ## xicor # all_vars = list(set(plan.score_frame_["variable"])) all_vars = [ c for c in cross_frame.columns if c not in ['churn', 'orig_index'] ] xicor_scores = vtreat.stats_utils.xicor_for_frame( cross_frame.loc[:, all_vars], numpy.asarray(churn_train, dtype=float), n_reps=5) xicor_picked = list(xicor_scores.loc[xicor_scores['xicor'] > 0.0, 'variable']) model_vars = xicor_picked # try a simple model model = sklearn.linear_model.LogisticRegression(max_iter=1000) with pytest.warns(UserWarning): # densifying warns model.fit(cross_frame.loc[:, model_vars], churn_train) with pytest.warns(UserWarning): # densifying warns preds_test = model.predict_proba(test_processed.loc[:, model_vars]) with pytest.warns(UserWarning): # densifying warns preds_train = model.predict_proba(cross_frame.loc[:, model_vars]) fpr, tpr, _ = sklearn.metrics.roc_curve(churn_test, preds_test[:, 1]) auc_test = sklearn.metrics.auc(fpr, tpr) fpr, tpr, _ = sklearn.metrics.roc_curve(churn_train, preds_train[:, 1]) auc_train = sklearn.metrics.auc(fpr, tpr) assert auc_test > 0.6 # not good! assert abs(auc_test - auc_train) < 0.05 # at least not over fit! # check against previous result assert test_processed.shape == expect_test.shape assert set(test_processed.columns) == set(expect_test.columns) assert numpy.max(numpy.max(numpy.abs(test_processed - expect_test))) < 1e-3 # test transform conversion transform_as_data = plan.description_matrix() incoming_vars = list(set(transform_as_data['orig_var'])) ops = vtreat.vtreat_db_adapter.as_data_algebra_pipeline( source=TableDescription(table_name='d_test', column_names=incoming_vars + ['orig_index']), vtreat_descr=transform_as_data, treatment_table_name='transform_as_data', row_keys=['orig_index'], ) test_by_pipeline = ops.eval({ 'd_test': d_test.loc[:, incoming_vars + ['orig_index']], 'transform_as_data': transform_as_data }) assert test_by_pipeline.shape[0] == test_processed.shape[0] assert test_by_pipeline.shape[1] >= test_processed.shape[1] assert not numpy.any(numpy.isnan(test_by_pipeline)) test_pipeline_cols = set(test_by_pipeline.columns) assert numpy.all([c in test_pipeline_cols for c in test_processed.columns]) test_cols_sorted = list(test_processed.columns) test_cols_sorted.sort() assert numpy.max( numpy.max( numpy.abs(test_processed[test_cols_sorted] - test_by_pipeline[test_cols_sorted]))) < 1e-5 # data algebra pipeline in database sql = data_algebra.BigQuery.BigQueryModel().to_sql(ops) assert isinstance(sql, str) if test_on_BigQuery: db_handle = data_algebra.BigQuery.example_handle() db_handle.drop_table('d_test_processed') db_handle.insert_table(d_test.loc[:, incoming_vars + ['orig_index']], table_name='d_test', allow_overwrite=True) db_handle.insert_table(transform_as_data, table_name='transform_as_data', allow_overwrite=True) db_handle.execute( f"CREATE TABLE {db_handle.db_model.table_prefix}.d_test_processed AS {db_handle.to_sql(ops)}" ) db_res = db_handle.read_query( f"SELECT * FROM {db_handle.db_model.table_prefix}.d_test_processed ORDER BY orig_index" ) assert db_res.shape[0] == test_processed.shape[0] assert numpy.max( numpy.max( numpy.abs(test_processed[test_cols_sorted] - db_res[test_cols_sorted]))) < 1e-5 db_handle.drop_table('d_test') db_handle.drop_table('transform_as_data') db_handle.drop_table('d_test_processed') db_handle.close()
def test_diabetes_example(): dir_path = os.path.dirname(os.path.realpath(__file__)) data = pandas.read_pickle(os.path.join(dir_path, 'diabetes_head.pkl')) assert data.shape[0] == 1000 # from AI200: day_04/ZZ_homework/soln_dont_peek/diabetes_soln.ipynb # sklearn.preprocessing.OneHotEncoder could # also perform this task well. # documentation: # https://github.com/WinVector/pyvtreat/blob/main/Examples/Classification/Classification.md treatment = vtreat.BinomialOutcomeTreatment( cols_to_copy=['encounter_id', 'patient_nbr', 'readmitted'], outcome_name='readmitted', outcome_target=True, params=vtreat.vtreat_parameters({ 'sparse_indicators': False, 'filter_to_recommended': False, }), ) data_treated = treatment.fit_transform(data) assert data_treated.shape[0] == data.shape[0] expect = { 'A1Cresult_lev_None', 'A1Cresult_lev__gt_8', 'A1Cresult_logit_code', 'A1Cresult_prevalence_code', 'acarbose_lev_No', 'acarbose_logit_code', 'acarbose_prevalence_code', 'admission_source_id_lev_1', 'admission_source_id_lev_7', 'admission_source_id_logit_code', 'admission_source_id_prevalence_code', 'admission_type_id_lev_1', 'admission_type_id_lev_2', 'admission_type_id_lev_6', 'admission_type_id_logit_code', 'admission_type_id_prevalence_code', 'age_lev__osq_40-50_cp_', 'age_lev__osq_50-60_cp_', 'age_lev__osq_60-70_cp_', 'age_lev__osq_70-80_cp_', 'age_lev__osq_80-90_cp_', 'age_logit_code', 'age_prevalence_code', 'change_lev_Ch', 'change_lev_No', 'change_logit_code', 'change_prevalence_code', 'chlorpropamide_lev_No', 'chlorpropamide_logit_code', 'chlorpropamide_prevalence_code', 'diabetesMed_lev_No', 'diabetesMed_lev_Yes', 'diabetesMed_logit_code', 'diabetesMed_prevalence_code', 'diag_1_is_bad', 'diag_1_lev_414', 'diag_1_logit_code', 'diag_1_prevalence_code', 'diag_2_is_bad', 'diag_2_logit_code', 'diag_2_prevalence_code', 'diag_3_is_bad', 'diag_3_lev_250', 'diag_3_logit_code', 'diag_3_prevalence_code', 'discharge_disposition_id_lev_1', 'discharge_disposition_id_lev_25', 'discharge_disposition_id_logit_code', 'discharge_disposition_id_prevalence_code', 'encounter_id', 'gender_lev_Female', 'gender_lev_Male', 'gender_logit_code', 'gender_prevalence_code', 'glimepiride_lev_No', 'glimepiride_logit_code', 'glimepiride_prevalence_code', 'glipizide_lev_No', 'glipizide_lev_Steady', 'glipizide_logit_code', 'glipizide_prevalence_code', 'glyburide_lev_No', 'glyburide_logit_code', 'glyburide_prevalence_code', 'insulin_lev_Down', 'insulin_lev_No', 'insulin_lev_Steady', 'insulin_logit_code', 'insulin_prevalence_code', 'max_glu_serum_lev_None', 'max_glu_serum_logit_code', 'max_glu_serum_prevalence_code', 'medical_specialty_is_bad', 'medical_specialty_lev_Cardiology', 'medical_specialty_lev_Family/GeneralPractice', 'medical_specialty_lev_InternalMedicine', 'medical_specialty_lev__NA_', 'medical_specialty_logit_code', 'medical_specialty_prevalence_code', 'metformin_lev_No', 'metformin_lev_Steady', 'metformin_logit_code', 'metformin_prevalence_code', 'num_lab_procedures', 'num_medications', 'num_procedures', 'number_diagnoses', 'number_emergency', 'number_inpatient', 'number_outpatient', 'patient_nbr', 'pioglitazone_lev_No', 'pioglitazone_logit_code', 'pioglitazone_prevalence_code', 'race_is_bad', 'race_lev_AfricanAmerican', 'race_lev_Caucasian', 'race_logit_code', 'race_prevalence_code', 'readmitted', 'repaglinide_lev_No', 'repaglinide_logit_code', 'repaglinide_prevalence_code', 'revisit', 'rosiglitazone_lev_No', 'rosiglitazone_logit_code', 'rosiglitazone_prevalence_code', 'time_in_hospital', 'tolazamide_lev_No', 'tolazamide_logit_code', 'tolazamide_prevalence_code', 'tolbutamide_lev_No', 'tolbutamide_logit_code', 'tolbutamide_prevalence_code', 'troglitazone_lev_No', 'troglitazone_logit_code', 'troglitazone_prevalence_code', 'visit_number', 'weight_is_bad', 'weight_lev__NA_', 'weight_logit_code', 'weight_prevalence_code' } assert set(data_treated.columns) == expect treatment = vtreat.BinomialOutcomeTreatment( cols_to_copy=['encounter_id', 'patient_nbr', 'readmitted'], outcome_name='readmitted', outcome_target=True, params=vtreat.vtreat_parameters({ 'sparse_indicators': False, 'filter_to_recommended': True, }), ) data_treated = treatment.fit_transform(data) assert data_treated.shape[0] == data.shape[0] assert data_treated.shape[1] >= 10
def test_user_coders(): with warnings.catch_warnings(): warnings.filterwarnings("ignore") # avoid depending on sklearn.metrics.r2_score def r_squared(*, y_true, y_pred): y_true = numpy.asarray(y_true) y_pred = numpy.asarray(y_pred) return 1 - numpy.sum((y_true - y_pred) ** 2) / numpy.sum( (y_true - numpy.mean(y_true)) ** 2 ) # %% class PolyTransform(vtreat.transform.UserTransform): """a polynomial model""" def __init__(self, *, deg=5, alpha=0.1): vtreat.transform.UserTransform.__init__(self, treatment="poly") self.models_ = None self.deg = deg self.alpha = alpha def poly_terms(self, vname, vec): vec = numpy.asarray(vec) r = pandas.DataFrame({"x": vec}) for d in range(1, self.deg + 1): r[vname + "_" + str(d)] = vec ** d return r def fit(self, X, y): self.models_ = {} self.incoming_vars_ = [] self.derived_vars_ = [] for v in X.columns: if vtreat.util.can_convert_v_to_numeric(X[v]): X_v = self.poly_terms(v, X[v]) model_v = sklearn.linear_model.Ridge(alpha=self.alpha).fit( X_v, y ) new_var = v + "_poly" self.models_[v] = (model_v, [c for c in X_v.columns], new_var) self.incoming_vars_.append(v) self.derived_vars_.append(new_var) return self def transform(self, X): r = pandas.DataFrame() for k, v in self.models_.items(): model_k = v[0] cols_k = v[1] new_var = v[2] X_k = self.poly_terms(k, X[k]) xform_k = model_k.predict(X_k) r[new_var] = xform_k return r # %% d = pandas.DataFrame({"x": [i for i in range(100)]}) d["y"] = numpy.sin(0.2 * d["x"]) + 0.2 * numpy.random.normal(size=d.shape[0]) d.head() # %% step = PolyTransform(deg=10) # %% fit = step.fit_transform(d[["x"]], d["y"]) fit["x"] = d["x"] fit.head() # %% # seaborn.scatterplot(x='x', y='y', data=d) # seaborn.lineplot(x='x', y='x_poly', data=fit, color='red', alpha=0.5) # %% transform = vtreat.NumericOutcomeTreatment( outcome_name="y", params=vtreat.vtreat_parameters( { "filter_to_recommended": False, "user_transforms": [PolyTransform(deg=10)], } ), ) # %% transform.fit(d, d["y"]) # %% transform.score_frame_ # %% x2_overfit = transform.transform(d) # %% # seaborn.scatterplot(x='x', y='y', data=x2_overfit) # seaborn.lineplot(x='x', y='x_poly', data=x2_overfit, color='red', alpha=0.5) # %% x2 = transform.fit_transform(d, d["y"]) # %% transform.score_frame_ # %% x2.head()
labels = targets.map(dmap).fillna(1) print(labels.value_counts()) variables.drop(columns=[ 'POST_PD_x', 'POST_PD_y', 'join', 'ICCE', 'PROVIDER_NAME', 'GENDER', 'NPI', 'PROVIDER_REPORTING_SPECIALTY', 'PROVIDER_SPECIALTY' ], axis=0, inplace=True) print(variables.columns.values) ## Data Prep for Train plan = vt.BinomialOutcomeTreatment(outcome_target=True, params=vt.vtreat_parameters({ 'filter_to_recommended': False, 'sparse_indicators': False })) cross_frame = plan.fit_transform(variables, labels) cross_frame.dtypes cross_frame.shape print(cross_frame) ## Split into Test/Train train_features, test_features, train_labels, test_labels = train_test_split( cross_frame, labels, test_size=0.2, random_state=42, shuffle=True) model_vars = np.asarray( plan.score_frame_['variable'][plan.score_frame_['recommended']]) rf = xg.XGBClassifier(objective='binary:logistic')