def test_id_col_check(): d = pandas.DataFrame({"x": ["a", "b", "c"], "y": ["a", "b", "b"]}) transform = vtreat.UnsupervisedTreatment(var_list=["x", "y"]) with pytest.warns(Warning): transform.fit_transform(d)
def test_unsupervised(): numpy.random.seed(235) zip = ["z" + str(i + 1).zfill(5) for i in range(15)] d = pandas.DataFrame({"zip": numpy.random.choice(zip, size=1000)}) d["const"] = 1 d["const2"] = "b" d["const3"] = None transform = vtreat.UnsupervisedTreatment( params=vtreat.unsupervised_parameters({"indicator_min_fraction": 0.01 })) d_treated = transform.fit_transform(d) for c in d_treated.columns: assert vtreat.util.can_convert_v_to_numeric(d_treated[c]) assert numpy.sum(vtreat.util.is_bad(d_treated[c])) == 0 sf = transform.score_frame_ assert set(sf["orig_variable"]) == {"zip"} # https://stackoverflow.com/a/45671804/6901725 with pytest.warns(None) as record: d_treated_2 = transform.transform(d) assert len(record) == 0 assert d_treated.equals(d_treated_2) fn = transform.get_feature_names() assert set(sf["variable"]) == set(fn)
def test_vtreat_onehot(): d = pd.DataFrame( { "xc": ["a", "b", "b"], "xd": ["1", "1", "2"], # vtreat picks columns to convert by type "xn": [1.0, 2.0, 3.0], } ) treatment = vtreat.UnsupervisedTreatment( params=vtreat.unsupervised_parameters( {"coders": {"clean_copy", "indicator_code"}} ) ) treatment.fit(d) res = treatment.transform(d) expect = pd.DataFrame( { "xn": [1.0, 2.0, 3.0], "xd_lev_1": [1.0, 1.0, 0.0], "xd_lev_2": [0.0, 0.0, 1.0], "xc_lev_b": [0.0, 1.0, 1.0], "xc_lev_a": [1.0, 0.0, 0.0], } ) assert data_algebra.test_util.equivalent_frames(res, expect, check_row_order=True)
def test_unsupervised(): n_rows = 10000 n_levels = 10 n_cat = 10 n_numeric = 10 numpy.random.seed(235) zip = ["z" + str(i + 1).zfill(5) for i in range(n_levels)] d = pandas.DataFrame({"const": numpy.zeros(n_rows) + 1}) d["const2"] = "b" for i in range(n_cat): d[f"zip_{i}"] = numpy.random.choice(zip, size=n_rows) for i in range(n_numeric): d[f"num_{i}"] = numpy.random.uniform(size=n_rows) transform = vtreat.UnsupervisedTreatment( params=vtreat.unsupervised_parameters({"indicator_min_fraction": 0.01 })) ## https://docs.python.org/3/library/profile.html # import cProfile # cProfile.run('d_treated = transform.fit_transform(d)') d_treated = transform.fit_transform(d) for c in d_treated.columns: assert vtreat.util.can_convert_v_to_numeric(d_treated[c]) assert numpy.sum(vtreat.util.is_bad(d_treated[c])) == 0 sf = transform.score_frame_
def test_col_dups_1(): d = pandas.DataFrame({"x": [1], "x2": [2], "y": [3]}) d.columns = ["x", "x", "y"] transform = vtreat.UnsupervisedTreatment(var_list=["x"], cols_to_copy=["y"]) with pytest.raises(ValueError): transform.fit_transform(d, d["y"])
def test_col_dups_1(): d = pandas.DataFrame({'x': [1], 'x2': [2], 'y': [3]}) d.columns = ['x', 'x', 'y'] transform = vtreat.UnsupervisedTreatment(var_list=['x'], cols_to_copy=['y']) with pytest.raises(ValueError): transform.fit_transform(d, d["y"])
def test_id_col_check(): d = pandas.DataFrame({'x': ['a', 'b', 'c'], 'y': ['a', 'b', 'b']}) transform = vtreat.UnsupervisedTreatment( var_list=['x', 'y'] ) with pytest.warns(Warning): transform.fit_transform(d)
def test_xgboost_col_name_issue_2(): # https://stackoverflow.com/questions/48645846/pythons-xgoost-valueerrorfeature-names-may-not-contain-or # ValueError('feature_names may not contain [, ] or <') d = pandas.DataFrame({"x": ["[", "]", "<", "_lt_", "_lt_"]}) transform = vtreat.UnsupervisedTreatment(var_list=["x"]) d_transformed = transform.fit_transform(d, None) cols = d_transformed.columns for col in cols: assert not any(c in col for c in "[]<>") assert len(set(cols)) == len(cols)
def test_unsupervised(): numpy.random.seed(235) zip = ["z" + str(i + 1).zfill(5) for i in range(15)] d = pandas.DataFrame({"zip": numpy.random.choice(zip, size=1000)}) d["const"] = 1 d["const2"] = "b" transform = vtreat.UnsupervisedTreatment( params=vtreat.unsupervised_parameters({"indicator_min_fraction": 0.01 })) d_treated = transform.fit_transform(d) for c in d_treated.columns: assert vtreat.util.can_convert_v_to_numeric(d_treated[c]) assert sum(vtreat.util.is_bad(d_treated[c])) == 0 sf = transform.score_frame_
def test_homes_example(): dir_path = os.path.dirname(os.path.realpath(__file__)) d = pandas.read_pickle(os.path.join(dir_path, 'homes_76.pkl')) assert d.shape[0] == 38 assert d.shape[1] == 8 # from AI200: day_01/02_Regression/Part2_LRPractice/LRExample.ipynb # documentation: https://github.com/WinVector/pyvtreat/blob/main/Examples/Unsupervised/Unsupervised.md treatment = vtreat.UnsupervisedTreatment( cols_to_copy=['Price'], params=vtreat.unsupervised_parameters({ 'sparse_indicators': False, 'coders': {'clean_copy', 'indicator_code', 'missing_indicator'} })) df = treatment.fit_transform(d) assert df.shape[0] == d.shape[0] expect_cols = [ 'Price', 'Size', 'Bath', 'Bed', 'Year', 'Garage', 'Lot_lev_4', 'Lot_lev_5', 'Lot_lev_3', 'Lot_lev_1', 'Lot_lev_2', 'Lot_lev_11', 'Elem_lev_edge', 'Elem_lev_edison', 'Elem_lev_parker', 'Elem_lev_harris', 'Elem_lev_adams', 'Elem_lev_crest' ] assert set(df.columns) == set(expect_cols)
def test_imputation_controls(): d = pandas.DataFrame({ "x": [0, 1, 1000, None], "y": [0, 0, 1, 1], }) transform = vtreat.UnsupervisedTreatment(cols_to_copy=["y"], ) d_treated = transform.fit_transform(d) expect = pandas.DataFrame({ "y": [0, 0, 1, 1], "x_is_bad": [0.0, 0.0, 0.0, 1.0], "x": [0.0, 1.0, 1000.0, 333.6666666667], }) vtreat.util.check_matching_numeric_frames(res=d_treated, expect=expect) transform = vtreat.UnsupervisedTreatment( cols_to_copy=["y"], params=vtreat.unsupervised_parameters({ "missingness_imputation": numpy.median, }), ) d_treated = transform.fit_transform(d) expect = pandas.DataFrame({ "y": [0, 0, 1, 1], "x_is_bad": [0.0, 0.0, 0.0, 1.0], "x": [0.0, 1.0, 1000.0, 1.0], }) vtreat.util.check_matching_numeric_frames(res=d_treated, expect=expect) transform = vtreat.UnsupervisedTreatment( cols_to_copy=["y"], params=vtreat.unsupervised_parameters({ "missingness_imputation": numpy.min, }), ) d_treated = transform.fit_transform(d) expect = pandas.DataFrame({ "y": [0, 0, 1, 1], "x_is_bad": [0.0, 0.0, 0.0, 1.0], "x": [0.0, 1.0, 1000.0, 0.0], }) vtreat.util.check_matching_numeric_frames(res=d_treated, expect=expect) transform = vtreat.UnsupervisedTreatment( cols_to_copy=["y"], params=vtreat.unsupervised_parameters({ "missingness_imputation": 7, }), imputation_map={"y": numpy.median}, ) d_treated = transform.fit_transform(d) expect = pandas.DataFrame({ "y": [0, 0, 1, 1], "x_is_bad": [0.0, 0.0, 0.0, 1.0], "x": [0.0, 1.0, 1000.0, 7.0], }) vtreat.util.check_matching_numeric_frames(res=d_treated, expect=expect) transform = vtreat.UnsupervisedTreatment( cols_to_copy=["y"], params=vtreat.unsupervised_parameters({ "missingness_imputation": 7, }), imputation_map={"x": numpy.median}, ) d_treated = transform.fit_transform(d) expect = pandas.DataFrame({ "y": [0, 0, 1, 1], "x_is_bad": [0.0, 0.0, 0.0, 1.0], "x": [0.0, 1.0, 1000.0, 1.0], }) vtreat.util.check_matching_numeric_frames(res=d_treated, expect=expect) transform = vtreat.UnsupervisedTreatment( cols_to_copy=["y"], params=vtreat.unsupervised_parameters({ "missingness_imputation": numpy.mean, }), imputation_map={"x": 12}, ) d_treated = transform.fit_transform(d) expect = pandas.DataFrame({ "y": [0, 0, 1, 1], "x_is_bad": [0.0, 0.0, 0.0, 1.0], "x": [0.0, 1.0, 1000.0, 12.0], }) vtreat.util.check_matching_numeric_frames(res=d_treated, expect=expect)