Пример #1
0
def test_classification():
    numpy.random.seed(46546)

    def make_data(nrows):
        d = pandas.DataFrame({"x": [0.1 * i for i in range(nrows)]})
        d["y"] = d["x"] + numpy.sin(
            d["x"]) + 0.1 * numpy.random.normal(size=d.shape[0])
        d["xc"] = ["level_" + str(5 * numpy.round(yi / 5, 1)) for yi in d["y"]]
        d["x2"] = numpy.random.normal(size=d.shape[0])
        d.loc[d["xc"] == "level_-1.0",
              "xc"] = numpy.nan  # introduce a nan level
        d["yc"] = d["y"] > 0.5
        return d

    d = make_data(500)
    vars = [c for c in d.columns if c not in set(["y", "yc"])]
    d_test = make_data(100)

    transform = vtreat.BinomialOutcomeTreatment(
        outcome_name="yc",  # outcome variable
        outcome_target=True,  # outcome of interest
        cols_to_copy=[
            "y"
        ],  # columns to "carry along" but not treat as input variables
        params=vtreat.vtreat_parameters({"filter_to_recommended": False}),
    )
    d_prepared = transform.fit_transform(d[vars], d["yc"])

    # show vars are under control
    assert transform.get_result_restriction() is None
    assert "x2" in set(d_prepared.columns)

    transform.set_result_restriction(["xc_logit_code", "x2"])
    dt_prepared = transform.transform(d_test)
    assert set(dt_prepared.columns) == set(["y", "yc", "x2", "xc_logit_code"])

    transform = vtreat.BinomialOutcomeTreatment(
        outcome_name="yc",  # outcome variable
        outcome_target=True,  # outcome of interest
        cols_to_copy=[
            "y"
        ],  # columns to "carry along" but not treat as input variables
        params=vtreat.vtreat_parameters({"filter_to_recommended": True}),
    )
    d_prepared = transform.fit_transform(d[vars], d["yc"])

    assert transform.get_result_restriction() is not None
    assert "x2" not in transform.get_result_restriction()
    assert "x2" not in set(d_prepared.columns)

    transform.set_result_restriction(["xc_logit_code", "x2"])
    dt_prepared = transform.transform(d_test)
    assert set(dt_prepared.columns) == set(["y", "yc", "x2", "xc_logit_code"])
Пример #2
0
def test_dup():
    numpy.random.seed(46546)

    def make_data(nrows):
        d = pandas.DataFrame({"x": [0.1 * i for i in range(nrows)]})
        d["y"] = d["x"] + numpy.sin(
            d["x"]) + 0.1 * numpy.random.normal(size=d.shape[0])
        d["xc"] = ["level_" + str(5 * numpy.round(yi / 5, 1)) for yi in d["y"]]
        d["x2"] = numpy.random.normal(size=d.shape[0])
        d.loc[d["xc"] == "level_-1.0",
              "xc"] = numpy.nan  # introduce a nan level
        d["yc"] = d["y"] > 0.5
        return d

    d = make_data(500)

    transform = vtreat.BinomialOutcomeTreatment(
        outcome_name="yc",  # outcome variable
        outcome_target=True,  # outcome of interest
        cols_to_copy=[
            "y"
        ],  # columns to "carry along" but not treat as input variables
    )

    d_prepared = transform.fit_transform(d, d["yc"])

    with pytest.warns(UserWarning):
        d_prepared_wrong = transform.transform(d)

    transform = vtreat.BinomialOutcomeTreatment(
        outcome_name="yc",  # outcome variable
        outcome_target=True,  # outcome of interest
        cols_to_copy=[
            "y"
        ],  # columns to "carry along" but not treat as input variables
        params={"error_on_duplicate_frames": True},
    )

    d_prepared = transform.fit_transform(d, d["yc"])

    with pytest.raises(ValueError):
        d_prepared_wrong = transform.transform(d)

    # no warning or error
    with pytest.warns(None) as record:
        dtest = make_data(450)
    assert not record

    dtest_prepared = transform.transform(dtest)
def test_outcome_name_required():

    numpy.random.seed(235)
    d = pandas.DataFrame({"x": ["1", "1", "1", "2", "2", "2"]})
    y = [1, 2, 3, 4, 5, 6]

    transform = vtreat.NumericOutcomeTreatment(
        params=vtreat.vtreat_parameters({"filter_to_recommended": False}))
    transform.fit_transform(d, y)
    with pytest.raises(Exception):
        transform.fit_transform(d)

    transform = vtreat.BinomialOutcomeTreatment(
        params=vtreat.vtreat_parameters({"filter_to_recommended": False}),
        outcome_target=3,
    )
    transform.fit_transform(d, y)
    with pytest.raises(Exception):
        transform.fit_transform(d)

    transform = vtreat.vtreat_api.MultinomialOutcomeTreatment(
        params=vtreat.vtreat_parameters({"filter_to_recommended": False}))
    transform.fit_transform(d, y)
    with pytest.raises(Exception):
        transform.fit_transform(d)
Пример #4
0
def test_classification_type_free():
    # confirm incoming column type does not matter during apply
    numpy.random.seed(46546)

    def make_data(nrows):
        d = pandas.DataFrame({"x": numpy.random.normal(size=nrows)})
        d["y"] = d["x"] + numpy.random.normal(size=nrows)
        d["xcn"] = numpy.round(d["x"] / 5, 1) * 5
        d["yc"] = d["y"] > 0
        return d

    d = make_data(100)
    d_head = d.loc[range(10), :].copy()
    d_train = d.copy()
    d_train["xcn"] = d_train["xcn"].astype(str)
    vars = ["x", "xcn"]

    transform = vtreat.BinomialOutcomeTreatment(
        outcome_target=True,  # outcome of interest
        outcome_name="yc",
        cols_to_copy=["y"],
    )

    d_prepared = transform.fit_transform(d_train, numpy.asarray(d["yc"]))
    d_train_head = d_train.loc[range(10), :].copy()

    t1 = transform.transform(d_train_head)
    t2 = transform.transform(d_head)
    assert t1.equals(t2)
Пример #5
0
def test_classification_numpy():
    numpy.random.seed(46546)

    def make_data(nrows):
        d = pandas.DataFrame({"x": [0.1 * i for i in range(nrows)]})
        d["y"] = d["x"] + numpy.sin(d["x"]) + 0.1 * numpy.random.normal(size=d.shape[0])
        d["xc"] = ["level_" + str(5 * numpy.round(yi / 5, 1)) for yi in d["y"]]
        d["x2"] = numpy.random.normal(size=d.shape[0])
        d.loc[d["xc"] == "level_-1.0", "xc"] = numpy.nan  # introduce a nan level
        d["yc"] = d["y"] > 0.5
        return d

    d = make_data(500)
    vars = [v for v in d.columns if v not in ['y', 'c']]
    d_n = numpy.asarray(d[vars])

    transform = vtreat.BinomialOutcomeTreatment(
        outcome_target=True,  # outcome of interest
    )

    d_prepared = transform.fit_transform(d_n, numpy.asarray(d["yc"]))
    d_prepared_columns = transform.last_result_columns
    sf = transform.score_frame_

    assert len(set(d_prepared_columns) - set(sf.variable)) == 0

    dtest = make_data(450)

    dtest_prepared = transform.transform(numpy.asarray(dtest[vars]))
    dtest_prepared_columns = transform.last_result_columns

    assert len(set(dtest_prepared_columns) - set(sf.variable)) == 0
Пример #6
0
def test_pipeparams():
    numpy.random.seed(2019)

    def make_data(nrows):
        d = pd.DataFrame({"x": 5 * numpy.random.normal(size=nrows)})
        d["y"] = numpy.sin(d["x"]) + 0.1 * numpy.random.normal(size=nrows)
        d.loc[numpy.arange(3, 10), "x"] = numpy.nan  # introduce a nan level
        d["xc"] = ["level_" + str(5 * numpy.round(yi / 5, 1)) for yi in d["y"]]
        d["x2"] = np.random.normal(size=nrows)
        d.loc[d["xc"] == "level_-1.0",
              "xc"] = numpy.nan  # introduce a nan level
        d["yc"] = d["y"] > 0.5
        return d

    df = make_data(500)

    df = df.drop(columns=["y"])

    transform = vtreat.BinomialOutcomeTreatment(
        outcome_target=True,
        params=vtreat.vtreat_parameters({"sparse_indicators": False}),
    )

    clf = Pipeline(steps=[
        ("preprocessor", transform),
        ("classifier", LogisticRegression(solver="lbfgs")),
    ])

    X, y = df, df.pop("yc")

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    clf.fit(X_train, y_train)

    #%%

    t_params = transform.get_params()
    assert t_params["indicator_min_fraction"] is not None
    assert transform.get_params()["indicator_min_fraction"] != 0

    #%%

    p_params = clf.get_params()
    assert p_params["preprocessor__indicator_min_fraction"] is not None

    #%%

    clf.set_params(preprocessor__indicator_min_fraction=0)
    assert transform.get_params()["indicator_min_fraction"] == 0

    # no warning or error
    with pytest.warns(None) as record:
        clf.fit(X_train, y_train)
    assert not record
Пример #7
0
def test_pipeparams():
    numpy.random.seed(2019)

    def make_data(nrows):
        d = pd.DataFrame({'x': 5*numpy.random.normal(size=nrows)})
        d['y'] = numpy.sin(d['x']) + 0.1*numpy.random.normal(size=nrows)
        d.loc[numpy.arange(3, 10), 'x'] = numpy.nan                           # introduce a nan level
        d['xc'] = ['level_' + str(5*numpy.round(yi/5, 1)) for yi in d['y']]
        d['x2'] = np.random.normal(size=nrows)
        d.loc[d['xc']=='level_-1.0', 'xc'] = numpy.nan  # introduce a nan level
        d['yc'] = d['y']>0.5
        return d

    df = make_data(500)

    df = df.drop(columns=['y'])

    transform = vtreat.BinomialOutcomeTreatment(
        outcome_target=True,
        params=vtreat.vtreat_parameters({'sparse_indicators': False}))

    clf = Pipeline(steps=[
        ('preprocessor', transform),
        ('classifier', LogisticRegression(solver = 'lbfgs'))]
    )

    X, y = df, df.pop('yc')

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    clf.fit(X_train, y_train)

    #%%

    t_params = transform.get_params()
    assert t_params['indicator_min_fraction'] is not None
    assert transform.get_params()['indicator_min_fraction'] != 0

    #%%

    p_params = clf.get_params()
    assert p_params['preprocessor__indicator_min_fraction'] is not None

    #%%

    clf.set_params(preprocessor__indicator_min_fraction=0)
    assert transform.get_params()['indicator_min_fraction'] == 0

    # no warning or error
    with pytest.warns(None) as record:
        clf.fit(X_train, y_train)
    assert not record
Пример #8
0
def test_classification():
    numpy.random.seed(46546)

    def make_data(nrows):
        d = pandas.DataFrame({"x": [0.1 * i for i in range(nrows)]})
        d["y"] = d["x"] + numpy.sin(
            d["x"]) + 0.1 * numpy.random.normal(size=d.shape[0])
        d["xc"] = ["level_" + str(5 * numpy.round(yi / 5, 1)) for yi in d["y"]]
        d["x2"] = numpy.random.normal(size=d.shape[0])
        d.loc[d["xc"] == "level_-1.0",
              "xc"] = numpy.nan  # introduce a nan level
        d["yc"] = d["y"] > 0.5
        return d

    d = make_data(500)

    transform = vtreat.BinomialOutcomeTreatment(
        outcome_name="yc",  # outcome variable
        outcome_target=True,  # outcome of interest
        cols_to_copy=[
            "y"
        ],  # columns to "carry along" but not treat as input variables
    )

    # show y-column doesn't get copied in, and can tolerate copy columns not being around
    vars = [c for c in d.columns if c not in set(['y', 'yc'])]
    d_prepared = transform.fit_transform(d[vars], d["yc"])
    assert 'yc' not in d_prepared.columns

    # design again

    d_prepared = transform.fit_transform(d, d["yc"])

    for c in d_prepared.columns:
        assert vtreat.util.can_convert_v_to_numeric(d_prepared[c])
        assert sum(vtreat.util.is_bad(d_prepared[c])) == 0

    dtest = make_data(450)

    dtest_prepared = transform.transform(dtest)

    for c in dtest_prepared.columns:
        assert vtreat.util.can_convert_v_to_numeric(dtest_prepared[c])
        assert sum(vtreat.util.is_bad(dtest_prepared[c])) == 0

    sf = transform.score_frame_

    xrow = sf.loc[numpy.logical_and(sf.variable == "x", sf.treatment ==
                                    "clean_copy"), :]
    xrow.reset_index(inplace=True, drop=True)

    assert xrow.recommended[0]
Пример #9
0
def test_db_adapter_1_cdata():
    # Example from:
    # https://github.com/WinVector/pyvtreat/blob/main/Examples/Database/vtreat_db_adapter.ipynb
    # Data from:
    # https://archive.ics.uci.edu/ml/datasets/Diabetes+130-US+hospitals+for+years+1999-2008

    # data_all = pd.read_csv("diabetes_head.csv")
    dir_path = os.path.dirname(os.path.realpath(__file__))
    data_all = pd.read_csv(os.path.join(dir_path, "diabetes_head.csv"))
    n = data_all.shape[0]
    data_all["orig_index"] = range(n)
    d_train = data_all.loc[range(n - 5), :].reset_index(inplace=False,
                                                        drop=True)
    d_app = data_all.loc[range(n - 5, n)].reset_index(inplace=False, drop=True)

    #%%

    outcome_name = "readmitted"
    cols_to_copy = ["orig_index", "encounter_id", "patient_nbr"
                    ] + [outcome_name]
    vars = ["time_in_hospital", "weight"]
    columns = vars + cols_to_copy

    # d_train.loc[:, columns]

    #%%

    treatment = vtreat.BinomialOutcomeTreatment(
        cols_to_copy=cols_to_copy,
        outcome_name=outcome_name,
        outcome_target=True,
        params=vtreat.vtreat_parameters({
            "sparse_indicators": False,
            "filter_to_recommended": False,
        }),
    )
    d_train_treated = treatment.fit_transform(d_train.loc[:, columns])

    d_app_treated = treatment.transform(d_app.loc[:, columns])

    # d_app_treated

    #%%

    transform_as_data = treatment.description_matrix()

    # transform_as_data

    #%%

    ops = as_data_algebra_pipeline(
        source=descr(d_app=d_app.loc[:, columns]),
        vtreat_descr=transform_as_data,
        treatment_table_name="transform_as_data",
        row_keys=['orig_index'],
    )

    # print(ops)

    #%%

    transformed = ops.eval({
        "d_app": d_app.loc[:, columns],
        "transform_as_data": transform_as_data
    })

    # transformed

    #%%

    assert data_algebra.test_util.equivalent_frames(transformed, d_app_treated)

    #%%

    db_handle = data_algebra.SQLite.example_handle()

    sql = db_handle.to_sql(ops)
    assert isinstance(sql, str)
    # print(sql)

    #%%

    db_handle.insert_table(d_app.loc[:, columns], table_name="d_app")
    db_handle.insert_table(transform_as_data, table_name="transform_as_data")

    db_handle.execute("CREATE TABLE res AS " + sql)

    res_db = db_handle.read_query(
        "SELECT * FROM res ORDER BY orig_index LIMIT 10")

    # res_db

    #%%

    assert data_algebra.test_util.equivalent_frames(res_db, d_app_treated)

    #%%

    db_handle.close()
Пример #10
0
def test_KDD2009_vtreat_1():
    data_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                            'KDD2009')
    test_on_BigQuery = False
    test_xicor = True
    # data from https://github.com/WinVector/PDSwR2/tree/master/KDD2009
    expect_test = pandas.read_csv(os.path.join(data_dir,
                                               'test_processed.csv.gz'),
                                  compression='gzip')
    d = pandas.read_csv(os.path.join(data_dir, 'orange_small_train.data.gz'),
                        sep='\t',
                        header=0)
    orig_vars = list(d.columns)
    # Read in dependent variable we are trying to predict.
    churn = pandas.read_csv(os.path.join(
        data_dir, 'orange_small_train_churn.labels.txt'),
                            header=None)
    churn.columns = ["churn"]
    churn['churn'] = churn['churn'] == 1  # replace with True / False
    # Arrange test/train split.
    numpy.random.seed(2020)
    n = d.shape[0]
    # https://github.com/WinVector/pyvtreat/blob/master/Examples/CustomizedCrossPlan/CustomizedCrossPlan.md
    split1 = vtreat.cross_plan.KWayCrossPlanYStratified().split_plan(
        n_rows=n, k_folds=10, y=churn.iloc[:, 0])
    train_idx = set(split1[0]['train'])
    is_train = [i in train_idx for i in range(n)]
    is_test = numpy.logical_not(is_train)
    d['orig_index'] = range(d.shape[0])
    d_train = d.loc[is_train, :].reset_index(drop=True, inplace=False)
    churn_train = numpy.asarray(churn.loc[is_train, :]["churn"])
    d_test = d.loc[is_test, :].reset_index(drop=True, inplace=False)
    churn_test = numpy.asarray(churn.loc[is_test, :]["churn"])
    # build treatment plan
    plan = vtreat.BinomialOutcomeTreatment(outcome_target=True,
                                           outcome_name='churn',
                                           cols_to_copy=['orig_index'],
                                           params=vtreat.vtreat_parameters({
                                               'filter_to_recommended':
                                               True,
                                               'sparse_indicators':
                                               True,
                                           }))
    cross_frame = plan.fit_transform(d_train, churn_train)
    test_processed = plan.transform(d_test)
    # check we got lots of variables, as seen in worksheet
    rec = plan.score_frame_.loc[plan.score_frame_.recommended, :]
    vc = rec.treatment.value_counts()
    treatments_seen = set(vc.index)
    assert numpy.all([
        t in treatments_seen for t in [
            'missing_indicator', 'indicator_code', 'logit_code',
            'prevalence_code', 'clean_copy'
        ]
    ])
    assert numpy.min(vc) >= 10
    model_vars = list(rec['variable'])
    if test_xicor:
        ## xicor
        # all_vars = list(set(plan.score_frame_["variable"]))
        all_vars = [
            c for c in cross_frame.columns if c not in ['churn', 'orig_index']
        ]
        xicor_scores = vtreat.stats_utils.xicor_for_frame(
            cross_frame.loc[:, all_vars],
            numpy.asarray(churn_train, dtype=float),
            n_reps=5)
        xicor_picked = list(xicor_scores.loc[xicor_scores['xicor'] > 0.0,
                                             'variable'])
        model_vars = xicor_picked
    # try a simple model
    model = sklearn.linear_model.LogisticRegression(max_iter=1000)
    with pytest.warns(UserWarning):  # densifying warns
        model.fit(cross_frame.loc[:, model_vars], churn_train)
    with pytest.warns(UserWarning):  # densifying warns
        preds_test = model.predict_proba(test_processed.loc[:, model_vars])
    with pytest.warns(UserWarning):  # densifying warns
        preds_train = model.predict_proba(cross_frame.loc[:, model_vars])
    fpr, tpr, _ = sklearn.metrics.roc_curve(churn_test, preds_test[:, 1])
    auc_test = sklearn.metrics.auc(fpr, tpr)
    fpr, tpr, _ = sklearn.metrics.roc_curve(churn_train, preds_train[:, 1])
    auc_train = sklearn.metrics.auc(fpr, tpr)
    assert auc_test > 0.6  # not good!
    assert abs(auc_test - auc_train) < 0.05  # at least not over fit!
    # check against previous result
    assert test_processed.shape == expect_test.shape
    assert set(test_processed.columns) == set(expect_test.columns)
    assert numpy.max(numpy.max(numpy.abs(test_processed - expect_test))) < 1e-3
    # test transform conversion
    transform_as_data = plan.description_matrix()
    incoming_vars = list(set(transform_as_data['orig_var']))
    ops = vtreat.vtreat_db_adapter.as_data_algebra_pipeline(
        source=TableDescription(table_name='d_test',
                                column_names=incoming_vars + ['orig_index']),
        vtreat_descr=transform_as_data,
        treatment_table_name='transform_as_data',
        row_keys=['orig_index'],
    )
    test_by_pipeline = ops.eval({
        'd_test':
        d_test.loc[:, incoming_vars + ['orig_index']],
        'transform_as_data':
        transform_as_data
    })
    assert test_by_pipeline.shape[0] == test_processed.shape[0]
    assert test_by_pipeline.shape[1] >= test_processed.shape[1]
    assert not numpy.any(numpy.isnan(test_by_pipeline))
    test_pipeline_cols = set(test_by_pipeline.columns)
    assert numpy.all([c in test_pipeline_cols for c in test_processed.columns])
    test_cols_sorted = list(test_processed.columns)
    test_cols_sorted.sort()
    assert numpy.max(
        numpy.max(
            numpy.abs(test_processed[test_cols_sorted] -
                      test_by_pipeline[test_cols_sorted]))) < 1e-5
    # data algebra pipeline in database
    sql = data_algebra.BigQuery.BigQueryModel().to_sql(ops)
    assert isinstance(sql, str)
    if test_on_BigQuery:
        db_handle = data_algebra.BigQuery.example_handle()
        db_handle.drop_table('d_test_processed')
        db_handle.insert_table(d_test.loc[:, incoming_vars + ['orig_index']],
                               table_name='d_test',
                               allow_overwrite=True)
        db_handle.insert_table(transform_as_data,
                               table_name='transform_as_data',
                               allow_overwrite=True)
        db_handle.execute(
            f"CREATE TABLE {db_handle.db_model.table_prefix}.d_test_processed AS {db_handle.to_sql(ops)}"
        )
        db_res = db_handle.read_query(
            f"SELECT * FROM {db_handle.db_model.table_prefix}.d_test_processed ORDER BY orig_index"
        )
        assert db_res.shape[0] == test_processed.shape[0]
        assert numpy.max(
            numpy.max(
                numpy.abs(test_processed[test_cols_sorted] -
                          db_res[test_cols_sorted]))) < 1e-5
        db_handle.drop_table('d_test')
        db_handle.drop_table('transform_as_data')
        db_handle.drop_table('d_test_processed')
        db_handle.close()
Пример #11
0
def test_diabetes_example():
    dir_path = os.path.dirname(os.path.realpath(__file__))
    data = pandas.read_pickle(os.path.join(dir_path, 'diabetes_head.pkl'))
    assert data.shape[0] == 1000

    # from AI200: day_04/ZZ_homework/soln_dont_peek/diabetes_soln.ipynb

    # sklearn.preprocessing.OneHotEncoder could
    # also perform this task well.

    # documentation:
    #  https://github.com/WinVector/pyvtreat/blob/main/Examples/Classification/Classification.md
    treatment = vtreat.BinomialOutcomeTreatment(
        cols_to_copy=['encounter_id', 'patient_nbr', 'readmitted'],
        outcome_name='readmitted',
        outcome_target=True,
        params=vtreat.vtreat_parameters({
            'sparse_indicators': False,
            'filter_to_recommended': False,
        }),
    )
    data_treated = treatment.fit_transform(data)

    assert data_treated.shape[0] == data.shape[0]

    expect = {
        'A1Cresult_lev_None', 'A1Cresult_lev__gt_8', 'A1Cresult_logit_code',
        'A1Cresult_prevalence_code', 'acarbose_lev_No', 'acarbose_logit_code',
        'acarbose_prevalence_code', 'admission_source_id_lev_1',
        'admission_source_id_lev_7', 'admission_source_id_logit_code',
        'admission_source_id_prevalence_code', 'admission_type_id_lev_1',
        'admission_type_id_lev_2', 'admission_type_id_lev_6',
        'admission_type_id_logit_code', 'admission_type_id_prevalence_code',
        'age_lev__osq_40-50_cp_', 'age_lev__osq_50-60_cp_',
        'age_lev__osq_60-70_cp_', 'age_lev__osq_70-80_cp_',
        'age_lev__osq_80-90_cp_', 'age_logit_code', 'age_prevalence_code',
        'change_lev_Ch', 'change_lev_No', 'change_logit_code',
        'change_prevalence_code', 'chlorpropamide_lev_No',
        'chlorpropamide_logit_code', 'chlorpropamide_prevalence_code',
        'diabetesMed_lev_No', 'diabetesMed_lev_Yes', 'diabetesMed_logit_code',
        'diabetesMed_prevalence_code', 'diag_1_is_bad', 'diag_1_lev_414',
        'diag_1_logit_code', 'diag_1_prevalence_code', 'diag_2_is_bad',
        'diag_2_logit_code', 'diag_2_prevalence_code', 'diag_3_is_bad',
        'diag_3_lev_250', 'diag_3_logit_code', 'diag_3_prevalence_code',
        'discharge_disposition_id_lev_1', 'discharge_disposition_id_lev_25',
        'discharge_disposition_id_logit_code',
        'discharge_disposition_id_prevalence_code', 'encounter_id',
        'gender_lev_Female', 'gender_lev_Male', 'gender_logit_code',
        'gender_prevalence_code', 'glimepiride_lev_No',
        'glimepiride_logit_code', 'glimepiride_prevalence_code',
        'glipizide_lev_No', 'glipizide_lev_Steady', 'glipizide_logit_code',
        'glipizide_prevalence_code', 'glyburide_lev_No',
        'glyburide_logit_code', 'glyburide_prevalence_code',
        'insulin_lev_Down', 'insulin_lev_No', 'insulin_lev_Steady',
        'insulin_logit_code', 'insulin_prevalence_code',
        'max_glu_serum_lev_None', 'max_glu_serum_logit_code',
        'max_glu_serum_prevalence_code', 'medical_specialty_is_bad',
        'medical_specialty_lev_Cardiology',
        'medical_specialty_lev_Family/GeneralPractice',
        'medical_specialty_lev_InternalMedicine', 'medical_specialty_lev__NA_',
        'medical_specialty_logit_code', 'medical_specialty_prevalence_code',
        'metformin_lev_No', 'metformin_lev_Steady', 'metformin_logit_code',
        'metformin_prevalence_code', 'num_lab_procedures', 'num_medications',
        'num_procedures', 'number_diagnoses', 'number_emergency',
        'number_inpatient', 'number_outpatient', 'patient_nbr',
        'pioglitazone_lev_No', 'pioglitazone_logit_code',
        'pioglitazone_prevalence_code', 'race_is_bad',
        'race_lev_AfricanAmerican', 'race_lev_Caucasian', 'race_logit_code',
        'race_prevalence_code', 'readmitted', 'repaglinide_lev_No',
        'repaglinide_logit_code', 'repaglinide_prevalence_code', 'revisit',
        'rosiglitazone_lev_No', 'rosiglitazone_logit_code',
        'rosiglitazone_prevalence_code', 'time_in_hospital',
        'tolazamide_lev_No', 'tolazamide_logit_code',
        'tolazamide_prevalence_code', 'tolbutamide_lev_No',
        'tolbutamide_logit_code', 'tolbutamide_prevalence_code',
        'troglitazone_lev_No', 'troglitazone_logit_code',
        'troglitazone_prevalence_code', 'visit_number', 'weight_is_bad',
        'weight_lev__NA_', 'weight_logit_code', 'weight_prevalence_code'
    }
    assert set(data_treated.columns) == expect

    treatment = vtreat.BinomialOutcomeTreatment(
        cols_to_copy=['encounter_id', 'patient_nbr', 'readmitted'],
        outcome_name='readmitted',
        outcome_target=True,
        params=vtreat.vtreat_parameters({
            'sparse_indicators': False,
            'filter_to_recommended': True,
        }),
    )
    data_treated = treatment.fit_transform(data)

    assert data_treated.shape[0] == data.shape[0]
    assert data_treated.shape[1] >= 10
Пример #12
0
labels = targets.map(dmap).fillna(1)
print(labels.value_counts())

variables.drop(columns=[
    'POST_PD_x', 'POST_PD_y', 'join', 'ICCE', 'PROVIDER_NAME', 'GENDER', 'NPI',
    'PROVIDER_REPORTING_SPECIALTY', 'PROVIDER_SPECIALTY'
],
               axis=0,
               inplace=True)
print(variables.columns.values)

## Data Prep for Train
plan = vt.BinomialOutcomeTreatment(outcome_target=True,
                                   params=vt.vtreat_parameters({
                                       'filter_to_recommended':
                                       False,
                                       'sparse_indicators':
                                       False
                                   }))

cross_frame = plan.fit_transform(variables, labels)
cross_frame.dtypes
cross_frame.shape
print(cross_frame)
## Split into Test/Train
train_features, test_features, train_labels, test_labels = train_test_split(
    cross_frame, labels, test_size=0.2, random_state=42, shuffle=True)
model_vars = np.asarray(
    plan.score_frame_['variable'][plan.score_frame_['recommended']])

rf = xg.XGBClassifier(objective='binary:logistic')