示例#1
0
def test_nan_inf():
    numpy.random.seed(235)
    d = pandas.DataFrame({
        "x": [1.0, numpy.nan, numpy.inf, -numpy.inf, None, 0],
        "y": [1, 2, 3, 4, 5, 6]
    })

    transform = vtreat.NumericOutcomeTreatment(
        outcome_name="y",
        params=vtreat.vtreat_parameters({"filter_to_recommended": False}),
    )

    d_treated = transform.fit_transform(d, d["y"])

    for c in d_treated.columns:
        assert vtreat.util.can_convert_v_to_numeric(d_treated[c])
        assert numpy.sum(vtreat.util.is_bad(d_treated[c])) == 0

    expect = pandas.DataFrame({
        "x": [1.0, 0.5, 0.5, 0.5, 0.5, 0],
        "x_is_bad": [0, 1, 1, 1, 1, 0],
        "y": [1, 2, 3, 4, 5, 6],
    })

    for c in expect.columns:
        ec = numpy.asarray(expect[c])
        ed = numpy.asarray(d_treated[c])
        assert numpy.max(numpy.abs(ec - ed)) < 1.0e-6
def test_outcome_name_required():

    numpy.random.seed(235)
    d = pandas.DataFrame({"x": ["1", "1", "1", "2", "2", "2"]})
    y = [1, 2, 3, 4, 5, 6]

    transform = vtreat.NumericOutcomeTreatment(
        params=vtreat.vtreat_parameters({"filter_to_recommended": False}))
    transform.fit_transform(d, y)
    with pytest.raises(Exception):
        transform.fit_transform(d)

    transform = vtreat.BinomialOutcomeTreatment(
        params=vtreat.vtreat_parameters({"filter_to_recommended": False}),
        outcome_target=3,
    )
    transform.fit_transform(d, y)
    with pytest.raises(Exception):
        transform.fit_transform(d)

    transform = vtreat.vtreat_api.MultinomialOutcomeTreatment(
        params=vtreat.vtreat_parameters({"filter_to_recommended": False}))
    transform.fit_transform(d, y)
    with pytest.raises(Exception):
        transform.fit_transform(d)
示例#3
0
def test_r1_issue():
    plan = vtreat.NumericOutcomeTreatment(
        outcome_name="y",
        params=vtreat.vtreat_parameters({"filter_to_recommended": False}),
    )

    # from https://pandas.pydata.org/pandas-docs/stable/user_guide/missing_data.html
    df = pandas.DataFrame(
        numpy.random.randn(5, 3),
        index=["a", "c", "e", "f", "h"],
        columns=["one", "two", "three"],
    )
    df["four"] = "foo"
    df["five"] = df["one"] > 0
    df2 = df.reindex(["a", "b", "c", "d", "e", "f", "g", "h"])
    df2.reset_index(inplace=True, drop=True)
    df2["y"] = range(df2.shape[0])
    df2.loc[3, "four"] = "blog"
    df2["const"] = 1

    vtreat.util.is_bad(df2["five"])
    prepped = plan.fit_transform(df2, df2["y"])  # used to raise an exception

    for c in prepped.columns:
        assert vtreat.util.can_convert_v_to_numeric(prepped[c])
        assert numpy.sum(vtreat.util.is_bad(prepped[c])) == 0
示例#4
0
def test_db_adapter_monster():
    outcome_name = "y"
    row_id_name = 'row_id'
    n_vars = 5

    def mk_data(n_rows: int = 100):
        step = 1 / np.sqrt(n_vars)
        cols = dict()
        y = np.random.normal(size=n_rows)
        for i in range(n_vars):
            vname = f"v_{i}"
            v = np.random.choice(["a", "b"], replace=True, size=n_rows)
            y = y + np.where(v == "a", step, -step)
            cols[vname] = v
        vars = list(cols.keys())
        vars.sort()
        cols[outcome_name] = y
        cols[row_id_name] = range(n_rows)
        d = pd.DataFrame(cols)
        return d, vars

    d, vars = mk_data(100)
    d_app, _ = mk_data(10)
    cols_to_copy = [outcome_name, row_id_name]
    columns = vars + cols_to_copy

    treatment = vtreat.NumericOutcomeTreatment(
        cols_to_copy=cols_to_copy,
        outcome_name=outcome_name,
        params=vtreat.vtreat_parameters({
            "sparse_indicators": False,
            "filter_to_recommended": False,
        }),
    )
    d_train_treated = treatment.fit_transform(d)
    assert isinstance(d_train_treated, pd.DataFrame)
    d_app_treated = treatment.transform(d_app)

    transform_as_data = treatment.description_matrix()
    # transform_as_data.to_csv('example_transform.csv', index=False)

    ops = as_data_algebra_pipeline(
        source=descr(d_app=d),
        vtreat_descr=transform_as_data,
        treatment_table_name="transform_as_data",
        row_keys=[row_id_name],
    )

    ops_source = str(ops)
    assert isinstance(ops_source, str)

    d_app_res = ops.eval({
        "d_app": d_app,
        "transform_as_data": transform_as_data
    })
    assert data_algebra.test_util.equivalent_frames(d_app_treated, d_app_res)
    assert numpy.all([c in d_app_res.columns for c in cols_to_copy])
示例#5
0
def test_unexpected_nan():
    # confirm NaN processing correct, even when none seenin training data
    numpy.random.seed(235)
    d = pandas.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": [1, 2, 3, 4, 5, 6]})

    transform = vtreat.NumericOutcomeTreatment(
        outcome_name="y",
        params=vtreat.vtreat_parameters({"filter_to_recommended": False}),
    )

    d_treated = transform.fit_transform(d, d["y"])
    assert transform.score_frame_.shape[0] == 1
    assert "x" in set(transform.score_frame_["variable"])

    d_app = pandas.DataFrame({"x": [1, 2, numpy.NAN, 4, None, 6]})
    assert numpy.any(numpy.isnan(d_app["x"]))
    d_app_treated = transform.transform(d_app)
    assert not numpy.any(numpy.isnan(d_app_treated["x"]))
示例#6
0
def test_classification():
    numpy.random.seed(46546)

    def make_data(nrows):
        d = pandas.DataFrame({"x": [0.1 * i for i in range(500)]})
        d["y"] = d["x"] + numpy.sin(
            d["x"]) + 0.1 * numpy.random.normal(size=d.shape[0])
        d["xc"] = ["level_" + str(5 * numpy.round(yi / 5, 1)) for yi in d["y"]]
        d["x2"] = numpy.random.normal(size=d.shape[0])
        d.loc[d["xc"] == "level_-1.0",
              "xc"] = numpy.nan  # introduce a nan level
        d["yc"] = d["y"] > 0.5
        return d

    d = make_data(5000)

    transform = vtreat.NumericOutcomeTreatment(
        outcome_name="y",  # outcome variable
        cols_to_copy=[
            "yc"
        ],  # columns to "carry along" but not treat as input variables
    )

    d_prepared = transform.fit_transform(d, d["y"])

    for c in d_prepared.columns:
        assert vtreat.util.can_convert_v_to_numeric(d_prepared[c])
        assert sum(vtreat.util.is_bad(d_prepared[c])) == 0

    dtest = make_data(450)

    dtest_prepared = transform.transform(dtest)

    for c in dtest_prepared.columns:
        assert vtreat.util.can_convert_v_to_numeric(dtest_prepared[c])
        assert sum(vtreat.util.is_bad(dtest_prepared[c])) == 0

    sf = transform.score_frame_

    xrow = sf.loc[numpy.logical_and(sf.variable == "x", sf.treatment ==
                                    "clean_copy"), :]
    xrow.reset_index(inplace=True, drop=True)

    assert xrow.recommended[0]
示例#7
0
def test_db_adapter_general():

    # set up example data
    def mk_data(
        n_rows: int = 100,
        *,
        outcome_name: str = "y",
        n_cat_vars: int = 5,
        n_num_vars: int = 5,
        add_unknowns: bool = False,
    ):
        step = 1 / np.sqrt(n_cat_vars + n_num_vars)
        cols = dict()
        y = np.random.normal(size=n_rows)
        for i in range(n_cat_vars):
            vname = f"vc_{i}"
            levels = ["a", "b", "c", "none"]
            if add_unknowns:
                levels = levels + ["d"]
            level_values = {
                v: step * np.random.normal(size=1)[0]
                for v in levels
            }
            v = np.random.choice(levels, replace=True, size=n_rows)
            y = y + np.array([level_values[vi] for vi in v])
            v = np.array([vi if vi != "none" else None for vi in v])
            cols[vname] = v
        for i in range(n_num_vars):
            vname = f"vn_{i}"
            v = np.random.normal(size=n_rows)
            y = y + step * v
            v[np.random.uniform(size=n_rows) < 0.24] = None
            cols[vname] = v

        vars = list(cols.keys())
        vars.sort()
        cols[outcome_name] = y
        d = pd.DataFrame(cols)
        d["orig_index"] = range(d.shape[0])
        return d, outcome_name, vars

    d, outcome_name, vars = mk_data(100)
    d_app, _, _ = mk_data(50, add_unknowns=True)
    cols_to_copy = [outcome_name, "orig_index"]
    columns = vars + cols_to_copy

    # get reference result
    treatment = vtreat.NumericOutcomeTreatment(
        cols_to_copy=cols_to_copy,
        outcome_name=outcome_name,
        params=vtreat.vtreat_parameters({
            "sparse_indicators": False,
            "filter_to_recommended": False,
        }),
    )
    d_train_treated = treatment.fit_transform(d)
    assert isinstance(d_train_treated, pd.DataFrame)
    d_app_treated = treatment.transform(d_app)

    # test ops path
    transform_as_data = treatment.description_matrix()
    ops = as_data_algebra_pipeline(
        source=descr(d_app=d),
        vtreat_descr=transform_as_data,
        treatment_table_name="transform_as_data",
        row_keys=["orig_index"],
    )
    ops_source = str(ops)
    assert isinstance(ops_source, str)
    d_app_res = ops.eval({
        "d_app": d_app,
        "transform_as_data": transform_as_data
    })
    assert data_algebra.test_util.equivalent_frames(d_app_treated, d_app_res)

    # test ops db path
    source_descr = TableDescription(
        table_name="d_app",
        column_names=columns,
    )
    db_handle = data_algebra.SQLite.example_handle()
    db_handle.insert_table(d_app.loc[:, columns], table_name="d_app")
    db_handle.insert_table(transform_as_data, table_name="transform_as_data")
    db_handle.execute("CREATE TABLE res AS " + db_handle.to_sql(ops))
    res_db = db_handle.read_query("SELECT * FROM res ORDER BY orig_index")
    assert data_algebra.test_util.equivalent_frames(res_db, d_app_treated)
    db_handle.close()
示例#8
0
def test_user_coders():
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore")

        # avoid depending on sklearn.metrics.r2_score
        def r_squared(*, y_true, y_pred):
            y_true = numpy.asarray(y_true)
            y_pred = numpy.asarray(y_pred)
            return 1 - numpy.sum((y_true - y_pred) ** 2) / numpy.sum(
                (y_true - numpy.mean(y_true)) ** 2
            )

        # %%

        class PolyTransform(vtreat.transform.UserTransform):
            """a polynomial model"""

            def __init__(self, *, deg=5, alpha=0.1):
                vtreat.transform.UserTransform.__init__(self, treatment="poly")
                self.models_ = None
                self.deg = deg
                self.alpha = alpha

            def poly_terms(self, vname, vec):
                vec = numpy.asarray(vec)
                r = pandas.DataFrame({"x": vec})
                for d in range(1, self.deg + 1):
                    r[vname + "_" + str(d)] = vec ** d
                return r

            def fit(self, X, y):
                self.models_ = {}
                self.incoming_vars_ = []
                self.derived_vars_ = []
                for v in X.columns:
                    if vtreat.util.can_convert_v_to_numeric(X[v]):
                        X_v = self.poly_terms(v, X[v])
                        model_v = sklearn.linear_model.Ridge(alpha=self.alpha).fit(
                            X_v, y
                        )
                        new_var = v + "_poly"
                        self.models_[v] = (model_v, [c for c in X_v.columns], new_var)
                        self.incoming_vars_.append(v)
                        self.derived_vars_.append(new_var)
                return self

            def transform(self, X):
                r = pandas.DataFrame()
                for k, v in self.models_.items():
                    model_k = v[0]
                    cols_k = v[1]
                    new_var = v[2]
                    X_k = self.poly_terms(k, X[k])
                    xform_k = model_k.predict(X_k)
                    r[new_var] = xform_k
                return r

        # %%

        d = pandas.DataFrame({"x": [i for i in range(100)]})
        d["y"] = numpy.sin(0.2 * d["x"]) + 0.2 * numpy.random.normal(size=d.shape[0])
        d.head()

        # %%

        step = PolyTransform(deg=10)

        # %%

        fit = step.fit_transform(d[["x"]], d["y"])
        fit["x"] = d["x"]
        fit.head()

        # %%

        # seaborn.scatterplot(x='x', y='y', data=d)
        # seaborn.lineplot(x='x', y='x_poly', data=fit, color='red', alpha=0.5)

        # %%

        transform = vtreat.NumericOutcomeTreatment(
            outcome_name="y",
            params=vtreat.vtreat_parameters(
                {
                    "filter_to_recommended": False,
                    "user_transforms": [PolyTransform(deg=10)],
                }
            ),
        )

        # %%

        transform.fit(d, d["y"])

        # %%

        transform.score_frame_

        # %%

        x2_overfit = transform.transform(d)

        # %%
        # seaborn.scatterplot(x='x', y='y', data=x2_overfit)
        # seaborn.lineplot(x='x', y='x_poly', data=x2_overfit, color='red', alpha=0.5)

        # %%

        x2 = transform.fit_transform(d, d["y"])

        # %%

        transform.score_frame_

        # %%

        x2.head()