def test_onehot_task():
    x1 = np.random.choice(["a", "b", "c"], size=1000)
    x2 = np.random.choice(["1", "2", "3", "4", "5", "6"], size=1000)

    x = np.hstack([np.reshape(x1, (-1, 1)), np.reshape(x2, (-1, 1))])
    data = TaskData(
        X=x,
        column_names=["x1", "x2"],
        column_types=[ColumnType(VarType.NUM),
                      ColumnType(VarType.NUM)],
    )

    task = OneHot()
    res = task.fit_transform(data)
    assert res.column_names == [
        "x1_0",
        "x1_1",
        "x1_2",
        "x2_0",
        "x2_1",
        "x2_2",
        "x2_3",
        "x2_4",
        "x2_5",
    ]
    assert all([x == ColumnType(VarType.NUM) for x in res.column_types])

    expected = OneHotEncoder().fit_transform(data.X)
    assert np.all(np.isclose(res.X.todense(), expected.todense()))
def test_ordcat_task(use_other):
    x1 = np.random.choice(["a", "b", "c"], size=1000)
    x2 = np.random.choice(["1", "2", "3", "4", "5", "6"], size=1000)

    x = np.hstack([np.reshape(x1, (-1, 1)), np.reshape(x2, (-1, 1))])
    data = TaskData(
        X=x,
        column_names=["x1", "x2"],
        column_types=[ColumnType(VarType.NUM),
                      ColumnType(VarType.NUM)],
    )

    task = OrdCat(min_support=0, use_other=use_other, handle_unknown="error")
    res = task.fit_transform(data)
    assert res.column_names == ["x1", "x2"]
    assert res.column_types == [
        ColumnType(VarType.CAT, level=5 if use_other else 4),
        ColumnType(VarType.CAT, level=8 if use_other else 7),
    ]

    expected = OrdinalEncoder().fit_transform(data.X)
    if use_other:
        expected = expected + 2
    else:
        expected = expected + 1
    assert np.all(np.isclose(res.X, expected))
def test_pipeline_numeric():
    df = create_dataset(num=10, cat=0, size=5000)

    train_df = df.iloc[:-1000]
    y_train = train_df.pop("target").values

    test_df = df.iloc[-1000:]
    y_test = test_df.pop("target").values

    num_pipeline = Pipeline(steps=[
        Step(name="impute", task=Wrap(SimpleImputer(strategy="mean"))),
        Step(
            name="standatize",
            task=Wrap(StandardScaler(), type_override=VarType.NUM),
        ),
    ])

    train = num_pipeline.fit_transform(to_task_data(train_df, y_train))
    test = num_pipeline.transform(to_task_data(test_df, y_test))

    assert all([x == ColumnType(VarType.NUM) for x in train.column_types])
    assert all([x == ColumnType(VarType.NUM) for x in test.column_types])

    imputer = SimpleImputer(strategy="mean")
    scaler = StandardScaler()
    res = imputer.fit_transform(train_df.values)
    train_expected = scaler.fit_transform(res)
    res = imputer.transform(test_df.values)
    test_expected = scaler.transform(res)

    assert np.all(np.isclose(train_expected, train.X))
    assert np.all(np.isclose(test_expected, test.X))
def test_date_features_extractor_task():
    x1 = pd.date_range(start="2020-10-17", periods=5000, freq="5D")
    x2 = pd.date_range(start="2007-06-06", periods=5000, freq="21S")

    x = np.hstack([np.reshape(x1, (-1, 1)), np.reshape(x2, (-1, 1))])
    data = TaskData(
        X=x,
        column_names=["x1", "x2"],
        column_types=[ColumnType(VarType.NUM),
                      ColumnType(VarType.NUM)],
    )

    task = DateFeatures()
    res = task.fit_transform(data)

    assert res.X.shape[1] == len(task.COMPONENTS) * 2

    x1_cols = [c for c in res.column_names if c.startswith("x1")]
    x2_cols = [c for c in res.column_names if c.startswith("x2")]

    x1_data = take_columns(res, x1_cols)
    x2_data = take_columns(res, x2_cols)

    extractors = {
        "year": lambda x: x.year,
        "month": lambda x: x.month,
        "week": lambda x: x.week,
        "day_of_month": lambda x: x.day,
        "day_of_week": lambda x: x.dayofweek,
        "hour": lambda x: x.hour,
        "minute": lambda x: x.minute,
        "second": lambda x: x.second,
    }
    for actual, expected, name in [(x1_data, x1, "x1"), (x2_data, x2, "x2")]:
        for k, v in extractors.items():
            feature_name = "{} - {}".format(name, k)
            assert feature_name in actual.column_names
            expected_val = np.array([v(x) for x in expected])
            idx = actual.column_names.index(feature_name)
            assert np.all(np.isclose(actual.X[:, idx], expected_val))

            idx = actual.column_names.index(feature_name)
            col_type = actual.column_types[idx]

            if "year" in feature_name:
                assert col_type.var_type == VarType.NUM
            else:
                assert col_type.var_type == VarType.CAT
def data():
    random_state = np.random.RandomState(RANDOM_SEED)

    x = random_state.random((1000, 5))
    y = random_state.random((1000, ))

    return TaskData(
        X=x,
        column_names=["x1", "x2", "x3", "x4", "x5"],
        column_types=[
            ColumnType(),
            ColumnType(),
            ColumnType(),
            ColumnType(),
            ColumnType(),
        ],
        y=y,
    )
def test_target_lag(order):
    x = np.random.random((1000, ))
    y = np.random.random((1000, ))

    data = TaskData(
        X=x,
        y=y,
        column_names=["x"],
        column_types=[ColumnType(VarType.NUM)],
    )

    df = pd.DataFrame({"x": x, "y": y})
    df["lag"] = df["y"].shift(order)
    lag = df["lag"].values

    task = TargetLag(order=order, handle_nan="drop")
    new_data = task.fit_transform(data)

    lag_index = new_data.column_names.index(TargetLag.PATTERN.format(order))
    assert np.all(np.isclose(lag[order:], new_data.X[:, lag_index]))
    assert len(new_data.column_names) == 2
    assert len(new_data.column_types) == 2
    assert new_data.column_names[-1] == TargetLag.PATTERN.format(order)
    assert new_data.column_types[-1] == ColumnType(VarType.LAG)
def test_ordcat_task_handle_unknown():
    x1 = np.random.choice(["a", "b", "c"], size=1000)
    x2 = np.random.choice(["1", "2", "3", "4", "5", "6"], size=1000)

    x = np.hstack([np.reshape(x1, (-1, 1)), np.reshape(x2, (-1, 1))])
    data = TaskData(
        X=x,
        column_names=["x1", "x2"],
        column_types=[ColumnType(VarType.NUM),
                      ColumnType(VarType.NUM)],
    )

    task = OrdCat(min_support=0, use_other=False, handle_unknown="missing")
    res = task.fit_transform(data)
    assert res.column_names == ["x1", "x2"]
    assert res.column_types == [
        ColumnType(VarType.CAT, level=4),
        ColumnType(VarType.CAT, level=7),
    ]

    expected = OrdinalEncoder().fit_transform(data.X)
    expected = expected + 1
    assert np.all(np.isclose(res.X, expected))

    # transform with new categories
    x1 = np.random.choice(["a", "c", "d"], size=1000)
    x2 = np.random.choice(["2", "3", "5", "6", "7"], size=1000)

    x = np.hstack([np.reshape(x1, (-1, 1)), np.reshape(x2, (-1, 1))])
    new_data = TaskData(
        X=x,
        column_names=["x1", "x2"],
        column_types=[ColumnType(VarType.NUM),
                      ColumnType(VarType.NUM)],
    )
    res = task.transform(new_data)

    mask = x1 == "d"
    results = res.X[:, 0][mask]
    assert np.unique(results) == np.array([0])

    mask = x2 == "7"
    results = res.X[:, 1][mask]
    assert np.unique(results) == np.array([0])
def test_ordcat_task_handle_unknown_error():
    x1 = np.random.choice(["a", "b", "c"], size=1000)
    x2 = np.random.choice(["1", "2", "3", "4", "5", "6"], size=1000)

    x = np.hstack([np.reshape(x1, (-1, 1)), np.reshape(x2, (-1, 1))])
    data = TaskData(
        X=x,
        column_names=["x1", "x2"],
        column_types=[ColumnType(VarType.NUM),
                      ColumnType(VarType.NUM)],
    )

    task = OrdCat(min_support=0, use_other=False, handle_unknown="error")
    res = task.fit_transform(data)
    assert res.column_names == ["x1", "x2"]
    assert res.column_types == [
        ColumnType(VarType.CAT, level=4),
        ColumnType(VarType.CAT, level=7),
    ]

    expected = OrdinalEncoder().fit_transform(data.X)
    expected = expected + 1
    assert np.all(np.isclose(res.X, expected))

    # transform with new categories
    x1 = np.random.choice(["a", "c", "d"], size=1000)
    x2 = np.random.choice(["2", "3", "5", "6", "7"], size=1000)

    x = np.hstack([np.reshape(x1, (-1, 1)), np.reshape(x2, (-1, 1))])
    new_data = TaskData(
        X=x,
        column_names=["x1", "x2"],
        column_types=[ColumnType(VarType.NUM),
                      ColumnType(VarType.NUM)],
    )
    with pytest.raises(ValueError) as excinfo:
        res = task.transform(new_data)
    assert "Found unknown categories" == str(excinfo.value)
def test_date_pipeline():
    df = create_dataset(num=0, cat=0, date=5, target=False, size=5000)
    train_df = df.iloc[:-1000]
    test_df = df.iloc[-1000:]

    date_pipeline = Pipeline(steps=[
        Step("date", DateFeatures()),
        Step(
            "derived_processing",
            ColumnsProcessor(branches=[
                Step("num_derived",
                     Wrap(StandardScaler()),
                     types=[VarType.NUM]),
                Step(
                    "cat_derived",
                    OrdCat(min_support=0, use_other=False),
                    types=[VarType.CAT],
                ),
            ]),
        ),
    ])
    train = date_pipeline.fit_transform(to_task_data(train_df))
    test = date_pipeline.transform(to_task_data(test_df))

    for data in [train, test]:
        assert data.column_types[:5] == [ColumnType(VarType.NUM)] * 5
        assert set([c.var_type
                    for c in data.column_types[5:]]) == set([VarType.CAT])
        assert all([c.level > 0 for c in data.column_types[5:]])

    date_features = DateFeatures()
    dates_train = date_features.fit_transform(to_task_data(train_df))
    dates_test = date_features.transform(to_task_data(test_df))

    num_train = take_columns(dates_train, types=[VarType.NUM])
    cat_train = take_columns(dates_train, types=[VarType.CAT])
    scaler = StandardScaler()
    enc = OrdinalEncoder()

    num_train = scaler.fit_transform(num_train.X)
    cat_train = enc.fit_transform(cat_train.X)
    cat_train = cat_train + 1

    assert np.all(
        np.isclose(num_train,
                   take_columns(train, types=[VarType.NUM]).X))
    assert np.all(
        np.isclose(cat_train,
                   take_columns(train, types=[VarType.CAT]).X))

    num_test = take_columns(dates_test, types=[VarType.NUM])
    cat_test = take_columns(dates_test, types=[VarType.CAT])
    num_test = scaler.transform(num_test.X)
    cat_test = enc.transform(cat_test.X)
    cat_test = cat_test + 1

    assert np.all(
        np.isclose(num_test,
                   take_columns(test, types=[VarType.NUM]).X))
    assert np.all(
        np.isclose(cat_test,
                   take_columns(test, types=[VarType.CAT]).X))