def test_onehot_task(): x1 = np.random.choice(["a", "b", "c"], size=1000) x2 = np.random.choice(["1", "2", "3", "4", "5", "6"], size=1000) x = np.hstack([np.reshape(x1, (-1, 1)), np.reshape(x2, (-1, 1))]) data = TaskData( X=x, column_names=["x1", "x2"], column_types=[ColumnType(VarType.NUM), ColumnType(VarType.NUM)], ) task = OneHot() res = task.fit_transform(data) assert res.column_names == [ "x1_0", "x1_1", "x1_2", "x2_0", "x2_1", "x2_2", "x2_3", "x2_4", "x2_5", ] assert all([x == ColumnType(VarType.NUM) for x in res.column_types]) expected = OneHotEncoder().fit_transform(data.X) assert np.all(np.isclose(res.X.todense(), expected.todense()))
def test_ordcat_task(use_other): x1 = np.random.choice(["a", "b", "c"], size=1000) x2 = np.random.choice(["1", "2", "3", "4", "5", "6"], size=1000) x = np.hstack([np.reshape(x1, (-1, 1)), np.reshape(x2, (-1, 1))]) data = TaskData( X=x, column_names=["x1", "x2"], column_types=[ColumnType(VarType.NUM), ColumnType(VarType.NUM)], ) task = OrdCat(min_support=0, use_other=use_other, handle_unknown="error") res = task.fit_transform(data) assert res.column_names == ["x1", "x2"] assert res.column_types == [ ColumnType(VarType.CAT, level=5 if use_other else 4), ColumnType(VarType.CAT, level=8 if use_other else 7), ] expected = OrdinalEncoder().fit_transform(data.X) if use_other: expected = expected + 2 else: expected = expected + 1 assert np.all(np.isclose(res.X, expected))
def test_ordcat_task_handle_unknown(): x1 = np.random.choice(["a", "b", "c"], size=1000) x2 = np.random.choice(["1", "2", "3", "4", "5", "6"], size=1000) x = np.hstack([np.reshape(x1, (-1, 1)), np.reshape(x2, (-1, 1))]) data = TaskData( X=x, column_names=["x1", "x2"], column_types=[ColumnType(VarType.NUM), ColumnType(VarType.NUM)], ) task = OrdCat(min_support=0, use_other=False, handle_unknown="missing") res = task.fit_transform(data) assert res.column_names == ["x1", "x2"] assert res.column_types == [ ColumnType(VarType.CAT, level=4), ColumnType(VarType.CAT, level=7), ] expected = OrdinalEncoder().fit_transform(data.X) expected = expected + 1 assert np.all(np.isclose(res.X, expected)) # transform with new categories x1 = np.random.choice(["a", "c", "d"], size=1000) x2 = np.random.choice(["2", "3", "5", "6", "7"], size=1000) x = np.hstack([np.reshape(x1, (-1, 1)), np.reshape(x2, (-1, 1))]) new_data = TaskData( X=x, column_names=["x1", "x2"], column_types=[ColumnType(VarType.NUM), ColumnType(VarType.NUM)], ) res = task.transform(new_data) mask = x1 == "d" results = res.X[:, 0][mask] assert np.unique(results) == np.array([0]) mask = x2 == "7" results = res.X[:, 1][mask] assert np.unique(results) == np.array([0])
def test_ordcat_task_handle_unknown_error(): x1 = np.random.choice(["a", "b", "c"], size=1000) x2 = np.random.choice(["1", "2", "3", "4", "5", "6"], size=1000) x = np.hstack([np.reshape(x1, (-1, 1)), np.reshape(x2, (-1, 1))]) data = TaskData( X=x, column_names=["x1", "x2"], column_types=[ColumnType(VarType.NUM), ColumnType(VarType.NUM)], ) task = OrdCat(min_support=0, use_other=False, handle_unknown="error") res = task.fit_transform(data) assert res.column_names == ["x1", "x2"] assert res.column_types == [ ColumnType(VarType.CAT, level=4), ColumnType(VarType.CAT, level=7), ] expected = OrdinalEncoder().fit_transform(data.X) expected = expected + 1 assert np.all(np.isclose(res.X, expected)) # transform with new categories x1 = np.random.choice(["a", "c", "d"], size=1000) x2 = np.random.choice(["2", "3", "5", "6", "7"], size=1000) x = np.hstack([np.reshape(x1, (-1, 1)), np.reshape(x2, (-1, 1))]) new_data = TaskData( X=x, column_names=["x1", "x2"], column_types=[ColumnType(VarType.NUM), ColumnType(VarType.NUM)], ) with pytest.raises(ValueError) as excinfo: res = task.transform(new_data) assert "Found unknown categories" == str(excinfo.value)
def test_imputer_wrapper_multiple_cols(): xs = [] for i in range(3): x = np.random.random((1000, 1)) nans = np.random.choice(x.shape[0], size=100) x[nans] = np.nan xs.append(x) x = np.concatenate(xs, axis=1) data = TaskData(X=x, column_names=["x1", "x2", "x3"], column_types=[0]) task = Wrap(SimpleImputer(strategy="median", add_indicator=True)) res = task.fit_transform(data) assert res.X.shape[1] == 6 assert res.column_names == ["SimpleImputer-{}".format(i) for i in range(6)]
def test_date_features_extractor_task(): x1 = pd.date_range(start="2020-10-17", periods=5000, freq="5D") x2 = pd.date_range(start="2007-06-06", periods=5000, freq="21S") x = np.hstack([np.reshape(x1, (-1, 1)), np.reshape(x2, (-1, 1))]) data = TaskData( X=x, column_names=["x1", "x2"], column_types=[ColumnType(VarType.NUM), ColumnType(VarType.NUM)], ) task = DateFeatures() res = task.fit_transform(data) assert res.X.shape[1] == len(task.COMPONENTS) * 2 x1_cols = [c for c in res.column_names if c.startswith("x1")] x2_cols = [c for c in res.column_names if c.startswith("x2")] x1_data = take_columns(res, x1_cols) x2_data = take_columns(res, x2_cols) extractors = { "year": lambda x: x.year, "month": lambda x: x.month, "week": lambda x: x.week, "day_of_month": lambda x: x.day, "day_of_week": lambda x: x.dayofweek, "hour": lambda x: x.hour, "minute": lambda x: x.minute, "second": lambda x: x.second, } for actual, expected, name in [(x1_data, x1, "x1"), (x2_data, x2, "x2")]: for k, v in extractors.items(): feature_name = "{} - {}".format(name, k) assert feature_name in actual.column_names expected_val = np.array([v(x) for x in expected]) idx = actual.column_names.index(feature_name) assert np.all(np.isclose(actual.X[:, idx], expected_val)) idx = actual.column_names.index(feature_name) col_type = actual.column_types[idx] if "year" in feature_name: assert col_type.var_type == VarType.NUM else: assert col_type.var_type == VarType.CAT
def test_imputer_wrapper(): x = np.random.random((1000, 1)) nans = np.random.choice(x.shape[0], size=100) x[nans] = np.nan data = TaskData(X=x, column_names=["x"], column_types=[0]) task = Wrap(SimpleImputer(strategy="constant", fill_value=-1)) res = task.fit_transform(data) assert np.unique(res.X[nans]).shape[0] == 1 assert np.unique(res.X[nans])[0] == -1 task = Wrap(SimpleImputer(strategy="mean")) res = task.fit_transform(data) assert np.unique(res.X[nans]).shape[0] == 1 assert np.isclose(np.unique(res.X[nans])[0], np.mean(x[~np.isnan(x)])) task = Wrap(SimpleImputer(strategy="median", add_indicator=True)) res = task.fit_transform(data) assert res.X.shape[1] == 2 assert np.all(np.isclose(np.unique(res.X[:, 1][nans]), np.array([1]))) assert np.isclose( np.unique(res.X[:, 0][nans])[0], np.median(x[~np.isnan(x)]))
def test_target_lag(order): x = np.random.random((1000, )) y = np.random.random((1000, )) data = TaskData( X=x, y=y, column_names=["x"], column_types=[ColumnType(VarType.NUM)], ) df = pd.DataFrame({"x": x, "y": y}) df["lag"] = df["y"].shift(order) lag = df["lag"].values task = TargetLag(order=order, handle_nan="drop") new_data = task.fit_transform(data) lag_index = new_data.column_names.index(TargetLag.PATTERN.format(order)) assert np.all(np.isclose(lag[order:], new_data.X[:, lag_index])) assert len(new_data.column_names) == 2 assert len(new_data.column_types) == 2 assert new_data.column_names[-1] == TargetLag.PATTERN.format(order) assert new_data.column_types[-1] == ColumnType(VarType.LAG)