def test_difference_lag(): df = cudf.DataFrame({ "userid": [0, 0, 0, 1, 1, 2], "timestamp": [1000, 1005, 1100, 2000, 2001, 3000] }) diff_features = ["timestamp"] >> ops.DifferenceLag( partition_cols=["userid"], shift=[1, -1]) dataset = nvt.Dataset(df) processor = nvtabular.Workflow(diff_features) processor.fit(dataset) new_gdf = processor.transform(dataset).to_ddf().compute() assert new_gdf["timestamp_difference_lag_1"][0] is (cudf.NA if hasattr( cudf, "NA") else None) assert new_gdf["timestamp_difference_lag_1"][1] == 5 assert new_gdf["timestamp_difference_lag_1"][2] == 95 assert new_gdf["timestamp_difference_lag_1"][3] is (cudf.NA if hasattr( cudf, "NA") else None) assert new_gdf["timestamp_difference_lag_-1"][0] == -5 assert new_gdf["timestamp_difference_lag_-1"][1] == -95 assert new_gdf["timestamp_difference_lag_-1"][2] is (cudf.NA if hasattr( cudf, "NA") else None) assert new_gdf["timestamp_difference_lag_-1"][3] == -1 assert new_gdf["timestamp_difference_lag_-1"][5] is (cudf.NA if hasattr( cudf, "NA") else None)
def test_difference_lag(cpu): lib = pd if cpu else cudf df = lib.DataFrame( {"userid": [0, 0, 0, 1, 1, 2], "timestamp": [1000, 1005, 1100, 2000, 2001, 3000]} ) diff_features = ["timestamp"] >> ops.DifferenceLag(partition_cols=["userid"], shift=[1, -1]) dataset = nvt.Dataset(df, cpu=cpu) processor = nvtabular.Workflow(diff_features) processor.fit(dataset) new_df = processor.transform(dataset).to_ddf().compute() assert new_df["timestamp_difference_lag_1"][1] == 5 assert new_df["timestamp_difference_lag_1"][2] == 95 if cpu: assert lib.isna(new_df["timestamp_difference_lag_1"][0]) assert lib.isna(new_df["timestamp_difference_lag_1"][3]) else: assert new_df["timestamp_difference_lag_1"][0] is (lib.NA if hasattr(lib, "NA") else None) assert new_df["timestamp_difference_lag_1"][3] is (lib.NA if hasattr(lib, "NA") else None) assert new_df["timestamp_difference_lag_-1"][0] == -5 assert new_df["timestamp_difference_lag_-1"][1] == -95 assert new_df["timestamp_difference_lag_-1"][3] == -1 if cpu: assert lib.isna(new_df["timestamp_difference_lag_-1"][2]) assert lib.isna(new_df["timestamp_difference_lag_-1"][5]) else: assert new_df["timestamp_difference_lag_-1"][2] is (lib.NA if hasattr(lib, "NA") else None) assert new_df["timestamp_difference_lag_-1"][5] is (lib.NA if hasattr(lib, "NA") else None)
def test_difference_lag(): df = cudf.DataFrame( {"userid": [0, 0, 0, 1, 1, 2], "timestamp": [1000, 1005, 1100, 2000, 2001, 3000]} ) columns = ["userid", "timestamp"] columns_ctx = {} columns_ctx["all"] = {} columns_ctx["all"]["base"] = columns op = ops.DifferenceLag("userid", columns=["timestamp"]) new_gdf = op.apply_op(df, columns_ctx, "all", target_cols=["timestamp"]) assert new_gdf["timestamp_DifferenceLag"][0] is None assert new_gdf["timestamp_DifferenceLag"][1] == 5 assert new_gdf["timestamp_DifferenceLag"][2] == 95 assert new_gdf["timestamp_DifferenceLag"][3] is None
"TE_x_cost_renamed", "TE_y_cost_renamed" ] # initial column selector works with tags # filter within the workflow by tags # test tags correct at output @pytest.mark.parametrize( "op", [ ops.Bucketize([1]), ops.Rename(postfix="_trim"), ops.Categorify(), ops.Categorify(encode_type="combo"), ops.Clip(0), ops.DifferenceLag("col1"), ops.FillMissing(), ops.Groupby("col1"), ops.HashBucket(1), ops.HashedCross(1), ops.JoinGroupby("col1"), ops.ListSlice(0), ops.LogOp(), ops.Normalize(), ops.TargetEncoding("col1"), ], ) def test_workflow_select_by_tags(op): schema1 = ColumnSchema("col1", tags=["b", "c", "d"]) schema2 = ColumnSchema("col2", tags=["c", "d"]) schema3 = ColumnSchema("col3", tags=["d"])
import nvtabular as nvt from nvtabular import ColumnSchema, ColumnSelector, Schema, dispatch, ops from nvtabular.dispatch import HAS_GPU @pytest.mark.parametrize("properties", [{}, {"p1": "1"}]) @pytest.mark.parametrize("tags", [[], ["TAG1", "TAG2"]]) @pytest.mark.parametrize( "op", [ ops.Bucketize([1]), ops.Rename(postfix="_trim"), ops.Categorify(), ops.Categorify(encode_type="combo"), ops.Clip(0), ops.DifferenceLag("1"), ops.FillMissing(), ops.Groupby(["1"]), ops.HashBucket(1), ops.HashedCross(1), ops.JoinGroupby(["1"]), ops.ListSlice(0), ops.LogOp(), ops.Normalize(), ops.TargetEncoding(["1"]), ops.AddMetadata(tags=["excellent"], properties={"domain": {"min": 0, "max": 20}}), ops.ValueCount(), ], ) @pytest.mark.parametrize("selection", [["1"], ["2", "3"], ["1", "2", "3", "4"]]) def test_schema_out(tags, properties, selection, op):