def test_selection_transform(df): selector = ColumnSelector(["x", "y"]) op = SelectionOp(selector) result_df = op.transform(ColumnSelector(), df) assert (result_df.columns == ["x", "y"]).all()
def test_selection_output_column_names(df): selector = ColumnSelector(["x", "y"]) op = SelectionOp(selector) result_selector = op.output_column_names(ColumnSelector()) assert result_selector.names == ["x", "y"]
def test_selection_output_schema(df): selector = ColumnSelector(["x", "y"]) schema = Schema([ColumnSchema(col) for col in df.columns]) op = SelectionOp(selector) result_schema = op.compute_output_schema(schema, ColumnSelector()) assert result_schema.column_names == ["x", "y"]
def test_workflow_select_by_tags(op): schema1 = ColumnSchema("col1", tags=["b", "c", "d"]) schema2 = ColumnSchema("col2", tags=["c", "d"]) schema3 = ColumnSchema("col3", tags=["d"]) schema = Schema([schema1, schema2, schema3]) cont_features = ColumnSelector(tags=["c"]) >> op workflow = Workflow(cont_features) workflow.fit_schema(schema) output_cols = op.output_column_names(ColumnSelector(["col1", "col2"])) assert len(workflow.output_schema.column_names) == len(output_cols.names)
def test_fit_schema_works_with_node_dependencies(): schema = Schema(["x", "y", "cost"]) cont_features = ColumnSelector(["cost"]) >> ops.Rename(postfix="_renamed") cat_features = ColumnSelector(["x", "y" ]) >> ops.TargetEncoding(cont_features) workflow1 = Workflow(cat_features) workflow1.fit_schema(schema) assert workflow1.output_schema.column_names == [ "TE_x_cost_renamed", "TE_y_cost_renamed" ]
def test_fit_schema_works_with_addition_nodes(): schema = Schema(["x", "y", "id"]) x_node = ColumnSelector(["x"]) >> ops.Rename(postfix="_renamed") workflow = Workflow(x_node + "y") workflow.fit_schema(schema) assert workflow.output_schema.column_names == ["x_renamed", "y"] x_node = ColumnSelector(["x"]) >> ops.Rename(postfix="_renamed") y_node = ColumnSelector(["y"]) >> ops.Rename(postfix="_renamed") workflow = Workflow(x_node + y_node) workflow.fit_schema(schema) assert workflow.output_schema.column_names == ["x_renamed", "y_renamed"]
def test_fit_schema_works_with_raw_column_dependencies(): schema = Schema(["x", "y", "cost"]) cat_features = ColumnSelector(["x", "y"]) >> ops.TargetEncoding("cost") workflow = Workflow(cat_features) workflow.fit_schema(schema) assert workflow.output_schema.column_names == ["TE_x_cost", "TE_y_cost"]
def test_fit_schema_works_when_subtracting_column_names(): schema = Schema(["x", "y", "id"]) cont_features = (ColumnSelector( ["x", "y"]) >> ops.FillMissing() >> ops.Clip(min_value=0) >> ops.LogOp >> ops.Normalize() >> ops.Rename(postfix="_renamed")) workflow1 = Workflow(cont_features - "y_renamed") workflow1.fit_schema(schema) assert workflow1.output_schema.column_names == ["x_renamed"]
def test_fit_schema_works_with_grouped_node_inputs(): schema = Schema(["x", "y", "cost"]) cat_features = ColumnSelector(["x", "y", ("x", "y")]) >> ops.TargetEncoding("cost") workflow1 = Workflow(cat_features) workflow1.fit_schema(schema) assert sorted(workflow1.output_schema.column_names) == sorted( ["TE_x_cost", "TE_y_cost", "TE_x_y_cost"])
def test_fit_schema(): schema = Schema(["x", "y", "id"]) cont_features = (ColumnSelector(schema.column_names) >> ops.FillMissing() >> ops.Clip(min_value=0) >> ops.LogOp >> ops.Normalize() >> ops.Rename(postfix="_renamed")) workflow = Workflow(cont_features) workflow.fit_schema(schema) assert workflow.output_schema.column_names == [ "x_renamed", "y_renamed", "id_renamed" ]