Пример #1
0
def test_selection_transform(df):
    selector = ColumnSelector(["x", "y"])
    op = SelectionOp(selector)

    result_df = op.transform(ColumnSelector(), df)

    assert (result_df.columns == ["x", "y"]).all()
Пример #2
0
def test_selection_output_column_names(df):
    selector = ColumnSelector(["x", "y"])
    op = SelectionOp(selector)

    result_selector = op.output_column_names(ColumnSelector())

    assert result_selector.names == ["x", "y"]
Пример #3
0
def test_selection_output_schema(df):
    selector = ColumnSelector(["x", "y"])
    schema = Schema([ColumnSchema(col) for col in df.columns])
    op = SelectionOp(selector)

    result_schema = op.compute_output_schema(schema, ColumnSelector())

    assert result_schema.column_names == ["x", "y"]
def test_workflow_select_by_tags(op):
    schema1 = ColumnSchema("col1", tags=["b", "c", "d"])
    schema2 = ColumnSchema("col2", tags=["c", "d"])
    schema3 = ColumnSchema("col3", tags=["d"])
    schema = Schema([schema1, schema2, schema3])

    cont_features = ColumnSelector(tags=["c"]) >> op
    workflow = Workflow(cont_features)
    workflow.fit_schema(schema)

    output_cols = op.output_column_names(ColumnSelector(["col1", "col2"]))
    assert len(workflow.output_schema.column_names) == len(output_cols.names)
def test_fit_schema_works_with_node_dependencies():
    schema = Schema(["x", "y", "cost"])

    cont_features = ColumnSelector(["cost"]) >> ops.Rename(postfix="_renamed")
    cat_features = ColumnSelector(["x", "y"
                                   ]) >> ops.TargetEncoding(cont_features)

    workflow1 = Workflow(cat_features)
    workflow1.fit_schema(schema)

    assert workflow1.output_schema.column_names == [
        "TE_x_cost_renamed", "TE_y_cost_renamed"
    ]
def test_fit_schema_works_with_addition_nodes():
    schema = Schema(["x", "y", "id"])

    x_node = ColumnSelector(["x"]) >> ops.Rename(postfix="_renamed")

    workflow = Workflow(x_node + "y")
    workflow.fit_schema(schema)

    assert workflow.output_schema.column_names == ["x_renamed", "y"]

    x_node = ColumnSelector(["x"]) >> ops.Rename(postfix="_renamed")
    y_node = ColumnSelector(["y"]) >> ops.Rename(postfix="_renamed")

    workflow = Workflow(x_node + y_node)
    workflow.fit_schema(schema)

    assert workflow.output_schema.column_names == ["x_renamed", "y_renamed"]
def test_fit_schema_works_with_raw_column_dependencies():
    schema = Schema(["x", "y", "cost"])

    cat_features = ColumnSelector(["x", "y"]) >> ops.TargetEncoding("cost")

    workflow = Workflow(cat_features)
    workflow.fit_schema(schema)

    assert workflow.output_schema.column_names == ["TE_x_cost", "TE_y_cost"]
def test_fit_schema_works_when_subtracting_column_names():
    schema = Schema(["x", "y", "id"])

    cont_features = (ColumnSelector(
        ["x", "y"]) >> ops.FillMissing() >> ops.Clip(min_value=0) >> ops.LogOp
                     >> ops.Normalize() >> ops.Rename(postfix="_renamed"))

    workflow1 = Workflow(cont_features - "y_renamed")
    workflow1.fit_schema(schema)

    assert workflow1.output_schema.column_names == ["x_renamed"]
def test_fit_schema_works_with_grouped_node_inputs():
    schema = Schema(["x", "y", "cost"])

    cat_features = ColumnSelector(["x", "y",
                                   ("x", "y")]) >> ops.TargetEncoding("cost")

    workflow1 = Workflow(cat_features)
    workflow1.fit_schema(schema)

    assert sorted(workflow1.output_schema.column_names) == sorted(
        ["TE_x_cost", "TE_y_cost", "TE_x_y_cost"])
def test_fit_schema():
    schema = Schema(["x", "y", "id"])

    cont_features = (ColumnSelector(schema.column_names) >> ops.FillMissing()
                     >> ops.Clip(min_value=0) >> ops.LogOp >> ops.Normalize()
                     >> ops.Rename(postfix="_renamed"))

    workflow = Workflow(cont_features)
    workflow.fit_schema(schema)

    assert workflow.output_schema.column_names == [
        "x_renamed", "y_renamed", "id_renamed"
    ]