Пример #1
0
    def compute_output_schema(self, input_schema: Schema, col_selector: ColumnSelector) -> Schema:
        """Given a set of schemas and a column selector for the input columns,
        returns a set of schemas for the transformed columns this operator will produce
        Parameters
        -----------
        input_schema: Schema
            The schemas of the columns to apply this operator to
        col_selector: ColumnSelector
            The column selector to apply to the input schema
        Returns
        -------
        Schema
            The schemas of the columns produced by this operator
        """
        if not col_selector:
            col_selector = ColumnSelector(input_schema.column_names)

        if col_selector.tags:
            tags_col_selector = ColumnSelector(tags=col_selector.tags)
            filtered_schema = input_schema.apply(tags_col_selector)
            col_selector += ColumnSelector(filtered_schema.column_names)

            # zero tags because already filtered
            col_selector._tags = []

        col_selector = self.output_column_names(col_selector)

        for column_name in col_selector.names:
            if column_name not in input_schema.column_schemas:
                input_schema += Schema([column_name])

        output_schema = Schema()
        for column_schema in input_schema.apply(col_selector):
            output_schema += Schema([self.transformed_schema(column_schema)])
        return output_schema
Пример #2
0
def test_column_schema_protobuf_domain_check(tmpdir):
    # create a schema
    schema1 = ColumnSchema(
        "col1",
        tags=[],
        properties={"domain": {"min": 0, "max": 10}},
        dtype=numpy.int,
        _is_list=False,
    )
    schema2 = ColumnSchema(
        "col2",
        tags=[],
        properties={"domain": {"min": 0.0, "max": 10.0}},
        dtype=numpy.float,
        _is_list=False,
    )
    column_schema_set = Schema([schema1, schema2])
    # write schema out
    schema_path = Path(tmpdir)
    saved_schema = column_schema_set.write(schema_path)
    # read schema back in
    loaded_schema = Schema.load(schema_path)
    # compare read to origin
    assert saved_schema == loaded_schema

    # load in protobuf file to tensorflow schema representation
    proto_schema = PbTxt_SchemaWriter._read(schema_path / "schema.pbtxt")

    assert """name: "col1"\n    min: 0\n    max: 10\n""" in str(proto_schema)
    assert """name: "col2"\n    min: 0.0\n    max: 10.0\n""" in str(proto_schema)
Пример #3
0
def test_applying_selector_to_schema_selects_by_name_or_tags():
    schema1 = ColumnSchema("col1")
    schema2 = ColumnSchema("col2", tags=["b", "c", "d"])

    schema = Schema([schema1, schema2])
    selector = ColumnSelector(["col1"], tags=["a", "b"])
    result = schema.apply(selector)

    assert result.column_names == schema.column_names
Пример #4
0
def _combine_schemas(elements):
    combined = Schema()
    for elem in elements:
        if isinstance(elem, Node):
            combined += elem.output_schema
        elif isinstance(elem, ColumnSelector):
            combined += Schema(elem.names)
        elif isinstance(elem, list):
            combined += _combine_schemas(elem)
    return combined
Пример #5
0
def test_applying_inverse_selector_to_schema_selects_relevant_columns():
    schema = Schema(["a", "b", "c", "d", "e"])
    selector = ColumnSelector(["a", "b"])
    result = schema.apply_inverse(selector)

    assert result == Schema(["c", "d", "e"])

    selector = None
    result = schema.apply_inverse(selector)

    assert result == schema
Пример #6
0
 def compute_output_schema(self, input_schema: Schema, col_selector: ColumnSelector) -> Schema:
     if not col_selector:
         col_selector = ColumnSelector(input_schema.column_names)
     output_schema = Schema()
     for column_name in col_selector.names:
         column_schema = input_schema.column_schemas[column_name]
         output_schema += Schema([self.transformed_schema(column_schema)])
         if self.add_binary_cols:
             column_schema = column_schema.with_name(f"{column_name}_filled")
             output_schema += Schema([column_schema])
     return output_schema
Пример #7
0
def test_dataset_schemas_can_be_added():
    ds1_schema = Schema([ColumnSchema("col1"), ColumnSchema("col2")])
    ds2_schema = Schema([ColumnSchema("col3"), ColumnSchema("col4")])

    result = ds1_schema + ds2_schema

    expected = Schema(
        [ColumnSchema("col1"), ColumnSchema("col2"), ColumnSchema("col3"), ColumnSchema("col4")]
    )

    assert result == expected
Пример #8
0
def test_applying_selector_to_schema_selects_by_name():
    schema = Schema(["a", "b", "c", "d", "e"])
    selector = ColumnSelector(["a", "b"])
    result = schema.apply(selector)

    assert result == Schema(["a", "b"])

    selector = None
    result = schema.apply(selector)

    assert result == schema
Пример #9
0
def test_column_schema_set_protobuf(tmpdir, props1, props2, tags1, tags2, d_type, list_type):
    # create a schema
    schema1 = ColumnSchema("col1", tags=tags1, properties=props1, dtype=d_type, _is_list=list_type)
    schema2 = ColumnSchema("col2", tags=tags2, properties=props2, dtype=d_type, _is_list=list_type)
    column_schema_set = Schema([schema1, schema2])
    # write schema out
    schema_path = Path(tmpdir)
    column_schema_set = column_schema_set.write(schema_path)
    # read schema back in
    target = Schema.load(schema_path)
    # compare read to origin
    assert column_schema_set == target
Пример #10
0
def test_dataset_schema_constructor():
    schema1 = ColumnSchema("col1", tags=["a", "b", "c"])
    schema2 = ColumnSchema("col2", tags=["c", "d", "e"])

    expected = {schema1.name: schema1, schema2.name: schema2}

    ds_schema_dict = Schema(expected)
    ds_schema_list = Schema([schema1, schema2])

    assert ds_schema_dict.column_schemas == expected
    assert ds_schema_list.column_schemas == expected

    with pytest.raises(TypeError) as exception_info:
        Schema(12345)

    assert "column_schemas" in str(exception_info.value)
Пример #11
0
def test_workflow_node_subtraction():
    schema = Schema(["a", "b", "c", "d", "e", "f"])

    node1 = ["a", "b", "c", "d"] >> Operator()
    node2 = ["c", "d"] >> Operator()
    node3 = ["b"] >> Operator()

    output_node = node1 - ["c", "d"]
    workflow = Workflow(output_node).fit_schema(schema)
    assert len(output_node.parents) == 1
    assert len(output_node.dependencies) == 0
    assert workflow.output_node.output_columns.names == ["a", "b"]

    output_node = node1 - node2
    workflow = Workflow(output_node).fit_schema(schema)
    assert len(output_node.parents) == 1
    assert len(output_node.dependencies) == 1
    assert workflow.output_node.output_columns.names == ["a", "b"]

    output_node = ["a", "b", "c", "d"] - node2
    workflow = Workflow(output_node).fit_schema(schema)
    assert len(output_node.parents) == 1
    assert len(output_node.dependencies) == 1
    assert workflow.output_node.output_columns.names == ["a", "b"]

    output_node = node1 - ["c", "d"] - node3
    workflow = Workflow(output_node).fit_schema(schema)
    assert len(output_node.parents) == 1
    assert len(output_node.dependencies) == 1
    assert workflow.output_node.output_columns.names == ["a"]
Пример #12
0
def _remove_columns(workflow, to_remove):
    workflow = copy.deepcopy(workflow)

    for label in to_remove:
        if label in workflow.input_dtypes:
            del workflow.input_dtypes[label]

        if label in workflow.output_dtypes:
            del workflow.output_dtypes[label]

    # Work backwards to form an input schema from redacted columns
    new_schema = Schema(list(workflow.input_dtypes.keys()))

    # Re-fit the workflow to altered input schema
    for node in iter_nodes([workflow.output_node]):
        node.input_schema = None
        node.output_schema = None

        if node.selector:
            for column in to_remove:
                if column in node.selector._names:
                    node.selector._names.remove(column)

                for subgroup in node.selector.subgroups:
                    if column in subgroup._names:
                        subgroup._names.remove(column)

    return workflow.fit_schema(new_schema)
Пример #13
0
    def compute_output_schema(self, input_schema: Schema, col_selector: ColumnSelector) -> Schema:
        if not col_selector:
            col_selector = ColumnSelector(input_schema.column_names)
        if col_selector.tags:
            tags_col_selector = ColumnSelector(tags=col_selector.tags)
            filtered_schema = input_schema.apply(tags_col_selector)
            col_selector += ColumnSelector(filtered_schema.column_names)

            # zero tags because already filtered
            col_selector._tags = []
        output_schema = Schema()
        for column_name in input_schema.column_schemas:
            new_names = self.output_column_names(ColumnSelector(column_name))
            column_schema = input_schema.column_schemas[column_name]
            for new_name in new_names.names:
                new_column_schema = column_schema.with_name(new_name)
                output_schema += Schema([self.transformed_schema(new_column_schema)])
        return output_schema
Пример #14
0
    def compute_output_schema(self, input_schema: Schema, col_selector: ColumnSelector) -> Schema:
        if not col_selector:
            col_selector = ColumnSelector(input_schema.column_names)
        if col_selector.tags:
            tags_col_selector = ColumnSelector(tags=col_selector.tags)
            filtered_schema = input_schema.apply(tags_col_selector)
            col_selector += ColumnSelector(filtered_schema.column_names)

            # zero tags because already filtered
            col_selector._tags = []
        output_schema = Schema()
        for column_name in col_selector.names:
            column_schema = input_schema.column_schemas[column_name]
            output_schema += Schema([self.transformed_schema(column_schema)])
            if self.add_binary_cols:
                column_schema = column_schema.with_name(f"{column_name}_filled")
                output_schema += Schema([column_schema])
        return output_schema
Пример #15
0
def test_addition_nodes_are_combined():
    schema = Schema(["a", "b", "c", "d", "e", "f", "g", "h"])

    node1 = ["a", "b"] >> Operator()
    node2 = ["c", "d"] >> Operator()
    node3 = ["e", "f"] >> Operator()
    node4 = ["g", "h"] >> Operator()

    add_node = node1 + node2 + node3
    workflow = Workflow(add_node).fit_schema(schema)
    assert set(workflow.output_node.parents) == {node1}
    assert set(workflow.output_node.dependencies) == {node2, node3}
    assert set(workflow.output_node.output_columns.names) == {
        "a", "b", "c", "d", "e", "f"
    }

    add_node = node1 + "c" + "d"
    workflow = Workflow(add_node).fit_schema(schema)
    assert set(workflow.output_node.parents) == {node1}
    assert set(
        workflow.output_node.output_columns.names) == {"a", "b", "c", "d"}

    add_node = "c" + node1 + "d"
    workflow = Workflow(add_node).fit_schema(schema)
    assert set(workflow.output_node.parents) == {node1}
    assert set(
        workflow.output_node.output_columns.names) == {"a", "b", "c", "d"}

    add_node = node1 + "e" + node2
    workflow = Workflow(add_node).fit_schema(schema)
    assert set(workflow.output_node.parents) == {node1}
    assert node2 in workflow.output_node.dependencies
    assert set(workflow.output_node.output_columns.names) == {
        "a", "b", "e", "c", "d"
    }

    add_node1 = node1 + node2
    add_node2 = node3 + node4

    add_node = add_node1 + add_node2
    workflow = Workflow(add_node).fit_schema(schema)

    assert set(workflow.output_node.parents) == {node1}
    assert set(workflow.output_node.dependencies) == {node2, node3, node4}
    assert set(workflow.output_node.output_columns.names) == {
        "a",
        "b",
        "c",
        "d",
        "e",
        "f",
        "g",
        "h",
    }
Пример #16
0
    def compute_output_schema(self, input_schema: Schema, col_selector: ColumnSelector) -> Schema:
        if not col_selector:
            if hasattr(self, "target"):
                col_selector = (
                    ColumnSelector(self.target) if isinstance(self.target, list) else self.target
                )
            else:
                col_selector = ColumnSelector(input_schema.column_names)
        if col_selector.tags:
            tags_col_selector = ColumnSelector(tags=col_selector.tags)
            filtered_schema = input_schema.apply(tags_col_selector)
            col_selector += ColumnSelector(filtered_schema.column_names)

            # zero tags because already filtered
            col_selector._tags = []
        new_col_selector = self.output_column_names(col_selector)
        new_list = []
        for name in col_selector.names:
            for new_name in new_col_selector.names:
                if name in new_name and new_name not in new_list:
                    new_list.append(new_name)

        base_cols_map = {}
        for new_col in new_list:
            base_cols_map[new_col] = []
            for old_col in input_schema.column_schemas:
                if old_col in new_col:
                    base_cols_map[new_col].append(old_col)

        col_selector = ColumnSelector(new_list)
        for column_name in col_selector.names:
            if column_name not in input_schema.column_schemas:
                # grab the first collision
                base_col_name = base_cols_map[column_name][0]
                base_col_schema = input_schema.column_schemas[base_col_name]
                input_schema += Schema([base_col_schema.with_name(column_name)])

        output_schema = Schema()
        for column_schema in input_schema.apply(col_selector):
            output_schema += Schema([self.transformed_schema(column_schema)])
        return output_schema
Пример #17
0
def test_workflow_node_addition():
    schema = Schema(["a", "b", "c", "d", "e", "f"])

    node1 = ["a", "b"] >> Operator()
    node2 = ["c", "d"] >> Operator()
    node3 = ["e", "f"] >> Operator()

    output_node = node1 + node2
    workflow = Workflow(output_node).fit_schema(schema)
    assert workflow.output_node.output_columns.names == ["a", "b", "c", "d"]

    output_node = node1 + "c"
    workflow = Workflow(output_node).fit_schema(schema)
    assert workflow.output_node.output_columns.names == ["a", "b", "c"]

    output_node = node1 + "c" + "d"
    workflow = Workflow(output_node).fit_schema(schema)
    assert workflow.output_node.output_columns.names == ["a", "b", "c", "d"]

    output_node = node1 + node2 + "e"
    workflow = Workflow(output_node).fit_schema(schema)
    assert workflow.output_node.output_columns.names == [
        "a", "b", "c", "d", "e"
    ]

    output_node = node1 + node2 + node3
    workflow = Workflow(output_node).fit_schema(schema)
    assert workflow.output_node.output_columns.names == [
        "a", "b", "c", "d", "e", "f"
    ]

    # Addition with groups
    output_node = node1 + ["c", "d"]
    workflow = Workflow(output_node).fit_schema(schema)
    assert workflow.output_node.output_columns.grouped_names == [
        "a", "b", "c", "d"
    ]

    output_node = node1 + [node2, "e"]
    workflow = Workflow(output_node).fit_schema(schema)
    assert workflow.output_node.output_columns.grouped_names == [
        "a", "b", "c", "d", "e"
    ]

    output_node = node1 + [node2, node3]
    workflow = Workflow(output_node).fit_schema(schema)
    assert workflow.output_node.output_columns.grouped_names == [
        "a", "b", "c", "d", "e", "f"
    ]
Пример #18
0
def test_grab_additional_input_columns(dataset, engine):
    schema = Schema(["x", "y"])
    node1 = ["x"] >> ops.FillMissing()
    node2 = node1 >> ops.Clip(min_value=0)

    add_node = node2 + ["y"]

    workflow = Workflow(add_node).fit_schema(schema)
    output_df = workflow.transform(dataset).to_ddf().compute()

    assert len(workflow.output_node.input_columns.names) == 2
    assert workflow.output_node.input_columns.names == ["x", "y"]

    assert len(workflow.output_node.output_columns.names) == 2
    assert workflow.output_node.output_columns.names == ["x", "y"]

    assert len(output_df.columns) == 2
    assert output_df.columns.tolist() == ["x", "y"]
Пример #19
0
    def fit_schema(self, input_schema: Schema) -> "Workflow":
        schemaless_nodes = {
            node: _get_schemaless_nodes(node.parents_with_dependencies)
            for node in _get_schemaless_nodes([self.output_node])
        }

        while schemaless_nodes:
            # get all the Operators with no outstanding dependencies
            current_phase = [
                node for node, dependencies in schemaless_nodes.items() if not dependencies
            ]
            if not current_phase:
                # this shouldn't happen, but lets not infinite loop just in case
                raise RuntimeError("failed to find dependency-free Operator to compute schema for")

            processed_nodes = []
            for node in current_phase:
                if not node.parents:
                    node.compute_schemas(input_schema)
                else:
                    combined_schema = sum(
                        [parent.output_schema for parent in node.parents if parent.output_schema],
                        Schema(),
                    )
                    # we want to update the input_schema with new values
                    # from combined schema
                    combined_schema = input_schema + combined_schema
                    node.compute_schemas(combined_schema)

                processed_nodes.append(node)

            # Remove all the operators we processed in this phase, and remove
            # from the dependencies of other ops too
            for schemaless_node in current_phase:
                schemaless_nodes.pop(schemaless_node)
            for dependencies in schemaless_nodes.values():
                dependencies.difference_update(current_phase)

        self.output_schema = self.output_node.output_schema

        return self
Пример #20
0
def test_feature_column_utils():
    cols = [
        tf.feature_column.embedding_column(
            tf.feature_column.categorical_column_with_vocabulary_list(
                "vocab_1", ["a", "b", "c", "d"]),
            16,
        ),
        tf.feature_column.embedding_column(
            tf.feature_column.categorical_column_with_vocabulary_list(
                "vocab_2", ["1", "2", "3", "4", "5"]),
            32,
        ),
    ]

    workflow, _ = nvtf.make_feature_column_workflow(cols, "target")

    schema = Schema(["vocab_1", "vocab_2", "target"])
    workflow.fit_schema(schema)
    assert workflow.output_node.output_schema.column_names == [
        "vocab_1", "vocab_2", "target"
    ]
Пример #21
0
def test_compute_schemas():
    root_schema = Schema(["a", "b", "c", "d", "e"])

    node1 = ["a", "b"] >> Rename(postfix="_renamed")
    node1.parents[0].compute_schemas(root_schema)
    node1.compute_schemas(root_schema)

    assert node1.input_columns.names == ["a", "b"]
    assert node1.output_columns.names == ["a_renamed", "b_renamed"]

    node2 = node1 + "c"
    node2.dependencies[0].compute_schemas(root_schema)
    node2.compute_schemas(root_schema)

    assert node2.input_columns.names == ["a_renamed", "b_renamed", "c"]
    assert node2.output_columns.names == ["a_renamed", "b_renamed", "c"]

    node3 = node2["a_renamed"]
    node3.compute_schemas(root_schema)

    assert node3.input_columns.names == ["a_renamed"]
    assert node3.output_columns.names == ["a_renamed"]
Пример #22
0
def test_input_output_column_names():
    schema = Schema(["a", "b", "c", "d", "e"])

    input_node = ["a", "b", "c"] >> FillMissing()
    workflow = Workflow(input_node).fit_schema(schema)
    assert workflow.output_node.input_columns.names == ["a", "b", "c"]
    assert workflow.output_node.output_columns.names == ["a", "b", "c"]

    chained_node = input_node >> Categorify()
    workflow = Workflow(chained_node).fit_schema(schema)
    assert workflow.output_node.input_columns.names == ["a", "b", "c"]
    assert workflow.output_node.output_columns.names == ["a", "b", "c"]

    selection_node = input_node[["b", "c"]]
    workflow = Workflow(selection_node).fit_schema(schema)
    assert workflow.output_node.input_columns.names == ["b", "c"]
    assert workflow.output_node.output_columns.names == ["b", "c"]

    addition_node = input_node + ["d"]
    workflow = Workflow(addition_node).fit_schema(schema)
    assert workflow.output_node.input_columns.names == ["a", "b", "c", "d"]
    assert workflow.output_node.output_columns.names == ["a", "b", "c", "d"]

    rename_node = input_node >> Rename(postfix="_renamed")
    workflow = Workflow(rename_node).fit_schema(schema)
    assert workflow.output_node.input_columns.names == ["a", "b", "c"]
    assert workflow.output_node.output_columns.names == [
        "a_renamed", "b_renamed", "c_renamed"
    ]

    dependency_node = input_node >> TargetEncoding("d")
    workflow = Workflow(dependency_node).fit_schema(schema)
    assert workflow.output_node.input_columns.names == ["a", "b", "c"]
    assert workflow.output_node.output_columns.names == [
        "TE_a_d", "TE_b_d", "TE_c_d"
    ]
Пример #23
0
def test_dataset_schema_select_by_tag():
    schema1 = ColumnSchema("col1", tags=["a", "b", "c"])
    schema2 = ColumnSchema("col2", tags=["b", "c", "d"])

    ds_schema = Schema([schema1, schema2])

    selected_schema1 = ds_schema.select_by_tag("a")
    selected_schema2 = ds_schema.select_by_tag("d")

    assert selected_schema1.column_schemas == {"col1": schema1}
    assert selected_schema2.column_schemas == {"col2": schema2}

    selected_schema_both = ds_schema.select_by_tag("c")
    selected_schema_neither = ds_schema.select_by_tag("e")
    selected_schema_multi = ds_schema.select_by_tag(["b", "c"])

    assert selected_schema_both.column_schemas == {"col1": schema1, "col2": schema2}
    assert selected_schema_neither.column_schemas == {}
    assert selected_schema_multi.column_schemas == {"col1": schema1, "col2": schema2}
Пример #24
0
def test_dataset_schema_select_by_name():
    schema1 = ColumnSchema("col1", tags=["a", "b", "c"])
    schema2 = ColumnSchema("col2", tags=["b", "c", "d"])

    ds_schema = Schema([schema1, schema2])

    selected_schema1 = ds_schema.select_by_name("col1")
    selected_schema2 = ds_schema.select_by_name("col2")

    assert selected_schema1.column_schemas == {"col1": schema1}
    assert selected_schema2.column_schemas == {"col2": schema2}

    selected_schema_multi = ds_schema.select_by_name(["col1", "col2"])

    assert selected_schema_multi.column_schemas == {"col1": schema1, "col2": schema2}

    with pytest.raises(KeyError) as exception_info:
        ds_schema.select_by_name("col3")

    assert "col3" in str(exception_info.value)
Пример #25
0
def test_schema_can_be_added_to_none():
    schema_set = Schema(["a", "b", "c"])

    assert (schema_set + None) == schema_set
    assert (None + schema_set) == schema_set
Пример #26
0
def test_construct_schema_with_column_names():
    schema = Schema(["x", "y", "z"])
    expected = Schema([ColumnSchema("x"), ColumnSchema("y"), ColumnSchema("z")])

    assert schema == expected
Пример #27
0
def test_dataset_schema_column_names():
    ds_schema = Schema(["x", "y", "z"])

    assert ds_schema.column_names == ["x", "y", "z"]