def compute_output_schema(self, input_schema: Schema, col_selector: ColumnSelector) -> Schema: """Given a set of schemas and a column selector for the input columns, returns a set of schemas for the transformed columns this operator will produce Parameters ----------- input_schema: Schema The schemas of the columns to apply this operator to col_selector: ColumnSelector The column selector to apply to the input schema Returns ------- Schema The schemas of the columns produced by this operator """ if not col_selector: col_selector = ColumnSelector(input_schema.column_names) if col_selector.tags: tags_col_selector = ColumnSelector(tags=col_selector.tags) filtered_schema = input_schema.apply(tags_col_selector) col_selector += ColumnSelector(filtered_schema.column_names) # zero tags because already filtered col_selector._tags = [] col_selector = self.output_column_names(col_selector) for column_name in col_selector.names: if column_name not in input_schema.column_schemas: input_schema += Schema([column_name]) output_schema = Schema() for column_schema in input_schema.apply(col_selector): output_schema += Schema([self.transformed_schema(column_schema)]) return output_schema
def test_column_schema_protobuf_domain_check(tmpdir): # create a schema schema1 = ColumnSchema( "col1", tags=[], properties={"domain": {"min": 0, "max": 10}}, dtype=numpy.int, _is_list=False, ) schema2 = ColumnSchema( "col2", tags=[], properties={"domain": {"min": 0.0, "max": 10.0}}, dtype=numpy.float, _is_list=False, ) column_schema_set = Schema([schema1, schema2]) # write schema out schema_path = Path(tmpdir) saved_schema = column_schema_set.write(schema_path) # read schema back in loaded_schema = Schema.load(schema_path) # compare read to origin assert saved_schema == loaded_schema # load in protobuf file to tensorflow schema representation proto_schema = PbTxt_SchemaWriter._read(schema_path / "schema.pbtxt") assert """name: "col1"\n min: 0\n max: 10\n""" in str(proto_schema) assert """name: "col2"\n min: 0.0\n max: 10.0\n""" in str(proto_schema)
def test_applying_selector_to_schema_selects_by_name_or_tags(): schema1 = ColumnSchema("col1") schema2 = ColumnSchema("col2", tags=["b", "c", "d"]) schema = Schema([schema1, schema2]) selector = ColumnSelector(["col1"], tags=["a", "b"]) result = schema.apply(selector) assert result.column_names == schema.column_names
def _combine_schemas(elements): combined = Schema() for elem in elements: if isinstance(elem, Node): combined += elem.output_schema elif isinstance(elem, ColumnSelector): combined += Schema(elem.names) elif isinstance(elem, list): combined += _combine_schemas(elem) return combined
def test_applying_inverse_selector_to_schema_selects_relevant_columns(): schema = Schema(["a", "b", "c", "d", "e"]) selector = ColumnSelector(["a", "b"]) result = schema.apply_inverse(selector) assert result == Schema(["c", "d", "e"]) selector = None result = schema.apply_inverse(selector) assert result == schema
def compute_output_schema(self, input_schema: Schema, col_selector: ColumnSelector) -> Schema: if not col_selector: col_selector = ColumnSelector(input_schema.column_names) output_schema = Schema() for column_name in col_selector.names: column_schema = input_schema.column_schemas[column_name] output_schema += Schema([self.transformed_schema(column_schema)]) if self.add_binary_cols: column_schema = column_schema.with_name(f"{column_name}_filled") output_schema += Schema([column_schema]) return output_schema
def test_dataset_schemas_can_be_added(): ds1_schema = Schema([ColumnSchema("col1"), ColumnSchema("col2")]) ds2_schema = Schema([ColumnSchema("col3"), ColumnSchema("col4")]) result = ds1_schema + ds2_schema expected = Schema( [ColumnSchema("col1"), ColumnSchema("col2"), ColumnSchema("col3"), ColumnSchema("col4")] ) assert result == expected
def test_applying_selector_to_schema_selects_by_name(): schema = Schema(["a", "b", "c", "d", "e"]) selector = ColumnSelector(["a", "b"]) result = schema.apply(selector) assert result == Schema(["a", "b"]) selector = None result = schema.apply(selector) assert result == schema
def test_column_schema_set_protobuf(tmpdir, props1, props2, tags1, tags2, d_type, list_type): # create a schema schema1 = ColumnSchema("col1", tags=tags1, properties=props1, dtype=d_type, _is_list=list_type) schema2 = ColumnSchema("col2", tags=tags2, properties=props2, dtype=d_type, _is_list=list_type) column_schema_set = Schema([schema1, schema2]) # write schema out schema_path = Path(tmpdir) column_schema_set = column_schema_set.write(schema_path) # read schema back in target = Schema.load(schema_path) # compare read to origin assert column_schema_set == target
def test_dataset_schema_constructor(): schema1 = ColumnSchema("col1", tags=["a", "b", "c"]) schema2 = ColumnSchema("col2", tags=["c", "d", "e"]) expected = {schema1.name: schema1, schema2.name: schema2} ds_schema_dict = Schema(expected) ds_schema_list = Schema([schema1, schema2]) assert ds_schema_dict.column_schemas == expected assert ds_schema_list.column_schemas == expected with pytest.raises(TypeError) as exception_info: Schema(12345) assert "column_schemas" in str(exception_info.value)
def test_workflow_node_subtraction(): schema = Schema(["a", "b", "c", "d", "e", "f"]) node1 = ["a", "b", "c", "d"] >> Operator() node2 = ["c", "d"] >> Operator() node3 = ["b"] >> Operator() output_node = node1 - ["c", "d"] workflow = Workflow(output_node).fit_schema(schema) assert len(output_node.parents) == 1 assert len(output_node.dependencies) == 0 assert workflow.output_node.output_columns.names == ["a", "b"] output_node = node1 - node2 workflow = Workflow(output_node).fit_schema(schema) assert len(output_node.parents) == 1 assert len(output_node.dependencies) == 1 assert workflow.output_node.output_columns.names == ["a", "b"] output_node = ["a", "b", "c", "d"] - node2 workflow = Workflow(output_node).fit_schema(schema) assert len(output_node.parents) == 1 assert len(output_node.dependencies) == 1 assert workflow.output_node.output_columns.names == ["a", "b"] output_node = node1 - ["c", "d"] - node3 workflow = Workflow(output_node).fit_schema(schema) assert len(output_node.parents) == 1 assert len(output_node.dependencies) == 1 assert workflow.output_node.output_columns.names == ["a"]
def _remove_columns(workflow, to_remove): workflow = copy.deepcopy(workflow) for label in to_remove: if label in workflow.input_dtypes: del workflow.input_dtypes[label] if label in workflow.output_dtypes: del workflow.output_dtypes[label] # Work backwards to form an input schema from redacted columns new_schema = Schema(list(workflow.input_dtypes.keys())) # Re-fit the workflow to altered input schema for node in iter_nodes([workflow.output_node]): node.input_schema = None node.output_schema = None if node.selector: for column in to_remove: if column in node.selector._names: node.selector._names.remove(column) for subgroup in node.selector.subgroups: if column in subgroup._names: subgroup._names.remove(column) return workflow.fit_schema(new_schema)
def compute_output_schema(self, input_schema: Schema, col_selector: ColumnSelector) -> Schema: if not col_selector: col_selector = ColumnSelector(input_schema.column_names) if col_selector.tags: tags_col_selector = ColumnSelector(tags=col_selector.tags) filtered_schema = input_schema.apply(tags_col_selector) col_selector += ColumnSelector(filtered_schema.column_names) # zero tags because already filtered col_selector._tags = [] output_schema = Schema() for column_name in input_schema.column_schemas: new_names = self.output_column_names(ColumnSelector(column_name)) column_schema = input_schema.column_schemas[column_name] for new_name in new_names.names: new_column_schema = column_schema.with_name(new_name) output_schema += Schema([self.transformed_schema(new_column_schema)]) return output_schema
def compute_output_schema(self, input_schema: Schema, col_selector: ColumnSelector) -> Schema: if not col_selector: col_selector = ColumnSelector(input_schema.column_names) if col_selector.tags: tags_col_selector = ColumnSelector(tags=col_selector.tags) filtered_schema = input_schema.apply(tags_col_selector) col_selector += ColumnSelector(filtered_schema.column_names) # zero tags because already filtered col_selector._tags = [] output_schema = Schema() for column_name in col_selector.names: column_schema = input_schema.column_schemas[column_name] output_schema += Schema([self.transformed_schema(column_schema)]) if self.add_binary_cols: column_schema = column_schema.with_name(f"{column_name}_filled") output_schema += Schema([column_schema]) return output_schema
def test_addition_nodes_are_combined(): schema = Schema(["a", "b", "c", "d", "e", "f", "g", "h"]) node1 = ["a", "b"] >> Operator() node2 = ["c", "d"] >> Operator() node3 = ["e", "f"] >> Operator() node4 = ["g", "h"] >> Operator() add_node = node1 + node2 + node3 workflow = Workflow(add_node).fit_schema(schema) assert set(workflow.output_node.parents) == {node1} assert set(workflow.output_node.dependencies) == {node2, node3} assert set(workflow.output_node.output_columns.names) == { "a", "b", "c", "d", "e", "f" } add_node = node1 + "c" + "d" workflow = Workflow(add_node).fit_schema(schema) assert set(workflow.output_node.parents) == {node1} assert set( workflow.output_node.output_columns.names) == {"a", "b", "c", "d"} add_node = "c" + node1 + "d" workflow = Workflow(add_node).fit_schema(schema) assert set(workflow.output_node.parents) == {node1} assert set( workflow.output_node.output_columns.names) == {"a", "b", "c", "d"} add_node = node1 + "e" + node2 workflow = Workflow(add_node).fit_schema(schema) assert set(workflow.output_node.parents) == {node1} assert node2 in workflow.output_node.dependencies assert set(workflow.output_node.output_columns.names) == { "a", "b", "e", "c", "d" } add_node1 = node1 + node2 add_node2 = node3 + node4 add_node = add_node1 + add_node2 workflow = Workflow(add_node).fit_schema(schema) assert set(workflow.output_node.parents) == {node1} assert set(workflow.output_node.dependencies) == {node2, node3, node4} assert set(workflow.output_node.output_columns.names) == { "a", "b", "c", "d", "e", "f", "g", "h", }
def compute_output_schema(self, input_schema: Schema, col_selector: ColumnSelector) -> Schema: if not col_selector: if hasattr(self, "target"): col_selector = ( ColumnSelector(self.target) if isinstance(self.target, list) else self.target ) else: col_selector = ColumnSelector(input_schema.column_names) if col_selector.tags: tags_col_selector = ColumnSelector(tags=col_selector.tags) filtered_schema = input_schema.apply(tags_col_selector) col_selector += ColumnSelector(filtered_schema.column_names) # zero tags because already filtered col_selector._tags = [] new_col_selector = self.output_column_names(col_selector) new_list = [] for name in col_selector.names: for new_name in new_col_selector.names: if name in new_name and new_name not in new_list: new_list.append(new_name) base_cols_map = {} for new_col in new_list: base_cols_map[new_col] = [] for old_col in input_schema.column_schemas: if old_col in new_col: base_cols_map[new_col].append(old_col) col_selector = ColumnSelector(new_list) for column_name in col_selector.names: if column_name not in input_schema.column_schemas: # grab the first collision base_col_name = base_cols_map[column_name][0] base_col_schema = input_schema.column_schemas[base_col_name] input_schema += Schema([base_col_schema.with_name(column_name)]) output_schema = Schema() for column_schema in input_schema.apply(col_selector): output_schema += Schema([self.transformed_schema(column_schema)]) return output_schema
def test_workflow_node_addition(): schema = Schema(["a", "b", "c", "d", "e", "f"]) node1 = ["a", "b"] >> Operator() node2 = ["c", "d"] >> Operator() node3 = ["e", "f"] >> Operator() output_node = node1 + node2 workflow = Workflow(output_node).fit_schema(schema) assert workflow.output_node.output_columns.names == ["a", "b", "c", "d"] output_node = node1 + "c" workflow = Workflow(output_node).fit_schema(schema) assert workflow.output_node.output_columns.names == ["a", "b", "c"] output_node = node1 + "c" + "d" workflow = Workflow(output_node).fit_schema(schema) assert workflow.output_node.output_columns.names == ["a", "b", "c", "d"] output_node = node1 + node2 + "e" workflow = Workflow(output_node).fit_schema(schema) assert workflow.output_node.output_columns.names == [ "a", "b", "c", "d", "e" ] output_node = node1 + node2 + node3 workflow = Workflow(output_node).fit_schema(schema) assert workflow.output_node.output_columns.names == [ "a", "b", "c", "d", "e", "f" ] # Addition with groups output_node = node1 + ["c", "d"] workflow = Workflow(output_node).fit_schema(schema) assert workflow.output_node.output_columns.grouped_names == [ "a", "b", "c", "d" ] output_node = node1 + [node2, "e"] workflow = Workflow(output_node).fit_schema(schema) assert workflow.output_node.output_columns.grouped_names == [ "a", "b", "c", "d", "e" ] output_node = node1 + [node2, node3] workflow = Workflow(output_node).fit_schema(schema) assert workflow.output_node.output_columns.grouped_names == [ "a", "b", "c", "d", "e", "f" ]
def test_grab_additional_input_columns(dataset, engine): schema = Schema(["x", "y"]) node1 = ["x"] >> ops.FillMissing() node2 = node1 >> ops.Clip(min_value=0) add_node = node2 + ["y"] workflow = Workflow(add_node).fit_schema(schema) output_df = workflow.transform(dataset).to_ddf().compute() assert len(workflow.output_node.input_columns.names) == 2 assert workflow.output_node.input_columns.names == ["x", "y"] assert len(workflow.output_node.output_columns.names) == 2 assert workflow.output_node.output_columns.names == ["x", "y"] assert len(output_df.columns) == 2 assert output_df.columns.tolist() == ["x", "y"]
def fit_schema(self, input_schema: Schema) -> "Workflow": schemaless_nodes = { node: _get_schemaless_nodes(node.parents_with_dependencies) for node in _get_schemaless_nodes([self.output_node]) } while schemaless_nodes: # get all the Operators with no outstanding dependencies current_phase = [ node for node, dependencies in schemaless_nodes.items() if not dependencies ] if not current_phase: # this shouldn't happen, but lets not infinite loop just in case raise RuntimeError("failed to find dependency-free Operator to compute schema for") processed_nodes = [] for node in current_phase: if not node.parents: node.compute_schemas(input_schema) else: combined_schema = sum( [parent.output_schema for parent in node.parents if parent.output_schema], Schema(), ) # we want to update the input_schema with new values # from combined schema combined_schema = input_schema + combined_schema node.compute_schemas(combined_schema) processed_nodes.append(node) # Remove all the operators we processed in this phase, and remove # from the dependencies of other ops too for schemaless_node in current_phase: schemaless_nodes.pop(schemaless_node) for dependencies in schemaless_nodes.values(): dependencies.difference_update(current_phase) self.output_schema = self.output_node.output_schema return self
def test_feature_column_utils(): cols = [ tf.feature_column.embedding_column( tf.feature_column.categorical_column_with_vocabulary_list( "vocab_1", ["a", "b", "c", "d"]), 16, ), tf.feature_column.embedding_column( tf.feature_column.categorical_column_with_vocabulary_list( "vocab_2", ["1", "2", "3", "4", "5"]), 32, ), ] workflow, _ = nvtf.make_feature_column_workflow(cols, "target") schema = Schema(["vocab_1", "vocab_2", "target"]) workflow.fit_schema(schema) assert workflow.output_node.output_schema.column_names == [ "vocab_1", "vocab_2", "target" ]
def test_compute_schemas(): root_schema = Schema(["a", "b", "c", "d", "e"]) node1 = ["a", "b"] >> Rename(postfix="_renamed") node1.parents[0].compute_schemas(root_schema) node1.compute_schemas(root_schema) assert node1.input_columns.names == ["a", "b"] assert node1.output_columns.names == ["a_renamed", "b_renamed"] node2 = node1 + "c" node2.dependencies[0].compute_schemas(root_schema) node2.compute_schemas(root_schema) assert node2.input_columns.names == ["a_renamed", "b_renamed", "c"] assert node2.output_columns.names == ["a_renamed", "b_renamed", "c"] node3 = node2["a_renamed"] node3.compute_schemas(root_schema) assert node3.input_columns.names == ["a_renamed"] assert node3.output_columns.names == ["a_renamed"]
def test_input_output_column_names(): schema = Schema(["a", "b", "c", "d", "e"]) input_node = ["a", "b", "c"] >> FillMissing() workflow = Workflow(input_node).fit_schema(schema) assert workflow.output_node.input_columns.names == ["a", "b", "c"] assert workflow.output_node.output_columns.names == ["a", "b", "c"] chained_node = input_node >> Categorify() workflow = Workflow(chained_node).fit_schema(schema) assert workflow.output_node.input_columns.names == ["a", "b", "c"] assert workflow.output_node.output_columns.names == ["a", "b", "c"] selection_node = input_node[["b", "c"]] workflow = Workflow(selection_node).fit_schema(schema) assert workflow.output_node.input_columns.names == ["b", "c"] assert workflow.output_node.output_columns.names == ["b", "c"] addition_node = input_node + ["d"] workflow = Workflow(addition_node).fit_schema(schema) assert workflow.output_node.input_columns.names == ["a", "b", "c", "d"] assert workflow.output_node.output_columns.names == ["a", "b", "c", "d"] rename_node = input_node >> Rename(postfix="_renamed") workflow = Workflow(rename_node).fit_schema(schema) assert workflow.output_node.input_columns.names == ["a", "b", "c"] assert workflow.output_node.output_columns.names == [ "a_renamed", "b_renamed", "c_renamed" ] dependency_node = input_node >> TargetEncoding("d") workflow = Workflow(dependency_node).fit_schema(schema) assert workflow.output_node.input_columns.names == ["a", "b", "c"] assert workflow.output_node.output_columns.names == [ "TE_a_d", "TE_b_d", "TE_c_d" ]
def test_dataset_schema_select_by_tag(): schema1 = ColumnSchema("col1", tags=["a", "b", "c"]) schema2 = ColumnSchema("col2", tags=["b", "c", "d"]) ds_schema = Schema([schema1, schema2]) selected_schema1 = ds_schema.select_by_tag("a") selected_schema2 = ds_schema.select_by_tag("d") assert selected_schema1.column_schemas == {"col1": schema1} assert selected_schema2.column_schemas == {"col2": schema2} selected_schema_both = ds_schema.select_by_tag("c") selected_schema_neither = ds_schema.select_by_tag("e") selected_schema_multi = ds_schema.select_by_tag(["b", "c"]) assert selected_schema_both.column_schemas == {"col1": schema1, "col2": schema2} assert selected_schema_neither.column_schemas == {} assert selected_schema_multi.column_schemas == {"col1": schema1, "col2": schema2}
def test_dataset_schema_select_by_name(): schema1 = ColumnSchema("col1", tags=["a", "b", "c"]) schema2 = ColumnSchema("col2", tags=["b", "c", "d"]) ds_schema = Schema([schema1, schema2]) selected_schema1 = ds_schema.select_by_name("col1") selected_schema2 = ds_schema.select_by_name("col2") assert selected_schema1.column_schemas == {"col1": schema1} assert selected_schema2.column_schemas == {"col2": schema2} selected_schema_multi = ds_schema.select_by_name(["col1", "col2"]) assert selected_schema_multi.column_schemas == {"col1": schema1, "col2": schema2} with pytest.raises(KeyError) as exception_info: ds_schema.select_by_name("col3") assert "col3" in str(exception_info.value)
def test_schema_can_be_added_to_none(): schema_set = Schema(["a", "b", "c"]) assert (schema_set + None) == schema_set assert (None + schema_set) == schema_set
def test_construct_schema_with_column_names(): schema = Schema(["x", "y", "z"]) expected = Schema([ColumnSchema("x"), ColumnSchema("y"), ColumnSchema("z")]) assert schema == expected
def test_dataset_schema_column_names(): ds_schema = Schema(["x", "y", "z"]) assert ds_schema.column_names == ["x", "y", "z"]