def remove_semantic_tags(self, semantic_tags): """Remove the semantic tags for any column names in the provided semantic_tags dictionary, updating the Woodwork typing information. Including `index` or `time_index` tags will set the Woodwork index or time index to None for the DataFrame. Args: semantic_tags (dict[str -> str/list/set]): A dictionary mapping the columns in the DataFrame to the tags that should be removed from the column's semantic tags """ _check_semantic_tags(self.columns.keys(), semantic_tags) for col_name, tags_to_remove in semantic_tags.items(): standard_tags = self.logical_types[col_name].standard_tags tags_to_remove = _convert_input_to_set(tags_to_remove) original_tags = self.semantic_tags[col_name].copy() self.columns[col_name]._remove_semantic_tags( tags_to_remove, col_name) # If the index is removed, reinsert any standard tags not explicitly removed if (self.use_standard_tags[col_name] and "index" in original_tags and "index" not in self.columns[col_name].semantic_tags): standard_tags_removed = tags_to_remove.intersection( standard_tags) standard_tags_to_reinsert = standard_tags.difference( standard_tags_removed) self.columns[col_name].semantic_tags = self.semantic_tags[ col_name].union(standard_tags_to_reinsert)
def reset_semantic_tags(self, columns=None, retain_index_tags=False): """Reset the semantic tags for the specified columns to the default values. The default values will be either an empty set or a set of the standard tags based on the column logical type, controlled by the use_standard_tags property on the table. Column names can be provided as a single string, a list of strings or a set of strings. If columns is not specified, tags will be reset for all columns. Args: columns (str/list/set, optional): The columns for which the semantic tags should be reset. retain_index_tags (bool, optional): If True, will retain any index or time_index semantic tags set on the column. If False, will clear all semantic tags. Defaults to False. """ columns = _convert_input_to_set(columns, "columns") cols_not_found = sorted( list(columns.difference(set(self.columns.keys())))) if cols_not_found: raise ColumnNotPresentError(cols_not_found) if not columns: columns = self.columns.keys() for col_name in columns: original_tags = self.semantic_tags[col_name] self.columns[col_name]._reset_semantic_tags() if retain_index_tags and "index" in original_tags: self._set_index_tags(col_name) if retain_index_tags and "time_index" in original_tags: self._set_time_index_tags(col_name)
def remove_semantic_tags(self, semantic_tags): """Removes specified semantic tags from column and returns a new column. Args: semantic_tags (str/list/set): Semantic tag(s) to remove from the column. Returns: woodwork.DataColumn: DataColumn with specified tags removed. """ tags_to_remove = _convert_input_to_set(semantic_tags) invalid_tags = sorted( list(tags_to_remove.difference(self._semantic_tags))) if invalid_tags: raise LookupError( f"Semantic tag(s) '{', '.join(invalid_tags)}' not present on column '{self.name}'" ) standard_tags_to_remove = sorted( list(tags_to_remove.intersection( self._logical_type.standard_tags))) if standard_tags_to_remove and self.use_standard_tags: warnings.warn( StandardTagsRemovalWarning().get_warning_message( standard_tags_to_remove, self.name), StandardTagsRemovalWarning) new_tags = self._semantic_tags.difference(tags_to_remove) return DataColumn(series=self._series, logical_type=self.logical_type, semantic_tags=new_tags, use_standard_tags=False)
def set_semantic_tags(self, semantic_tags, retain_index_tags=True): """Replace current semantic tags with new values and return a new DataColumn object. Args: semantic_tags (str/list/set): New semantic tag(s) to set for column retain_index_tags (bool, optional): If True, any 'index' or 'time_index' tags on the column will be retained. If False, all tags will be replaced. Defaults to True. Returns: woodwork.DataColumn: DataColumn with specified semantic tags. """ semantic_tags = _convert_input_to_set(semantic_tags) _validate_tags(semantic_tags) is_index = 'index' in self._semantic_tags is_time_index = 'time_index' in self._semantic_tags new_col = DataColumn(series=self._series, logical_type=self.logical_type, semantic_tags=semantic_tags, use_standard_tags=self.use_standard_tags) if new_col.use_standard_tags: new_col._semantic_tags = new_col._semantic_tags.union( new_col._logical_type.standard_tags) if retain_index_tags and is_index: new_col._set_as_index() if retain_index_tags and is_time_index: new_col._set_as_time_index() return new_col
def _get_column_tags(self, semantic_tags, validate): semantic_tags = _convert_input_to_set( semantic_tags, error_language="semantic_tags", validate=validate ) if self.use_standard_tags: if self.logical_type is None: raise ValueError("Cannot use standard tags when logical_type is None") semantic_tags = semantic_tags.union(self.logical_type.standard_tags) return semantic_tags
def _set_semantic_tags(self, semantic_tags): """Replace current semantic tags with new values. If use_standard_tags is set to True, standard tags will be added as well. Args: semantic_tags (str/list/set): New semantic tag(s) to set """ semantic_tags = _convert_input_to_set(semantic_tags) if self.use_standard_tags: semantic_tags = semantic_tags.union(self.logical_type.standard_tags) self.semantic_tags = semantic_tags
def add_semantic_tags(self, semantic_tags): """Adds specified semantic tags to columns, updating the Woodwork typing information. Will retain any previously set values. Args: semantic_tags (dict[str -> str/list/set]): A dictionary mapping the columns in the DataFrame to the tags that should be added to the column's semantic tags """ _check_semantic_tags(self.columns.keys(), semantic_tags) for col_name, tags_to_add in semantic_tags.items(): tags_to_add = _convert_input_to_set(tags_to_add) _validate_not_setting_index_tags(tags_to_add, col_name) self.columns[col_name]._add_semantic_tags(tags_to_add, col_name)
def test_validation_methods_called(mock_validate_input_type, mock_validate_strings): assert not mock_validate_input_type.called assert not mock_validate_strings.called _convert_input_to_set("test_tag", validate=False) assert not mock_validate_input_type.called _convert_input_to_set("test_tag", validate=True) assert mock_validate_input_type.called _convert_input_to_set(["test_tag", "tag2"], validate=False) assert not mock_validate_strings.called _convert_input_to_set(["test_tag", "tag2"], validate=True) assert mock_validate_strings.called
def _set_semantic_tags(semantic_tags, standard_tags, use_standard_tags): """Replace current semantic tags with new values. If use_standard_tags is set to True, standard tags will be added as well. Args: semantic_tags (str/list/set): New semantic tag(s) to set standard_tags (set): Set of standard tags for the column logical type use_standard_tags (bool): If True, retain standard tags after reset """ semantic_tags = _convert_input_to_set(semantic_tags) if use_standard_tags: semantic_tags = semantic_tags.union(standard_tags) return semantic_tags
def test_convert_input_to_set(): error_message = "semantic_tags must be a string, set or list" with pytest.raises(TypeError, match=error_message): _convert_input_to_set(int) error_message = "test_text must be a string, set or list" with pytest.raises(TypeError, match=error_message): _convert_input_to_set({'index': {}, 'time_index': {}}, 'test_text') error_message = "include parameter must contain only strings" with pytest.raises(TypeError, match=error_message): _convert_input_to_set(['index', 1], 'include parameter') semantic_tags_from_single = _convert_input_to_set('index', 'include parameter') assert semantic_tags_from_single == {'index'} semantic_tags_from_list = _convert_input_to_set(['index', 'numeric', 'category']) assert semantic_tags_from_list == {'index', 'numeric', 'category'} semantic_tags_from_set = _convert_input_to_set({'index', 'numeric', 'category'}, 'include parameter') assert semantic_tags_from_set == {'index', 'numeric', 'category'}
def _add_semantic_tags(self, new_tags, name): """Add the specified semantic tags to the current set of tags Args: new_tags (str/list/set): The new tags to add name (str): Name of the column to use in warning """ new_tags = _convert_input_to_set(new_tags) duplicate_tags = sorted(list( self.semantic_tags.intersection(new_tags))) if duplicate_tags: warnings.warn( DuplicateTagsWarning().get_warning_message( duplicate_tags, name), DuplicateTagsWarning) self.semantic_tags = self.semantic_tags.union(new_tags)
def __init__(self, series, logical_type=None, semantic_tags=None, use_standard_tags=True, name=None, description=None, metadata=None): """Create a DataColumn. Args: series (pd.Series or dd.Series or numpy.ndarray or pd.api.extensions.ExtensionArray): Series containing the data associated with the column. logical_type (LogicalType, optional): The logical type that should be assigned to the column. If no value is provided, the LogicalType for the series will be inferred. semantic_tags (str or list or set, optional): Semantic tags to assign to the column. Defaults to an empty set if not specified. There are two options for specifying the semantic tags: (str) If only one semantic tag is being set, a single string can be passed. (list or set) If multiple tags are being set, a list or set of strings can be passed. use_standard_tags (bool, optional): If True, will add standard semantic tags to columns based on the inferred or specified logical type for the column. Defaults to True. name (str, optional): Name of DataColumn. Will overwrite Series name, if it exists. description (str, optional): Optional text describing the contents of the column metadata (dict[str -> json serializable], optional): Metadata associated with the column. """ self._assigned_name = name self._set_series(series) self.use_standard_tags = use_standard_tags self._logical_type = self._parse_logical_type(logical_type) semantic_tags = _convert_input_to_set(semantic_tags) _validate_tags(semantic_tags) if use_standard_tags: semantic_tags = semantic_tags.union( self.logical_type.standard_tags) self._semantic_tags = semantic_tags self._update_dtype() if description and not isinstance(description, str): raise TypeError("Column description must be a string") self.description = description if metadata and not isinstance(metadata, dict): raise TypeError("Column metadata must be a dictionary") self.metadata = metadata or {}
def test_convert_input_to_set(): error_message = "semantic_tags must be a string, set or list" with pytest.raises(TypeError, match=error_message): _convert_input_to_set(int) error_message = "test_text must be a string, set or list" with pytest.raises(TypeError, match=error_message): _convert_input_to_set({"index": {}, "time_index": {}}, "test_text") error_message = "include parameter must contain only strings" with pytest.raises(TypeError, match=error_message): _convert_input_to_set(["index", 1], "include parameter") semantic_tags_from_single = _convert_input_to_set("index", "include parameter") assert semantic_tags_from_single == {"index"} semantic_tags_from_list = _convert_input_to_set(["index", "numeric", "category"]) assert semantic_tags_from_list == {"index", "numeric", "category"} semantic_tags_from_set = _convert_input_to_set( {"index", "numeric", "category"}, "include parameter" ) assert semantic_tags_from_set == {"index", "numeric", "category"}
def add_semantic_tags(self, semantic_tags): """Add the specified semantic tags to the column and return a new DataColumn object. Args: semantic_tags (str/list/set): New semantic tag(s) to add to the column Returns: woodwork.DataColumn: DataColumn with specified semantic tags added. """ new_tags = _convert_input_to_set(semantic_tags) _validate_tags(new_tags) duplicate_tags = sorted( list(self._semantic_tags.intersection(new_tags))) if duplicate_tags: warnings.warn( DuplicateTagsWarning().get_warning_message( duplicate_tags, self.name), DuplicateTagsWarning) new_col_tags = self._semantic_tags.union(new_tags) new_col = DataColumn(series=self._series, logical_type=self.logical_type, semantic_tags=new_col_tags, use_standard_tags=self.use_standard_tags) return new_col
def _create_columns( self, column_names, logical_types, semantic_tags, use_standard_tags, column_descriptions, column_origins, column_metadata, validate, ): """Create a dictionary with column names as keys and new column dictionaries holding each column's typing information as values.""" columns = {} for name in column_names: semantic_tags_for_col = _convert_input_to_set( (semantic_tags or {}).get(name), error_language=f"semantic_tags for {name}", validate=validate, ) if validate: _validate_not_setting_index_tags(semantic_tags_for_col, name) description = (column_descriptions or {}).get(name) origin = (column_origins if isinstance(column_origins, str) else (column_origins or {}).get(name)) metadata_for_col = (column_metadata or {}).get(name) columns[name] = ColumnSchema( logical_type=logical_types.get(name), semantic_tags=semantic_tags_for_col, use_standard_tags=use_standard_tags.get(name), description=description, origin=origin, metadata=metadata_for_col, validate=validate, ) return columns
def _remove_semantic_tags(self, tags_to_remove, name): """Removes specified semantic tags from from the current set of tags Args: tags_to_remove (str/list/set): The tags to remove name (str): Name of the column to use in warning """ tags_to_remove = _convert_input_to_set(tags_to_remove) invalid_tags = sorted( list(tags_to_remove.difference(self.semantic_tags))) if invalid_tags: raise LookupError( f"Semantic tag(s) '{', '.join(invalid_tags)}' not present on column '{name}'" ) if self.use_standard_tags and sorted( list( tags_to_remove.intersection( self.logical_type.standard_tags))): warnings.warn( StandardTagsChangedWarning().get_warning_message( not self.use_standard_tags, name), StandardTagsChangedWarning) self.semantic_tags = self.semantic_tags.difference(tags_to_remove)
def _remove_semantic_tags(tags_to_remove, current_tags, name, standard_tags, use_standard_tags): """Removes specified semantic tags from from the current set of tags Args: tags_to_remove (str/list/set): The tags to remove current_tags (set): Current set of semantic tags name (str): Name of the column to use in warning standard_tags (set): Set of standard tags for the column logical type use_standard_tags (bool): If True, warn if user attempts to remove a standard tag """ tags_to_remove = _convert_input_to_set(tags_to_remove) invalid_tags = sorted(list(tags_to_remove.difference(current_tags))) if invalid_tags: raise LookupError( f"Semantic tag(s) '{', '.join(invalid_tags)}' not present on column '{name}'" ) standard_tags_to_remove = sorted( list(tags_to_remove.intersection(standard_tags))) if standard_tags_to_remove and use_standard_tags: warnings.warn( StandardTagsChangedWarning().get_warning_message( not use_standard_tags, name), StandardTagsChangedWarning) return current_tags.difference(tags_to_remove)