def _update_dtype(self): """Update the dtype of the underlying series to match the dtype corresponding to the LogicalType for the column.""" if isinstance(self.logical_type, Ordinal): self.logical_type._validate_data(self._series) elif _get_ltype_class(self.logical_type) == LatLong: # Reformat LatLong columns to be a length two tuple (or list for Koalas) of floats if dd and isinstance(self._series, dd.Series): name = self._series.name meta = (self._series, tuple([float, float])) self._series = self._series.apply(_reformat_to_latlong, meta=meta) self._series.name = name elif ks and isinstance(self._series, ks.Series): formatted_series = self._series.to_pandas().apply( _reformat_to_latlong, use_list=True) self._series = ks.from_pandas(formatted_series) else: self._series = self._series.apply(_reformat_to_latlong) if self.logical_type.pandas_dtype != str(self._series.dtype): # Update the underlying series try: if _get_ltype_class(self.logical_type) == Datetime: if dd and isinstance(self._series, dd.Series): name = self._series.name self._series = dd.to_datetime( self._series, format=self.logical_type.datetime_format) self._series.name = name elif ks and isinstance(self._series, ks.Series): self._series = ks.Series(ks.to_datetime( self._series.to_numpy(), format=self.logical_type.datetime_format), name=self._series.name) else: self._series = pd.to_datetime( self._series, format=self.logical_type.datetime_format) else: if ks and isinstance( self._series, ks.Series) and self.logical_type.backup_dtype: new_dtype = self.logical_type.backup_dtype else: new_dtype = self.logical_type.pandas_dtype self._series = self._series.astype(new_dtype) except (TypeError, ValueError): error_msg = f'Error converting datatype for column {self.name} from type {str(self._series.dtype)} ' \ f'to type {self.logical_type.pandas_dtype}. Please confirm the underlying data is consistent with ' \ f'logical type {self.logical_type}.' raise TypeError(error_msg)
def _replace_nans_for_mutual_info(schema, data): """Replace NaN values in the dataframe so that mutual information can be calculated Args: schema (woodwork.TableSchema): Woodwork typing info for the data data (pd.DataFrame): dataframe to use for calculating mutual information Returns: pd.DataFrame: data with nans replaced with either mean or mode """ for column_name in data.columns[data.isnull().any()]: column = schema.columns[column_name] series = data[column_name] if column.is_numeric or column.is_datetime: mean = series.mean() if isinstance(mean, float) and not _get_ltype_class( column.logical_type) == Double: data[column_name] = series.astype('float') data[column_name] = series.fillna(mean) elif column.is_categorical or column.is_boolean: mode = _get_mode(series) data[column_name] = series.fillna(mode) return data
def _validate_logical_type(logical_type): ltype_class = _get_ltype_class(logical_type) if ltype_class not in ww.type_system.registered_types: raise TypeError( f'logical_type {logical_type} is not a registered LogicalType.') if ltype_class == Ordinal and not isinstance(logical_type, Ordinal): raise TypeError( "Must use an Ordinal instance with order values defined")
def write_table_data(datatable, path, format='csv', **kwargs): '''Write underlying datatable data to disk or S3 path. Args: datatable (DataTable) : Instance of :class:`.DataTable`. path (str) : Location on disk to write datatable data. format (str) : Format to use for writing datatable data. Defaults to csv. kwargs (keywords) : Additional keyword arguments to pass as keywords arguments to the underlying serialization method. Returns: loading_info (dict) : Information on storage location and format of datatable data. ''' format = format.lower() dt_name = datatable.name or 'data' df = datatable.to_dataframe() if dd and isinstance(df, dd.DataFrame) and format == 'csv': basename = "{}-*.{}".format(dt_name, format) else: basename = '.'.join([dt_name, format]) location = os.path.join('data', basename) file = os.path.join(path, location) if format == 'csv': compression = kwargs['compression'] if ks and isinstance(df, ks.DataFrame): df = df.copy() columns = list(df.select_dtypes('object').columns) df[columns] = df[columns].astype(str) compression = str(compression) df.to_csv(file, index=kwargs['index'], sep=kwargs['sep'], encoding=kwargs['encoding'], compression=compression) elif format == 'pickle': # Dask and Koalas currently do not support to_pickle if not isinstance(df, pd.DataFrame): msg = 'DataFrame type not compatible with pickle serialization. Please serialize to another format.' raise ValueError(msg) df.to_pickle(file, **kwargs) elif format == 'parquet': # Latlong columns in pandas and Dask DataFrames contain tuples, which raises # an error in parquet format. df = df.copy() latlong_columns = [ col_name for col_name, col in datatable.columns.items() if _get_ltype_class(col.logical_type) == ww.logical_types.LatLong ] df[latlong_columns] = df[latlong_columns].astype(str) df.to_parquet(file, **kwargs) else: error = 'must be one of the following formats: {}' raise ValueError(error.format(', '.join(FORMATS))) return {'location': location, 'type': format, 'params': kwargs}
def typing_info_to_dict(dataframe): """Creates the description for a Woodwork table, including typing information for each column and loading information. Args: dataframe (pd.DataFrame, dd.Dataframe, ks.DataFrame): DataFrame with Woodwork typing information initialized. Returns: dict: Dictionary containing Woodwork typing information """ if dd and isinstance(dataframe, dd.DataFrame): # Need to determine the category info for Dask it can be saved below category_cols = [colname for colname, col in dataframe.ww._schema.columns.items() if col.is_categorical] dataframe = dataframe.ww.categorize(columns=category_cols) ordered_columns = dataframe.columns column_typing_info = [ {'name': col_name, 'ordinal': ordered_columns.get_loc(col_name), 'use_standard_tags': col.use_standard_tags, 'logical_type': { 'parameters': _get_specified_ltype_params(col.logical_type), 'type': str(_get_ltype_class(col.logical_type)) }, 'physical_type': { 'type': str(dataframe[col_name].dtype), # Store categorical values so they can be recreated if they are modified during serialization 'cat_values': dataframe[col_name].dtype.categories.to_list() if str(dataframe[col_name].dtype) == 'category' else None }, 'semantic_tags': sorted(list(col.semantic_tags)), 'description': col.description, 'metadata': col.metadata, } for col_name, col in dataframe.ww.columns.items() ] if dd and isinstance(dataframe, dd.DataFrame): table_type = 'dask' elif ks and isinstance(dataframe, ks.DataFrame): table_type = 'koalas' else: table_type = 'pandas' return { 'schema_version': SCHEMA_VERSION, 'name': dataframe.ww.name, 'index': dataframe.ww.index, 'time_index': dataframe.ww.time_index, 'column_typing_info': column_typing_info, 'loading_info': { 'table_type': table_type }, 'table_metadata': dataframe.ww.metadata }
def _check_time_index(column_names, time_index, logical_type): if time_index not in column_names: raise ColumnNotPresentError( f"Specified time index column `{time_index}` not found in TableSchema" ) ltype_class = _get_ltype_class(logical_type) if not (ltype_class == ww.logical_types.Datetime or "numeric" in ltype_class.standard_tags): raise TypeError( "Time index column must be a Datetime or numeric column.")
def _check_time_index(column_names, time_index, logical_type): if time_index not in column_names: raise LookupError( f'Specified time index column `{time_index}` not found in TableSchema' ) ltype_class = _get_ltype_class(logical_type) if not (ltype_class == ww.logical_types.Datetime or 'numeric' in ltype_class.standard_tags): raise TypeError( 'Time index column must be a Datetime or numeric column.')
def clean_latlong(dataframe): """Convert latlong tuples to strings for parquet, arrow and feather file format. Attempting to serialize with tuples present results in an error""" latlong_columns = [ col_name for col_name, col in dataframe.ww.columns.items() if _get_ltype_class(col.logical_type) == LatLong ] if len(latlong_columns) > 0: dataframe = dataframe.ww.copy() dataframe[latlong_columns] = dataframe[latlong_columns].astype(str) return dataframe
def _parse_logical_type(self, logical_type): if logical_type: if isinstance(logical_type, str): logical_type = ww.type_system.str_to_logical_type(logical_type) ltype_class = _get_ltype_class(logical_type) if ltype_class == Ordinal and not isinstance( logical_type, Ordinal): raise TypeError( "Must use an Ordinal instance with order values defined") if ltype_class in ww.type_system.registered_types: return logical_type else: raise TypeError( f"Invalid logical type specified for '{self.name}'") else: return ww.type_system.infer_logical_type(self._series)
def typing_info_to_dict(dataframe): """Creates the description for a Woodwork table, including typing information for each column and loading information. Args: dataframe (pd.DataFrame, dd.Dataframe, ks.DataFrame): DataFrame with Woodwork typing information initialized. Returns: dict: Dictionary containing Woodwork typing information """ ordered_columns = dataframe.columns column_typing_info = [{ 'name': col_name, 'ordinal': ordered_columns.get_loc(col_name), 'use_standard_tags': col.use_standard_tags, 'logical_type': { 'parameters': _get_specified_ltype_params(col.logical_type), 'type': str(_get_ltype_class(col.logical_type)) }, 'physical_type': { 'type': str(dataframe[col_name].dtype) }, 'semantic_tags': sorted(list(col.semantic_tags)), 'description': col.description, 'metadata': col.metadata, } for col_name, col in dataframe.ww.columns.items()] if dd and isinstance(dataframe, dd.DataFrame): table_type = 'dask' elif ks and isinstance(dataframe, ks.DataFrame): table_type = 'koalas' else: table_type = 'pandas' return { 'schema_version': SCHEMA_VERSION, 'name': dataframe.ww.name, 'index': dataframe.ww.index, 'time_index': dataframe.ww.time_index, 'column_typing_info': column_typing_info, 'loading_info': { 'table_type': table_type }, 'table_metadata': dataframe.ww.metadata }
def datatable_to_description(datatable): '''Gets the description for a DataTable, including typing information for each column and loading information. ''' df = datatable.to_dataframe() ordered_columns = df.columns column_metadata = [{ 'name': col.name, 'ordinal': ordered_columns.get_loc(col.name), 'logical_type': { 'parameters': _get_specified_ltype_params(col.logical_type), 'type': str(_get_ltype_class(col.logical_type)) }, 'physical_type': { 'type': str(col.dtype) }, 'semantic_tags': sorted(list(col.semantic_tags)), 'description': col.description, 'metadata': col.metadata } for col in datatable.columns.values()] if dd and isinstance(df, dd.DataFrame): table_type = 'dask' elif ks and isinstance(df, ks.DataFrame): table_type = 'koalas' else: table_type = 'pandas' return { 'schema_version': SCHEMA_VERSION, 'name': datatable.name, 'index': datatable.index, 'time_index': datatable.time_index, 'column_metadata': column_metadata, 'loading_info': { 'table_type': table_type }, 'table_metadata': datatable.metadata }
def _check_logical_types(column_names, logical_types, require_all_cols=True): if not isinstance(logical_types, dict): raise TypeError("logical_types must be a dictionary") cols_in_ltypes = set(logical_types.keys()) cols_in_schema = set(column_names) cols_not_found_in_schema = cols_in_ltypes.difference(cols_in_schema) if cols_not_found_in_schema: raise ColumnNotPresentError( "logical_types contains columns that are not present in " f"TableSchema: {sorted(list(cols_not_found_in_schema))}") cols_not_found_in_ltypes = cols_in_schema.difference(cols_in_ltypes) if cols_not_found_in_ltypes and require_all_cols: raise ColumnNotPresentError( f"logical_types is missing columns that are present in " f"TableSchema: {sorted(list(cols_not_found_in_ltypes))}") for col_name, logical_type in logical_types.items(): if _get_ltype_class( logical_type) not in ww.type_system.registered_types: raise TypeError("Logical Types must be of the LogicalType class " "and registered in Woodwork's type system. " f"{logical_type} does not meet that criteria.")
def _validate_accessor_params(dataframe, index, make_index, time_index, logical_types, schema, use_standard_tags): _check_unique_column_names(dataframe) _check_use_standard_tags(use_standard_tags) if schema is not None: _check_schema(dataframe, schema) else: # We ignore these parameters if a schema is passed if index is not None or make_index: _check_index(dataframe, index, make_index) if logical_types: _check_logical_types(dataframe.columns, logical_types) if time_index is not None: datetime_format = None logical_type = None if logical_types is not None and time_index in logical_types: logical_type = logical_types[time_index] if _get_ltype_class(logical_types[time_index]) == Datetime: datetime_format = logical_types[time_index].datetime_format _check_time_index(dataframe, time_index, datetime_format=datetime_format, logical_type=logical_type)
def is_boolean(self): """Whether the ColumnSchema is a Boolean column""" ltype_class = _get_ltype_class(self.logical_type) return ltype_class == Boolean or ltype_class == BooleanNullable
def _filter_cols(self, include=None, exclude=None, col_names=False): """Return list of columns filtered with any of: semantic tags, LogicalTypes, column names Args: include (str or LogicalType or list[str or LogicalType]): parameter or list of parameters to filter columns by. Can be Logical Types or Semantic Tags. Columns that match will be included in the returned list of columns. exclude (str or LogicalType or list[str or LogicalType]): parameter or list of parameters to filter columns by. Can be Logical Types or Semantic Tags. Columns that match will be excluded from the returned list of columns. col_names (bool): Specifies whether to filter columns by name. Defaults to False. Returns: List[str] of column names that fit into filter. """ assert not (include and exclude), "Cannot specify both include and exclude" if include and not isinstance(include, list): include = [include] elif exclude and not isinstance(exclude, list): exclude = [exclude] if include is not None: selectors = include elif exclude is not None: selectors = exclude ltypes_used = set() ltypes_in_schema = { type(col.logical_type) for col in self.columns.values() } tags_used = set() tags_in_schema = { tag for col in self.columns.values() for tag in col.semantic_tags } col_name_matches = set() for selector in selectors: # Determine if the selector is a registered, uninstantiated LogicalType maybe_ltype = selector if isinstance(selector, str): # Convert possible string to LogicalType - unregistered LogicalTypes return None maybe_ltype = ww.type_system.str_to_logical_type( selector, raise_error=False) # Get the class - unregistered LogicalTypes return LogicalTypeMetaClass maybe_ltype_class = _get_ltype_class(maybe_ltype) if maybe_ltype_class in ww.type_system.registered_types: if maybe_ltype not in ww.type_system.registered_types: raise TypeError( f"Invalid selector used in include: {maybe_ltype} cannot be instantiated" ) if maybe_ltype in ltypes_in_schema: ltypes_used.add(maybe_ltype) elif maybe_ltype_class == ww.logical_types.LogicalType.__class__: raise TypeError( f"Specified LogicalType selector {maybe_ltype} is not registered in Woodwork's type system." ) # Hashability as a proxy for whether a selector is possibly a semantic tag or column name if not isinstance(selector, Hashable): raise TypeError( f"Invalid selector used in include: {selector} must be a " "string, uninstantiated and registered LogicalType, or valid column name" ) # Determine if the selector is a semantic tag if selector in tags_in_schema: tags_used.add(selector) # Determine if the selector is a column name if col_names and selector in self.columns: col_name_matches.add(selector) cols_to_return = [] cols_seen = set() for col_name, col in self.columns.items(): is_match = (type(col.logical_type) in ltypes_used or col.semantic_tags.intersection(tags_used) or col_name in col_name_matches) if include is not None and is_match and col_name not in cols_seen: cols_to_return.append(col_name) cols_seen.add(col_name) elif exclude is not None and not is_match and col_name not in cols_seen: cols_to_return.append(col_name) cols_seen.add(col_name) return cols_to_return
def _is_col_boolean(col): return _get_ltype_class(col.logical_type) == Boolean
def _is_col_datetime(col): return _get_ltype_class(col.logical_type) == Datetime
def _get_describe_dict(dataframe, include=None): """Calculates statistics for data contained in a DataFrame using Woodwork typing information. Args: dataframe (pd.DataFrame): DataFrame to be described with Woodwork typing information initialized include (list[str or LogicalType], optional): filter for what columns to include in the statistics returned. Can be a list of column names, semantic tags, logical types, or a list combining any of the three. It follows the most broad specification. Favors logical types then semantic tag then column name. If no matching columns are found, an empty DataFrame will be returned. Returns: dict[str -> dict]: A dictionary with a key for each column in the data or for each column matching the logical types, semantic tags or column names specified in ``include``, paired with a value containing a dictionary containing relevant statistics for that column. """ agg_stats_to_calculate = { 'category': ["count", "nunique"], 'numeric': ["count", "max", "min", "nunique", "mean", "std"], Datetime: ["count", "max", "min", "nunique", "mean"], } if include is not None: filtered_cols = dataframe.ww._filter_cols(include, col_names=True) cols_to_include = [(k, v) for k, v in dataframe.ww.columns.items() if k in filtered_cols] else: cols_to_include = dataframe.ww.columns.items() results = {} if dd and isinstance(dataframe, dd.DataFrame): df = dataframe.compute() elif ks and isinstance(dataframe, ks.DataFrame): df = dataframe.to_pandas() # Any LatLong columns will be using lists, which we must convert # back to tuples so we can calculate the mode, which requires hashable values latlong_columns = [col_name for col_name, col in dataframe.ww.columns.items() if _get_ltype_class(col.logical_type) == LatLong] df[latlong_columns] = df[latlong_columns].applymap(lambda latlong: tuple(latlong) if latlong else latlong) else: df = dataframe for column_name, column in cols_to_include: if 'index' in column.semantic_tags: continue values = {} logical_type = column.logical_type semantic_tags = column.semantic_tags series = df[column_name] # Calculate Aggregation Stats if _is_col_categorical(column): agg_stats = agg_stats_to_calculate['category'] elif _is_col_numeric(column): agg_stats = agg_stats_to_calculate['numeric'] elif _is_col_datetime(column): agg_stats = agg_stats_to_calculate[Datetime] else: agg_stats = ["count"] values = series.agg(agg_stats).to_dict() # Calculate other specific stats based on logical type or semantic tags if _is_col_boolean(column): values["num_false"] = series.value_counts().get(False, 0) values["num_true"] = series.value_counts().get(True, 0) elif _is_col_numeric(column): quant_values = series.quantile([0.25, 0.5, 0.75]).tolist() values["first_quartile"] = quant_values[0] values["second_quartile"] = quant_values[1] values["third_quartile"] = quant_values[2] mode = _get_mode(series) # The format of the mode should match its format in the DataFrame if ks and isinstance(dataframe, ks.DataFrame) and series.name in latlong_columns: mode = list(mode) values["nan_count"] = series.isna().sum() values["mode"] = mode values["physical_type"] = series.dtype values["logical_type"] = logical_type values["semantic_tags"] = semantic_tags results[column_name] = values return results
def typing_info_to_dict(dataframe): """Creates the description for a Woodwork table, including typing information for each column and loading information. Args: dataframe (pd.DataFrame, dd.Dataframe, ks.DataFrame): DataFrame with Woodwork typing information initialized. Returns: dict: Dictionary containing Woodwork typing information """ if _is_dask_dataframe(dataframe): # Need to determine the category info for Dask it can be saved below category_cols = [ colname for colname, col in dataframe.ww._schema.columns.items() if col.is_categorical ] dataframe = dataframe.ww.categorize(columns=category_cols) ordered_columns = dataframe.columns def _get_physical_type_dict(column): type_dict = {"type": str(column.dtype)} if str(column.dtype) == "category": type_dict["cat_values"] = column.dtype.categories.to_list() type_dict["cat_dtype"] = str(column.dtype.categories.dtype) return type_dict column_typing_info = [{ "name": col_name, "ordinal": ordered_columns.get_loc(col_name), "use_standard_tags": col.use_standard_tags, "logical_type": { "parameters": _get_specified_ltype_params(col.logical_type), "type": str(_get_ltype_class(col.logical_type)), }, "physical_type": _get_physical_type_dict(dataframe[col_name]), "semantic_tags": sorted(list(col.semantic_tags)), "description": col.description, "origin": col.origin, "metadata": col.metadata, } for col_name, col in dataframe.ww.columns.items()] if _is_dask_dataframe(dataframe): table_type = "dask" elif _is_spark_dataframe(dataframe): table_type = "spark" else: table_type = "pandas" return { "schema_version": SCHEMA_VERSION, "name": dataframe.ww.name, "index": dataframe.ww.index, "time_index": dataframe.ww.time_index, "column_typing_info": column_typing_info, "loading_info": { "table_type": table_type }, "table_metadata": dataframe.ww.metadata, }
def is_datetime(self): """Whether the ColumnSchema is a Datetime column""" return _get_ltype_class(self.logical_type) == Datetime
def _get_mutual_information_dict(dataframe, num_bins=10, nrows=None, include_index=False): """Calculates mutual information between all pairs of columns in the DataFrame that support mutual information. Logical Types that support mutual information are as follows: Boolean, Categorical, CountryCode, Datetime, Double, Integer, Ordinal, PostalCode, and SubRegionCode Args: dataframe (pd.DataFrame): Data containing Woodwork typing information from which to calculate mutual information. num_bins (int): Determines number of bins to use for converting numeric features into categorical. nrows (int): The number of rows to sample for when determining mutual info. If specified, samples the desired number of rows from the data. Defaults to using all rows. include_index (bool): If True, the column specified as the index will be included as long as its LogicalType is valid for mutual information calculations. If False, the index column will not have mutual information calculated for it. Defaults to False. Returns: list(dict): A list containing dictionaries that have keys `column_1`, `column_2`, and `mutual_info` that is sorted in decending order by mutual info. Mutual information values are between 0 (no mutual information) and 1 (perfect dependency). """ valid_types = get_valid_mi_types() valid_columns = [col_name for col_name, col in dataframe.ww.columns.items() if _get_ltype_class(col.logical_type) in valid_types] if not include_index and dataframe.ww.index is not None: valid_columns.remove(dataframe.ww.index) data = dataframe.loc[:, valid_columns] if dd and isinstance(data, dd.DataFrame): data = data.compute() if ks and isinstance(dataframe, ks.DataFrame): data = data.to_pandas() # cut off data if necessary if nrows is not None and nrows < data.shape[0]: data = data.sample(nrows) # remove fully null columns not_null_cols = data.columns[data.notnull().any()] if set(not_null_cols) != set(valid_columns): data = data.loc[:, not_null_cols] data = _replace_nans_for_mutual_info(dataframe.ww.schema, data) data = _make_categorical_for_mutual_info(dataframe.ww.schema, data, num_bins) # calculate mutual info for all pairs of columns mutual_info = [] col_names = data.columns.to_list() for i, a_col in enumerate(col_names): for j in range(i, len(col_names)): b_col = col_names[j] if a_col == b_col: # Ignore because the mutual info for a column with itself will always be 1 continue else: mi_score = normalized_mutual_info_score(data[a_col], data[b_col]) mutual_info.append( {"column_1": a_col, "column_2": b_col, "mutual_info": mi_score} ) mutual_info.sort(key=lambda mi: mi['mutual_info'], reverse=True) return mutual_info