def merge_cats_encoding(self, ser, cats): # df and cats are both series # set cats to dfs offs = None if _is_list_dtype(ser.dtype) or _is_list_dtype(ser): ser, offs = _pull_apart_list(ser) ser = _make_df({"vals": ser}) cats = _make_df({"names": cats}) cats["vals"] = cats.index ser = ser.merge(cats, on=["vals"], how="left") return ser["names"], offs
def _separate_list_columns(self, gdf): lists, scalars = [], [] for col in gdf.columns: if _is_list_dtype(gdf[col]): lists.append(col) else: scalars.append(col) return scalars, lists
def _separate_list_columns(self, gdf): lists, scalars = [], [] for col in gdf.columns: if _is_list_dtype(gdf[col]): lists.append(col) else: scalars.append(col) return _get_embedding_order(scalars), _get_embedding_order(lists)
def fit(self, columns: ColumnNames, ddf: dd.DataFrame): # User passed in a list of column groups. We need to figure out # if this list contains any multi-column groups, and if there # are any (obvious) problems with these groups columns_uniq = list(set(flatten(columns, container=tuple))) columns_all = list(flatten(columns, container=tuple)) if sorted(columns_all) != sorted( columns_uniq) and self.encode_type == "joint": # If we are doing "joint" encoding, there must be unique mapping # between input column names and column groups. Otherwise, more # than one unique-value table could be used to encode the same # column. raise ValueError("Same column name included in multiple groups.") for group in columns: if isinstance(group, tuple) and len(group) > 1: # For multi-column groups, we concatenate column names # to get the "group" name. name = _make_name(*group, sep=self.name_sep) for col in group: self.storage_name[col] = name # Check metadata type to reset on_host and cat_cache if the # underlying ddf is already a pandas-backed collection if isinstance(ddf._meta, pd.DataFrame): self.on_host = False # Cannot use "device" caching if the data is pandas-backed self.cat_cache = "host" if self.cat_cache == "device" else self.cat_cache if self.search_sorted: # Pandas' search_sorted only works with Series. # For now, it is safest to disallow this option. self.search_sorted = False warnings.warn( "Cannot use `search_sorted=True` for pandas-backed data.") # convert tuples to lists columns = [list(c) if isinstance(c, tuple) else c for c in columns] dsk, key = _category_stats( ddf, columns, [], [], self.out_path, self.freq_threshold, self.tree_width, self.on_host, concat_groups=self.encode_type == "joint", name_sep=self.name_sep, max_size=self.max_size, num_buckets=self.num_buckets, ) # TODO: we can't check the dtypes on the ddf here since they are incorrect # for cudf's list type. So, we're checking the partitions. fix. return Delayed(key, dsk), ddf.map_partitions(lambda df: _is_list_dtype(df))
def get_embedding_sizes(source, output_dtypes=None): """Returns a dictionary of embedding sizes from a workflow or column_group Parameters ---------- source : Workflow or ColumnGroup Either a nvtabular Workflow or ColumnGroup object that we should use to find embedding sizes output_dtypes : dict, optional Optional dictionary of column_name:dtype. If passing a workflow object dtypes will be read from the workflow. This is used to figure out which columns are multihot-categorical, which are split out by this function. If passed a column_group and this parameter isn't set, you won't have multihot columns returned separately """ # TODO: do we need to distinguish multihot columns here? (if so why? ) # have to lazy import Workflow to avoid circular import errors from nvtabular.workflow import Workflow if isinstance(source, Workflow): queue = [source.column_group] output_dtypes = output_dtypes or source.output_dtypes else: # passed in a column group queue = [source] output_dtypes = output_dtypes or {} output = {} multihot_columns = set() while queue: current = queue.pop() if current.op and hasattr(current.op, "get_embedding_sizes"): output.update(current.op.get_embedding_sizes(current.columns)) elif not current.op: # only follow parents if its not an operator node (which could # transform meaning of the get_embedding_sizes queue.extend(current.parents) for column in output: dtype = output_dtypes.get(column) if dtype and _is_list_dtype(dtype): # multi hot so remove from output and add to multihot multihot_columns.add(column) # TODO: returning differnt return types like this (based off the presence # of multihot features) is pretty janky. fix. if not multihot_columns: return output single_hots = { k: v for k, v in output.items() if k not in multihot_columns } multi_hots = {k: v for k, v in output.items() if k in multihot_columns} return single_hots, multi_hots
def transform(self, col_selector: ColumnSelector, df: DataFrameType) -> DataFrameType: for name in col_selector.names: column = df[name] if _is_list_dtype(column): transformed = np.log( _flatten_list_column_values(column).astype(np.float32) + 1) df[name] = _encode_list_column(column, transformed) else: df[name] = np.log(column.astype(np.float32) + 1) return df
def _cudf_to_array(df, cpu=True): output = {} for name in df.columns: col = df[name] if _is_list_dtype(col.dtype): offsets = col._column.offsets.values_host if cpu else col._column.offsets.values values = col.list.leaves.values_host if cpu else col.list.leaves.values output[name] = (values, offsets) else: output[name] = col.values_host if cpu else col.values return output
def _chunkwise_moments(df): vals = {name: type(df)() for name in ["count", "sum", "squaredsum"]} for name in df.columns: column = df[name] if _is_list_dtype(column): column = _flatten_list_column_values(column) vals["count"][name] = [column.count()] vals["sum"][name] = [column.sum().astype("float64")] vals["squaredsum"][name] = [column.astype("float64").pow(2).sum()] # NOTE: Perhaps we should convert to pandas here # (since we know the results should be small)? return vals
def _add_model_param(column, dtype, paramclass, params, dims=None): dims = dims if dims is not None else [-1, 1] if _is_list_dtype(dtype): params.append( paramclass(name=column + "__values", data_type=_convert_dtype(dtype.element_type), dims=dims)) params.append( paramclass(name=column + "__nnzs", data_type=model_config.TYPE_INT64, dims=dims)) else: params.append( paramclass(name=column, data_type=_convert_dtype(dtype), dims=dims))
def get_row_size(self, row, cats_rep): """ row = cudf.DataFrame comprising of one row """ size = 0 for col in row.columns: if _is_list_dtype(row[col].dtype): # second from last position is max list length # find correct cats_rep by scanning through all for column name tar = self.find_target_rep(col, cats_rep) # else use default 1 val = tar.multi_max if tar else 1 size = size + row[col]._column.elements.dtype.itemsize * val else: size = size + row[col].dtype.itemsize return size
def fit(self, col_selector: ColumnSelector, ddf: dd.DataFrame) -> Any: stats = {} for col in col_selector.names: series = ddf[col] if _is_list_dtype(series.compute()): stats[col] = stats[col] if col in stats else {} stats[col]["value_count"] = ({} if "value_count" not in stats[col] else stats[col]["value_count"]) offs = _pull_apart_list(series.compute())[1] lh, rh = offs[1:], offs[:-1] rh = rh.reset_index(drop=True) lh = lh.reset_index(drop=True) deltas = lh - rh # must be regular python class otherwise protobuf fails stats[col]["value_count"]["min"] = int(deltas.min()) stats[col]["value_count"]["max"] = int(deltas.max()) return stats
def initialize(self, args): # Arg parsing workflow_path = os.path.join( args["model_repository"], str(args["model_version"]), "workflow" ) model_device = args["model_instance_kind"] # Workflow instantiation self.workflow = nvtabular.Workflow.load(workflow_path) column_types = get_column_types(workflow_path) # cats and conts (which duplicates tags) # Config loading and parsing self.model_config = json.loads(args["model_config"]) model_framework = self.model_config["parameters"]["output_model"]["string_value"] # Dtype parsing input_dtypes = self.workflow.input_dtypes.items() self.input_dtypes, self.input_multihots = _parse_input_dtypes(input_dtypes) self.output_dtypes = dict() for name, dtype in self.workflow.output_dtypes.items(): if not _is_list_dtype(dtype): self._set_output_dtype(name) else: self._set_output_dtype(name + "__nnzs") self._set_output_dtype(name + "__values") if model_framework == "hugectr": runner_class = HugeCTRWorkflowRunner elif model_framework == "pytorch": runner_class = PyTorchWorkflowRunner else: runner_class = TensorflowWorkflowRunner self.runner = runner_class( self.workflow, column_types, self.output_dtypes, self.model_config, model_device )
def convert_df_to_triton_input(column_names, batch, input_class=grpcclient.InferInput): columns = [(col, batch[col]) for col in column_names] inputs = [] for i, (name, col) in enumerate(columns): if _is_list_dtype(col): if isinstance(col, pd.Series): raise ValueError( "this function doesn't support CPU list values yet") inputs.append( _convert_column_to_triton_input( col._column.offsets.values_host.astype("int64"), name + "__nnzs", input_class)) inputs.append( _convert_column_to_triton_input( col.list.leaves.values_host.astype("int64"), name + "__values", input_class)) else: values = col.values if isinstance(col, pd.Series) else col.values_host inputs.append( _convert_column_to_triton_input(values, name, input_class)) return inputs
def _parse_input_dtypes(dtypes): input_dtypes = {col: dtype for col, dtype in dtypes if not _is_list_dtype(dtype)} input_multihots = {col: dtype for col, dtype in dtypes if _is_list_dtype(dtype)} return input_dtypes, input_multihots
def _is_list_col(column_group, df): has_lists = any(_is_list_dtype(df[col]) for col in column_group) if has_lists and len(column_group) != 1: raise ValueError("Can't categorical encode multiple list columns") return has_lists