def unique(self) -> ParentType: """ Returns the unique elements in each list. The ordering of elements is not guaranteed. Returns ------- Series or Index Examples -------- >>> s = cudf.Series([[1, 1, 2, None, None], None, [4, 4], []]) >>> s 0 [1.0, 1.0, 2.0, nan, nan] 1 None 2 [4.0, 4.0] 3 [] dtype: list >>> s.list.unique() # Order of list element is not guaranteed 0 [1.0, 2.0, nan] 1 None 2 [4.0] 3 [] dtype: list """ if is_list_dtype(self._column.children[1].dtype): raise NotImplementedError("Nested lists unique is not supported.") return self._return_or_inplace( drop_list_duplicates(self._column, nulls_equal=True, nans_all_equal=True))
def __init__(self, column, parent=None): if not is_list_dtype(column.dtype): raise AttributeError( "Can only use .list accessor with a 'list' dtype" ) self._column = column self._parent = parent
def _is_list_dtype(s): """Check if Series contains list elements""" if isinstance(s, pd.Series): if not len(s): # pylint: disable=len-as-condition return False return pd.api.types.is_list_like(s.values[0]) else: return is_list_dtype(s)
def _is_list_dtype(s): """Check if Series contains list elements""" if isinstance(s, pd.Series): if not len(s): return False return pd.api.types.is_list_like(s.values[0]) else: return is_list_dtype(s)
def _separate_list_columns(self, gdf): lists, scalars = [], [] for col in gdf.columns: if is_list_dtype(gdf[col]): lists.append(col) else: scalars.append(col) return _get_embedding_order(scalars), _get_embedding_order(lists)
def fit_finalize(self, dask_stats): dtypes = dask_stats[1] self.mh_columns = [ col for col, dtype in zip(dtypes.index, dtypes) if is_list_dtype(dtype) ] categories = dask_stats[0] for col in categories: self.categories[col] = categories[col]
def initialize(self, args): workflow_path = os.path.join(args["model_repository"], str(args["model_version"]), "workflow") self.workflow = nvtabular.Workflow.load(workflow_path) self.model_config = json.loads(args["model_config"]) self.output_model = self.model_config["parameters"]["output_model"][ "string_value"] # recurse over all column groups, initializing operators for inference pipeline self._initialize_ops(self.workflow.column_group) self.input_dtypes = { col: dtype for col, dtype in self.workflow.input_dtypes.items() if not is_list_dtype(dtype) } self.input_multihots = { col: dtype for col, dtype in self.workflow.input_dtypes.items() if is_list_dtype(dtype) } self.output_dtypes = dict() for name, dtype in self.workflow.output_dtypes.items(): if not is_list_dtype(dtype): self._set_output_dtype(name) else: # pytorch + hugectr don't support multihot output features at inference if self.output_model in {"hugectr", "pytorch"}: raise ValueError( f"{self.output_model} doesn't yet support multihot features" ) self._set_output_dtype(name + "__nnzs") self._set_output_dtype(name + "__values") if self.output_model == "hugectr": self.column_types = get_column_types(workflow_path) self.offsets = get_hugectr_offsets(workflow_path) if self.offsets is None and "cats" in self.column_types: raise Exception( "slot_size_array.json could not be found to read the slot sizes" ) else: self.column_types = self.offsets = None
def execute(self, requests: List[InferenceRequest]) -> List[InferenceResponse]: """Transforms the input batches by running through a NVTabular workflow.transform function. """ responses = [] for request in requests: # create a cudf DataFrame from the triton request input_df = cudf.DataFrame({ name: _convert_tensor(get_input_tensor_by_name(request, name)) for name in self.input_dtypes }) for name, dtype in self.input_multihots.items(): values = as_column( _convert_tensor( get_input_tensor_by_name(request, name + "__values"))) nnzs = as_column( _convert_tensor( get_input_tensor_by_name(request, name + "__nnzs"))) input_df[name] = build_column(None, dtype=dtype, size=nnzs.size - 1, children=(nnzs, values)) # use our NVTabular workflow to transform the dataframe output_df = nvtabular.workflow._transform_partition( input_df, [self.workflow.column_group]) # convert back to a triton response output_tensors = [] for name in output_df.columns: col = output_df[name] if is_list_dtype(col.dtype): # convert list values to match TF dataloader values = col.list.leaves.values_host.astype( self.output_dtypes[name + "__values"]) values = values.reshape(len(values), 1) output_tensors.append(Tensor(name + "__values", values)) offsets = col._column.offsets.values_host.astype( self.output_dtypes[name + "__nnzs"]) nnzs = offsets[1:] - offsets[:-1] nnzs = nnzs.reshape(len(nnzs), 1) output_tensors.append(Tensor(name + "__nnzs", nnzs)) else: d = col.values_host.astype(self.output_dtypes[name]) d = d.reshape(len(d), 1) output_tensors.append(Tensor(name, d)) responses.append(InferenceResponse(output_tensors)) return responses
def sort_values( self, ascending: bool = True, inplace: bool = False, kind: str = "quicksort", na_position: str = "last", ignore_index: bool = False, ) -> ParentType: """ Sort each list by the values. Sort the lists in ascending or descending order by some criterion. Parameters ---------- ascending : bool, default True If True, sort values in ascending order, otherwise descending. na_position : {'first', 'last'}, default 'last' 'first' puts nulls at the beginning, 'last' puts nulls at the end. ignore_index : bool, default False If True, the resulting axis will be labeled 0, 1, ..., n - 1. Returns ------- Series or Index with each list sorted Notes ----- Difference from pandas: * Not supporting: `inplace`, `kind` Examples -------- >>> s = cudf.Series([[4, 2, None, 9], [8, 8, 2], [2, 1]]) >>> s.list.sort_values(ascending=True, na_position="last") 0 [2.0, 4.0, 9.0, nan] 1 [2.0, 8.0, 8.0] 2 [1.0, 2.0] dtype: list """ if inplace: raise NotImplementedError("`inplace` not currently implemented.") if kind != "quicksort": raise NotImplementedError("`kind` not currently implemented.") if na_position not in {"first", "last"}: raise ValueError(f"Unknown `na_position` value {na_position}") if is_list_dtype(self._column.children[1].dtype): raise NotImplementedError("Nested lists sort is not supported.") return self._return_or_inplace( sort_lists(self._column, ascending, na_position), retain_index=not ignore_index, )
def merge_cats_encoding(self, ser, cats): # df and cats are both series # set cats to dfs offs = None if is_list_dtype(ser.dtype): offs = ser._column.offsets ser = ser.list.leaves ser = cudf.DataFrame({"vals": ser}) cats = cudf.DataFrame({"names": cats}) cats["vals"] = cats.index ser = ser.merge(cats, on=["vals"], how="left") return ser["names"], offs
def transform(self, columns: ColumnNames, gdf: cudf.DataFrame) -> cudf.DataFrame: if isinstance(self.num_buckets, int): num_buckets = {name: self.num_buckets for name in columns} else: num_buckets = self.num_buckets for col, nb in num_buckets.items(): if is_list_dtype(gdf[col].dtype): gdf[col] = _encode_list_column( gdf[col], gdf[col].list.leaves.hash_values() % nb) else: gdf[col] = gdf[col].hash_values() % nb return gdf
def _add_model_param(column, dtype, paramclass, params, dims=[-1, 1]): if is_list_dtype(dtype): params.append( paramclass(name=column + "__values", data_type=_convert_dtype(dtype.element_type), dims=dims)) params.append( paramclass(name=column + "__nnzs", data_type=model_config.TYPE_INT64, dims=dims)) else: params.append( paramclass(name=column, data_type=_convert_dtype(dtype), dims=dims))
def initialize(self, args): workflow_path = os.path.join(args["model_repository"], str(args["model_version"]), "workflow") self.workflow = nvtabular.Workflow.load(workflow_path) self.model_config = json.loads(args["model_config"]) self.input_dtypes = { col: dtype for col, dtype in self.workflow.input_dtypes.items() if not is_list_dtype(dtype) } self.input_multihots = { col: dtype for col, dtype in self.workflow.input_dtypes.items() if is_list_dtype(dtype) } self.output_dtypes = dict() for name, dtype in self.workflow.output_dtypes.items(): if not is_list_dtype(dtype): self._set_output_dtype(name) else: self._set_output_dtype(name + "__nnzs") self._set_output_dtype(name + "__values")
def get_row_size(self, row, cats_rep): """ row = cudf.DataFrame comprising of one row """ size = 0 for col in row.columns: if is_list_dtype(row[col].dtype): # second from last position is max list length # find correct cats_rep by scanning through all for column name tar = self.find_target_rep(col, cats_rep) # else use default 1 val = tar.multi_max if tar else 1 size = size + row[col]._column.elements.dtype.itemsize * val else: size = size + row[col].dtype.itemsize return size
def _hash_bucket(gdf, num_buckets, col, encode_type="joint"): if encode_type == "joint": nb = num_buckets[col[0]] if is_list_dtype(gdf[col[0]].dtype): encoded = gdf[col[0]].list.leaves.hash_values() % nb else: encoded = gdf[col[0]].hash_values() % nb elif encode_type == "combo": if len(col) > 1: name = _make_name(*tuple(col), sep="_") else: name = col[0] nb = num_buckets[name] val = 0 for column in col: val ^= gdf[column].hash_values() # or however we want to do this aggregation val = val % nb encoded = val return encoded
def add_data(self, gdf): # Populate columns idxs if not self.col_idx: for i, x in enumerate(gdf.columns.values): self.col_idx[str(x)] = i # list columns in cudf don't currently support chunked writing in parquet. # hack around this by just writing a single file with this partition # this restriction can be removed once cudf supports chunked writing # in parquet if any(is_list_dtype(gdf[col].dtype) for col in gdf.columns): self._write_table(0, gdf, True) return # Generate `ind` array to map each row to an output file. # This approach is certainly more optimized for shuffling # than it is for non-shuffling, but using a single code # path is probably worth the (possible) minor overhead. nrows = gdf.shape[0] typ = np.min_scalar_type(nrows * 2) if self.shuffle: ind = cp.random.choice(cp.arange(self.num_out_files, dtype=typ), nrows) else: ind = cp.arange(nrows, dtype=typ) cp.floor_divide(ind, math.ceil(nrows / self.num_out_files), out=ind) for x, group in enumerate( gdf.scatter_by_map(ind, map_size=self.num_out_files, keep_index=False)): self.num_samples[x] += len(group) if self.num_threads > 1: self.queue.put((x, group)) else: self._write_table(x, group) # wait for all writes to finish before exiting # (so that we aren't using memory) if self.num_threads > 1: self.queue.join()
def convert_df_to_triton_input(column_names, batch, input_class=grpcclient.InferInput): columns = [(col, batch[col]) for col in column_names] inputs = [] for i, (name, col) in enumerate(columns): if is_list_dtype(col): inputs.append( _convert_column_to_triton_input( col._column.offsets.values_host.astype("int64"), name + "__nnzs", input_class)) inputs.append( _convert_column_to_triton_input( col.list.leaves.values_host.astype("int64"), name + "__values", input_class)) else: inputs.append( _convert_column_to_triton_input(col.values_host, name, input_class)) return inputs
def op_logic(self, gdf: cudf.DataFrame, target_columns: list, stats_context=None): cat_names = target_columns if isinstance(self.num_buckets, int): num_buckets = {name: self.num_buckets for name in cat_names} else: num_buckets = self.num_buckets new_gdf = cudf.DataFrame() for col, nb in num_buckets.items(): new_col = f"{col}_{self._id}" if is_list_dtype(gdf[col].dtype): encoded = _encode_list_column( gdf[col], gdf[col].list.leaves.hash_values() % nb) else: encoded = gdf[col].hash_values() % nb new_gdf[new_col] = encoded return new_gdf
def __init__(self, parent: ParentType): if not is_list_dtype(parent.dtype): raise AttributeError( "Can only use .list accessor with a 'list' dtype") super().__init__(parent=parent)
def _is_list_col(column_group, df): has_lists = any(is_list_dtype(df[col]) for col in column_group) if has_lists and len(column_group) != 1: raise ValueError("Can't categorical encode multiple list columns") return has_lists
def to_numeric(arg, errors="raise", downcast=None): """ Convert argument into numerical types. Parameters ---------- arg : column-convertible The object to convert to numeric types errors : {'raise', 'ignore', 'coerce'}, defaults 'raise' Policy to handle errors during parsing. * 'raise' will notify user all errors encountered. * 'ignore' will skip error and returns ``arg``. * 'coerce' will leave invalid values as nulls. downcast : {'integer', 'signed', 'unsigned', 'float'}, defaults None If set, will try to down-convert the datatype of the parsed results to smallest possible type. For each `downcast` type, this method will determine the smallest possible dtype from the following sets: * {'integer', 'signed'}: all integer types greater or equal to `np.int8` * {'unsigned'}: all unsigned types greater or equal to `np.uint8` * {'float'}: all floating types greater or equal to `np.float32` Note that downcast behavior is decoupled from parsing. Errors encountered during downcast is raised regardless of ``errors`` parameter. Returns ------- Series or ndarray Depending on the input, if series is passed in, series is returned, otherwise ndarray Notes ------- An important difference from pandas is that this function does not accept mixed numeric/non-numeric type sequences. For example ``[1, 'a']``. A ``TypeError`` will be raised when such input is received, regardless of ``errors`` parameter. Examples -------- >>> s = cudf.Series(['1', '2.0', '3e3']) >>> cudf.to_numeric(s) 0 1.0 1 2.0 2 3000.0 dtype: float64 >>> cudf.to_numeric(s, downcast='float') 0 1.0 1 2.0 2 3000.0 dtype: float32 >>> cudf.to_numeric(s, downcast='signed') 0 1 1 2 2 3000 dtype: int16 >>> s = cudf.Series(['apple', '1.0', '3e3']) >>> cudf.to_numeric(s, errors='ignore') 0 apple 1 1.0 2 3e3 dtype: object >>> cudf.to_numeric(s, errors='coerce') 0 <NA> 1 1.0 2 3000.0 dtype: float64 """ if errors not in {"raise", "ignore", "coerce"}: raise ValueError("invalid error value specified") if downcast not in {None, "integer", "signed", "unsigned", "float"}: raise ValueError("invalid downcasting method provided") if not can_convert_to_column(arg) or (hasattr(arg, "ndim") and arg.ndim > 1): raise ValueError("arg must be column convertible") col = as_column(arg) dtype = col.dtype if is_datetime_dtype(dtype) or is_timedelta_dtype(dtype): col = col.as_numerical_column(np.dtype("int64")) elif is_categorical_dtype(dtype): cat_dtype = col.dtype.type if is_numerical_dtype(cat_dtype): col = col.as_numerical_column(cat_dtype) else: try: col = _convert_str_col(col._get_decategorized_column(), errors, downcast) except ValueError as e: if errors == "ignore": return arg else: raise e elif is_string_dtype(dtype): try: col = _convert_str_col(col, errors, downcast) except ValueError as e: if errors == "ignore": return arg else: raise e elif is_list_dtype(dtype) or is_struct_dtype(dtype): raise ValueError("Input does not support nested datatypes") elif is_numerical_dtype(dtype): pass else: raise ValueError("Unrecognized datatype") # str->float conversion may require lower precision if col.dtype == np.dtype("f"): col = col.as_numerical_column("d") if downcast: downcast_type_map = { "integer": list(np.typecodes["Integer"]), "signed": list(np.typecodes["Integer"]), "unsigned": list(np.typecodes["UnsignedInteger"]), } float_types = list(np.typecodes["Float"]) idx = float_types.index(np.dtype(np.float32).char) downcast_type_map["float"] = float_types[idx:] type_set = downcast_type_map[downcast] for t in type_set: downcast_dtype = np.dtype(t) if downcast_dtype.itemsize <= col.dtype.itemsize: if col.can_cast_safely(downcast_dtype): col = libcudf.unary.cast(col, downcast_dtype) break if isinstance(arg, (cudf.Series, pd.Series)): return cudf.Series(col) else: col = col.fillna(col.default_na_value()) return col.values