Пример #1
0
    def unique(self) -> ParentType:
        """
        Returns the unique elements in each list.
        The ordering of elements is not guaranteed.

        Returns
        -------
        Series or Index

        Examples
        --------
        >>> s = cudf.Series([[1, 1, 2, None, None], None, [4, 4], []])
        >>> s
        0    [1.0, 1.0, 2.0, nan, nan]
        1                         None
        2                   [4.0, 4.0]
        3                           []
        dtype: list
        >>> s.list.unique() # Order of list element is not guaranteed
        0              [1.0, 2.0, nan]
        1                         None
        2                        [4.0]
        3                           []
        dtype: list
        """

        if is_list_dtype(self._column.children[1].dtype):
            raise NotImplementedError("Nested lists unique is not supported.")

        return self._return_or_inplace(
            drop_list_duplicates(self._column,
                                 nulls_equal=True,
                                 nans_all_equal=True))
Пример #2
0
 def __init__(self, column, parent=None):
     if not is_list_dtype(column.dtype):
         raise AttributeError(
             "Can only use .list accessor with a 'list' dtype"
         )
     self._column = column
     self._parent = parent
Пример #3
0
def _is_list_dtype(s):
    """Check if Series contains list elements"""
    if isinstance(s, pd.Series):
        if not len(s):  # pylint: disable=len-as-condition
            return False
        return pd.api.types.is_list_like(s.values[0])
    else:
        return is_list_dtype(s)
Пример #4
0
def _is_list_dtype(s):
    """Check if Series contains list elements"""
    if isinstance(s, pd.Series):
        if not len(s):
            return False
        return pd.api.types.is_list_like(s.values[0])
    else:
        return is_list_dtype(s)
Пример #5
0
 def _separate_list_columns(self, gdf):
     lists, scalars = [], []
     for col in gdf.columns:
         if is_list_dtype(gdf[col]):
             lists.append(col)
         else:
             scalars.append(col)
     return _get_embedding_order(scalars), _get_embedding_order(lists)
Пример #6
0
 def fit_finalize(self, dask_stats):
     dtypes = dask_stats[1]
     self.mh_columns = [
         col for col, dtype in zip(dtypes.index, dtypes)
         if is_list_dtype(dtype)
     ]
     categories = dask_stats[0]
     for col in categories:
         self.categories[col] = categories[col]
Пример #7
0
    def initialize(self, args):
        workflow_path = os.path.join(args["model_repository"],
                                     str(args["model_version"]), "workflow")
        self.workflow = nvtabular.Workflow.load(workflow_path)
        self.model_config = json.loads(args["model_config"])
        self.output_model = self.model_config["parameters"]["output_model"][
            "string_value"]

        # recurse over all column groups, initializing operators for inference pipeline
        self._initialize_ops(self.workflow.column_group)

        self.input_dtypes = {
            col: dtype
            for col, dtype in self.workflow.input_dtypes.items()
            if not is_list_dtype(dtype)
        }
        self.input_multihots = {
            col: dtype
            for col, dtype in self.workflow.input_dtypes.items()
            if is_list_dtype(dtype)
        }

        self.output_dtypes = dict()
        for name, dtype in self.workflow.output_dtypes.items():
            if not is_list_dtype(dtype):
                self._set_output_dtype(name)
            else:
                # pytorch + hugectr don't support multihot output features at inference
                if self.output_model in {"hugectr", "pytorch"}:
                    raise ValueError(
                        f"{self.output_model} doesn't yet support multihot features"
                    )
                self._set_output_dtype(name + "__nnzs")
                self._set_output_dtype(name + "__values")

        if self.output_model == "hugectr":
            self.column_types = get_column_types(workflow_path)
            self.offsets = get_hugectr_offsets(workflow_path)
            if self.offsets is None and "cats" in self.column_types:
                raise Exception(
                    "slot_size_array.json could not be found to read the slot sizes"
                )
        else:
            self.column_types = self.offsets = None
Пример #8
0
    def execute(self,
                requests: List[InferenceRequest]) -> List[InferenceResponse]:
        """Transforms the input batches by running through a NVTabular workflow.transform
        function.
        """
        responses = []
        for request in requests:
            # create a cudf DataFrame from the triton request
            input_df = cudf.DataFrame({
                name: _convert_tensor(get_input_tensor_by_name(request, name))
                for name in self.input_dtypes
            })

            for name, dtype in self.input_multihots.items():
                values = as_column(
                    _convert_tensor(
                        get_input_tensor_by_name(request, name + "__values")))
                nnzs = as_column(
                    _convert_tensor(
                        get_input_tensor_by_name(request, name + "__nnzs")))
                input_df[name] = build_column(None,
                                              dtype=dtype,
                                              size=nnzs.size - 1,
                                              children=(nnzs, values))

            # use our NVTabular workflow to transform the dataframe
            output_df = nvtabular.workflow._transform_partition(
                input_df, [self.workflow.column_group])

            # convert back to a triton response
            output_tensors = []
            for name in output_df.columns:
                col = output_df[name]
                if is_list_dtype(col.dtype):
                    # convert list values to match TF dataloader
                    values = col.list.leaves.values_host.astype(
                        self.output_dtypes[name + "__values"])
                    values = values.reshape(len(values), 1)
                    output_tensors.append(Tensor(name + "__values", values))

                    offsets = col._column.offsets.values_host.astype(
                        self.output_dtypes[name + "__nnzs"])
                    nnzs = offsets[1:] - offsets[:-1]
                    nnzs = nnzs.reshape(len(nnzs), 1)
                    output_tensors.append(Tensor(name + "__nnzs", nnzs))
                else:
                    d = col.values_host.astype(self.output_dtypes[name])
                    d = d.reshape(len(d), 1)
                    output_tensors.append(Tensor(name, d))

            responses.append(InferenceResponse(output_tensors))

        return responses
Пример #9
0
    def sort_values(
        self,
        ascending: bool = True,
        inplace: bool = False,
        kind: str = "quicksort",
        na_position: str = "last",
        ignore_index: bool = False,
    ) -> ParentType:
        """
        Sort each list by the values.

        Sort the lists in ascending or descending order by some criterion.

        Parameters
        ----------
        ascending : bool, default True
            If True, sort values in ascending order, otherwise descending.
        na_position : {'first', 'last'}, default 'last'
            'first' puts nulls at the beginning, 'last' puts nulls at the end.
        ignore_index : bool, default False
            If True, the resulting axis will be labeled 0, 1, ..., n - 1.

        Returns
        -------
        Series or Index with each list sorted

        Notes
        -----
        Difference from pandas:
          * Not supporting: `inplace`, `kind`

        Examples
        --------
        >>> s = cudf.Series([[4, 2, None, 9], [8, 8, 2], [2, 1]])
        >>> s.list.sort_values(ascending=True, na_position="last")
        0    [2.0, 4.0, 9.0, nan]
        1         [2.0, 8.0, 8.0]
        2              [1.0, 2.0]
        dtype: list
        """
        if inplace:
            raise NotImplementedError("`inplace` not currently implemented.")
        if kind != "quicksort":
            raise NotImplementedError("`kind` not currently implemented.")
        if na_position not in {"first", "last"}:
            raise ValueError(f"Unknown `na_position` value {na_position}")
        if is_list_dtype(self._column.children[1].dtype):
            raise NotImplementedError("Nested lists sort is not supported.")

        return self._return_or_inplace(
            sort_lists(self._column, ascending, na_position),
            retain_index=not ignore_index,
        )
Пример #10
0
 def merge_cats_encoding(self, ser, cats):
     # df and cats are both series
     # set cats to dfs
     offs = None
     if is_list_dtype(ser.dtype):
         offs = ser._column.offsets
         ser = ser.list.leaves
     ser = cudf.DataFrame({"vals": ser})
     cats = cudf.DataFrame({"names": cats})
     cats["vals"] = cats.index
     ser = ser.merge(cats, on=["vals"], how="left")
     return ser["names"], offs
Пример #11
0
    def transform(self, columns: ColumnNames,
                  gdf: cudf.DataFrame) -> cudf.DataFrame:
        if isinstance(self.num_buckets, int):
            num_buckets = {name: self.num_buckets for name in columns}
        else:
            num_buckets = self.num_buckets

        for col, nb in num_buckets.items():
            if is_list_dtype(gdf[col].dtype):
                gdf[col] = _encode_list_column(
                    gdf[col], gdf[col].list.leaves.hash_values() % nb)
            else:
                gdf[col] = gdf[col].hash_values() % nb
        return gdf
Пример #12
0
def _add_model_param(column, dtype, paramclass, params, dims=[-1, 1]):
    if is_list_dtype(dtype):
        params.append(
            paramclass(name=column + "__values",
                       data_type=_convert_dtype(dtype.element_type),
                       dims=dims))
        params.append(
            paramclass(name=column + "__nnzs",
                       data_type=model_config.TYPE_INT64,
                       dims=dims))
    else:
        params.append(
            paramclass(name=column, data_type=_convert_dtype(dtype),
                       dims=dims))
Пример #13
0
    def initialize(self, args):
        workflow_path = os.path.join(args["model_repository"],
                                     str(args["model_version"]), "workflow")
        self.workflow = nvtabular.Workflow.load(workflow_path)
        self.model_config = json.loads(args["model_config"])

        self.input_dtypes = {
            col: dtype
            for col, dtype in self.workflow.input_dtypes.items()
            if not is_list_dtype(dtype)
        }
        self.input_multihots = {
            col: dtype
            for col, dtype in self.workflow.input_dtypes.items()
            if is_list_dtype(dtype)
        }

        self.output_dtypes = dict()
        for name, dtype in self.workflow.output_dtypes.items():
            if not is_list_dtype(dtype):
                self._set_output_dtype(name)
            else:
                self._set_output_dtype(name + "__nnzs")
                self._set_output_dtype(name + "__values")
Пример #14
0
 def get_row_size(self, row, cats_rep):
     """
     row = cudf.DataFrame comprising of one row
     """
     size = 0
     for col in row.columns:
         if is_list_dtype(row[col].dtype):
             # second from last position is max list length
             # find correct cats_rep by scanning through all for column name
             tar = self.find_target_rep(col, cats_rep)
             # else use default 1
             val = tar.multi_max if tar else 1
             size = size + row[col]._column.elements.dtype.itemsize * val
         else:
             size = size + row[col].dtype.itemsize
     return size
Пример #15
0
def _hash_bucket(gdf, num_buckets, col, encode_type="joint"):
    if encode_type == "joint":
        nb = num_buckets[col[0]]
        if is_list_dtype(gdf[col[0]].dtype):
            encoded = gdf[col[0]].list.leaves.hash_values() % nb
        else:
            encoded = gdf[col[0]].hash_values() % nb
    elif encode_type == "combo":
        if len(col) > 1:
            name = _make_name(*tuple(col), sep="_")
        else:
            name = col[0]
        nb = num_buckets[name]
        val = 0
        for column in col:
            val ^= gdf[column].hash_values()  # or however we want to do this aggregation
        val = val % nb
        encoded = val
    return encoded
Пример #16
0
    def add_data(self, gdf):
        # Populate columns idxs
        if not self.col_idx:
            for i, x in enumerate(gdf.columns.values):
                self.col_idx[str(x)] = i

        # list columns in cudf don't currently support chunked writing in parquet.
        # hack around this by just writing a single file with this partition
        # this restriction can be removed once cudf supports chunked writing
        # in parquet
        if any(is_list_dtype(gdf[col].dtype) for col in gdf.columns):
            self._write_table(0, gdf, True)
            return

        # Generate `ind` array to map each row to an output file.
        # This approach is certainly more optimized for shuffling
        # than it is for non-shuffling, but using a single code
        # path is probably worth the (possible) minor overhead.
        nrows = gdf.shape[0]
        typ = np.min_scalar_type(nrows * 2)
        if self.shuffle:
            ind = cp.random.choice(cp.arange(self.num_out_files, dtype=typ),
                                   nrows)
        else:
            ind = cp.arange(nrows, dtype=typ)
            cp.floor_divide(ind,
                            math.ceil(nrows / self.num_out_files),
                            out=ind)
        for x, group in enumerate(
                gdf.scatter_by_map(ind,
                                   map_size=self.num_out_files,
                                   keep_index=False)):
            self.num_samples[x] += len(group)
            if self.num_threads > 1:
                self.queue.put((x, group))
            else:
                self._write_table(x, group)

        # wait for all writes to finish before exiting
        # (so that we aren't using memory)
        if self.num_threads > 1:
            self.queue.join()
Пример #17
0
def convert_df_to_triton_input(column_names,
                               batch,
                               input_class=grpcclient.InferInput):
    columns = [(col, batch[col]) for col in column_names]
    inputs = []
    for i, (name, col) in enumerate(columns):
        if is_list_dtype(col):
            inputs.append(
                _convert_column_to_triton_input(
                    col._column.offsets.values_host.astype("int64"),
                    name + "__nnzs", input_class))
            inputs.append(
                _convert_column_to_triton_input(
                    col.list.leaves.values_host.astype("int64"),
                    name + "__values", input_class))
        else:
            inputs.append(
                _convert_column_to_triton_input(col.values_host, name,
                                                input_class))
    return inputs
Пример #18
0
    def op_logic(self,
                 gdf: cudf.DataFrame,
                 target_columns: list,
                 stats_context=None):
        cat_names = target_columns
        if isinstance(self.num_buckets, int):
            num_buckets = {name: self.num_buckets for name in cat_names}
        else:
            num_buckets = self.num_buckets

        new_gdf = cudf.DataFrame()
        for col, nb in num_buckets.items():
            new_col = f"{col}_{self._id}"
            if is_list_dtype(gdf[col].dtype):
                encoded = _encode_list_column(
                    gdf[col], gdf[col].list.leaves.hash_values() % nb)
            else:
                encoded = gdf[col].hash_values() % nb

            new_gdf[new_col] = encoded
        return new_gdf
Пример #19
0
 def __init__(self, parent: ParentType):
     if not is_list_dtype(parent.dtype):
         raise AttributeError(
             "Can only use .list accessor with a 'list' dtype")
     super().__init__(parent=parent)
Пример #20
0
def _is_list_col(column_group, df):
    has_lists = any(is_list_dtype(df[col]) for col in column_group)
    if has_lists and len(column_group) != 1:
        raise ValueError("Can't categorical encode multiple list columns")
    return has_lists
Пример #21
0
def to_numeric(arg, errors="raise", downcast=None):
    """
    Convert argument into numerical types.

    Parameters
    ----------
    arg : column-convertible
        The object to convert to numeric types
    errors : {'raise', 'ignore', 'coerce'}, defaults 'raise'
        Policy to handle errors during parsing.

        * 'raise' will notify user all errors encountered.
        * 'ignore' will skip error and returns ``arg``.
        * 'coerce' will leave invalid values as nulls.
    downcast : {'integer', 'signed', 'unsigned', 'float'}, defaults None
        If set, will try to down-convert the datatype of the
        parsed results to smallest possible type. For each `downcast`
        type, this method will determine the smallest possible
        dtype from the following sets:

        * {'integer', 'signed'}: all integer types greater or equal to
          `np.int8`
        * {'unsigned'}: all unsigned types greater or equal to `np.uint8`
        * {'float'}: all floating types greater or equal to `np.float32`

        Note that downcast behavior is decoupled from parsing. Errors
        encountered during downcast is raised regardless of ``errors``
        parameter.

    Returns
    -------
    Series or ndarray
        Depending on the input, if series is passed in, series is returned,
        otherwise ndarray

    Notes
    -------
    An important difference from pandas is that this function does not accept
    mixed numeric/non-numeric type sequences. For example ``[1, 'a']``.
    A ``TypeError`` will be raised when such input is received, regardless of
    ``errors`` parameter.

    Examples
    --------
    >>> s = cudf.Series(['1', '2.0', '3e3'])
    >>> cudf.to_numeric(s)
    0       1.0
    1       2.0
    2    3000.0
    dtype: float64
    >>> cudf.to_numeric(s, downcast='float')
    0       1.0
    1       2.0
    2    3000.0
    dtype: float32
    >>> cudf.to_numeric(s, downcast='signed')
    0       1
    1       2
    2    3000
    dtype: int16
    >>> s = cudf.Series(['apple', '1.0', '3e3'])
    >>> cudf.to_numeric(s, errors='ignore')
    0    apple
    1      1.0
    2      3e3
    dtype: object
    >>> cudf.to_numeric(s, errors='coerce')
    0      <NA>
    1       1.0
    2    3000.0
    dtype: float64
    """

    if errors not in {"raise", "ignore", "coerce"}:
        raise ValueError("invalid error value specified")

    if downcast not in {None, "integer", "signed", "unsigned", "float"}:
        raise ValueError("invalid downcasting method provided")

    if not can_convert_to_column(arg) or (hasattr(arg, "ndim")
                                          and arg.ndim > 1):
        raise ValueError("arg must be column convertible")

    col = as_column(arg)
    dtype = col.dtype

    if is_datetime_dtype(dtype) or is_timedelta_dtype(dtype):
        col = col.as_numerical_column(np.dtype("int64"))
    elif is_categorical_dtype(dtype):
        cat_dtype = col.dtype.type
        if is_numerical_dtype(cat_dtype):
            col = col.as_numerical_column(cat_dtype)
        else:
            try:
                col = _convert_str_col(col._get_decategorized_column(), errors,
                                       downcast)
            except ValueError as e:
                if errors == "ignore":
                    return arg
                else:
                    raise e
    elif is_string_dtype(dtype):
        try:
            col = _convert_str_col(col, errors, downcast)
        except ValueError as e:
            if errors == "ignore":
                return arg
            else:
                raise e
    elif is_list_dtype(dtype) or is_struct_dtype(dtype):
        raise ValueError("Input does not support nested datatypes")
    elif is_numerical_dtype(dtype):
        pass
    else:
        raise ValueError("Unrecognized datatype")

    # str->float conversion may require lower precision
    if col.dtype == np.dtype("f"):
        col = col.as_numerical_column("d")

    if downcast:
        downcast_type_map = {
            "integer": list(np.typecodes["Integer"]),
            "signed": list(np.typecodes["Integer"]),
            "unsigned": list(np.typecodes["UnsignedInteger"]),
        }
        float_types = list(np.typecodes["Float"])
        idx = float_types.index(np.dtype(np.float32).char)
        downcast_type_map["float"] = float_types[idx:]

        type_set = downcast_type_map[downcast]

        for t in type_set:
            downcast_dtype = np.dtype(t)
            if downcast_dtype.itemsize <= col.dtype.itemsize:
                if col.can_cast_safely(downcast_dtype):
                    col = libcudf.unary.cast(col, downcast_dtype)
                    break

    if isinstance(arg, (cudf.Series, pd.Series)):
        return cudf.Series(col)
    else:
        col = col.fillna(col.default_na_value())
        return col.values