示例#1
0
def _encode_list_column(original, encoded, dtype=None):
    """Convert `encoded` to be a list column with the
    same offsets as `original`
    """
    if isinstance(original, pd.Series):
        # Pandas version (not very efficient)
        offset = 0
        new_data = []
        for val in original.values:
            size = len(val)
            new_data.append(
                np.array(encoded[offset:offset + size], dtype=dtype))
            offset += size
        return pd.Series(new_data)
    else:
        # CuDF version
        encoded = as_column(encoded)
        if dtype:
            encoded = encoded.astype(dtype, copy=False)
        list_dtype = cudf.core.dtypes.ListDtype(
            encoded.dtype if dtype is None else dtype)
        return build_column(
            None,
            dtype=list_dtype,
            size=original.size,
            children=(original._column.offsets, encoded),
        )
示例#2
0
def _encode_list_column(original, encoded):
    encoded = as_column(encoded)
    return build_column(
        None,
        dtype=cudf.core.dtypes.ListDtype(encoded.dtype),
        size=original.size,
        children=(original._column.offsets, encoded),
    )
示例#3
0
    def as_timedelta_column(self, dtype, **kwargs):

        return build_column(
            data=self.astype("int64").base_data,
            dtype=dtype,
            mask=self.base_mask,
            offset=self.offset,
            size=self.size,
        )
示例#4
0
def _build_cudf_list_column(new_elements, new_offsets):
    if not HAS_GPU:
        return []
    return build_column(
        None,
        dtype=cudf.core.dtypes.ListDtype(new_elements.dtype),
        size=new_offsets.size - 1,
        children=(as_column(new_offsets), as_column(new_elements)),
    )
示例#5
0
文件: datetime.py 项目: zivzone/cudf
    def as_numerical(self):
        from cudf.core.column import build_column

        return build_column(
            data=self.base_data,
            dtype=np.int64,
            mask=self.base_mask,
            offset=self.offset,
            size=self.size,
        )
示例#6
0
    def as_datetime_column(self, dtype, **kwargs):
        from cudf.core.column import build_column

        return build_column(
            data=self.astype("int64").base_data,
            dtype=dtype,
            mask=self.base_mask,
            offset=self.offset,
            size=self.size,
        )
示例#7
0
 def as_timedelta_column(self, dtype: Dtype,
                         **kwargs) -> "cudf.core.column.TimeDeltaColumn":
     return cast(
         "cudf.core.column.TimeDeltaColumn",
         build_column(
             data=self.astype("int64").base_data,
             dtype=dtype,
             mask=self.base_mask,
             offset=self.offset,
             size=self.size,
         ),
     )
示例#8
0
    def execute(self,
                requests: List[InferenceRequest]) -> List[InferenceResponse]:
        """Transforms the input batches by running through a NVTabular workflow.transform
        function.
        """
        responses = []
        for request in requests:
            # create a cudf DataFrame from the triton request
            input_df = cudf.DataFrame({
                name: _convert_tensor(get_input_tensor_by_name(request, name))
                for name in self.input_dtypes
            })

            for name, dtype in self.input_multihots.items():
                values = as_column(
                    _convert_tensor(
                        get_input_tensor_by_name(request, name + "__values")))
                nnzs = as_column(
                    _convert_tensor(
                        get_input_tensor_by_name(request, name + "__nnzs")))
                input_df[name] = build_column(None,
                                              dtype=dtype,
                                              size=nnzs.size - 1,
                                              children=(nnzs, values))

            # use our NVTabular workflow to transform the dataframe
            output_df = nvtabular.workflow._transform_partition(
                input_df, [self.workflow.column_group])

            # convert back to a triton response
            output_tensors = []
            for name in output_df.columns:
                col = output_df[name]
                if is_list_dtype(col.dtype):
                    # convert list values to match TF dataloader
                    values = col.list.leaves.values_host.astype(
                        self.output_dtypes[name + "__values"])
                    values = values.reshape(len(values), 1)
                    output_tensors.append(Tensor(name + "__values", values))

                    offsets = col._column.offsets.values_host.astype(
                        self.output_dtypes[name + "__nnzs"])
                    nnzs = offsets[1:] - offsets[:-1]
                    nnzs = nnzs.reshape(len(nnzs), 1)
                    output_tensors.append(Tensor(name + "__nnzs", nnzs))
                else:
                    d = col.values_host.astype(self.output_dtypes[name])
                    d = d.reshape(len(d), 1)
                    output_tensors.append(Tensor(name, d))

            responses.append(InferenceResponse(output_tensors))

        return responses
示例#9
0
    def _with_type_metadata(self: ColumnBase, dtype: Dtype) -> ColumnBase:
        if isinstance(dtype, CategoricalDtype):
            return column.build_categorical_column(
                categories=dtype.categories._values,
                codes=build_column(self.base_data, dtype=self.dtype),
                mask=self.base_mask,
                ordered=dtype.ordered,
                size=self.size,
                offset=self.offset,
                null_count=self.null_count,
            )

        return self
示例#10
0
文件: utils.py 项目: rongou/cudf
def _fillna_natwise(col):
    # If the value we are filling is np.datetime64("NAT")
    # we set the same mask as current column.
    # However where there are "<NA>" in the
    # columns, their corresponding locations
    nat = cudf._lib.scalar._create_proxy_nat_scalar(col.dtype)
    result = cudf._lib.replace.replace_nulls(col, nat)
    return column.build_column(
        data=result.base_data,
        dtype=result.dtype,
        size=result.size,
        offset=result.offset,
        children=result.base_children,
    )
示例#11
0
 def create_multihot_col(self, offsets, data):
     """
     offsets = cudf series with offset values for list data
     data = cudf series with the list data flattened to 1-d
     """
     offs = as_column(offsets, dtype="int32")
     encoded = as_column(data)
     col = build_column(
         None,
         size=offs.size - 1,
         dtype=cudf.core.dtypes.ListDtype(encoded.dtype),
         children=(offs, encoded),
     )
     return cudf.Series(col)
示例#12
0
    def transform(self, columns: ColumnNames,
                  df: DataFrameType) -> DataFrameType:
        on_cpu = isinstance(df, pd.DataFrame)
        ret = pd.DataFrame() if on_cpu else cudf.DataFrame()
        for col in columns:
            # handle CPU via normal python slicing (not very efficient)
            if on_cpu:
                ret[col] = [row[self.start:self.end] for row in df[col]]
            else:
                # figure out the size of each row from the list offsets
                c = df[col]._column
                offsets = c.offsets.values
                elements = c.elements.values

                # figure out the size of each row after slicing start/end
                new_offsets = cp.zeros(offsets.size, dtype=offsets.dtype)
                threads = 32
                blocks = (offsets.size + threads - 1) // threads

                # calculate new row offsets after slicing
                _calculate_row_sizes[blocks, threads](self.start, self.end,
                                                      offsets, new_offsets)
                new_offsets = cp.cumsum(new_offsets).astype(offsets.dtype)

                # create a new array for the sliced elements
                new_elements = cp.zeros(new_offsets[-1].item(),
                                        dtype=elements.dtype)
                if new_elements.size:
                    _slice_rows[blocks, threads](self.start, offsets, elements,
                                                 new_offsets, new_elements)

                # build up a list column with the sliced values
                ret[col] = build_column(
                    None,
                    dtype=cudf.core.dtypes.ListDtype(new_elements.dtype),
                    size=new_offsets.size - 1,
                    children=(as_column(new_offsets), as_column(new_elements)),
                )

        return ret
示例#13
0
def assert_buffer_equal(buffer_and_dtype: Tuple[_CuDFBuffer, Any], cudfcol):
    buf, dtype = buffer_and_dtype
    device_id = cp.asarray(cudfcol.data).device.id
    assert buf.__dlpack_device__() == (2, device_id)
    col_from_buf = build_column(Buffer(buf.ptr, buf.bufsize),
                                protocol_dtype_to_cupy_dtype(dtype))
    # check that non null values are the equals as nulls are represented
    # by sentinel values in the buffer.
    # FIXME: In gh-10202 some minimal fixes were added to unblock CI. But
    # currently only non-null values are compared, null positions are
    # unchecked.
    non_null_idxs = ~cudf.Series(cudfcol).isna()
    assert_eq(col_from_buf[non_null_idxs], cudfcol[non_null_idxs])

    if dtype[0] != _DtypeKind.BOOL:
        array_from_dlpack = cp.fromDlpack(buf.__dlpack__()).get()
        col_array = cp.asarray(cudfcol.data_array_view).get()
        assert_eq(
            array_from_dlpack[non_null_idxs.to_numpy()].flatten(),
            col_array[non_null_idxs.to_numpy()].flatten(),
        )
    else:
        pytest.raises(TypeError, buf.__dlpack__)
示例#14
0
    def as_datetime_column(self, dtype, **kwargs):
        from cudf.core.column import build_column

        return build_column(data=self.astype("int64").data,
                            dtype=dtype,
                            mask=self.mask)
示例#15
0
    def as_numerical(self):
        from cudf.core.column import build_column

        return build_column(data=self.data, dtype=np.int64, mask=self.mask)