Exemplo n.º 1
0
def test_dataframe_setitem_scaler_bool_inconsistency():
    df = pd.DataFrame({"a": [1, 2, 3]})
    df[[True, False, True]] = pd.DataFrame({"a": [-1, -2]})

    gdf = DataFrame({"a": [1, 2, 3]})
    gdf[[True, False, True]] = DataFrame({"a": [-1, -2]})
    assert_eq(df, gdf)
Exemplo n.º 2
0
    def _getitem_tuple_arg(self, arg):
        from cudf.core.dataframe import Series, DataFrame
        from cudf.core.column import column
        from cudf.core.index import as_index
        from cudf.utils.cudautils import arange
        from cudf import MultiIndex

        # Step 1: Gather columns
        if isinstance(self._df.columns, MultiIndex):
            columns_df = self._df.columns._get_column_major(self._df, arg[1])
            if isinstance(columns_df, Series):
                return columns_df
        else:
            columns = self._get_column_selection(arg[1])
            columns_df = DataFrame(index=self._df.index)
            for i, col in enumerate(columns):
                columns_df.insert(i, col, self._df[col])

        # Step 2: Gather rows
        if isinstance(columns_df.index, MultiIndex):
            return columns_df.index._get_row_major(columns_df, arg[0])
        else:
            if isinstance(self._df.columns, MultiIndex):
                if isinstance(arg[0], slice):
                    start, stop, step = arg[0].indices(len(columns_df))
                    indices = arange(start, stop, step)
                    df = columns_df.take(indices)
                else:
                    df = columns_df.take(arg[0])
            else:
                df = DataFrame()
                for col in columns_df.columns:
                    # need Series() in case a scalar is returned
                    df[col] = Series(columns_df[col].loc[arg[0]])
                df.columns = columns_df.columns

        # Step 3: Gather index
        if df.shape[0] == 1:  # we have a single row
            if isinstance(arg[0], slice):
                start = arg[0].start
                if start is None:
                    start = self._df.index[0]
                df.index = as_index(start)
            else:
                row_selection = column.as_column(arg[0])
                if pd.api.types.is_bool_dtype(row_selection.dtype):
                    df.index = self._df.index.take(row_selection)
                else:
                    df.index = as_index(row_selection)
        # Step 4: Downcast
        if self._can_downcast_to_series(df, arg):
            return self._downcast_to_series(df, arg)
        return df
Exemplo n.º 3
0
def from_dlpack(pycapsule_obj):
    """Converts from a DLPack tensor to a cuDF object.

    DLPack is an open-source memory tensor structure:
    `dmlc/dlpack <https://github.com/dmlc/dlpack>`_.

    This function takes a PyCapsule object which contains a pointer to
    a DLPack tensor as input, and returns a cuDF object. This function deep
    copies the data in the DLPack tensor into a cuDF object.

    Parameters
    ----------
    pycapsule_obj : PyCapsule
        Input DLPack tensor pointer which is encapsulated in a PyCapsule
        object.

    Returns
    -------
    A cuDF DataFrame or Series depending on if the input DLPack tensor is 1D
    or 2D.
    """

    res = libdlpack.from_dlpack(pycapsule_obj)

    if res._num_columns == 1:
        return Series(res._data[0])
    else:
        return DataFrame(data=res._data)
Exemplo n.º 4
0
def test_series_setitem_index():
    df = pd.DataFrame(
        data={"b": [-1, -2, -3], "c": [1, 2, 3]}, index=[1, 2, 3]
    )

    df["b"] = pd.Series(data=[12, 11, 10], index=[3, 2, 1])
    gdf = DataFrame(data={"b": [-1, -2, -3], "c": [1, 2, 3]}, index=[1, 2, 3])
    gdf["b"] = Series(data=[12, 11, 10], index=[3, 2, 1])
    assert_eq(df, gdf, check_dtype=False)
Exemplo n.º 5
0
    def _getitem_tuple_arg(self, arg):
        from cudf import MultiIndex
        from cudf.core.dataframe import DataFrame, Series
        from cudf.core.index import as_index

        # Iloc Step 1:
        # Gather the columns specified by the second tuple arg
        columns_df = self._get_column_selection(arg[1])
        columns_df._index = self._df._index

        # Iloc Step 2:
        # Gather the rows specified by the first tuple arg
        if isinstance(columns_df.index, MultiIndex):
            if isinstance(arg[0], slice):
                df = columns_df[arg[0]]
            else:
                df = columns_df.index._get_row_major(columns_df, arg[0])
            if (len(df) == 1 and len(columns_df) >= 1) and not (isinstance(
                    arg[0], slice) or isinstance(arg[1], slice)):
                # Pandas returns a numpy scalar in this case
                return df[0]
            if self._can_downcast_to_series(df, arg):
                return self._downcast_to_series(df, arg)
            return df
        else:
            df = DataFrame()
            for i, col in enumerate(columns_df._columns):
                # need Series() in case a scalar is returned
                df[i] = Series(col[arg[0]])

            df.index = as_index(columns_df.index[arg[0]])
            df.columns = columns_df.columns

        # Iloc Step 3:
        # Reindex
        if df.shape[0] == 1:  # we have a single row without an index
            df.index = as_index(self._df.index[arg[0]])

        # Iloc Step 4:
        # Downcast
        if self._can_downcast_to_series(df, arg):
            return self._downcast_to_series(df, arg)

        if df.shape[0] == 0 and df.shape[1] == 0 and isinstance(arg[0], slice):
            from cudf.core.index import RangeIndex

            slice_len = len(self._df)
            start, stop, step = arg[0].indices(slice_len)
            df._index = RangeIndex(start, stop)
        return df
Exemplo n.º 6
0
def from_dlpack(pycapsule_obj):
    """Converts from a DLPack tensor to a cuDF object.

    DLPack is an open-source memory tensor structure:
    `dmlc/dlpack <https://github.com/dmlc/dlpack>`_.

    This function takes a PyCapsule object which contains a pointer to
    a DLPack tensor as input, and returns a cuDF object. This function deep
    copies the data in the DLPack tensor into a cuDF object.

    Parameters
    ----------
    pycapsule_obj : PyCapsule
        Input DLPack tensor pointer which is encapsulated in a PyCapsule
        object.

    Returns
    -------
    A cuDF DataFrame or Series depending on if the input DLPack tensor is 1D
    or 2D.
    """
    try:
        res, valids = cpp_dlpack.from_dlpack(pycapsule_obj)
    except GDFError as err:
        if str(err) == "b'GDF_DATASET_EMPTY'":
            raise ValueError(
                "Cannot create a cuDF Object from a DLPack tensor of 0 size"
            )
        else:
            raise err
    cols = []
    for idx in range(len(valids)):
        mask = None
        if valids[idx]:
            mask = Buffer(valids[idx])
        cols.append(
            column.build_column(
                Buffer(res[idx]), dtype=res[idx].dtype, mask=mask
            )
        )
    if len(cols) == 1:
        return Series(cols[0])
    else:
        df = DataFrame()
        for idx, col in enumerate(cols):
            df[idx] = col
        return df
Exemplo n.º 7
0
def test_cummin(dtype, nelem):
    if dtype == np.int8:
        # to keep data in range
        data = gen_rand(dtype, nelem, low=-2, high=2)
    else:
        data = gen_rand(dtype, nelem)

    decimal = 4 if dtype == np.float32 else 6

    # series
    gs = Series(data)
    ps = pd.Series(data)
    np.testing.assert_array_almost_equal(gs.cummin().to_array(),
                                         ps.cummin(),
                                         decimal=decimal)

    # dataframe series (named series)
    gdf = DataFrame()
    gdf["a"] = Series(data)
    pdf = pd.DataFrame()
    pdf["a"] = pd.Series(data)
    np.testing.assert_array_almost_equal(gdf.a.cummin().to_array(),
                                         pdf.a.cummin(),
                                         decimal=decimal)
Exemplo n.º 8
0
def test_dataframe_setitem_scaler_keyerror():
    df = DataFrame({"a": [1, 2, 3]})
    with pytest.raises(KeyError):
        df[["x"]] = 0
Exemplo n.º 9
0
    def _getitem_tuple_arg(self, arg):
        from uuid import uuid4

        from cudf import MultiIndex
        from cudf.core.column import column
        from cudf.core.dataframe import DataFrame
        from cudf.core.index import as_index

        # Step 1: Gather columns
        if isinstance(arg, tuple):
            columns_df = self._get_column_selection(arg[1])
            columns_df._index = self._df._index
        else:
            columns_df = self._df

        # Step 2: Gather rows
        if isinstance(columns_df.index, MultiIndex):
            if isinstance(arg, (MultiIndex, pd.MultiIndex)):
                if isinstance(arg, pd.MultiIndex):
                    arg = MultiIndex.from_pandas(arg)

                indices = indices_from_labels(columns_df, arg)
                return columns_df.take(indices)

            else:
                if isinstance(arg, tuple):
                    return columns_df.index._get_row_major(columns_df, arg[0])
                else:
                    return columns_df.index._get_row_major(columns_df, arg)
        else:
            if isinstance(arg[0], slice):
                out = get_label_range_or_mask(
                    columns_df.index, arg[0].start, arg[0].stop, arg[0].step
                )
                if isinstance(out, slice):
                    df = columns_df._slice(out)
                else:
                    df = columns_df._apply_boolean_mask(out)
            else:
                tmp_arg = arg
                if is_scalar(arg[0]):
                    # If a scalar, there is possibility of having duplicates.
                    # Join would get all the duplicates. So, coverting it to
                    # an array kind.
                    tmp_arg = ([tmp_arg[0]], tmp_arg[1])
                if len(tmp_arg[0]) == 0:
                    return columns_df._empty_like(keep_index=True)
                tmp_arg = (column.as_column(tmp_arg[0]), tmp_arg[1])

                if pd.api.types.is_bool_dtype(tmp_arg[0]):
                    df = columns_df._apply_boolean_mask(tmp_arg[0])
                else:
                    tmp_col_name = str(uuid4())
                    other_df = DataFrame(
                        {tmp_col_name: column.arange(len(tmp_arg[0]))},
                        index=as_index(tmp_arg[0]),
                    )
                    df = other_df.join(columns_df, how="inner")
                    # as join is not assigning any names to index,
                    # update it over here
                    df.index.name = columns_df.index.name
                    df = df.sort_values(tmp_col_name)
                    df.drop(columns=[tmp_col_name], inplace=True)
                    # There were no indices found
                    if len(df) == 0:
                        raise KeyError(arg)

        # Step 3: Gather index
        if df.shape[0] == 1:  # we have a single row
            if isinstance(arg[0], slice):
                start = arg[0].start
                if start is None:
                    start = self._df.index[0]
                df.index = as_index(start)
            else:
                row_selection = column.as_column(arg[0])
                if pd.api.types.is_bool_dtype(row_selection.dtype):
                    df.index = self._df.index.take(row_selection)
                else:
                    df.index = as_index(row_selection)
        # Step 4: Downcast
        if self._can_downcast_to_series(df, arg):
            return self._downcast_to_series(df, arg)
        return df
Exemplo n.º 10
0
def _parse_tdf_gpu(tdf):
    """
    Parse the results of a select ipc_gpu into a GpuDataFrame

    Parameters
    ----------
    tdf : TDataFrame

    Returns
    -------
    gdf : GpuDataFrame
    """

    import pyarrow as pa
    from cudf.comm.gpuarrow import GpuArrowReader
    from cudf.core.dataframe import DataFrame
    from cudf._lib.arrow._cuda import Context, IpcMemHandle
    from numba import cuda

    ipc_handle = IpcMemHandle.from_buffer(pa.py_buffer(tdf.df_handle))
    ctx = Context()
    ipc_buf = ctx.open_ipc_buffer(ipc_handle)
    ipc_buf.context.synchronize()

    schema_buffer, shm_ptr = load_buffer(tdf.sm_handle, tdf.sm_size)

    buffer = pa.BufferReader(schema_buffer)
    schema = pa.read_schema(buffer)

    # Dictionary Memo functionality used to
    # deserialize on the C++ side is not
    # exposed on the pyarrow side, so we need to
    # handle this on our own.
    dict_memo = {}

    try:
        dict_batch_reader = pa.RecordBatchStreamReader(buffer)
        updated_fields = []

        for f in schema:
            if pa.types.is_dictionary(f.type):
                msg = dict_batch_reader.read_next_batch()
                dict_memo[f.name] = msg.column(0)
                updated_fields.append(pa.field(f.name, f.type.index_type))
            else:
                updated_fields.append(pa.field(f.name, f.type))

        schema = pa.schema(updated_fields)
    except pa.ArrowInvalid:
        # This message does not have any dictionary encoded
        # columns
        pass

    dtype = np.dtype(np.byte)
    darr = cuda.devicearray.DeviceNDArray(
        shape=ipc_buf.size,
        strides=dtype.itemsize,
        dtype=dtype,
        gpu_data=ipc_buf.to_numba(),
    )

    reader = GpuArrowReader(schema, darr)
    df = DataFrame()
    df.set_tdf = MethodType(set_tdf, df)
    df.get_tdf = MethodType(get_tdf, df)

    for k, v in reader.to_dict().items():
        if k in dict_memo:
            df[k] = pa.DictionaryArray.from_arrays(v, dict_memo[k])
        else:
            df[k] = v

    df.set_tdf(tdf)

    # free shared memory from Python
    # https://github.com/omnisci/pymapd/issues/46
    # https://github.com/omnisci/pymapd/issues/31
    free_sm = shmdt(ctypes.cast(shm_ptr, ctypes.c_void_p))  # noqa

    return df
Exemplo n.º 11
0
    def _getitem_tuple_arg(self, arg):
        from cudf import MultiIndex
        from cudf.core.dataframe import DataFrame, Series
        from cudf.core.column import column_empty
        from cudf.core.index import as_index

        # Iloc Step 1:
        # Gather the columns specified by the second tuple arg
        columns = self._get_column_selection(arg[1])
        if isinstance(self._df.columns, MultiIndex):
            columns_df = self._df.columns._get_column_major(self._df, arg[1])
            if (len(columns_df) == 0 and len(columns_df.columns) == 0
                    and not isinstance(arg[0], slice)):
                result = Series(column_empty(0, dtype="float64"), name=arg[0])
                result._index = columns_df.columns.copy(deep=False)
                return result
        else:
            if isinstance(arg[0], slice):
                columns_df = DataFrame()
                for i, col in enumerate(columns):
                    columns_df.insert(i, col, self._df[col])
                columns_df._index = self._df._index
            else:
                columns_df = self._df._columns_view(columns)

        # Iloc Step 2:
        # Gather the rows specified by the first tuple arg
        if isinstance(columns_df.index, MultiIndex):
            df = columns_df.index._get_row_major(columns_df, arg[0])
            if (len(df) == 1 and len(columns_df) >= 1) and not (isinstance(
                    arg[0], slice) or isinstance(arg[1], slice)):
                # Pandas returns a numpy scalar in this case
                return df[0]
            if self._can_downcast_to_series(df, arg):
                return self._downcast_to_series(df, arg)
            return df
        else:
            df = DataFrame()
            for i, col in enumerate(columns_df._columns):
                # need Series() in case a scalar is returned
                df[i] = Series(col[arg[0]])

            df.index = as_index(columns_df.index[arg[0]])
            df.columns = columns_df.columns

        # Iloc Step 3:
        # Reindex
        if df.shape[0] == 1:  # we have a single row without an index
            if isinstance(arg[0], slice):
                start = arg[0].start
                if start is None:
                    start = 0
                df.index = as_index(self._df.index[start])
            else:
                df.index = as_index(self._df.index[arg[0]])

        # Iloc Step 4:
        # Downcast
        if self._can_downcast_to_series(df, arg):
            if isinstance(df.columns, MultiIndex):
                if len(df) > 0 and not (isinstance(arg[0], slice)
                                        or isinstance(arg[1], slice)):
                    return list(df._data.values())[0][0]
                elif df.shape[1] > 1:
                    result = self._downcast_to_series(df, arg)
                    result.index = df.columns
                    return result
                elif not isinstance(arg[0], slice):
                    if len(df._data) == 0:
                        return Series(
                            column_empty(0, dtype="float64"),
                            index=df.columns,
                            name=arg[0],
                        )
                    else:
                        result_series = df[df.columns[0]]
                        result_series.index = df.columns
                        result_series.name = arg[0]
                        return result_series
                else:
                    return df[df.columns[0]]
            return self._downcast_to_series(df, arg)
        if df.shape[0] == 0 and df.shape[1] == 0:
            from cudf.core.index import RangeIndex

            slice_len = arg[0].stop or len(self._df)
            start, stop, step = arg[0].indices(slice_len)
            df._index = RangeIndex(start, stop)
        return df
Exemplo n.º 12
0
def where(
    frame: Union[Series, Index, DataFrame],
    cond: Any,
    other: Any = None,
    inplace: bool = False,
) -> Optional[Union[Frame]]:
    """
    Replace values where the condition is False.

    Parameters
    ----------
    cond : bool Series/DataFrame, array-like
        Where cond is True, keep the original value.
        Where False, replace with corresponding value from other.
        Callables are not supported.
    other: scalar, list of scalars, Series/DataFrame
        Entries where cond is False are replaced with
        corresponding value from other. Callables are not
        supported. Default is None.

        DataFrame expects only Scalar or array like with scalars or
        dataframe with same dimension as frame.

        Series expects only scalar or series like with same length
    inplace : bool, default False
        Whether to perform the operation in place on the data.

    Returns
    -------
    Same type as caller

    Examples
    --------
    >>> import cudf
    >>> df = DataFrame({"A":[1, 4, 5], "B":[3, 5, 8]})
    >>> df.where(df % 2 == 0, [-1, -1])
       A  B
    0 -1 -1
    1  4 -1
    2 -1  8

    >>> ser = Series([4, 3, 2, 1, 0])
    >>> ser.where(ser > 2, 10)
    0     4
    1     3
    2    10
    3    10
    4    10
    dtype: int64
    >>> ser.where(ser > 2)
    0       4
    1       3
    2    <NA>
    3    <NA>
    4    <NA>
    dtype: int64
    """

    if isinstance(frame, DataFrame):
        if hasattr(cond, "__cuda_array_interface__"):
            cond = DataFrame(cond,
                             columns=frame._column_names,
                             index=frame.index)
        elif (hasattr(cond, "__array_interface__")
              and cond.__array_interface__["shape"] != frame.shape):
            raise ValueError("conditional must be same shape as self")
        elif not isinstance(cond, DataFrame):
            cond = frame.from_pandas(pd.DataFrame(cond))

        common_cols = set(frame._column_names).intersection(
            set(cond._column_names))
        if len(common_cols) > 0:
            # If `frame` and `cond` are having unequal index,
            # then re-index `cond`.
            if not frame.index.equals(cond.index):
                cond = cond.reindex(frame.index)
        else:
            if cond.shape != frame.shape:
                raise ValueError(
                    """Array conditional must be same shape as self""")
            # Setting `frame` column names to `cond`
            # as `cond` has no column names.
            cond.columns = frame.columns

        (
            source_df,
            others,
        ) = _normalize_columns_and_scalars_type(frame, other)
        if isinstance(other, Frame):
            others = others._data.columns

        out_df = DataFrame(index=frame.index)
        if len(frame._columns) != len(others):
            raise ValueError(
                """Replacement list length or number of dataframe columns
                should be equal to Number of columns of dataframe""")
        for i, column_name in enumerate(frame._column_names):
            input_col = source_df._data[column_name]
            other_column = others[i]
            if column_name in cond._data:
                if isinstance(input_col, cudf.core.column.CategoricalColumn):
                    if cudf.utils.dtypes.is_scalar(other_column):
                        try:
                            other_column = input_col._encode(other_column)
                        except ValueError:
                            # When other is not present in categories,
                            # fill with Null.
                            other_column = None
                        other_column = cudf.Scalar(other_column,
                                                   dtype=input_col.codes.dtype)
                    elif isinstance(other_column,
                                    cudf.core.column.CategoricalColumn):
                        other_column = other_column.codes
                    input_col = input_col.codes

                result = cudf._lib.copying.copy_if_else(
                    input_col, other_column, cond._data[column_name])

                if isinstance(
                        frame._data[column_name],
                        cudf.core.column.CategoricalColumn,
                ):
                    result = cudf.core.column.build_categorical_column(
                        categories=frame._data[column_name].categories,
                        codes=cudf.core.column.as_column(result.base_data,
                                                         dtype=result.dtype),
                        mask=result.base_mask,
                        size=result.size,
                        offset=result.offset,
                        ordered=frame._data[column_name].ordered,
                    )
            else:
                out_mask = cudf._lib.null_mask.create_null_mask(
                    len(input_col),
                    state=cudf._lib.null_mask.MaskState.ALL_NULL,
                )
                result = input_col.set_mask(out_mask)
            out_df[column_name] = frame[column_name].__class__(result)

        return frame._mimic_inplace(out_df, inplace=inplace)

    else:
        if isinstance(other, DataFrame):
            raise NotImplementedError(
                "cannot align with a higher dimensional Frame")
        input_col = frame._data[frame.name]
        cond = cudf.core.column.as_column(cond)
        if len(cond) != len(frame):
            raise ValueError(
                """Array conditional must be same shape as self""")

        (
            input_col,
            other,
        ) = _normalize_columns_and_scalars_type(frame, other, inplace)

        if isinstance(input_col, cudf.core.column.CategoricalColumn):
            if cudf.utils.dtypes.is_scalar(other):
                try:
                    other = input_col._encode(other)
                except ValueError:
                    # When other is not present in categories,
                    # fill with Null.
                    other = None
                other = cudf.Scalar(other, dtype=input_col.codes.dtype)
            elif isinstance(other, cudf.core.column.CategoricalColumn):
                other = other.codes

            input_col = input_col.codes

        result = cudf._lib.copying.copy_if_else(input_col, other, cond)

        if isinstance(frame._data[frame.name],
                      cudf.core.column.CategoricalColumn):
            result = cudf.core.column.build_categorical_column(
                categories=cast(
                    cudf.core.column.CategoricalColumn,
                    frame._data[frame.name],
                ).categories,
                codes=cudf.core.column.as_column(result.base_data,
                                                 dtype=result.dtype),
                mask=result.base_mask,
                size=result.size,
                offset=result.offset,
                ordered=cast(
                    cudf.core.column.CategoricalColumn,
                    frame._data[frame.name],
                ).ordered,
            )

        if isinstance(frame, Index):
            result = Index(result, name=frame.name)
        else:
            result = frame._copy_construct(data=result)

        return frame._mimic_inplace(result, inplace=inplace)