示例#1
0
文件: indexing.py 项目: zeichuan/cudf
 def _can_downcast_to_series(self, df, arg):
     """
     This method encapsulates the logic used
     to determine whether or not the result of a loc/iloc
     operation should be "downcasted" from a DataFrame to a
     Series
     """
     if isinstance(df, cudf.Series):
         return False
     nrows, ncols = df.shape
     if nrows == 1:
         if type(arg[0]) is slice:
             if not is_scalar(arg[1]):
                 return False
         dtypes = df.dtypes.values.tolist()
         all_numeric = all(
             [pd.api.types.is_numeric_dtype(t) for t in dtypes]
         )
         all_identical = dtypes.count(dtypes[0]) == len(dtypes)
         if all_numeric or all_identical:
             return True
     if ncols == 1:
         if type(arg[1]) is slice:
             if not is_scalar(arg[0]):
                 return False
         return True
     return False
示例#2
0
文件: indexing.py 项目: zeichuan/cudf
    def _loc_to_iloc(self, arg):
        from cudf.dataframe.series import Series
        from cudf.dataframe.index import Index

        if isinstance(
            arg, (list, np.ndarray, pd.Series, range, Index, DeviceNDArray)
        ):
            if len(arg) == 0:
                arg = Series(np.array([], dtype="int32"))
            else:
                arg = Series(arg)
        if isinstance(arg, Series):
            if arg.dtype in [np.bool, np.bool_]:
                return arg
            else:
                return indices_from_labels(self._sr, arg)
        elif is_scalar(arg):
            found_index = self._sr.index.find_label_range(arg, None)[0]
            return found_index
        elif isinstance(arg, slice):
            start_index, stop_index = self._sr.index.find_label_range(
                arg.start, arg.stop
            )
            return slice(start_index, stop_index, arg.step)
        else:
            raise NotImplementedError(
                ".loc not implemented for label type {}".format(
                    type(arg).__name__
                )
            )
示例#3
0
文件: indexing.py 项目: zeichuan/cudf
    def _downcast_to_series(self, df, arg):
        """
        "Downcast" from a DataFrame to a Series
        based on Pandas indexing rules
        """
        nrows, ncols = df.shape
        # determine the axis along which the Series is taken:
        if nrows == 1 and ncols == 1:
            if not is_scalar(arg[0]):
                axis = 1
            else:
                axis = 0
        elif nrows == 1:
            axis = 0
        elif ncols == 1:
            axis = 1
        else:
            raise ValueError("Cannot downcast DataFrame selection to Series")

        # take series along the axis:
        if axis == 1:
            return df[df.columns[0]]
        else:
            df = _normalize_dtypes(df)
            sr = df.T
            return sr[sr.columns[0]]
示例#4
0
    def __setitem__(self, key, value):
        """
        Set the value of self[key] to value.

        If value and self are of different types,
        value is coerced to self.dtype
        """
        import cudf.bindings.copying as cpp_copying
        from cudf.dataframe import columnops

        if isinstance(key, slice):
            key_start, key_stop, key_stride = key.indices(len(self))
            if key_stride != 1:
                raise NotImplementedError("Stride not supported in slice")
            nelem = abs(key_stop - key_start)
        else:
            key = columnops.as_column(key)
            if pd.api.types.is_bool_dtype(key.dtype):
                if not len(key) == len(self):
                    raise ValueError(
                        "Boolean mask must be of same length as column")
                key = columnops.as_column(cudautils.arange(len(self)))[key]
            nelem = len(key)

        if utils.is_scalar(value):
            if is_categorical_dtype(self.dtype):
                from cudf.dataframe.categorical import CategoricalColumn
                from cudf.dataframe.buffer import Buffer
                from cudf.utils.cudautils import fill_value

                data = rmm.device_array(nelem, dtype="int8")
                fill_value(data, self._encode(value))
                value = CategoricalColumn(
                    data=Buffer(data),
                    categories=self._categories,
                    ordered=False,
                )
            elif value is None:
                value = columnops.column_empty(nelem, self.dtype, masked=True)
            else:
                to_dtype = pd.api.types.pandas_dtype(self.dtype)
                value = utils.scalar_broadcast_to(value, nelem, to_dtype)

        value = columnops.as_column(value).astype(self.dtype)

        if len(value) != nelem:
            msg = (f"Size mismatch: cannot set value "
                   f"of size {len(value)} to indexing result of size "
                   f"{nelem}")
            raise ValueError(msg)

        if isinstance(key, slice):
            out = cpp_copying.apply_copy_range(self, value, key_start,
                                               key_stop, 0)
        else:
            out = cpp_copying.apply_scatter(value, key, self)

        self._data = out.data
        self._mask = out.mask
        self._update_null_count()
示例#5
0
文件: indexing.py 项目: zeichuan/cudf
 def _get_column_selection(self, arg):
     cols = self._df.columns
     if isinstance(cols, cudf.MultiIndex):
         return cols._get_column_major(self._df, arg)
     if is_scalar(arg):
         return [cols[arg]]
     else:
         return cols[arg]
示例#6
0
    def fillna(self, fill_value, inplace=False):
        if is_scalar(fill_value):
            fill_value = np.datetime64(fill_value, self.time_unit)
        else:
            fill_value = columnops.as_column(fill_value, nan_as_null=False)

        result = cpp_replace.apply_replace_nulls(self, fill_value)

        result = result.replace(mask=None)
        return self._mimic_inplace(result, inplace)
示例#7
0
文件: groupby.py 项目: zeichuan/cudf
 def __getitem__(self, arg):
     if is_scalar(arg):
         return self.__getattr__(arg)
     else:
         arg = list(arg)
         by_list = []
         for by_name, by in zip(self._groupby.key_names,
                                self._groupby.key_columns):
             by_list.append(cudf.Series(by, name=by_name))
         return self._df[arg].groupby(
             by_list,
             as_index=self._groupby.as_index,
             sort=self._groupby.sort,
         )
示例#8
0
文件: groupby.py 项目: zeichuan/cudf
 def key_from_by(self, by):
     """
     Get (key_name, key_column) pair from a single *by* argument
     """
     if is_scalar(by):
         key_name = by
         key_column = self.obj[by]._column
     else:
         by = cudf.Series(by)
         if len(by) != len(self.obj):
             raise NotImplementedError(
                 "cuDF does not support arbitrary series index lengths "
                 "for groupby")
         key_name = by.name
         key_column = by._column
     return key_name, key_column
示例#9
0
文件: indexing.py 项目: zeichuan/cudf
    def _get_column_selection(self, arg):
        if is_scalar(arg):
            return [arg]

        elif isinstance(arg, slice):
            start = self._df.columns[0] if arg.start is None else arg.start
            stop = self._df.columns[-1] if arg.stop is None else arg.stop
            cols = []
            within_slice = False
            for c in self._df.columns:
                if c == start:
                    within_slice = True
                if within_slice:
                    cols.append(c)
                if c == stop:
                    break
            return cols

        else:
            return arg