def unordered_compare(self, cmpop, rhs): lhs, rhs = self, rhs return binop(lhs, rhs, op=_unordered_impl[cmpop], out_dtype=np.bool) def to_pandas(self, index): return pd.Series(self.to_array().astype(self.dtype), index=index) def to_arrow(self): mask = None if self.has_null_mask: mask = pa.py_buffer(self.nullmask.mem.copy_to_host()) data = pa.py_buffer(self.data.mem.copy_to_host().view('int64')) pa_dtype = _gdf.np_to_pa_dtype(self.dtype) return pa.Array.from_buffers(type=pa_dtype, length=len(self), buffers=[mask, data], null_count=self.null_count) def binop(lhs, rhs, op, out_dtype): nvtx_range_push("PYGDF_BINARY_OP", "orange") masked = lhs.has_null_mask or rhs.has_null_mask out = columnops.column_empty_like(lhs, dtype=out_dtype, masked=masked) null_count = _gdf.apply_binaryop(op, lhs, rhs, out) out = out.replace(null_count=null_count) nvtx_range_pop() return out register_distributed_serializer(DatetimeColumn)
Returns ------- result : subclass of Index - CategoricalIndex for Categorical input. - DatetimeIndex for Datetime input. - GenericIndex for all other inputs. """ # This function should probably be moved to Index.__new__ if isinstance(arbitrary, Index): return arbitrary elif isinstance(arbitrary, NumericalColumn): return GenericIndex(arbitrary, name=name) elif isinstance(arbitrary, DatetimeColumn): return DatetimeIndex(arbitrary, name=name) elif isinstance(arbitrary, CategoricalColumn): return CategoricalIndex(arbitrary, name=name) else: name = None if hasattr(arbitrary, 'name'): name = arbitrary.name if len(arbitrary) == 0: return RangeIndex(0, 0, name=name) return as_index(columnops.as_column(arbitrary), name=name) register_distributed_serializer(RangeIndex) register_distributed_serializer(GenericIndex) register_distributed_serializer(DatetimeIndex) register_distributed_serializer(CategoricalIndex)
Whether to use the list of quantiles as index. Returns ------- DataFrame """ if not quant_index: return Series(self._column.quantile(q, interpolation, exact)) else: return Series(self._column.quantile(q, interpolation, exact), index=as_index(np.asarray(q))) register_distributed_serializer(Series) truediv_int_dtype_corrections = { 'int64': 'float64', 'int32': 'float32', 'int': 'float', } class DatetimeProperties(object): def __init__(self, series): self.series = series @property def year(self): return self.get_dt_field('year')
return out def is_contiguous(self): return self.mem.is_c_contiguous() class BufferSentryError(ValueError): pass class _BufferSentry(object): def __init__(self, buf): self._buf = buf def dtype(self, dtype): if self._buf.dtype != dtype: raise BufferSentryError('dtype mismatch') return self def ndim(self, ndim): if self._buf.ndim != ndim: raise BufferSentryError('ndim mismatch') return self def contig(self): if not self._buf.is_c_contiguous(): raise BufferSentryError('non contiguous') register_distributed_serializer(Buffer)
def numeric_column_unaryop(operand, op, out_dtype): out = columnops.column_empty_like_same_mask(operand, dtype=out_dtype) _gdf.apply_unaryop(op, operand, out) return out.view(NumericalColumn, dtype=out_dtype) def numeric_column_compare(lhs, rhs, op): return numeric_column_binop(lhs, rhs, op, out_dtype=np.bool_) def numeric_normalize_types(*args): """Cast all args to a common type using numpy promotion logic """ dtype = np.result_type(*[a.dtype for a in args]) return [a.astype(dtype) for a in args] def column_hash_values(column0, *other_columns): """Hash all values in the given columns. Returns a new NumericalColumn[int32] """ columns = [column0] + list(other_columns) buf = Buffer(rmm.device_array(len(column0), dtype=np.int32)) result = NumericalColumn(data=buf, dtype=buf.dtype) _gdf.hash_columns(columns, result) return result register_distributed_serializer(NumericalColumn)
Convert from a Pandas MultiIndex Raises ------ TypeError for invalid input type. Examples -------- >>> import cudf >>> import pandas as pd >>> pmi = pd.MultiIndex(levels=[['a', 'b'], ['c', 'd']], codes=[[0, 1], [1, ]]) >>> cudf.from_pandas(pmi) MultiIndex( ... ) """ if not isinstance(multiindex, pd.MultiIndex): raise TypeError('not a pandas.MultiIndex') if hasattr(multiindex, 'codes'): mi = cls(levels=multiindex.levels, codes=multiindex.codes, names=multiindex.names) else: mi = cls(levels=multiindex.levels, codes=multiindex.labels, names=multiindex.names) return mi register_distributed_serializer(MultiIndex)
outcols={'out1': np.int32, 'out2': np.int32}, # threads per block tpb=8) print(result) Output: .. code-block:: python key val out1 out2 0 0 0 0 0 1 0 1 0 1 2 1 2 2 3 3 1 3 3 4 4 2 4 8 6 5 2 5 10 7 6 2 6 12 8 """ if not callable(function): raise TypeError("type {!r} is not callable", type(function)) df, segs = self.as_df() kwargs.update({'chunks': segs}) return df.apply_chunks(function, **kwargs) register_distributed_serializer(Groupby)
return joined_index, indexers else: return joined_index def pandas_categorical_as_column(categorical, codes=None): """Creates a CategoricalColumn from a pandas.Categorical If ``codes`` is defined, use it instead of ``categorical.codes`` """ # TODO fix mutability issue in numba to avoid the .copy() codes = (categorical.codes.copy() if codes is None else codes) # TODO pending pandas to be improved # https://github.com/pandas-dev/pandas/issues/14711 # https://github.com/pandas-dev/pandas/pull/16015 valid_codes = codes != -1 buf = Buffer(codes) params = dict(data=buf, categories=categorical.categories, ordered=categorical.ordered) if not np.all(valid_codes): mask = cudautils.compact_mask_bytes(valid_codes) nnz = np.count_nonzero(valid_codes) null_count = codes.size - nnz params.update(dict(mask=Buffer(mask), null_count=null_count)) return CategoricalColumn(**params) register_distributed_serializer(CategoricalColumn)
------- begin, end : 2-tuple of int The starting index and the ending index. The *last* value occurs at ``end - 1`` position. """ col = self._values begin, end = None, None if first is not None: begin = col.find_first_value(first) if last is not None: end = col.find_last_value(last) end += 1 return begin, end register_distributed_serializer(RangeIndex) register_distributed_serializer(GenericIndex) class DatetimeIndex(GenericIndex): # TODO this constructor should take a timezone or something to be # consistent with pandas def __new__(self, values, name=None): # we should be more strict on what we accept here but # we'd have to go and figure out all the semantics around # pandas dtindex creation first which. For now # just make sure we handle np.datetime64 arrays # and then just dispatch upstream if isinstance(values, np.ndarray) and values.dtype.kind == 'M': values = DatetimeColumn.from_numpy(values) elif isinstance(values, pd.DatetimeIndex):