def _can_downcast_to_series(self, df, arg): """ This method encapsulates the logic used to determine whether or not the result of a loc/iloc operation should be "downcasted" from a DataFrame to a Series """ from cudf.core.column import as_column if isinstance(df, cudf.Series): return False nrows, ncols = df.shape if nrows == 1: if type(arg[0]) is slice: if not is_scalar(arg[1]): return False else: # row selection using boolean indexing - never downcasts if pd.api.types.is_bool_dtype(as_column(arg[0]).dtype): return False dtypes = df.dtypes.values.tolist() all_numeric = all( [pd.api.types.is_numeric_dtype(t) for t in dtypes]) if all_numeric: return True if ncols == 1: if type(arg[1]) is slice: if not is_scalar(arg[0]): return False if isinstance(arg[1], tuple): # Multiindex indexing with a slice if any(isinstance(v, slice) for v in arg): return False return True return False
def _downcast_to_series(self, df, arg): """ "Downcast" from a DataFrame to a Series based on Pandas indexing rules """ nrows, ncols = df.shape # determine the axis along which the Series is taken: if nrows == 1 and ncols == 1: if is_scalar(arg[0]) and is_scalar(arg[1]): return df[df.columns[0]][0] elif not is_scalar(arg[0]): axis = 1 else: axis = 0 elif nrows == 1: axis = 0 elif ncols == 1: axis = 1 else: raise ValueError("Cannot downcast DataFrame selection to Series") # take series along the axis: if axis == 1: return df[df._data.names[0]] else: df = _normalize_dtypes(df) sr = df.T return sr[sr._data.names[0]]
def __setitem__(self, key, value): try: key = self._loc_to_iloc(key) except KeyError as e: if (is_scalar(key) and not isinstance(self._sr.index, cudf.MultiIndex) and is_scalar(value)): _append_new_row_inplace(self._sr.index._values, key) _append_new_row_inplace(self._sr._column, value) return else: raise e if isinstance(value, (pd.Series, cudf.Series)): value = cudf.Series(value) value = value._align_to_index(self._sr.index, how="right") self._sr.iloc[key] = value
def _loc_to_iloc(self, arg): from cudf.core.series import Series from cudf.core.index import Index if isinstance( arg, (list, np.ndarray, pd.Series, range, Index, DeviceNDArray)): if len(arg) == 0: arg = Series(np.array([], dtype="int32")) else: arg = Series(arg) if isinstance(arg, Series): if arg.dtype in [np.bool, np.bool_]: return arg else: return indices_from_labels(self._sr, arg) elif is_scalar(arg): found_index = self._sr.index.find_label_range(arg, None)[0] return found_index elif isinstance(arg, slice): start_index, stop_index = self._sr.index.find_label_range( arg.start, arg.stop) return slice(start_index, stop_index, arg.step) else: raise NotImplementedError( ".loc not implemented for label type {}".format( type(arg).__name__))
def __setitem__(self, key, value): from cudf.core.column import column if isinstance(key, tuple): key = list(key) # coerce value into a scalar or column if is_scalar(value): value = to_cudf_compatible_scalar(value) elif not (isinstance(value, (list, dict)) and isinstance(self._sr._column.dtype, (cudf.ListDtype, cudf.StructDtype))): value = column.as_column(value) if (not isinstance( self._sr._column.dtype, (cudf.Decimal64Dtype, cudf.CategoricalDtype), ) and hasattr(value, "dtype") and _is_non_decimal_numeric_dtype(value.dtype)): # normalize types if necessary: if not is_integer(key): to_dtype = np.result_type(value.dtype, self._sr._column.dtype) value = value.astype(to_dtype) self._sr._column._mimic_inplace( self._sr._column.astype(to_dtype), inplace=True) self._sr._column[key] = value
def fillna(self, fill_value): col = self if is_scalar(fill_value): if isinstance(fill_value, np.timedelta64): dtype = determine_out_dtype(self.dtype, fill_value.dtype) fill_value = fill_value.astype(dtype) col = col.astype(dtype) elif not isinstance(fill_value, Scalar): fill_value = np.timedelta64(fill_value) else: fill_value = column.as_column(fill_value, nan_as_null=False) result = libcudf.replace.replace_nulls(col, fill_value) if isinstance(fill_value, np.timedelta64) and np.isnat(fill_value): # If the value we are filling is np.timedelta64("NAT") # we set the same mask as current column. # However where there are "<NA>" in the # columns, their corresponding locations # in base_data will contain min(int64) values. return column.build_column( data=result.base_data, dtype=result.dtype, mask=self.base_mask, size=result.size, offset=result.offset, children=result.base_children, ) return result
def __setitem__(self, key, value): from cudf.core.column import column if isinstance(key, tuple): key = list(key) # coerce value into a scalar or column if is_scalar(value): value = to_cudf_compatible_scalar(value) else: value = column.as_column(value) if ( not is_categorical_dtype(self._sr._column.dtype) and hasattr(value, "dtype") and pd.api.types.is_numeric_dtype(value.dtype) ): # normalize types if necessary: if not pd.api.types.is_integer(key): to_dtype = np.result_type(value.dtype, self._sr._column.dtype) value = value.astype(to_dtype) self._sr._column._mimic_inplace( self._sr._column.astype(to_dtype), inplace=True ) self._sr._column[key] = value
def _loc_to_iloc(self, arg): from cudf.core.column import column from cudf.core.series import Series if is_scalar(arg): try: found_index = self._sr.index._values.find_first_value( arg, closest=False ) return found_index except (TypeError, KeyError, IndexError, ValueError): raise KeyError("label scalar is out of bound") elif isinstance(arg, slice): return get_label_range_or_mask( self._sr.index, arg.start, arg.stop, arg.step ) elif isinstance(arg, (cudf.MultiIndex, pd.MultiIndex)): if isinstance(arg, pd.MultiIndex): arg = cudf.MultiIndex.from_pandas(arg) return indices_from_labels(self._sr, arg) else: arg = Series(column.as_column(arg)) if arg.dtype in (bool, np.bool_): return arg else: indices = indices_from_labels(self._sr, arg) if indices.null_count > 0: raise KeyError("label scalar is out of bound") return indices
def __getitem__(self, index): from numbers import Number if isinstance(index, slice): start, stop, step = index.indices(len(self)) sln = (stop - start) // step sln = max(0, sln) start += self._start stop += self._start if sln == 0: return RangeIndex(0, None, self.name) elif step == 1: return RangeIndex(start, stop, self.name) else: return index_from_range(start, stop, step) elif isinstance(index, Number): index = utils.normalize_index(index, len(self)) index += self._start return index elif isinstance(index, (list, np.ndarray)): index = np.asarray(index) index = rmm.to_device(index) else: if is_scalar(index): index = min_signed_type(index)(index) index = column.as_column(index) return as_index(self._values[index], name=self.name)
def _setitem_tuple_arg(self, key, value): if isinstance(self._df.index, cudf.MultiIndex) or isinstance( self._df.columns, pd.MultiIndex): raise NotImplementedError( "Setting values using df.loc[] not supported on " "DataFrames with a MultiIndex") try: columns = self._get_column_selection(key[1]) except KeyError: if not self._df.empty and isinstance(key[0], slice): pos_range = get_label_range_or_mask(self._df.index, key[0].start, key[0].stop, key[0].step) idx = self._df.index[pos_range] elif self._df.empty and isinstance(key[0], slice): idx = None else: idx = cudf.Index(key[0]) if is_scalar(value): length = len(idx) if idx is not None else 1 value = as_column(value, length=length) new_col = cudf.Series(value, index=idx) if not self._df.empty: new_col = new_col._align_to_index(self._df.index, how="right") if self._df.empty: self._df.index = (idx if idx is not None else cudf.RangeIndex( len(new_col))) self._df._data.insert(key[1], new_col) else: for col in columns: self._df[col].loc[key[0]] = value
def _get_column_selection(self, arg): cols = self._df.columns if isinstance(cols, cudf.MultiIndex): return cols._get_column_major(self._df, arg) if is_scalar(arg): return [cols[arg]] else: return cols[arg]
def normalize_binop_value(self, other): if is_scalar(other) and isinstance(other, (int, np.int, Decimal)): return cudf.Scalar(Decimal(other)) elif isinstance(other, cudf.Scalar) and isinstance( other.dtype, cudf.Decimal64Dtype): return other else: raise TypeError(f"cannot normalize {type(other)}")
def __getitem__(self, arg): if isinstance(arg, tuple): arg = list(arg) data = self._sr._column[arg] if is_scalar(data) or data is None: return data index = self._sr.index.take(arg) return self._sr._copy_construct(data=data, index=index)
def _getitem_tuple_arg(self, arg): from cudf import MultiIndex from cudf.core.column import column from cudf.core.index import as_index # Iloc Step 1: # Gather the columns specified by the second tuple arg columns_df = self._get_column_selection(arg[1]) columns_df._index = self._df._index # Iloc Step 2: # Gather the rows specified by the first tuple arg if isinstance(columns_df.index, MultiIndex): if isinstance(arg[0], slice): df = columns_df[arg[0]] else: df = columns_df.index._get_row_major(columns_df, arg[0]) if (len(df) == 1 and len(columns_df) >= 1) and not ( isinstance(arg[0], slice) or isinstance(arg[1], slice) ): # Pandas returns a numpy scalar in this case return df.iloc[0] if self._can_downcast_to_series(df, arg): return self._downcast_to_series(df, arg) return df else: if isinstance(arg[0], slice): df = columns_df._slice(arg[0]) elif is_scalar(arg[0]): index = arg[0] if index < 0: index += len(columns_df) df = columns_df._slice(slice(index, index + 1, 1)) else: arg = (column.as_column(arg[0]), arg[1]) if pd.api.types.is_bool_dtype(arg[0]): df = columns_df._apply_boolean_mask(arg[0]) else: df = columns_df._gather(arg[0]) # Iloc Step 3: # Reindex if df.shape[0] == 1: # we have a single row without an index df.index = as_index(self._df.index[arg[0]]) # Iloc Step 4: # Downcast if self._can_downcast_to_series(df, arg): return self._downcast_to_series(df, arg) if df.shape[0] == 0 and df.shape[1] == 0 and isinstance(arg[0], slice): from cudf.core.index import RangeIndex slice_len = len(self._df) start, stop, step = arg[0].indices(slice_len) df._index = RangeIndex(start, stop) return df
def fillna(self, fill_value): if is_scalar(fill_value): fill_value = np.datetime64(fill_value, self.time_unit) else: fill_value = column.as_column(fill_value, nan_as_null=False) result = libcudf.replace.replace_nulls(self, fill_value) result = column.build_column(result.data, result.dtype, mask=None) return result
def fillna(self, fill_value, inplace=False): if is_scalar(fill_value): fill_value = np.datetime64(fill_value, self.time_unit) else: fill_value = column.as_column(fill_value, nan_as_null=False) result = libcudf.replace.replace_nulls(self, fill_value) result = result.replace(mask=None) return self._mimic_inplace(result, inplace)
def fillna(self, fill_value=None, method=None): if fill_value is not None: if cudf.utils.utils.isnat(fill_value): return _fillna_natwise(self) if is_scalar(fill_value): if not isinstance(fill_value, cudf.Scalar): fill_value = cudf.Scalar(fill_value, dtype=self.dtype) else: fill_value = column.as_column(fill_value, nan_as_null=False) return super().fillna(fill_value, method)
def fillna(self, fill_value): if cudf.utils.utils.isnat(fill_value): return _fillna_natwise(self) if is_scalar(fill_value): if not isinstance(fill_value, cudf.Scalar): fill_value = cudf.Scalar(fill_value, dtype=self.dtype) else: fill_value = column.as_column(fill_value, nan_as_null=False) result = libcudf.replace.replace_nulls(self, fill_value) return result
def __getitem__(self, arg): if is_scalar(arg): return self.__getattr__(arg) else: arg = list(arg) by = None if self._groupby.level is None: by = self._groupby.key_columns return self._df[arg].groupby( by=by, level=self._groupby.level, as_index=self._groupby.as_index, sort=self._groupby.sort, dropna=self._groupby.dropna, )
def fillna(self, fill_value): if cudf.utils.utils.isnat(fill_value): return _fillna_natwise(self) col = self if is_scalar(fill_value): if isinstance(fill_value, np.timedelta64): dtype = determine_out_dtype(self.dtype, fill_value.dtype) fill_value = fill_value.astype(dtype) col = col.astype(dtype) if not isinstance(fill_value, cudf.Scalar): fill_value = cudf.Scalar(fill_value, dtype=dtype) else: fill_value = column.as_column(fill_value, nan_as_null=False) result = libcudf.replace.replace_nulls(col, fill_value) return result
def key_from_by(self, by): """ Get (key_name, key_column) pair from a single *by* argument """ if is_scalar(by): self.df_key_names.append(by) key_name = by key_column = self.obj[by] else: if len(by) != len(self.obj): raise NotImplementedError( "cuDF does not support arbitrary series index lengths " "for groupby") key_name = by.name key_column = by return key_name, key_column
def fillna(self, fill_value=None, method=None): if fill_value is not None: if cudf.utils.utils.isnat(fill_value): return _fillna_natwise(self) col = self if is_scalar(fill_value): if isinstance(fill_value, np.timedelta64): dtype = determine_out_dtype(self.dtype, fill_value.dtype) fill_value = fill_value.astype(dtype) col = col.astype(dtype) if not isinstance(fill_value, cudf.Scalar): fill_value = cudf.Scalar(fill_value, dtype=dtype) else: fill_value = column.as_column(fill_value, nan_as_null=False) return ColumnBase.fillna(col, fill_value) else: return super().fillna(method=method)
def _get_column_selection(self, arg): if is_scalar(arg): return [arg] elif isinstance(arg, slice): start = self._df.columns[0] if arg.start is None else arg.start stop = self._df.columns[-1] if arg.stop is None else arg.stop cols = [] within_slice = False for c in self._df.columns: if c == start: within_slice = True if within_slice: cols.append(c) if c == stop: break return cols else: return arg
def _loc_to_iloc(self, arg): if is_scalar(arg): if not is_numerical_dtype(self._sr.index.dtype): # TODO: switch to cudf.utils.dtypes.is_integer(arg) if isinstance(arg, cudf.Scalar) and pd.api.types.is_integer_dtype( arg.dtype): found_index = arg.value return found_index elif pd.api.types.is_integer(arg): found_index = arg return found_index try: found_index = self._sr.index._values.find_first_value( arg, closest=False) return found_index except (TypeError, KeyError, IndexError, ValueError): raise KeyError("label scalar is out of bound") elif isinstance(arg, slice): return get_label_range_or_mask(self._sr.index, arg.start, arg.stop, arg.step) elif isinstance(arg, (cudf.MultiIndex, pd.MultiIndex)): if isinstance(arg, pd.MultiIndex): arg = cudf.MultiIndex.from_pandas(arg) return indices_from_labels(self._sr, arg) else: arg = cudf.core.series.Series(cudf.core.column.as_column(arg)) if arg.dtype in (bool, np.bool_): return arg else: indices = indices_from_labels(self._sr, arg) if indices.null_count > 0: raise KeyError("label scalar is out of bound") return indices
def join( self, other, how="left", level=None, return_indexers=False, sort=False ): """ Compute join_index and indexers to conform data structures to the new index. Parameters ---------- other : Index. how : {'left', 'right', 'inner', 'outer'} return_indexers : bool, default False sort : bool, default False Sort the join keys lexicographically in the result Index. If False, the order of the join keys depends on the join type (how keyword). Returns: index Examples -------- >>> import cudf >>> lhs = cudf.DataFrame( ... {"a":[2, 3, 1], "b":[3, 4, 2]}).set_index(['a', 'b'] ... ).index >>> rhs = cudf.DataFrame({"a":[1, 4, 3]}).set_index('a').index >>> lhs.join(rhs, how='inner') MultiIndex(levels=[0 1 1 3 dtype: int64, 0 2 1 4 dtype: int64], codes= a b 0 1 1 1 0 0) """ if isinstance(self, cudf.MultiIndex) and isinstance( other, cudf.MultiIndex ): raise TypeError( "Join on level between two MultiIndex objects is ambiguous" ) if level is not None and not is_scalar(level): raise ValueError("level should be an int or a label only") if isinstance(other, cudf.MultiIndex): if how == "left": how = "right" elif how == "right": how = "left" rhs = self.copy(deep=False) lhs = other.copy(deep=False) else: lhs = self.copy(deep=False) rhs = other.copy(deep=False) on = level # In case of MultiIndex, it will be None as # we don't need to update name left_names = lhs.names right_names = rhs.names # There should be no `None` values in Joined indices, # so essentially it would be `left/right` or 'inner' # in case of MultiIndex if isinstance(lhs, cudf.MultiIndex): if level is not None and isinstance(level, int): on = lhs._data.get_by_index(level).names[0] right_names = (on,) or right_names on = right_names[0] if how == "outer": how = "left" elif how == "right": how = "inner" else: # Both are nomal indices right_names = left_names on = right_names[0] lhs.names = left_names rhs.names = right_names output = lhs._merge(rhs, how=how, on=on, sort=sort) return output
def to_datetime( arg, errors="raise", dayfirst=False, yearfirst=False, utc=None, format=None, exact=True, unit="ns", infer_datetime_format=False, origin="unix", cache=True, ): """ Convert argument to datetime. Parameters ---------- arg : int, float, str, datetime, list, tuple, 1-d array, Series DataFrame/dict-like The object to convert to a datetime. errors : {'ignore', 'raise', 'coerce', 'warn'}, default 'raise' - If 'raise', then invalid parsing will raise an exception. - If 'coerce', then invalid parsing will be set as NaT. - If 'warn' : prints last exceptions as warnings and return the input. - If 'ignore', then invalid parsing will return the input. dayfirst : bool, default False Specify a date parse order if `arg` is str or its list-likes. If True, parses dates with the day first, eg 10/11/12 is parsed as 2012-11-10. Warning: dayfirst=True is not strict, but will prefer to parse with day first (this is a known bug, based on dateutil behavior). format : str, default None The strftime to parse time, eg "%d/%m/%Y", note that "%f" will parse all the way up to nanoseconds. See strftime documentation for more information on choices: https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior. unit : str, default 'ns' The unit of the arg (D,s,ms,us,ns) denote the unit, which is an integer or float number. This will be based off the origin(unix epoch start). Example, with unit='ms' and origin='unix' (the default), this would calculate the number of milliseconds to the unix epoch start. infer_datetime_format : bool, default False If True and no `format` is given, attempt to infer the format of the datetime strings, and if it can be inferred, switch to a faster method of parsing them. In some cases this can increase the parsing speed by ~5-10x. Returns ------- datetime If parsing succeeded. Return type depends on input: - list-like: DatetimeIndex - Series: Series of datetime64 dtype - scalar: Timestamp Examples -------- Assembling a datetime from multiple columns of a DataFrame. The keys can be common abbreviations like ['year', 'month', 'day', 'minute', 'second', 'ms', 'us', 'ns']) or plurals of the same >>> import cudf >>> df = cudf.DataFrame({'year': [2015, 2016], ... 'month': [2, 3], ... 'day': [4, 5]}) >>> cudf.to_datetime(df) 0 2015-02-04 1 2016-03-05 dtype: datetime64[ns] >>> cudf.to_datetime(1490195805, unit='s') numpy.datetime64('2017-03-22T15:16:45.000000000') >>> cudf.to_datetime(1490195805433502912, unit='ns') numpy.datetime64('1780-11-20T01:02:30.494253056') """ if arg is None: return None if exact is False: raise NotImplementedError("exact support is not yet implemented") if origin != "unix": raise NotImplementedError("origin support is not yet implemented") if yearfirst: raise NotImplementedError("yearfirst support is not yet implemented") try: if isinstance(arg, cudf.DataFrame): # we require at least Ymd required = ["year", "month", "day"] req = list(set(required) - set(arg._data.names)) if len(req): req = ",".join(req) raise ValueError( f"to assemble mappings requires at least that " f"[year, month, day] be specified: [{req}] " f"is missing" ) # replace passed column name with values in _unit_map unit = {k: get_units(k) for k in arg._data.names} unit_rev = {v: k for k, v in unit.items()} # keys we don't recognize excess = set(unit_rev.keys()) - set(_unit_map.values()) if len(excess): excess = ",".join(excess) raise ValueError( f"extra keys have been passed to the " f"datetime assemblage: [{excess}]" ) new_series = ( arg[unit_rev["year"]].astype("str") + "-" + arg[unit_rev["month"]].astype("str").str.zfill(2) + "-" + arg[unit_rev["day"]].astype("str").str.zfill(2) ) format = "%Y-%m-%d" col = new_series._column.as_datetime_column( "datetime64[s]", format=format ) for u in ["h", "m", "s", "ms", "us", "ns"]: value = unit_rev.get(u) if value is not None and value in arg: arg_col = arg._data[value] if arg_col.dtype.kind in ("f"): col = new_series._column.as_datetime_column( "datetime64[ns]", format=format ) break elif arg_col.dtype.kind in ("O"): if not cpp_is_integer(arg_col).all(): col = new_series._column.as_datetime_column( "datetime64[ns]", format=format ) break times_column = None for u in ["h", "m", "s", "ms", "us", "ns"]: value = unit_rev.get(u) if value is not None and value in arg: current_col = arg._data[value] # If the arg[value] is of int or # float dtype we don't want to type-cast if current_col.dtype.kind in ("O"): try: current_col = current_col.astype(dtype="int64") except ValueError: current_col = current_col.astype(dtype="float64") factor = as_device_scalar( column.datetime._numpy_to_pandas_conversion[u] / ( column.datetime._numpy_to_pandas_conversion["s"] if np.datetime_data(col.dtype)[0] == "s" else 1 ) ) if times_column is None: times_column = current_col * factor else: times_column = times_column + (current_col * factor) if times_column is not None: col = (col.astype(dtype="int64") + times_column).astype( dtype=col.dtype ) return cudf.Series(col, index=arg.index) elif isinstance(arg, cudf.Index): col = arg._values col = _process_col( col=col, unit=unit, dayfirst=dayfirst, infer_datetime_format=infer_datetime_format, format=format, ) return as_index(col, name=arg.name) elif isinstance(arg, cudf.Series): col = arg._column col = _process_col( col=col, unit=unit, dayfirst=dayfirst, infer_datetime_format=infer_datetime_format, format=format, ) return cudf.Series(col, index=arg.index, name=arg.name) else: col = column.as_column(arg) col = _process_col( col=col, unit=unit, dayfirst=dayfirst, infer_datetime_format=infer_datetime_format, format=format, ) if is_scalar(arg): return col[0] else: return as_index(col) except Exception as e: if errors == "raise": raise e elif errors == "warn": import traceback tb = traceback.format_exc() warnings.warn(tb) elif errors == "ignore": pass elif errors == "coerce": return np.datetime64("nat", "ns" if unit is None else unit) return arg
def _align_by_and_df(obj, by, how="inner"): """ Returns a pair of dataframes and a list may be containing combination of column names and Series which are intersected as per their indices. Examples -------- Dataframe and Series in the 'by' have different indices: >>> import cudf >>> import cudf.core.groupby.groupby as grp_by >>> gdf = cudf.DataFrame( {"x": [1.0, 2.0, 3.0], "y": [1, 2, 1]}, index=[1,2,3] ) >>> gsr = cudf.Series([0.0, 1.0, 2.0], name='a', index=[2,3,4]) >>> updtd_gdf, updtd_by = grp_by._align_by_and_df(gdf, ['x', gsr]) >>> print (gdf) x y 1 1.0 1 2 2.0 2 3 3.0 1 >>> print(updtd_gdf) x y 2 2.0 2 3 3.0 1 >>> print(by) ['x', 2 0.0 3 1.0 4 2.0 Name: a, dtype: float64] >>> print(updtd_by) ['x', 2 0.0 3 1.0 Name: a, dtype: float64] """ if not isinstance(by, (list, tuple)): by = [by] series_count = 0 join_required = False series = [] for by_col in by: if not is_scalar(by_col) and not isinstance(by_col, cudf.Index): sr = by_col if not isinstance(by_col, cudf.Series): sr = cudf.Series(by_col) if not join_required and not obj.index.equals(sr.index): join_required = True series.append(sr) new_obj = None if join_required: for sr in series: if new_obj is None: new_obj = sr.to_frame(series_count) else: new_obj = new_obj.join( sr.to_frame(series_count), how=how, sort="True" ) series_count += 1 series_count = 0 new_by = [] if new_obj is not None: new_obj = new_obj.join(obj, how=how, sort="True") columns = new_obj.columns for by_col in by: if not is_scalar(by_col) and not isinstance(by_col, cudf.Index): sr, sr.name = ( cudf.Series(new_obj[columns[series_count]]), by_col.name, ) new_by.append(sr) series_count += 1 else: new_by.append(by_col) new_obj = new_obj[columns[series_count::]] else: new_obj = obj new_by = by return new_obj, new_by
def __setitem__(self, key, value): """ Set the value of self[key] to value. If value and self are of different types, value is coerced to self.dtype """ from cudf.core import column if isinstance(key, slice): key_start, key_stop, key_stride = key.indices(len(self)) if key_stride != 1: raise NotImplementedError("Stride not supported in slice") nelem = abs(key_stop - key_start) else: key = column.as_column(key) if pd.api.types.is_bool_dtype(key.dtype): if not len(key) == len(self): raise ValueError( "Boolean mask must be of same length as column" ) key = column.as_column(cudautils.arange(len(self)))[key] nelem = len(key) if is_scalar(value): if is_categorical_dtype(self.dtype): from cudf.utils.cudautils import fill_value data = rmm.device_array(nelem, dtype=self.codes.dtype) fill_value(data, self._encode(value)) value = build_categorical_column( categories=self.dtype.categories, codes=as_column(data), ordered=self.dtype.ordered, ) elif value is None: value = column.column_empty(nelem, self.dtype, masked=True) else: to_dtype = pd.api.types.pandas_dtype(self.dtype) value = utils.scalar_broadcast_to(value, nelem, to_dtype) value = column.as_column(value).astype(self.dtype) if len(value) != nelem: msg = ( f"Size mismatch: cannot set value " f"of size {len(value)} to indexing result of size " f"{nelem}" ) raise ValueError(msg) if is_categorical_dtype(value.dtype): value = value.cat().set_categories(self.categories) assert self.dtype == value.dtype if isinstance(key, slice): out = libcudf.copying.copy_range( self, value, key_start, key_stop, 0 ) else: try: out = libcudf.copying.scatter(value, key, self) except RuntimeError as e: if "out of bounds" in str(e): raise IndexError( f"index out of bounds for column of size {len(self)}" ) raise self._mimic_inplace(out, inplace=True)
def _getitem_tuple_arg(self, arg): from uuid import uuid4 from cudf import MultiIndex from cudf.core.column import column from cudf.core.dataframe import DataFrame from cudf.core.index import as_index # Step 1: Gather columns if isinstance(arg, tuple): columns_df = self._get_column_selection(arg[1]) columns_df._index = self._df._index else: columns_df = self._df # Step 2: Gather rows if isinstance(columns_df.index, MultiIndex): if isinstance(arg, (MultiIndex, pd.MultiIndex)): if isinstance(arg, pd.MultiIndex): arg = MultiIndex.from_pandas(arg) indices = indices_from_labels(columns_df, arg) return columns_df.take(indices) else: if isinstance(arg, tuple): return columns_df.index._get_row_major(columns_df, arg[0]) else: return columns_df.index._get_row_major(columns_df, arg) else: if isinstance(arg[0], slice): out = get_label_range_or_mask( columns_df.index, arg[0].start, arg[0].stop, arg[0].step ) if isinstance(out, slice): df = columns_df._slice(out) else: df = columns_df._apply_boolean_mask(out) else: tmp_arg = arg if is_scalar(arg[0]): # If a scalar, there is possibility of having duplicates. # Join would get all the duplicates. So, coverting it to # an array kind. tmp_arg = ([tmp_arg[0]], tmp_arg[1]) if len(tmp_arg[0]) == 0: return columns_df._empty_like(keep_index=True) tmp_arg = (column.as_column(tmp_arg[0]), tmp_arg[1]) if pd.api.types.is_bool_dtype(tmp_arg[0]): df = columns_df._apply_boolean_mask(tmp_arg[0]) else: tmp_col_name = str(uuid4()) other_df = DataFrame( {tmp_col_name: column.arange(len(tmp_arg[0]))}, index=as_index(tmp_arg[0]), ) df = other_df.join(columns_df, how="inner") # as join is not assigning any names to index, # update it over here df.index.name = columns_df.index.name df = df.sort_values(tmp_col_name) df.drop(columns=[tmp_col_name], inplace=True) # There were no indices found if len(df) == 0: raise KeyError(arg) # Step 3: Gather index if df.shape[0] == 1: # we have a single row if isinstance(arg[0], slice): start = arg[0].start if start is None: start = self._df.index[0] df.index = as_index(start) else: row_selection = column.as_column(arg[0]) if pd.api.types.is_bool_dtype(row_selection.dtype): df.index = self._df.index.take(row_selection) else: df.index = as_index(row_selection) # Step 4: Downcast if self._can_downcast_to_series(df, arg): return self._downcast_to_series(df, arg) return df
def read_csv( filepath_or_buffer, lineterminator="\n", quotechar='"', quoting=0, doublequote=True, header="infer", mangle_dupe_cols=True, usecols=None, sep=",", delimiter=None, delim_whitespace=False, skipinitialspace=False, names=None, dtype=None, skipfooter=0, skiprows=0, dayfirst=False, compression="infer", thousands=None, decimal=".", true_values=None, false_values=None, nrows=None, byte_range=None, skip_blank_lines=True, parse_dates=None, comment=None, na_values=None, keep_default_na=True, na_filter=True, prefix=None, index_col=None, **kwargs, ): """{docstring}""" is_single_filepath_or_buffer = ioutils.ensure_single_filepath_or_buffer( path_or_data=filepath_or_buffer, **kwargs, ) if not is_single_filepath_or_buffer: raise NotImplementedError( "`read_csv` does not yet support reading multiple files") filepath_or_buffer, compression = ioutils.get_filepath_or_buffer( path_or_data=filepath_or_buffer, compression=compression, iotypes=(BytesIO, StringIO), **kwargs, ) if na_values is not None and is_scalar(na_values): na_values = [na_values] if keep_default_na is False: # TODO: Remove this error once the following issue is fixed: # https://github.com/rapidsai/cudf/issues/6680 raise NotImplementedError( "keep_default_na=False is currently not supported, please refer " "to: https://github.com/rapidsai/cudf/issues/6680") return libcudf.csv.read_csv( filepath_or_buffer, lineterminator=lineterminator, quotechar=quotechar, quoting=quoting, doublequote=doublequote, header=header, mangle_dupe_cols=mangle_dupe_cols, usecols=usecols, sep=sep, delimiter=delimiter, delim_whitespace=delim_whitespace, skipinitialspace=skipinitialspace, names=names, dtype=dtype, skipfooter=skipfooter, skiprows=skiprows, dayfirst=dayfirst, compression=compression, thousands=thousands, decimal=decimal, true_values=true_values, false_values=false_values, nrows=nrows, byte_range=byte_range, skip_blank_lines=skip_blank_lines, parse_dates=parse_dates, comment=comment, na_values=na_values, keep_default_na=keep_default_na, na_filter=na_filter, prefix=prefix, index_col=index_col, )