def array_to_series(array): if isinstance(array, pa.ChunkedArray): return Series._concat( [array_to_series(chunk) for chunk in array.chunks] ) if isinstance(array, pa.Column): return Series._concat( [array_to_series(chunk) for chunk in array.data.chunks] ) array_len = len(array) null_count = array.null_count buffers = make_device_arrays(array) mask, data = buffers[0], buffers[1] dtype = arrow_to_pandas_dtype(array.type) if pa.types.is_dictionary(array.type): from cudf.core.column import CategoricalColumn codes = array_to_series(array.indices) categories = array_to_series(array.dictionary) data = CategoricalColumn( data=codes.data, mask=mask, null_count=null_count, categories=categories, ordered=array.type.ordered, ) elif pa.types.is_string(array.type): import nvstrings offs, data = buffers[1], buffers[2] offs = offs[array.offset : array.offset + array_len + 1] data = None if data is None else data.device_ctypes_pointer.value mask = None if mask is None else mask.device_ctypes_pointer.value data = nvstrings.from_offsets( data, offs.device_ctypes_pointer.value, array_len, mask, null_count, True, ) elif data is not None: data = data[array.offset : array.offset + len(array)] series = Series(data, dtype=dtype) if null_count > 0 and mask is not None and not series.has_null_mask: return series.set_mask(mask, null_count) return series
def _tile(A, reps): series_list = [A] * reps if reps > 0: return Series._concat(objs=series_list, index=None) else: return Series([], dtype=A.dtype)
def concat(objs, axis=0, ignore_index=False, sort=None): """Concatenate DataFrames, Series, or Indices row-wise. Parameters ---------- objs : list of DataFrame, Series, or Index axis : {0/'index', 1/'columns'}, default 0 The axis to concatenate along. ignore_index : bool, default False Set True to ignore the index of the *objs* and provide a default range index instead. Returns ------- A new object of like type with rows from each object in ``objs``. """ if sort not in (None, False): raise NotImplementedError("sort parameter is not yet supported") if not objs: raise ValueError("Need at least one object to concatenate") objs = [obj for obj in objs if obj is not None] # Return for single object if len(objs) == 1: return objs[0] if len(objs) == 0: raise ValueError("All objects passed were None") typs = set(type(o) for o in objs) allowed_typs = {Series, DataFrame} param_axis = _axis_map.get(axis, None) if param_axis is None: raise ValueError( '`axis` must be 0 / "index" or 1 / "columns", got: {0}'.format( param_axis ) ) else: axis = param_axis # when axis is 1 (column) we can concat with Series and Dataframes if axis == 1: assert typs.issubset(allowed_typs) df = DataFrame() sr_name = 0 for idx, o in enumerate(objs): if isinstance(o, Series): name = o.name if name is None: name = sr_name sr_name += 1 objs[idx] = o.to_frame(name=name) for idx, o in enumerate(objs): if idx == 0: df.index = o.index for col in o._data.names: if col in df._data: raise NotImplementedError( "A Column with duplicate name found: {0}, cuDF\ doesn't support having multiple columns with\ same names yet.".format( col ) ) df[col] = o._data[col] result_columns = objs[0].columns for o in objs[1:]: result_columns = result_columns.append(o.columns) df.columns = result_columns return df typ = list(typs)[0] if len(typs) > 1: raise ValueError( "`concat` expects all objects to be of the same " "type. Got mix of %r." % [t.__name__ for t in typs] ) typ = list(typs)[0] if typ is DataFrame: return DataFrame._concat(objs, axis=axis, ignore_index=ignore_index) elif typ is Series: return Series._concat(objs, axis=axis) elif typ is cudf.MultiIndex: return cudf.MultiIndex._concat(objs) elif issubclass(typ, Index): return Index._concat(objs) else: raise ValueError("Unknown type %r" % typ)
def melt( frame, id_vars=None, value_vars=None, var_name=None, value_name="value", col_level=None, ): """Unpivots a DataFrame from wide format to long format, optionally leaving identifier variables set. Parameters ---------- frame : DataFrame id_vars : tuple, list, or ndarray, optional Column(s) to use as identifier variables. default: None value_vars : tuple, list, or ndarray, optional Column(s) to unpivot. default: all columns that are not set as `id_vars`. var_name : scalar Name to use for the `variable` column. default: frame.columns.name or 'variable' value_name : str Name to use for the `value` column. default: 'value' Returns ------- out : DataFrame Melted result Difference from pandas: * Does not support 'col_level' because cuDF does not have multi-index Examples -------- >>> import cudf >>> import numpy as np >>> df = cudf.DataFrame({'A': {0: 1, 1: 1, 2: 5}, ... 'B': {0: 1, 1: 3, 2: 6}, ... 'C': {0: 1.0, 1: np.nan, 2: 4.0}, ... 'D': {0: 2.0, 1: 5.0, 2: 6.0}}) >>> cudf.melt(frame=df, id_vars=['A', 'B'], value_vars=['C', 'D']) A B variable value 0 1 1 C 1.0 1 1 3 C 2 5 6 C 4.0 3 1 1 D 2.0 4 1 3 D 5.0 5 5 6 D 6.0 """ assert col_level in (None,) # Arg cleaning import collections # id_vars if id_vars is not None: if not isinstance(id_vars, collections.abc.Sequence): id_vars = [id_vars] id_vars = list(id_vars) missing = set(id_vars) - set(frame.columns) if not len(missing) == 0: raise KeyError( "The following 'id_vars' are not present" " in the DataFrame: {missing}" "".format(missing=list(missing)) ) else: id_vars = [] # value_vars if value_vars is not None: if not isinstance(value_vars, collections.abc.Sequence): value_vars = [value_vars] value_vars = list(value_vars) missing = set(value_vars) - set(frame.columns) if not len(missing) == 0: raise KeyError( "The following 'value_vars' are not present" " in the DataFrame: {missing}" "".format(missing=list(missing)) ) else: # then all remaining columns in frame value_vars = frame.columns.drop(id_vars) value_vars = list(value_vars) # Error for unimplemented support for datatype dtypes = [frame[col].dtype for col in id_vars + value_vars] if any(is_categorical_dtype(t) for t in dtypes): raise NotImplementedError( "Categorical columns are not yet " "supported for function" ) # Check dtype homogeneity in value_var # Because heterogeneous concat is unimplemented dtypes = [frame[col].dtype for col in value_vars] if len(dtypes) > 0: dtype = dtypes[0] if any(t != dtype for t in dtypes): raise ValueError("all cols in value_vars must have the same dtype") # overlap overlap = set(id_vars).intersection(set(value_vars)) if not len(overlap) == 0: raise KeyError( "'value_vars' and 'id_vars' cannot have overlap." " The following 'value_vars' are ALSO present" " in 'id_vars': {overlap}" "".format(overlap=list(overlap)) ) N = len(frame) K = len(value_vars) def _tile(A, reps): series_list = [A] * reps if reps > 0: return Series._concat(objs=series_list, index=None) else: return Series([], dtype=A.dtype) # Step 1: tile id_vars mdata = collections.OrderedDict() for col in id_vars: mdata[col] = _tile(frame[col], K) # Step 2: add variable var_cols = [] for i, var in enumerate(value_vars): var_cols.append(Series(cudautils.full(size=N, value=i, dtype=np.int8))) temp = Series._concat(objs=var_cols, index=None) if not var_name: var_name = "variable" mdata[var_name] = Series( build_categorical_column( categories=value_vars, codes=as_column(temp._column.base_data, dtype=temp._column.dtype), mask=temp._column.base_mask, size=temp._column.size, offset=temp._column.offset, ordered=False, ) ) # Step 3: add values mdata[value_name] = Series._concat( objs=[frame[val] for val in value_vars], index=None ) return DataFrame(mdata)
def concat(objs, axis=0, ignore_index=False, sort=None): """Concatenate DataFrames, Series, or Indices row-wise. Parameters ---------- objs : list of DataFrame, Series, or Index axis : {0/'index', 1/'columns'}, default 0 The axis to concatenate along. ignore_index : bool, default False Set True to ignore the index of the *objs* and provide a default range index instead. Returns ------- A new object of like type with rows from each object in ``objs``. """ if sort not in (None, False): raise NotImplementedError("sort parameter is not yet supported") if not objs: raise ValueError("Need at least one object to concatenate") # no-op for single object if len(objs) == 1: return objs[0] typs = set(type(o) for o in objs) allowed_typs = {Series, DataFrame} param_axis = _axis_map.get(axis, None) if param_axis is None: raise ValueError( '`axis` must be 0 / "index" or 1 / "columns", got: {0}'.format( param_axis)) else: axis = param_axis # when axis is 1 (column) we can concat with Series and Dataframes if axis == 1: assert typs.issubset(allowed_typs) df = DataFrame() for idx, o in enumerate(objs): if isinstance(o, Series): name = o.name if o.name is None: # pandas uses 0-offset name = idx - 1 df[name] = o else: for col in o.columns: df[col] = o[col] return df if len(typs) > 1: raise ValueError("`concat` expects all objects to be of the same " "type. Got mix of %r." % [t.__name__ for t in typs]) typ = list(typs)[0] if typ is DataFrame: return DataFrame._concat(objs, axis=axis, ignore_index=ignore_index) elif typ is Series: return Series._concat(objs, axis=axis) elif issubclass(typ, Index): return Index._concat(objs) else: raise ValueError("Unknown type %r" % typ)
def concat(objs, axis=0, ignore_index=False, sort=None): """Concatenate DataFrames, Series, or Indices row-wise. Parameters ---------- objs : list of DataFrame, Series, or Index axis : {0/'index', 1/'columns'}, default 0 The axis to concatenate along. ignore_index : bool, default False Set True to ignore the index of the *objs* and provide a default range index instead. sort : bool, default False Sort non-concatenation axis if it is not already aligned. Returns ------- A new object of like type with rows from each object in ``objs``. Examples -------- Combine two ``Series``. >>> import cudf >>> s1 = cudf.Series(['a', 'b']) >>> s2 = cudf.Series(['c', 'd']) >>> s1 0 a 1 b dtype: object >>> s2 0 c 1 d dtype: object >>> cudf.concat([s1, s2]) 0 a 1 b 0 c 1 d dtype: object Clear the existing index and reset it in the result by setting the ``ignore_index`` option to ``True``. >>> cudf.concat([s1, s2], ignore_index=True) 0 a 1 b 2 c 3 d dtype: object Combine two DataFrame objects with identical columns. >>> df1 = cudf.DataFrame([['a', 1], ['b', 2]], ... columns=['letter', 'number']) >>> df1 letter number 0 a 1 1 b 2 >>> df2 = cudf.DataFrame([['c', 3], ['d', 4]], ... columns=['letter', 'number']) >>> df2 letter number 0 c 3 1 d 4 >>> cudf.concat([df1, df2]) letter number 0 a 1 1 b 2 0 c 3 1 d 4 Combine DataFrame objects with overlapping columns and return everything. Columns outside the intersection will be filled with ``null`` values. >>> df3 = cudf.DataFrame([['c', 3, 'cat'], ['d', 4, 'dog']], ... columns=['letter', 'number', 'animal']) >>> df3 letter number animal 0 c 3 cat 1 d 4 dog >>> cudf.concat([df1, df3], sort=False) letter number animal 0 a 1 None 1 b 2 None 0 c 3 cat 1 d 4 dog Combine ``DataFrame`` objects horizontally along the x axis by passing in ``axis=1``. >>> df4 = cudf.DataFrame([['bird', 'polly'], ['monkey', 'george']], ... columns=['animal', 'name']) >>> df4 animal name 0 bird polly 1 monkey george >>> cudf.concat([df1, df4], axis=1) letter number animal name 0 a 1 bird polly 1 b 2 monkey george """ if not objs: raise ValueError("No objects to concatenate") objs = [obj for obj in objs if obj is not None] # Return for single object if len(objs) == 1: if ignore_index: result = cudf.DataFrame( data=objs[0]._data.copy(deep=True), index=cudf.RangeIndex(len(objs[0])), ) else: result = objs[0].copy() return result if len(objs) == 0: raise ValueError("All objects passed were None") # Retrieve the base types of `objs`. In order to support sub-types # and object wrappers, we use `isinstance()` instead of comparing # types directly typs = set() for o in objs: if isinstance(o, cudf.MultiIndex): typs.add(cudf.MultiIndex) if issubclass(type(o), Index): typs.add(type(o)) elif isinstance(o, DataFrame): typs.add(DataFrame) elif isinstance(o, Series): typs.add(Series) else: raise ValueError(f"cannot concatenate object of type {type(o)}") allowed_typs = {Series, DataFrame} param_axis = _axis_map.get(axis, None) if param_axis is None: raise ValueError( '`axis` must be 0 / "index" or 1 / "columns", got: {0}'.format( param_axis ) ) else: axis = param_axis # when axis is 1 (column) we can concat with Series and Dataframes if axis == 1: assert typs.issubset(allowed_typs) df = DataFrame() _normalize_series_and_dataframe(objs, axis=axis) objs, match_index = _align_objs(objs) for idx, o in enumerate(objs): if not ignore_index and idx == 0: df.index = o.index for col in o._data.names: if col in df._data: raise NotImplementedError( "A Column with duplicate name found: {0}, cuDF\ doesn't support having multiple columns with\ same names yet.".format( col ) ) df[col] = o._data[col] result_columns = objs[0].columns for o in objs[1:]: result_columns = result_columns.append(o.columns) df.columns = result_columns.unique() if ignore_index: df.index = None return df elif not match_index: return df.sort_index() else: return df typ = list(typs)[0] if len(typs) > 1: if allowed_typs == typs: # This block of code will run when `objs` has # both Series & DataFrame kind of inputs. _normalize_series_and_dataframe(objs, axis=axis) typ = DataFrame else: raise ValueError( "`concat` cannot concatenate objects of " "types: %r." % sorted([t.__name__ for t in typs]) ) if typ is DataFrame: objs = [obj for obj in objs if obj.shape != (0, 0)] if len(objs) == 0: # If objs is empty, that indicates all of # objs are empty dataframes. return cudf.DataFrame() elif len(objs) == 1: if ignore_index: result = cudf.DataFrame( data=objs[0]._data.copy(deep=True), index=cudf.RangeIndex(len(objs[0])), ) else: result = objs[0].copy() return result else: return DataFrame._concat( objs, axis=axis, ignore_index=ignore_index, sort=sort ) elif typ is Series: return Series._concat( objs, axis=axis, index=None if ignore_index else True ) elif typ is cudf.MultiIndex: return cudf.MultiIndex._concat(objs) elif issubclass(typ, Index): return Index._concat(objs) else: raise ValueError(f"cannot concatenate object of type {typ}")