def sum( self, axis=None, skipna=None, level=None, numeric_only=None, min_count=0, **kwargs, ): axis = self._get_axis_number(axis) if axis is not None else 0 if numeric_only not in ( None, True, ): raise err._unsupported_error("numeric_only", numeric_only) if min_count > 0: raise err._unsupported_error("min_count", min_count) return self._unary_reduction( [("sum", numeric_only)], axis=axis, skipna=skipna, level=level, min_count=min_count, **kwargs, )
def var( self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs, ): axis = self._get_axis_number(axis) if axis is not None else 0 if numeric_only not in ( None, True, ): raise err._unsupported_error("numeric_only", numeric_only) if ddof != 1: raise err._unsupported_error("ddof", ddof) return self._unary_reduction( [("var", numeric_only)], axis=axis, skipna=skipna, level=level, ddof=ddof, **kwargs, )
def sum(self, numeric_only=True, min_count=0): if numeric_only not in ( True, None, ): raise err._unsupported_error("numeric_only", numeric_only) if min_count > 0: raise err._unsupported_error("min_count", min_count) return self._groupby_reduce(ops=[("sum", numeric_only)])
def unary_reduction(df, ops, axis=0, skipna=True, level=None): if isinstance(ops, list): ops = [_maybe_convert_to_default(desc) for desc in ops] else: # TODO: We will hit this case once we add agg/aggregate assert False if axis != 0: raise err._unsupported_error("axis", axis) if skipna not in ( True, None, ): raise err._unsupported_error("skipna", skipna) if level is not None: raise err._unsupported_error("level", level) columns = df._frame._columns indexer = [] for idx, column in enumerate(columns): if incompatible_ops(ops, column.dtype.to_pandas()): continue indexer.append(idx) valid_columns = [columns[idx] for idx in indexer] ops = [desc[0] for desc in ops] if df._is_series: if len(valid_columns) == 0: raise TypeError(f"Cannot perform reduction '{ops[0]}' " f"with {columns[0].dtype} dtype") result = valid_columns[0].unary_reduction(ops[0], skipna) return result.get_scalar().value else: frame = df._frame.replace_columns(valid_columns) columns = df.columns[indexer] new_frame = frame.unary_reduction( ops[0], columns, axis=axis, skipna=skipna, level=level, ) if len(new_frame._columns) > 1: from .dataframe import DataFrame return DataFrame(frame=new_frame, columns=df.columns) else: from .series import Series return Series(frame=new_frame)
def _copy_if_else( self, cond, other=None, inplace=False, axis=None, level=None, errors="raise", try_cast=False, negate=False, ): inplace = validate_bool_kwarg(inplace, "inplace") axis = self._get_axis_number(axis, 0) if level is not None: raise err._unsupported_error("level", level) if axis not in (0, ): raise err._unsupported_error("axis", axis) if try_cast not in (False, ): raise err._unsupported_error("try_cast", try_cast) # Checks on cond cond = self._ensure_valid_frame(cond) if self.ndim < cond.ndim: raise ValueError( "cannot use the higher dimensional dataframe for 'cond'") _, cond = self._align_frame(cond, join="left", broadcast_axis=1) if any(not is_bool_dtype(dtype) for dtype in cond._get_dtypes()): raise ValueError("'cond' must have only boolean values") # Checks on other if not is_scalar(other): other = self._ensure_valid_frame(other) if self.ndim < other.ndim: raise ValueError( "cannot use the higher dimensional dataframe for 'other'") _, other = self._align_frame(other, join="left", broadcast_axis=1) for l_dtype, r_dtype in zip(self._get_dtypes(), other._get_dtypes()): if l_dtype != r_dtype: raise ValueError("'other' must have the same type as self") other = other._frame else: other = util.sanitize_scalar(other) frame = self._frame.copy_if_else(cond._frame, other, negate=negate) return self._create_or_update_frame(frame, inplace)
def __getitem__(self, key): (row_loc, row_scalar, out_ndim) = self._validate_locator(key) sr = self.sr if row_scalar: result = sr._frame.read_at(row_loc) elif isinstance(row_loc, slice): if row_loc == slice(None): result = sr._frame else: result = sr._frame.slice_rows_by_slice(row_loc, False) else: row_loc = sr._ensure_valid_frame(row_loc) if not row_loc._is_series: raise ValueError("indexer must be 1-dimensional") if not is_bool_dtype(row_loc.dtype): raise err._unsupported_error( "only boolean indexers are supported now") # This may raise an exception if the indexer size doesn't match # with the index of the LHS. row_loc = row_loc._frame.update_legate_index(sr._raw_index) result = sr._frame.select(row_loc) try: return super().construct_result(result, out_ndim, row_scalar) except _NotFoundError: raise KeyError(row_loc)
def mean(self, numeric_only=True): if numeric_only not in ( True, None, ): raise err._unsupported_error("numeric_only", numeric_only) return self._groupby_reduce(ops=[("mean", numeric_only)])
def to_datetime( arg, errors="raise", dayfirst=False, yearfirst=False, utc=None, format=None, exact=True, unit=None, infer_datetime_format=False, origin="unix", cache=True, ): if not isinstance(arg, Frame): result = pandas.to_datetime( arg, errors=errors, dayfirst=dayfirst, yearfirst=yearfirst, utc=utc, format=format, exact=exact, unit=unit, infer_datetime_format=infer_datetime_format, origin=origin, cache=cache, ) return util.sanitize_scalar(result) if not (arg._is_series and is_string_dtype(arg.dtype)): print(type(arg.dtype)) raise err._unsupported_error("to_datetime handles only string columns") return arg.str.to_datetime(format)
def _uncompress_files(paths, compressions): new_paths = [] to_remove = [] for path, compression in zip(paths, compressions): if compression == CompressionType.UNCOMPRESSED: new_paths.append(path) continue import tempfile out = os.path.join( tempfile.gettempdir(), f"_lg_uncompressed_{os.path.basename(path).replace('.gz', '')}", ) new_paths.append(out) to_remove.append(out) if compression == CompressionType.GZIP: import gzip as decompress elif compression == CompressionType.BZ2: import bz2 as decompress else: from legate.pandas.common import errors as err raise err._unsupported_error( f"unsupported compression method '{compression.name.lower()}'") with open(out, "wb") as f_out: with decompress.open(path, "rb") as f_in: shutil.copyfileobj(f_in, f_out) return new_paths, [CompressionType.UNCOMPRESSED] * len(paths), to_remove
def dropna(self, axis=0, how="any", thresh=None, subset=None, inplace=False): axis = self._get_axis_number(axis, 0) inplace = validate_bool_kwarg(inplace, "inplace") if axis not in (0, ): raise err._unsupported_error("axis", axis) if how is None and thresh is None: raise TypeError("must specify how or thresh") if how is not None and how not in ("any", "all"): raise ValueError("invalid how option: %s" % how) if subset is not None: idxr = self.columns.get_indexer_for(subset) mask = idxr == -1 if mask.any(): raise KeyError(list(np.compress(mask, subset))) else: idxr = list(range(len(self.columns))) if thresh is None: thresh = len(idxr) if how == "any" else 1 new_frame = self._frame.dropna(axis, idxr, thresh) return self._create_or_update_frame(new_frame, inplace)
def sort_values( self, axis=0, ascending=True, inplace=False, kind="quicksort", na_position="last", ignore_index: bool = False, ): axis = self._get_axis_number(axis) if axis not in (0, ): raise err._unsupported_error("axis", axis) if na_position not in ( "first", "last", ): raise err._invalid_value_error("na_position", na_position) ascending = self._get_ascending(ascending, 1) new_frame = self._frame.sort_values( [0], axis, ascending, kind, na_position, ignore_index, ) return self._create_or_update_frame(new_frame, inplace)
def squeeze(self, axis=None): axis = self._get_axis_number(axis, None) if axis not in ( 1, None, ): raise err._unsupported_error("axis", axis) result = self if (axis in ( 1, None, ) and len(result.columns) == 1): result = Series(frame=result._frame, name=result.columns[0]) if (axis in ( 0, None, ) and len(result) == 1): if result._is_series: result = result.to_pandas().squeeze() else: # TODO: We want to handle this case once we support series # of mixed type values (which would be either expressed # by its transpose or backed by a Pandas series). warnings.warn( "Squeezing a dataframe on both axes is currently " "unsupported unless the size is 1. Squeeze for axis=0 " "will be ignored.") return result
def __getattr__(self, key): try: return object.__getattribute__(self, key) except AttributeError as e: if hasattr(pandas.Series, key): raise err._unsupported_error( f"Series.{key} is not yet implemented in Legate Pandas.") raise e
def convert_agg_func(agg_func): if isinstance(agg_func, str): if agg_func not in _SUPPORTED_AGGS: raise err._unsupported_error( f"Unsupported aggregation method: {agg_func}") return (agg_func, _NUMERIC_ONLY[agg_func]) elif is_dict_like(agg_func): converted = {} for col, func in agg_func.items(): funcs = util.to_list_if_scalar(convert_agg_func(func)) converted[col] = funcs return converted elif is_list_like(agg_func): return [convert_agg_func(func) for func in agg_func] else: raise err._unsupported_error( f"Unsupported aggregation descriptor: {agg_func}")
def sort_index( self, axis=0, level=None, ascending=True, inplace=False, kind="quicksort", na_position="last", sort_remaining=True, ignore_index: bool = False, ): axis = self._get_axis_number(axis) if axis not in (0, ): raise err._unsupported_error("axis", axis) nlevels = self._raw_index.nlevels if nlevels == 1: # Pandas ignores level and sort_remaining for single-level indices, levels = [0] if level is None else util.to_list_if_scalar(level) # and it casts ascending to a boolean value... ascending = [bool(ascending)] else: if level is None: levels = list(range(nlevels)) # When level is None, Pandas crops the ascending list # to match its length to the number of levels... ascending = self._get_ascending(ascending, nlevels)[:nlevels] else: levels = util.to_list_if_scalar(level) levels = [ self._raw_index._get_level_number(lvl) for lvl in levels ] default_asc = bool(ascending) ascending = self._get_ascending(ascending, len(levels)) if len(ascending) != len(levels): raise ValueError( "level must have same length as ascending") # XXX: Pandas ignores sort_remaining for multi-level indices # (GH #24247), and always sorts the levels monotonically # before the actual sorting... # Here we do the right thing and hopefully Pandas fixes # its bug in the future. if sort_remaining: already_added = set(levels) for lvl in range(nlevels): if lvl not in already_added: levels.append(lvl) ascending.append(default_asc) new_frame = self._frame.sort_index( axis=axis, levels=levels, ascending=ascending, kind=kind, na_position=na_position, ignore_index=ignore_index, ) return self._create_or_update_frame(new_frame, inplace)
def binary_op(self, op, other): reverse = False if op in _REVERSED_OPS: op = op[1:] reverse = True # Perform binary operation rhs1 = self._columns if is_scalar(other): other = self._runtime.create_scalar(other, ty.infer_dtype(other)) rhs2 = [other] * len(rhs1) else: rhs2 = other._columns results = [] for rh1, rh2 in zip(rhs1, rhs2): # If the right operand is integer, we convert it to the left # operand's dtype if isinstance(rh2, Scalar): if ty.is_integer_dtype(rh2.dtype): rh2 = rh2.astype(rh1.dtype) elif ty.is_categorical_dtype(rh1.dtype): rh2 = rh1.dtype.encode(rh2, unwrap=False, can_fail=True) else: common_dtype = ty.find_common_dtype(rh1.dtype, rh2.dtype) rh1 = rh1.astype(common_dtype) rh2 = rh2.astype(common_dtype) elif not ( ty.is_categorical_dtype(rh1.dtype) or ty.is_categorical_dtype(rh2.dtype) ): common_dtype = ty.find_common_dtype(rh1.dtype, rh2.dtype) rh1 = rh1.astype(common_dtype) rh2 = rh2.astype(common_dtype) lh_dtype = ty.get_binop_result_type(op, rh1.dtype, rh2.dtype) if ty.is_string_dtype(rh1.dtype) and op in ( "add", "mul", ): raise err._unsupported_error( f"unsupported operand type(s) for {op}: " f"'{rh1.dtype}' and '{rh2.dtype}'" ) if reverse: rh1, rh2 = rh2, rh1 swapped = False if isinstance(rh1, Scalar): rh1, rh2 = rh2, rh1 swapped = True results.append(rh1.binary_op(op, rh2, lh_dtype, swapped=swapped)) return Table(self._runtime, self._index, results)
def count(self, axis=0, level=None, numeric_only=False): axis = self._get_axis_number(axis) if axis is not None else 0 if numeric_only not in ( None, False, ): raise err._unsupported_error("numeric_only", numeric_only) return self._unary_reduction([("count", numeric_only)], axis=axis, level=level)
def from_stores(type, stores, children=None): from .bitmask import Bitmask from .runtime import _runtime as rt if children is not None: raise err._unsupported_error("Only accept flat stores for now") slices = [Column._import_store(rt, store) for store in stores] if len(stores) > 2: raise err._unsupported_error( f"Unsupported Legate Array type: {type}") dtype = ty.to_legate_dtype(type) assert dtype == slices[1].dtype bitmask = None if slices[0] is None else Bitmask(rt, slices[0]) return Column(rt, slices[1], bitmask)
def _import_store(rt, store): if store is None: return None kind = store.kind if kind not in ((Region, FieldID), (Region, int)): raise err._unsupported_error( f"Unsupported Legate Store kind: {kind}") (region, fid) = store.storage if region.index_space.get_dim() != 1: raise err._unsupported_error("All Legate Arrays must be 1-D") dtype = ty.to_legate_dtype(store.type) if kind[1] is FieldID: fid = fid.fid storage = rt._create_external_storage(region) return storage.import_field(region, fid, dtype)
def from_pandas(cls, runtime, dtype): if dtype.categories.dtype != object: raise err._unsupported_error("Categories must be strings for now") categories_storage = runtime.create_storage(len(dtype.categories)) categories_column = runtime._create_string_column_from_pandas( categories_storage, dtype.categories, num_pieces=1, ).as_replicated_column() return cls(categories_column, dtype.ordered)
def to_csv( self, path_or_buf=None, sep=",", na_rep="", columns=None, header=True, index=True, line_terminator=None, chunksize=None, partition=False, ): if not isinstance(path_or_buf, str): raise err._unsupported_error("path must be a string for now") if len(sep) != 1: raise err._unsupported_error("separator must be a character") line_terminator = (os.linesep if line_terminator is None else line_terminator) # The default chunk size is 8 chunksize = 8 if chunksize is None else chunksize new_self = self if columns is not None: new_self = self[util.to_list_if_scalar(columns)] new_self._frame.to_csv( path=path_or_buf, sep=sep, na_rep=na_rep, header=header, index=index, line_terminator=line_terminator, chunksize=chunksize, partition=partition, column_names=new_self.columns.to_list(), )
def find_common_dtype(dtype1, dtype2): if is_categorical_dtype(dtype1) and is_categorical_dtype(dtype2): from legate.pandas.common import errors as err raise err._unsupported_error( "categorical dtypes are not supported yet") if dtype1 == dtype2: return dtype1 else: return to_legate_dtype( np.find_common_type( [dtype1.to_pandas(), dtype2.to_pandas()], []))
def __setitem__(self, key, item): (row_loc, row_scalar, _) = self._validate_locator(key) sr = self.sr self._validate_lhs(sr) if row_scalar: row_loc = sr._raw_index == row_loc index = sr._frame.slice_index_by_boolean_mask(row_loc) item = self._align_rhs(sr, index, item) result = sr._frame.scatter_by_boolean_mask(row_loc, index, item) elif isinstance(row_loc, slice): if row_loc == slice(None): index = sr._frame._index item = self._align_rhs(sr, index, item) result = item else: (index, bounds) = sr._frame.slice_index_by_slice(row_loc, True) item = self._align_rhs(sr, index, item) result = sr._frame.scatter_by_slice(index, bounds, item) else: row_loc = sr._ensure_valid_frame(row_loc) _, row_loc = sr._align_frame(row_loc, join="left", axis=0) if not row_loc._is_series: raise ValueError("indexer must be 1-dimensional") if not is_bool_dtype(row_loc.dtype): raise err._unsupported_error( "only boolean indexers are supported now") row_loc = row_loc._frame index = sr._frame.slice_index_by_boolean_mask(row_loc) item = self._align_rhs(sr, index, item) result = sr._frame.scatter_by_boolean_mask(row_loc, index, item) self.update_column(result)