def replace(self, to_replace, value): """ Replace values given in *to_replace* with *value*. Parameters ---------- to_replace : numeric, str or list-like Value(s) to replace. * numeric or str: - values equal to *to_replace* will be replaced with *value* * list of numeric or str: - If *value* is also list-like, *to_replace* and *value* must be of same length. value : numeric, str, list-like, or dict Value(s) to replace `to_replace` with. See also -------- Series.fillna Returns ------- result : Series Series after replacement. The mask and index are preserved. """ if not is_scalar(to_replace): if is_scalar(value): value = utils.scalar_broadcast_to( value, (len(to_replace),), np.dtype(type(value)) ) else: if not is_scalar(value): raise TypeError( "Incompatible types '{}' and '{}' " "for *to_replace* and *value*.".format( type(to_replace).__name__, type(value).__name__ ) ) to_replace = [to_replace] value = [value] if len(to_replace) != len(value): raise ValueError( "Replacement lists must be" "of same length." "Expected {}, got {}.".format(len(to_replace), len(value)) ) if is_dict_like(to_replace) or is_dict_like(value): raise TypeError("Dict-like args not supported in Series.replace()") result = self._column.find_and_replace(to_replace, value) return self._copy_construct(data=result)
def __call__(self, arg): if is_scalar(arg): ret = pd.to_datetime( arg, errors=self._errors, dayfirst=self._dayfirst, yearfirst=self._yearfirst, utc=self._utc, format=self._format, exact=self._exact, unit=self._unit, infer_datetime_format=self._infer_datetime_format, origin=self._origin, cache=self._cache) return astensor(ret) dtype = np.datetime64(1, 'ns').dtype if isinstance(arg, (pd.Series, SERIES_TYPE)): arg = asseries(arg) return self.new_series([arg], shape=arg.shape, dtype=dtype, index_value=arg.index_value, name=arg.name) if is_dict_like(arg) or isinstance(arg, DATAFRAME_TYPE): arg = asdataframe(arg) columns = arg.columns_value.to_pandas().tolist() if sorted(columns) != sorted(['year', 'month', 'day']): missing = ','.join(c for c in ['day', 'month', 'year'] if c not in columns) raise ValueError( 'to assemble mappings requires at least ' f'that [year, month, day] be specified: [{missing}] is missing' ) return self.new_series([arg], shape=(arg.shape[0], ), dtype=dtype, index_value=arg.index_value) elif isinstance(arg, (pd.Index, INDEX_TYPE)): arg = asindex(arg) return self.new_index([arg], shape=arg.shape, dtype=dtype, index_value=parse_index( pd.Index([], dtype=dtype), self._params, arg), name=arg.name) else: arg = astensor(arg) if arg.ndim != 1: raise TypeError('arg must be a string, datetime, ' 'list, tuple, 1-d tensor, or Series') return self.new_index([arg], shape=arg.shape, dtype=dtype, index_value=parse_index( pd.Index([], dtype=dtype), self._params, arg))
def _write_one_pair(key, value): if is_scalar(value): if type(value).__module__ == 'numpy': value = value.item() scalar_dict[key] = value elif isinstance(value, np.ndarray): self.write_array(sub_group, key, value) elif isinstance(value, pd.DataFrame): self.write_dataframe(sub_group, key, value) elif is_dict_like(value): self.write_mapping(sub_group, key, value) elif issparse(value): assert isinstance(value, csr_matrix) self.write_csr(sub_group, key, value) else: # assume value is either list or tuple, converting it to np.ndarray self.write_array(sub_group, key, value.astype(str) if is_categorical_dtype(value) else np.array(value))
def convert_agg_func(agg_func): if isinstance(agg_func, str): if agg_func not in _SUPPORTED_AGGS: raise err._unsupported_error( f"Unsupported aggregation method: {agg_func}") return (agg_func, _NUMERIC_ONLY[agg_func]) elif is_dict_like(agg_func): converted = {} for col, func in agg_func.items(): funcs = util.to_list_if_scalar(convert_agg_func(func)) converted[col] = funcs return converted elif is_list_like(agg_func): return [convert_agg_func(func) for func in agg_func] else: raise err._unsupported_error( f"Unsupported aggregation descriptor: {agg_func}")
def format_index(self, formatter): """ Format the text display value of index. .. versionadded:: 0.18.0 Parameters ---------- formatter : str, callable, or dict Returns ------- self : Styler Notes ----- ``formatter`` is either an ``a`` or a dict ``{index name: a}`` where ``a`` is one of - str: this will be wrapped in: ``a.format(x)`` - callable: called with the value of an individual cell The default display value for index is "str(index)". Examples -------- >>> df = pd.DataFrame( {'a': range(3), 'b': range(3)}, index=['c', 'd', 'e'] ) >>> styler = df.style.format_index({'d': lambda x: f'Index {x}'}) >>> styler.render() """ if is_dict_like(formatter): for index, index_formatter in formatter.items(): index_formatter = _maybe_wrap_formatter(index_formatter) index_num = self.data.index.get_loc(index) self._display_index_funcs[index_num] = index_formatter else: for index_num in range(len(self.data)): index_formatter = _maybe_wrap_formatter(formatter) self._display_index_funcs[index_num] = index_formatter return self
def __init__( self, data=None, index=None, columns=None, dtype=None, copy=False, frame=None, ): # TODO: We would want to hide the frame argument from the users, # as it is intended only for internal uses if frame is not None: assert index is None assert dtype is None assert columns is not None assert len(columns) == len(frame._columns) self._frame = frame self._set_columns(columns) elif isinstance(data, type(self)): self._construct_from_dataframe(data, index, columns, dtype, copy) elif isinstance(data, Frame): self._construct_from_series(data, index, columns, dtype, copy) elif (not _is_pandas_container(data) and is_dict_like(data) and len(data) > 0): if all(isinstance(val, Frame) for val in data.values()): self._construct_from_frames(data, index, columns, dtype, copy) elif all( hasattr(val, "__legate_data_interface__") for val in data.values()): self._construct_from_legate_containers(data, index, columns, dtype, copy) else: self._construct_fallback(data, index, columns, dtype, copy) else: self._construct_fallback(data, index, columns, dtype, copy) assert self._columns is not None
def format(self, formatter, subset=None): """ Format the text display value of cells. .. versionadded:: 0.18.0 Parameters ---------- formatter : str, callable, or dict subset : IndexSlice An argument to ``DataFrame.loc`` that restricts which elements ``formatter`` is applied to. Returns ------- self : Styler Notes ----- ``formatter`` is either an ``a`` or a dict ``{column name: a}`` where ``a`` is one of - str: this will be wrapped in: ``a.format(x)`` - callable: called with the value of an individual cell The default display value for numeric values is the "general" (``g``) format with ``pd.options.display.precision`` precision. Examples -------- >>> df = pd.DataFrame(np.random.randn(4, 2), columns=['a', 'b']) >>> df.style.format("{:.2%}") >>> df['c'] = ['a', 'b', 'c', 'd'] >>> df.style.format({'c': str.upper}) """ if subset is None: row_locs = range(len(self.data)) col_locs = range(len(self.data.columns)) else: subset = _non_reducing_slice(subset) if len(subset) == 1: subset = subset, self.data.columns sub_df = self.data.loc[subset] row_locs = self.data.index.get_indexer_for(sub_df.index) col_locs = self.data.columns.get_indexer_for(sub_df.columns) if is_dict_like(formatter): for col, col_formatter in formatter.items(): # formatter must be callable, so '{}' are converted to lambdas col_formatter = _maybe_wrap_formatter(col_formatter) col_num = self.data.columns.get_indexer_for([col])[0] for row_num in row_locs: self._display_funcs[(row_num, col_num)] = col_formatter else: # single scalar to format all cells with locs = product(*(row_locs, col_locs)) for i, j in locs: formatter = _maybe_wrap_formatter(formatter) self._display_funcs[(i, j)] = formatter return self
def rename_categories( self, new_categories: Union[list, dict, Callable], inplace: bool = False ) -> Optional["ps.Series"]: """ Rename categories. Parameters ---------- new_categories : list-like, dict-like or callable New categories which will replace old categories. * list-like: all items must be unique and the number of items in the new categories must match the existing number of categories. * dict-like: specifies a mapping from old categories to new. Categories not contained in the mapping are passed through and extra categories in the mapping are ignored. * callable : a callable that is called on all items in the old categories and whose return values comprise the new categories. inplace : bool, default False Whether or not to rename the categories inplace or return a copy of this categorical with renamed categories. .. deprecated:: 3.2.0 Returns ------- cat : Series or None Categorical with removed categories or None if ``inplace=True``. Raises ------ ValueError If new categories are list-like and do not have the same number of items than the current categories or do not validate as categories See Also -------- reorder_categories : Reorder categories. add_categories : Add new categories. remove_categories : Remove the specified categories. remove_unused_categories : Remove categories which are not used. set_categories : Set the categories to the specified ones. Examples -------- >>> s = ps.Series(["a", "a", "b"], dtype="category") >>> s.cat.rename_categories([0, 1]) # doctest: +SKIP 0 0 1 0 2 1 dtype: category Categories (2, int64): [0, 1] For dict-like ``new_categories``, extra keys are ignored and categories not in the dictionary are passed through >>> s.cat.rename_categories({'a': 'A', 'c': 'C'}) # doctest: +SKIP 0 A 1 A 2 b dtype: category Categories (2, object): ['A', 'b'] You may also provide a callable to create the new categories >>> s.cat.rename_categories(lambda x: x.upper()) # doctest: +SKIP 0 A 1 A 2 B dtype: category Categories (2, object): ['A', 'B'] """ from pyspark.pandas.frame import DataFrame if inplace: warnings.warn( "The `inplace` parameter in rename_categories is deprecated " "and will be removed in a future version.", FutureWarning, ) if is_dict_like(new_categories): categories = [cast(dict, new_categories).get(item, item) for item in self.categories] elif callable(new_categories): categories = [new_categories(item) for item in self.categories] elif is_list_like(new_categories): if len(self.categories) != len(new_categories): raise ValueError( "new categories need to have the same number of items as the old categories!" ) categories = cast(list, new_categories) else: raise TypeError("new_categories must be list-like, dict-like or callable.") internal = self._data._psdf._internal.with_new_spark_column( self._data._column_label, self._data.spark.column, field=self._data._internal.data_fields[0].copy( dtype=CategoricalDtype(categories=categories, ordered=self.ordered) ), ) if inplace: self._data._psdf._update_internal_frame(internal) return None else: return DataFrame(internal)._psser_for(self._data._column_label).copy()
def _groupby_reduce(self, ops=None): columns = self._df._get_columns() dtypes = self._df._frame.dtypes valid_ops = {} valid_columns = [] size_reduction = len(ops) == 1 and ops[0][0] == "size" # If the ops are given as a list, apply them across all the columns # with compatible data types if isinstance(ops, list): key_indices = set(self._keys) for col_idx, col in enumerate(columns): if col_idx in key_indices: continue if reduction.incompatible_ops(ops, dtypes[col_idx]): continue valid_ops[col_idx] = [desc[0] for desc in ops] valid_columns.append(col) # Special case with the size reduction, which produces a single # output regardless of the number of input columns if size_reduction: break # If the ops are passed in a dictionary, it also specifies the input # columns on which the aggregation are performed else: assert is_dict_like(ops) for col, descs in ops.items(): col_idx = columns.get_indexer_for([col]) if len(col_idx) > 1: raise KeyError(f"ambiguous column name {col}") if col_idx[0] == -1: raise KeyError(col) if reduction.incompatible_ops(descs, dtypes[col_idx[0]]): continue valid_ops[col_idx[0]] = [desc[0] for desc in descs] valid_columns.append(col) frame = self._df._frame.groupby_reduce(self._keys, valid_ops, self._method, self._sort) # If more than one aggregation is requested for a column, # the output column names should use MultiIndex multi_aggs = any(len(set(ops)) > 1 for ops in valid_ops.values()) def _generate_columns(columns, all_ops): if multi_aggs: from pandas import MultiIndex pairs = [] for idx, ops in all_ops.items(): pairs.extend([(columns[idx], op) for op in ops]) index = MultiIndex.from_tuples(pairs) if self._is_series_groupby: index = index.droplevel(0) return index else: from pandas import Index return Index([columns[idx] for idx in all_ops.keys()]) from .dataframe import DataFrame if self._as_index: # Groupby keys are rearranged to come first in the frame, # no matter where they were in the input frame, so the # indexer should be picking the first N keys in the frame, # where N is the number of keys indexer = list(range(len(self._keys))) index_columns = frame.select_columns(indexer) # However, the index names are derived from the input # dataframe, which is not rearranged, so we use the original # indexer to select the names index_names = columns[self._keys] value_names = _generate_columns(columns, valid_ops) # Once we find the index columns, we drop them from the frame frame = frame.drop_columns(indexer) frame = frame.set_index(index_columns, index_names) if size_reduction or (self._is_series_groupby and not multi_aggs): # Size reduction always produces a series from .series import Series return Series(frame=frame, name=value_names[0]) else: return DataFrame(frame=frame, columns=value_names) else: # Index levels don't survive in the output when as_index is False levels = set(self._levels) keys = [key for key in self._keys if key not in levels] key_names = columns[keys] value_names = _generate_columns(columns, valid_ops) # If the column names are stored in a MultiIndex, # we should extend the key names to match the shape if multi_aggs: from pandas import MultiIndex key_names = MultiIndex.from_arrays( [key_names, [""] * len(key_names)]) value_names = key_names.append(value_names) frame = frame.drop_columns(self._levels) return DataFrame(frame=frame, columns=value_names)
def fillna( self, value=None, method=None, axis=None, inplace=False, limit=None, downcast=None, ): axis = self._get_axis_number(axis, 0) inplace = validate_bool_kwarg(inplace, "inplace") if axis not in (0, ): raise err._unsupported_error("axis", axis) if value is None and method is None: raise ValueError("must specify a fill method or value") if value is not None and method is not None: raise ValueError("cannot specify both a fill method and value") # Checks on method if method is not None: raise err._unsupported_error("method", method) if method is not None and method not in [ "backfill", "bfill", "pad", "ffill", ]: expecting = "pad (ffill) or backfill (bfill)" msg = "Invalid fill method. Expecting {expecting}. Got {method}" msg = msg.format(expecting=expecting, method=method) raise ValueError(msg) # Checks on limit if limit is not None: raise err._unsupported_error("limit", limit) if limit is not None: if not isinstance(limit, int): raise ValueError("Limit must be an integer") elif limit <= 0: raise ValueError("Limit must be greater than 0") # Checks on value if isinstance(value, (list, tuple)): raise TypeError("'value' parameter must be a scalar or dict, but " f"you passed a {type(value).__name__}") if is_scalar(value): values = {} for idx in range(len(self._get_columns())): values[idx] = util.sanitize_scalar(value) elif is_dict_like(value): if self._is_series: raise err._unsupported_error( "'value' cannot be a dict for series") values = {} for col, val in value.items(): if not is_scalar(val): raise err._unsupported_error( "'value' must be a dict of scalars for now") idxr = self.columns.get_indexer_for([col]) if idxr[0] != -1: values[idxr[0]] = util.sanitize_scalar(val) new_frame = self._frame.fillna(values) return self._create_or_update_frame(new_frame, inplace)
def read_csv( filepath_or_buffer, sep=",", delimiter=None, header="infer", names=None, index_col=None, usecols=None, prefix=None, mangle_dupe_cols=True, dtype=None, true_values=None, false_values=None, skiprows=None, skipfooter=0, nrows=None, na_values=None, skip_blank_lines=True, parse_dates=False, compression="infer", quotechar='"', quoting=0, doublequote=True, verify_header=False, **kwargs, # TODO: Put back these options once we figure out how to support them # with the Arrows CSV reader. # skipinitialspace=False, # GPU only # keep_default_na=True, # GPU only # na_filter=True, # GPU only # dayfirst=False, # GPU only # thousands=None, # GPU only # decimal=".", # GPU only # lineterminator=None, # GPU only # comment=None, # GPU only # delim_whitespace=False, # GPU only ): # Checks on filepath_or_buffer paths = util.to_list_if_scalar(filepath_or_buffer) if any(not isinstance(path, str) for path in paths): raise err._unsupported_error( "'filepath_or_buffer' must be a string or a list of strings") if len(paths) == 0: raise ValueError("'filepath_or_buffer' must be a non-empty list") for path in paths: if not os.path.exists(path): raise ValueError(f"{path} does not exist") if not isinstance(compression, str): raise err._unsupported_error("compression", compression) compressions = [ _parse_compression(infer_compression(path, compression)) for path in paths ] # Checks on sep and delimiter if sep is None and delimiter is None: raise ValueError("at least one of 'sep' or 'delimiter' must be given") sep = delimiter if delimiter is not None else sep if len(sep) > 1: raise ValueError("'sep' must be a 1-character string") # Checks on sep and delimiter if header == "infer": header = 0 if names is None else None if header not in ( 0, None, ): raise err._unsupported_error("header", header) # Checks on skiprows, kipfooter, and nrows skiprows = 0 if skiprows is None else skiprows if not is_integer(skiprows): raise ValueError("'skiprows' must be an integer") if not is_integer(skipfooter): raise ValueError("'skipfooter' must be an integer") if not (nrows is None or is_integer(nrows)): raise ValueError("'nrows' must be None or an integer") # If either column names or dtype is missing, infer them by parsing # the first few of lines using Pandas # FIXME: We should use cuDF for this if names is None or dtype is None: engine = ("python" if skipfooter > 0 else "c", ) column_names, dtypes = _extract_header_using_pandas( paths[0], sep, header, names, dtype, true_values, false_values, skiprows, na_values, skip_blank_lines, parse_dates, compression, quotechar, quoting, doublequote, engine, peek_rows=3, ) if verify_header: for path in paths[1:]: result = _extract_header_using_pandas( path, sep, header, names, dtype, true_values, false_values, skiprows, na_values, skip_blank_lines, parse_dates, compression, quotechar, quoting, doublequote, engine, peek_rows=3, ) if not column_names.equals(result[0]): raise ValueError( f"{paths[0]} and {path} have different headers") else: column_names = pandas.Index(names) if is_dict_like(dtype): dtypes = [] for name in names: if name not in dtype: raise ValueError(f"'dtype' has no entry for '{name}'") dtypes.append(_ensure_dtype(dtype[name])) elif is_list_like(dtype): raise err._unsupported_error( "'dtype' must be a string, a dtype, or a dictionary") else: dtype = _ensure_dtype(dtype) dtypes = [dtype] * len(names) if column_names.has_duplicates: raise ValueError("Header must not have any duplicates") # Checks on unsupported options if prefix is not None: raise err._unsupported_error("prefix", prefix) if mangle_dupe_cols not in (True, ): raise err._unsupported_error("mangle_dupe_cols", mangle_dupe_cols) # If there was a header in the file, we should skip that line as well if header == 0: skiprows += 1 # Checks on parse_dates _ERR_MSG_PARSE_DATES = ( "'parse_dates' must be a list of integers or strings for now") if is_dict_like(parse_dates): raise err._unsupported_error(_ERR_MSG_PARSE_DATES) parse_dates = parse_dates if parse_dates is not False else [] if not is_list_like(parse_dates): raise err._unsupported_error(_ERR_MSG_PARSE_DATES) date_cols = _get_indexer(column_names, parse_dates, "parse_dates") # Override dtypes for the datetime columns for idx in date_cols: dtypes[idx] = ty.ts_ns # If a column is given a datetime dtype but not added to the parse_dates, # we should record it for idx, dtype in enumerate(dtypes): if idx not in parse_dates: parse_dates.append(idx) # Checks on quoting if quoting != 0: raise err._unsupported_error("quoting", quoting) if len(quotechar) > 1: raise ValueError("'quotechar' must be a 1-character string") # Checks on index_col index_col = None if index_col is False else index_col if index_col is not None: if is_integer(index_col) or isinstance(index_col, str): index_col = [index_col] if not is_list_like(index_col): raise err._unsupported_error("index_col", index_col) index_col = _get_indexer(column_names, index_col, "index_col") # Checks on true_values, false_values, and na_values _check_string_list(true_values, "true_values") _check_string_list(false_values, "false_values") _check_string_list(na_values, "na_values") # Checks on nrows if skipfooter != 0 and nrows is not None: raise ValueError("'skipfooter' not supported with 'nrows'") df = DataFrame( frame=io.read_csv( paths, sep=sep, usecols=usecols, dtypes=dtypes, true_values=true_values, false_values=false_values, skiprows=skiprows, skipfooter=skipfooter, nrows=nrows, na_values=na_values, skip_blank_lines=skip_blank_lines, date_cols=date_cols, compressions=compressions, quotechar=quotechar, quoting=quoting, doublequote=doublequote, ), columns=column_names, ) if index_col is not None: df = df.set_index(column_names[index_col]) # Make sure we reset the names for unnamed indices names = df._raw_index.names names = [ None if name.startswith("Unnamed") else name for name in names ] df._raw_index.names = names return df