def coo_to_sparse_series(A, dense_index: bool = False): """ Convert a scipy.sparse.coo_matrix to a SparseSeries. Parameters ---------- A : scipy.sparse.coo.coo_matrix dense_index : bool, default False Returns ------- Series Raises ------ TypeError if A is not a coo_matrix """ from pandas import SparseDtype try: s = Series(A.data, MultiIndex.from_arrays((A.row, A.col))) except AttributeError as err: raise TypeError( f"Expected coo_matrix. Got {type(A).__name__} instead.") from err s = s.sort_index() s = s.astype(SparseDtype(s.dtype)) if dense_index: # is there a better constructor method to use here? i = range(A.shape[0]) j = range(A.shape[1]) ind = MultiIndex.from_product([i, j]) s = s.reindex(ind) return s
def _stack_multi_column_index(columns: MultiIndex) -> MultiIndex: """Creates a MultiIndex from the first N-1 levels of this MultiIndex.""" if len(columns.levels) <= 2: return columns.levels[0]._rename(name=columns.names[0]) levs = [ [lev[c] if c >= 0 else None for c in codes] for lev, codes in zip(columns.levels[:-1], columns.codes[:-1]) ] # Remove duplicate tuples in the MultiIndex. tuples = zip(*levs) unique_tuples = (key for key, _ in itertools.groupby(tuples)) new_levs = zip(*unique_tuples) # The dtype of each level must be explicitly set to avoid inferring the wrong type. # See GH-36991. return MultiIndex.from_arrays( [ # Not all indices can accept None values. Index(new_lev, dtype=lev.dtype) if None not in new_lev else new_lev for new_lev, lev in zip(new_levs, columns.levels) ], names=columns.names[:-1], )
def pivot(data: "DataFrame", index=None, columns=None, values=None) -> "DataFrame": if columns is None: raise TypeError("pivot() missing 1 required argument: 'columns'") if values is None: cols = [columns] if index is None else [index, columns] append = index is None indexed = data.set_index(cols, append=append) else: if index is None: index = data.index else: index = data[index] index = MultiIndex.from_arrays([index, data[columns]]) if is_list_like(values) and not isinstance(values, tuple): # Exclude tuple because it is seen as a single column name indexed = data._constructor(data[values].values, index=index, columns=values) else: indexed = data._constructor_sliced(data[values].values, index=index) return indexed.unstack(columns)
def pivot( data: DataFrame, index: IndexLabel | None = None, columns: IndexLabel | None = None, values: IndexLabel | None = None, ) -> DataFrame: if columns is None: raise TypeError("pivot() missing 1 required argument: 'columns'") columns_listlike = com.convert_to_list_like(columns) indexed: DataFrame | Series if values is None: if index is not None: cols = com.convert_to_list_like(index) else: cols = [] append = index is None # error: Unsupported operand types for + ("List[Any]" and "ExtensionArray") # error: Unsupported left operand type for + ("ExtensionArray") indexed = data.set_index( cols + columns_listlike, append=append # type: ignore[operator] ) else: if index is None: if isinstance(data.index, MultiIndex): # GH 23955 index_list = [ data.index.get_level_values(i) for i in range(data.index.nlevels) ] else: index_list = [Series(data.index, name=data.index.name)] else: index_list = [data[idx] for idx in com.convert_to_list_like(index)] data_columns = [data[col] for col in columns_listlike] index_list.extend(data_columns) multiindex = MultiIndex.from_arrays(index_list) if is_list_like(values) and not isinstance(values, tuple): # Exclude tuple because it is seen as a single column name values = cast(Sequence[Hashable], values) indexed = data._constructor(data[values]._values, index=multiindex, columns=values) else: indexed = data._constructor_sliced(data[values]._values, index=multiindex) # error: Argument 1 to "unstack" of "DataFrame" has incompatible type "Union # [List[Any], ExtensionArray, ndarray[Any, Any], Index, Series]"; expected # "Hashable" return indexed.unstack(columns_listlike) # type: ignore[arg-type]
def ensure_key_mapped_multiindex(index, key: Callable, level=None): """ Returns a new MultiIndex in which key has been applied to all levels specified in level (or all levels if level is None). Used for key sorting for MultiIndex. Parameters ---------- index : MultiIndex Index to which to apply the key function on the specified levels. key : Callable Function that takes an Index and returns an Index of the same shape. This key is applied to each level separately. The name of the level can be used to distinguish different levels for application. level : list-like, int or str, default None Level or list of levels to apply the key function to. If None, key function is applied to all levels. Other levels are left unchanged. Returns ------- labels : MultiIndex Resulting MultiIndex with modified levels. """ from pandas.core.indexes.api import MultiIndex if level is not None: if isinstance(level, (str, int)): sort_levels = [level] else: sort_levels = level sort_levels = [index._get_level_number(lev) for lev in sort_levels] else: sort_levels = list(range(index.nlevels)) # satisfies mypy mapped = [ ensure_key_mapped(index._get_level_values(level), key) if level in sort_levels else index._get_level_values(level) for level in range(index.nlevels) ] labels = MultiIndex.from_arrays(mapped) return labels
def pivot( data: DataFrame, index: IndexLabel | None = None, columns: IndexLabel | None = None, values: IndexLabel | None = None, ) -> DataFrame: if columns is None: raise TypeError("pivot() missing 1 required argument: 'columns'") columns = com.convert_to_list_like(columns) if values is None: if index is not None: cols = com.convert_to_list_like(index) else: cols = [] append = index is None # error: Unsupported operand types for + ("List[Any]" and "ExtensionArray") # error: Unsupported left operand type for + ("ExtensionArray") indexed = data.set_index( cols + columns, append=append # type: ignore[operator] ) else: if index is None: index = [Series(data.index, name=data.index.name)] else: index = com.convert_to_list_like(index) index = [data[idx] for idx in index] data_columns = [data[col] for col in columns] index.extend(data_columns) index = MultiIndex.from_arrays(index) if is_list_like(values) and not isinstance(values, tuple): # Exclude tuple because it is seen as a single column name values = cast(Sequence[Hashable], values) indexed = data._constructor(data[values]._values, index=index, columns=values) else: indexed = data._constructor_sliced(data[values]._values, index=index) return indexed.unstack(columns)
def pivot( data: "DataFrame", index: Optional[Union[Label, Sequence[Label]]] = None, columns: Optional[Union[Label, Sequence[Label]]] = None, values: Optional[Union[Label, Sequence[Label]]] = None, ) -> "DataFrame": if columns is None: raise TypeError("pivot() missing 1 required argument: 'columns'") columns = com.convert_to_list_like(columns) if values is None: if index is not None: cols = com.convert_to_list_like(index) else: cols = [] cols.extend(columns) append = index is None indexed = data.set_index(cols, append=append) else: if index is None: index = [Series(data.index, name=data.index.name)] else: index = com.convert_to_list_like(index) index = [data[idx] for idx in index] data_columns = [data[col] for col in columns] index.extend(data_columns) index = MultiIndex.from_arrays(index) if is_list_like(values) and not isinstance(values, tuple): # Exclude tuple because it is seen as a single column name values = cast(Sequence[Label], values) indexed = data._constructor(data[values]._values, index=index, columns=values) else: indexed = data._constructor_sliced(data[values]._values, index=index) return indexed.unstack(columns)
def pivot(data: "DataFrame", index=None, columns=None, values=None) -> "DataFrame": if columns is None: raise TypeError("pivot() missing 1 required argument: 'columns'") columns = columns if is_list_like(columns) else [columns] if values is None: cols: List[str] = [] if index is None: pass elif is_list_like(index): cols = list(index) else: cols = [index] cols.extend(columns) append = index is None indexed = data.set_index(cols, append=append) else: if index is None: index = [Series(data.index, name=data.index.name)] elif is_list_like(index): index = [data[idx] for idx in index] else: index = [data[index]] data_columns = [data[col] for col in columns] index.extend(data_columns) index = MultiIndex.from_arrays(index) if is_list_like(values) and not isinstance(values, tuple): # Exclude tuple because it is seen as a single column name indexed = data._constructor(data[values].values, index=index, columns=values) else: indexed = data._constructor_sliced(data[values].values, index=index) return indexed.unstack(columns)
def __internal_pivot_table( data: DataFrame, values, index, columns, aggfunc: AggFuncTypeBase | AggFuncTypeDict, fill_value, margins: bool, dropna: bool, margins_name: str, observed: bool, sort: bool, ) -> DataFrame: """ Helper of :func:`pandas.pivot_table` for any non-list ``aggfunc``. """ keys = index + columns values_passed = values is not None if values_passed: if is_list_like(values): values_multi = True values = list(values) else: values_multi = False values = [values] # GH14938 Make sure value labels are in data for i in values: if i not in data: raise KeyError(i) to_filter = [] for x in keys + values: if isinstance(x, Grouper): x = x.key try: if x in data: to_filter.append(x) except TypeError: pass if len(to_filter) < len(data.columns): data = data[to_filter] else: values = data.columns for key in keys: try: values = values.drop(key) except (TypeError, ValueError, KeyError): pass values = list(values) grouped = data.groupby(keys, observed=observed, sort=sort) agged = grouped.agg(aggfunc) if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns): agged = agged.dropna(how="all") # gh-21133 # we want to down cast if # the original values are ints # as we grouped with a NaN value # and then dropped, coercing to floats for v in values: if ( v in data and is_integer_dtype(data[v]) and v in agged and not is_integer_dtype(agged[v]) ): if not isinstance(agged[v], ABCDataFrame): # exclude DataFrame case bc maybe_downcast_to_dtype expects # ArrayLike # e.g. test_pivot_table_multiindex_columns_doctest_case # agged.columns is a MultiIndex and 'v' is indexing only # on its first level. agged[v] = maybe_downcast_to_dtype(agged[v], data[v].dtype) table = agged # GH17038, this check should only happen if index is defined (not None) if table.index.nlevels > 1 and index: # Related GH #17123 # If index_names are integers, determine whether the integers refer # to the level position or name. index_names = agged.index.names[: len(index)] to_unstack = [] for i in range(len(index), len(keys)): name = agged.index.names[i] if name is None or name in index_names: to_unstack.append(i) else: to_unstack.append(name) table = agged.unstack(to_unstack) if not dropna: if isinstance(table.index, MultiIndex): m = MultiIndex.from_arrays( cartesian_product(table.index.levels), names=table.index.names ) table = table.reindex(m, axis=0) if isinstance(table.columns, MultiIndex): m = MultiIndex.from_arrays( cartesian_product(table.columns.levels), names=table.columns.names ) table = table.reindex(m, axis=1) if isinstance(table, ABCDataFrame): table = table.sort_index(axis=1) if fill_value is not None: table = table.fillna(fill_value, downcast="infer") if margins: if dropna: data = data[data.notna().all(axis=1)] table = _add_margins( table, data, values, rows=index, cols=columns, aggfunc=aggfunc, observed=dropna, margins_name=margins_name, fill_value=fill_value, ) # discard the top level if values_passed and not values_multi and table.columns.nlevels > 1: table = table.droplevel(0, axis=1) if len(index) == 0 and len(columns) > 0: table = table.T # GH 15193 Make sure empty columns are removed if dropna=True if isinstance(table, ABCDataFrame) and dropna: table = table.dropna(how="all", axis=1) return table
def pivot_table( data, values=None, index=None, columns=None, aggfunc="mean", fill_value=None, margins=False, dropna=True, margins_name="All", observed=False, ) -> "DataFrame": index = _convert_by(index) columns = _convert_by(columns) if isinstance(aggfunc, list): pieces: List[DataFrame] = [] keys = [] for func in aggfunc: table = pivot_table( data, values=values, index=index, columns=columns, fill_value=fill_value, aggfunc=func, margins=margins, dropna=dropna, margins_name=margins_name, observed=observed, ) pieces.append(table) keys.append(getattr(func, "__name__", func)) return concat(pieces, keys=keys, axis=1) keys = index + columns values_passed = values is not None if values_passed: if is_list_like(values): values_multi = True values = list(values) else: values_multi = False values = [values] # GH14938 Make sure value labels are in data for i in values: if i not in data: raise KeyError(i) to_filter = [] for x in keys + values: if isinstance(x, Grouper): x = x.key try: if x in data: to_filter.append(x) except TypeError: pass if len(to_filter) < len(data.columns): data = data[to_filter] else: values = data.columns for key in keys: try: values = values.drop(key) except (TypeError, ValueError, KeyError): pass values = list(values) grouped = data.groupby(keys, observed=observed) agged = grouped.agg(aggfunc) if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns): agged = agged.dropna(how="all") # gh-21133 # we want to down cast if # the original values are ints # as we grouped with a NaN value # and then dropped, coercing to floats for v in values: if (v in data and is_integer_dtype(data[v]) and v in agged and not is_integer_dtype(agged[v])): agged[v] = maybe_downcast_to_dtype(agged[v], data[v].dtype) table = agged # GH17038, this check should only happen if index is defined (not None) if table.index.nlevels > 1 and index: # Related GH #17123 # If index_names are integers, determine whether the integers refer # to the level position or name. index_names = agged.index.names[:len(index)] to_unstack = [] for i in range(len(index), len(keys)): name = agged.index.names[i] if name is None or name in index_names: to_unstack.append(i) else: to_unstack.append(name) table = agged.unstack(to_unstack) if not dropna: if table.index.nlevels > 1: m = MultiIndex.from_arrays(cartesian_product(table.index.levels), names=table.index.names) table = table.reindex(m, axis=0) if table.columns.nlevels > 1: m = MultiIndex.from_arrays(cartesian_product(table.columns.levels), names=table.columns.names) table = table.reindex(m, axis=1) if isinstance(table, ABCDataFrame): table = table.sort_index(axis=1) if fill_value is not None: table = table._ensure_type(table.fillna(fill_value, downcast="infer")) if margins: if dropna: data = data[data.notna().all(axis=1)] table = _add_margins( table, data, values, rows=index, cols=columns, aggfunc=aggfunc, observed=dropna, margins_name=margins_name, fill_value=fill_value, ) # discard the top level if (values_passed and not values_multi and not table.empty and (table.columns.nlevels > 1)): table = table[values[0]] if len(index) == 0 and len(columns) > 0: table = table.T # GH 15193 Make sure empty columns are removed if dropna=True if isinstance(table, ABCDataFrame) and dropna: table = table.dropna(how="all", axis=1) return table
def flex_binary_moment(arg1, arg2, f, pairwise=False): if isinstance(arg1, ABCSeries) and isinstance(arg2, ABCSeries): X, Y = prep_binary(arg1, arg2) return f(X, Y) elif isinstance(arg1, ABCDataFrame): from pandas import DataFrame def dataframe_from_int_dict(data, frame_template): result = DataFrame(data, index=frame_template.index) if len(result.columns) > 0: result.columns = frame_template.columns[result.columns] return result results = {} if isinstance(arg2, ABCDataFrame): if pairwise is False: if arg1 is arg2: # special case in order to handle duplicate column names for i in range(len(arg1.columns)): results[i] = f(arg1.iloc[:, i], arg2.iloc[:, i]) return dataframe_from_int_dict(results, arg1) else: if not arg1.columns.is_unique: raise ValueError("'arg1' columns are not unique") if not arg2.columns.is_unique: raise ValueError("'arg2' columns are not unique") X, Y = arg1.align(arg2, join="outer") X, Y = prep_binary(X, Y) res_columns = arg1.columns.union(arg2.columns) for col in res_columns: if col in X and col in Y: results[col] = f(X[col], Y[col]) return DataFrame(results, index=X.index, columns=res_columns) elif pairwise is True: results = defaultdict(dict) for i in range(len(arg1.columns)): for j in range(len(arg2.columns)): if j < i and arg2 is arg1: # Symmetric case results[i][j] = results[j][i] else: results[i][j] = f( *prep_binary(arg1.iloc[:, i], arg2.iloc[:, j])) from pandas import concat result_index = arg1.index.union(arg2.index) if len(result_index): # construct result frame result = concat( [ concat( [ results[i][j] for j in range(len(arg2.columns)) ], ignore_index=True, ) for i in range(len(arg1.columns)) ], ignore_index=True, axis=1, ) result.columns = arg1.columns # set the index and reorder if arg2.columns.nlevels > 1: # mypy needs to know columns is a MultiIndex, Index doesn't # have levels attribute arg2.columns = cast(MultiIndex, arg2.columns) # GH 21157: Equivalent to MultiIndex.from_product( # [result_index], <unique combinations of arg2.columns.levels>, # ) # A normal MultiIndex.from_product will produce too many # combinations. result_level = np.tile( result_index, len(result) // len(result_index)) arg2_levels = (np.repeat( arg2.columns.get_level_values(i), len(result) // len(arg2.columns), ) for i in range(arg2.columns.nlevels)) result_names = list( arg2.columns.names) + [result_index.name] result.index = MultiIndex.from_arrays( [*arg2_levels, result_level], names=result_names) # GH 34440 num_levels = len(result.index.levels) new_order = [num_levels - 1] + list( range(num_levels - 1)) result = result.reorder_levels(new_order).sort_index() else: result.index = MultiIndex.from_product([ range(len(arg2.columns)), range(len(result_index)) ]) result = result.swaplevel(1, 0).sort_index() result.index = MultiIndex.from_product([result_index] + [arg2.columns]) else: # empty result result = DataFrame( index=MultiIndex(levels=[arg1.index, arg2.columns], codes=[[], []]), columns=arg2.columns, dtype="float64", ) # reset our index names to arg1 names # reset our column names to arg2 names # careful not to mutate the original names result.columns = result.columns.set_names(arg1.columns.names) result.index = result.index.set_names(result_index.names + arg2.columns.names) return result else: results = { i: f(*prep_binary(arg1.iloc[:, i], arg2)) for i in range(len(arg1.columns)) } return dataframe_from_int_dict(results, arg1) else: return flex_binary_moment(arg2, arg1, f)