def index_with_missing(request): """ Fixture for indices with missing values """ if request.param in ["int", "uint", "range", "empty", "repeats"]: pytest.xfail("missing values not supported") # GH 35538. Use deep copy to avoid illusive bug on np-dev # Azure pipeline that writes into indices_dict despite copy ind = indices_dict[request.param].copy(deep=True) vals = ind.values if request.param in ["tuples", "mi-with-dt64tz-level", "multi"]: # For setting missing values in the top level of MultiIndex vals = ind.tolist() vals[0] = (None,) + vals[0][1:] vals[-1] = (None,) + vals[-1][1:] return MultiIndex.from_tuples(vals) else: vals[0] = None vals[-1] = None return type(ind)(vals)
def get_new_columns(self): if self.value_columns is None: if self.lift == 0: return self.removed_level._shallow_copy(name=self.removed_name) lev = self.removed_level.insert(0, item=self.removed_level._na_value) return lev.rename(self.removed_name) stride = len(self.removed_level) + self.lift width = len(self.value_columns) propagator = np.repeat(np.arange(width), stride) if isinstance(self.value_columns, MultiIndex): new_levels = self.value_columns.levels + ( self.removed_level_full, ) new_names = self.value_columns.names + (self.removed_name, ) new_codes = [ lab.take(propagator) for lab in self.value_columns.codes ] else: new_levels = [self.value_columns, self.removed_level_full] new_names = [self.value_columns.name, self.removed_name] new_codes = [propagator] # The two indices differ only if the unstacked level had unused items: if len(self.removed_level_full) != len(self.removed_level): # In this case, we remap the new codes to the original level: repeater = self.removed_level_full.get_indexer(self.removed_level) if self.lift: repeater = np.insert(repeater, 0, -1) else: # Otherwise, we just use each level item exactly once: repeater = np.arange(stride) - self.lift # The entire level is then just a repetition of the single chunk: new_codes.append(np.tile(repeater, width)) return MultiIndex(levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False)
def __init__(self, index: MultiIndex, level=-1, constructor=None): if constructor is None: constructor = DataFrame self.constructor = constructor self.index = index.remove_unused_levels() self.level = self.index._get_level_number(level) # when index includes `nan`, need to lift levels/strides by 1 self.lift = 1 if -1 in self.index.codes[self.level] else 0 # Note: the "pop" below alters these in-place. self.new_index_levels = list(self.index.levels) self.new_index_names = list(self.index.names) self.removed_name = self.new_index_names.pop(self.level) self.removed_level = self.new_index_levels.pop(self.level) self.removed_level_full = index.levels[self.level] # Bug fix GH 20601 # If the data frame is too big, the number of unique index combination # will cause int32 overflow on windows environments. # We want to check and raise an error before this happens num_rows = np.max( [index_level.size for index_level in self.new_index_levels]) num_columns = self.removed_level.size # GH20601: This forces an overflow if the number of cells is too high. num_cells = num_rows * num_columns # GH 26314: Previous ValueError raised was too restrictive for many users. if num_cells > np.iinfo(np.int32).max: warnings.warn( f"The following operation may generate {num_cells} cells " f"in the resulting pandas object.", PerformanceWarning, ) self._make_selectors()
def pivot( data: "DataFrame", index: Optional[Union[Label, Sequence[Label]]] = None, columns: Optional[Union[Label, Sequence[Label]]] = None, values: Optional[Union[Label, Sequence[Label]]] = None, ) -> "DataFrame": if columns is None: raise TypeError("pivot() missing 1 required argument: 'columns'") columns = com.convert_to_list_like(columns) if values is None: if index is not None: cols = com.convert_to_list_like(index) else: cols = [] cols.extend(columns) append = index is None indexed = data.set_index(cols, append=append) else: if index is None: index = [Series(data.index, name=data.index.name)] else: index = com.convert_to_list_like(index) index = [data[idx] for idx in index] data_columns = [data[col] for col in columns] index.extend(data_columns) index = MultiIndex.from_arrays(index) if is_list_like(values) and not isinstance(values, tuple): # Exclude tuple because it is seen as a single column name values = cast(Sequence[Label], values) indexed = data._constructor(data[values]._values, index=index, columns=values) else: indexed = data._constructor_sliced(data[values]._values, index=index) return indexed.unstack(columns)
def pivot(data: "DataFrame", index=None, columns=None, values=None) -> "DataFrame": if columns is None: raise TypeError("pivot() missing 1 required argument: 'columns'") columns = columns if is_list_like(columns) else [columns] if values is None: cols: List[str] = [] if index is None: pass elif is_list_like(index): cols = list(index) else: cols = [index] cols.extend(columns) append = index is None indexed = data.set_index(cols, append=append) else: if index is None: index = [Series(data.index, name=data.index.name)] elif is_list_like(index): index = [data[idx] for idx in index] else: index = [data[index]] data_columns = [data[col] for col in columns] index.extend(data_columns) index = MultiIndex.from_arrays(index) if is_list_like(values) and not isinstance(values, tuple): # Exclude tuple because it is seen as a single column name indexed = data._constructor(data[values].values, index=index, columns=values) else: indexed = data._constructor_sliced(data[values].values, index=index) return indexed.unstack(columns)
def pivot(data: "DataFrame", index=None, columns=None, values=None): if values is None: cols = [columns] if index is None else [index, columns] append = index is None indexed = data.set_index(cols, append=append) else: if index is None: index = data.index else: index = data[index] index = MultiIndex.from_arrays([index, data[columns]]) if is_list_like(values) and not isinstance(values, tuple): # Exclude tuple because it is seen as a single column name indexed = data._constructor(data[values].values, index=index, columns=values) else: indexed = data._constructor_sliced(data[values].values, index=index) return indexed.unstack(columns)
def _stack_multi_column_index(columns: MultiIndex) -> MultiIndex: """Creates a MultiIndex from the first N-1 levels of this MultiIndex.""" if len(columns.levels) <= 2: return columns.levels[0]._rename(name=columns.names[0]) levs = [[lev[c] if c >= 0 else None for c in codes] for lev, codes in zip(columns.levels[:-1], columns.codes[:-1])] # Remove duplicate tuples in the MultiIndex. tuples = zip(*levs) unique_tuples = (key for key, _ in itertools.groupby(tuples)) new_levs = zip(*unique_tuples) # The dtype of each level must be explicitly set to avoid inferring the wrong type. # See GH-36991. return MultiIndex.from_arrays( [ # Not all indices can accept None values. Index(new_lev, dtype=lev.dtype) if None not in new_lev else new_lev for new_lev, lev in zip(new_levs, columns.levels) ], names=columns.names[:-1], )
def _create_multiindex(): """ MultiIndex used to test the general functionality of this object """ # See Also: tests.multi.conftest.idx major_axis = Index(["foo", "bar", "baz", "qux"]) minor_axis = Index(["one", "two"]) major_codes = np.array([0, 0, 1, 2, 3, 3]) minor_codes = np.array([0, 1, 0, 1, 0, 1]) index_names = ["first", "second"] mi = MultiIndex( levels=[major_axis, minor_axis], codes=[major_codes, minor_codes], names=index_names, verify_integrity=False, ) return mi class TestSubDict(dict): def __init__(self, *args, **kwargs): dict.__init__(self, *args, **kwargs)
def __init__(self, index: MultiIndex, level=-1, constructor=None): if constructor is None: constructor = DataFrame self.constructor = constructor self.index = index.remove_unused_levels() self.level = self.index._get_level_number(level) # when index includes `nan`, need to lift levels/strides by 1 self.lift = 1 if -1 in self.index.codes[self.level] else 0 # Note: the "pop" below alters these in-place. self.new_index_levels = list(self.index.levels) self.new_index_names = list(self.index.names) self.removed_name = self.new_index_names.pop(self.level) self.removed_level = self.new_index_levels.pop(self.level) self.removed_level_full = index.levels[self.level] # Bug fix GH 20601 # If the data frame is too big, the number of unique index combination # will cause int32 overflow on windows environments. # We want to check and raise an error before this happens num_rows = np.max( [index_level.size for index_level in self.new_index_levels]) num_columns = self.removed_level.size # GH20601: This forces an overflow if the number of cells is too high. num_cells = np.multiply(num_rows, num_columns, dtype=np.int32) if num_rows > 0 and num_columns > 0 and num_cells <= 0: raise ValueError( "Unstacked DataFrame is too big, causing int32 overflow") self._make_selectors()
def get_new_columns(self, value_columns: Index | None): if value_columns is None: if self.lift == 0: return self.removed_level._rename(name=self.removed_name) lev = self.removed_level.insert(0, item=self.removed_level._na_value) return lev.rename(self.removed_name) stride = len(self.removed_level) + self.lift width = len(value_columns) propagator = np.repeat(np.arange(width), stride) new_levels: FrozenList | list[Index] if isinstance(value_columns, MultiIndex): new_levels = value_columns.levels + (self.removed_level_full, ) new_names = value_columns.names + (self.removed_name, ) new_codes = [lab.take(propagator) for lab in value_columns.codes] else: new_levels = [ value_columns, self.removed_level_full, ] new_names = [value_columns.name, self.removed_name] new_codes = [propagator] repeater = self._repeater # The entire level is then just a repetition of the single chunk: new_codes.append(np.tile(repeater, width)) return MultiIndex(levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False)
def index_with_missing(request): """ Fixture for indices with missing values. Integer-dtype and empty cases are excluded because they cannot hold missing values. MultiIndex is excluded because isna() is not defined for MultiIndex. """ # GH 35538. Use deep copy to avoid illusive bug on np-dev # Azure pipeline that writes into indices_dict despite copy ind = indices_dict[request.param].copy(deep=True) vals = ind.values if request.param in ["tuples", "mi-with-dt64tz-level", "multi"]: # For setting missing values in the top level of MultiIndex vals = ind.tolist() vals[0] = (None, ) + vals[0][1:] vals[-1] = (None, ) + vals[-1][1:] return MultiIndex.from_tuples(vals) else: vals[0] = None vals[-1] = None return type(ind)(vals)
def _stack_multi_columns(frame, level_num=-1, dropna=True): def _convert_level_number(level_num, columns): """ Logic for converting the level number to something we can safely pass to swaplevel. If `level_num` matches a column name return the name from position `level_num`, otherwise return `level_num`. """ if level_num in columns.names: return columns.names[level_num] return level_num this = frame.copy() # this makes life much simpler if level_num != frame.columns.nlevels - 1: # roll levels to put selected level at end roll_columns = this.columns for i in range(level_num, frame.columns.nlevels - 1): # Need to check if the ints conflict with level names lev1 = _convert_level_number(i, roll_columns) lev2 = _convert_level_number(i + 1, roll_columns) roll_columns = roll_columns.swaplevel(lev1, lev2) this.columns = roll_columns if not this.columns._is_lexsorted(): # Workaround the edge case where 0 is one of the column names, # which interferes with trying to sort based on the first # level level_to_sort = _convert_level_number(0, this.columns) this = this.sort_index(level=level_to_sort, axis=1) new_columns = _stack_multi_column_index(this.columns) # time to ravel the values new_data = {} level_vals = this.columns.levels[-1] level_codes = sorted(set(this.columns.codes[-1])) level_vals_nan = level_vals.insert(len(level_vals), None) level_vals_used = np.take(level_vals_nan, level_codes) levsize = len(level_codes) drop_cols = [] for key in new_columns: try: loc = this.columns.get_loc(key) except KeyError: drop_cols.append(key) continue # can make more efficient? # we almost always return a slice # but if unsorted can get a boolean # indexer if not isinstance(loc, slice): slice_len = len(loc) else: slice_len = loc.stop - loc.start if slice_len != levsize: chunk = this.loc[:, this.columns[loc]] chunk.columns = level_vals_nan.take(chunk.columns.codes[-1]) value_slice = chunk.reindex(columns=level_vals_used).values else: if frame._is_homogeneous_type and is_extension_array_dtype( frame.dtypes.iloc[0] ): dtype = this[this.columns[loc]].dtypes.iloc[0] subset = this[this.columns[loc]] value_slice = dtype.construct_array_type()._concat_same_type( [x._values for _, x in subset.items()] ) N, K = this.shape idx = np.arange(N * K).reshape(K, N).T.ravel() value_slice = value_slice.take(idx) elif frame._is_mixed_type: value_slice = this[this.columns[loc]].values else: value_slice = this.values[:, loc] if value_slice.ndim > 1: # i.e. not extension value_slice = value_slice.ravel() new_data[key] = value_slice if len(drop_cols) > 0: new_columns = new_columns.difference(drop_cols) N = len(this) if isinstance(this.index, MultiIndex): new_levels = list(this.index.levels) new_names = list(this.index.names) new_codes = [lab.repeat(levsize) for lab in this.index.codes] else: old_codes, old_levels = factorize_from_iterable(this.index) new_levels = [old_levels] new_codes = [old_codes.repeat(levsize)] new_names = [this.index.name] # something better? new_levels.append(level_vals) new_codes.append(np.tile(level_codes, N)) new_names.append(frame.columns.names[level_num]) new_index = MultiIndex( levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False ) result = frame._constructor(new_data, index=new_index, columns=new_columns) # more efficient way to go about this? can do the whole masking biz but # will only save a small amount of time... if dropna: result = result.dropna(axis=0, how="all") return result
def stack(frame, level=-1, dropna=True): """ Convert DataFrame to Series with multi-level Index. Columns become the second level of the resulting hierarchical index Returns ------- stacked : Series """ def factorize(index): if index.is_unique: return index, np.arange(len(index)) codes, categories = factorize_from_iterable(index) return categories, codes N, K = frame.shape # Will also convert negative level numbers and check if out of bounds. level_num = frame.columns._get_level_number(level) if isinstance(frame.columns, MultiIndex): return _stack_multi_columns(frame, level_num=level_num, dropna=dropna) elif isinstance(frame.index, MultiIndex): new_levels = list(frame.index.levels) new_codes = [lab.repeat(K) for lab in frame.index.codes] clev, clab = factorize(frame.columns) new_levels.append(clev) new_codes.append(np.tile(clab, N).ravel()) new_names = list(frame.index.names) new_names.append(frame.columns.name) new_index = MultiIndex( levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False ) else: levels, (ilab, clab) = zip(*map(factorize, (frame.index, frame.columns))) codes = ilab.repeat(K), np.tile(clab, N).ravel() new_index = MultiIndex( levels=levels, codes=codes, names=[frame.index.name, frame.columns.name], verify_integrity=False, ) if not frame.empty and frame._is_homogeneous_type: # For homogeneous EAs, frame._values will coerce to object. So # we concatenate instead. dtypes = list(frame.dtypes._values) dtype = dtypes[0] if is_extension_array_dtype(dtype): arr = dtype.construct_array_type() new_values = arr._concat_same_type( [col._values for _, col in frame.items()] ) new_values = _reorder_for_extension_array_stack(new_values, N, K) else: # homogeneous, non-EA new_values = frame._values.ravel() else: # non-homogeneous new_values = frame._values.ravel() if dropna: mask = notna(new_values) new_values = new_values[mask] new_index = new_index[mask] return frame._constructor_sliced(new_values, index=new_index)
def _unstack_multiple(data, clocs, fill_value=None): if len(clocs) == 0: return data # NOTE: This doesn't deal with hierarchical columns yet index = data.index # GH 19966 Make sure if MultiIndexed index has tuple name, they will be # recognised as a whole if clocs in index.names: clocs = [clocs] clocs = [index._get_level_number(i) for i in clocs] rlocs = [i for i in range(index.nlevels) if i not in clocs] clevels = [index.levels[i] for i in clocs] ccodes = [index.codes[i] for i in clocs] cnames = [index.names[i] for i in clocs] rlevels = [index.levels[i] for i in rlocs] rcodes = [index.codes[i] for i in rlocs] rnames = [index.names[i] for i in rlocs] shape = tuple(len(x) for x in clevels) group_index = get_group_index(ccodes, shape, sort=False, xnull=False) comp_ids, obs_ids = compress_group_index(group_index, sort=False) recons_codes = decons_obs_group_ids(comp_ids, obs_ids, shape, ccodes, xnull=False) if not rlocs: # Everything is in clocs, so the dummy df has a regular index dummy_index = Index(obs_ids, name="__placeholder__") else: dummy_index = MultiIndex( levels=rlevels + [obs_ids], codes=rcodes + [comp_ids], names=rnames + ["__placeholder__"], verify_integrity=False, ) if isinstance(data, Series): dummy = data.copy() dummy.index = dummy_index unstacked = dummy.unstack("__placeholder__", fill_value=fill_value) new_levels = clevels new_names = cnames new_codes = recons_codes else: if isinstance(data.columns, MultiIndex): result = data for i in range(len(clocs)): val = clocs[i] result = result.unstack(val, fill_value=fill_value) clocs = [v if v < val else v - 1 for v in clocs] return result # GH#42579 deep=False to avoid consolidating dummy = data.copy(deep=False) dummy.index = dummy_index unstacked = dummy.unstack("__placeholder__", fill_value=fill_value) if isinstance(unstacked, Series): unstcols = unstacked.index else: unstcols = unstacked.columns assert isinstance(unstcols, MultiIndex) # for mypy new_levels = [unstcols.levels[0]] + clevels new_names = [data.columns.name] + cnames new_codes = [unstcols.codes[0]] for rec in recons_codes: new_codes.append(rec.take(unstcols.codes[-1])) new_columns = MultiIndex( levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False ) if isinstance(unstacked, Series): unstacked.index = new_columns else: unstacked.columns = new_columns return unstacked
def _unstack_multiple(data, clocs, fill_value=None): if len(clocs) == 0: return data # NOTE: This doesn't deal with hierarchical columns yet index = data.index clocs = [index._get_level_number(i) for i in clocs] rlocs = [i for i in range(index.nlevels) if i not in clocs] clevels = [index.levels[i] for i in clocs] ccodes = [index.codes[i] for i in clocs] cnames = [index.names[i] for i in clocs] rlevels = [index.levels[i] for i in rlocs] rcodes = [index.codes[i] for i in rlocs] rnames = [index.names[i] for i in rlocs] shape = [len(x) for x in clevels] group_index = get_group_index(ccodes, shape, sort=False, xnull=False) comp_ids, obs_ids = compress_group_index(group_index, sort=False) recons_codes = decons_obs_group_ids(comp_ids, obs_ids, shape, ccodes, xnull=False) if rlocs == []: # Everything is in clocs, so the dummy df has a regular index dummy_index = Index(obs_ids, name="__placeholder__") else: dummy_index = MultiIndex( levels=rlevels + [obs_ids], codes=rcodes + [comp_ids], names=rnames + ["__placeholder__"], verify_integrity=False, ) if isinstance(data, Series): dummy = data.copy() dummy.index = dummy_index unstacked = dummy.unstack("__placeholder__", fill_value=fill_value) new_levels = clevels new_names = cnames new_codes = recons_codes else: if isinstance(data.columns, MultiIndex): result = data for i in range(len(clocs)): val = clocs[i] result = result.unstack(val) clocs = [v if i > v else v - 1 for v in clocs] return result dummy = data.copy() dummy.index = dummy_index unstacked = dummy.unstack("__placeholder__", fill_value=fill_value) if isinstance(unstacked, Series): unstcols = unstacked.index else: unstcols = unstacked.columns new_levels = [unstcols.levels[0]] + clevels new_names = [data.columns.name] + cnames new_codes = [unstcols.codes[0]] for rec in recons_codes: new_codes.append(rec.take(unstcols.codes[-1])) new_columns = MultiIndex(levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False) if isinstance(unstacked, Series): unstacked.index = new_columns else: unstacked.columns = new_columns return unstacked
indices_dict = { "unicode": tm.makeUnicodeIndex(100), "string": tm.makeStringIndex(100), "datetime": tm.makeDateIndex(100), "datetime-tz": tm.makeDateIndex(100, tz="US/Pacific"), "period": tm.makePeriodIndex(100), "timedelta": tm.makeTimedeltaIndex(100), "int": tm.makeIntIndex(100), "uint": tm.makeUIntIndex(100), "range": tm.makeRangeIndex(100), "float": tm.makeFloatIndex(100), "bool": tm.makeBoolIndex(10), "categorical": tm.makeCategoricalIndex(100), "interval": tm.makeIntervalIndex(100), "empty": Index([]), "tuples": MultiIndex.from_tuples(zip(["foo", "bar", "baz"], [1, 2, 3])), "mi-with-dt64tz-level": _create_mi_with_dt64tz_level(), "multi": _create_multiindex(), "repeats": Index([0, 0, 1, 1, 2, 2]), } @pytest.fixture(params=indices_dict.keys()) def index(request): """ Fixture for many "simple" kinds of indices. These indices are unlikely to cover corner cases, e.g. - no names - no NaTs/NaNs - no values near implementation bounds
def __internal_pivot_table( data: DataFrame, values, index, columns, aggfunc: AggFuncTypeBase | AggFuncTypeDict, fill_value, margins: bool, dropna: bool, margins_name: str, observed: bool, sort: bool, ) -> DataFrame: """ Helper of :func:`pandas.pivot_table` for any non-list ``aggfunc``. """ keys = index + columns values_passed = values is not None if values_passed: if is_list_like(values): values_multi = True values = list(values) else: values_multi = False values = [values] # GH14938 Make sure value labels are in data for i in values: if i not in data: raise KeyError(i) to_filter = [] for x in keys + values: if isinstance(x, Grouper): x = x.key try: if x in data: to_filter.append(x) except TypeError: pass if len(to_filter) < len(data.columns): data = data[to_filter] else: values = data.columns for key in keys: try: values = values.drop(key) except (TypeError, ValueError, KeyError): pass values = list(values) grouped = data.groupby(keys, observed=observed, sort=sort) agged = grouped.agg(aggfunc) if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns): agged = agged.dropna(how="all") # gh-21133 # we want to down cast if # the original values are ints # as we grouped with a NaN value # and then dropped, coercing to floats for v in values: if ( v in data and is_integer_dtype(data[v]) and v in agged and not is_integer_dtype(agged[v]) ): if not isinstance(agged[v], ABCDataFrame): # exclude DataFrame case bc maybe_downcast_to_dtype expects # ArrayLike # e.g. test_pivot_table_multiindex_columns_doctest_case # agged.columns is a MultiIndex and 'v' is indexing only # on its first level. agged[v] = maybe_downcast_to_dtype(agged[v], data[v].dtype) table = agged # GH17038, this check should only happen if index is defined (not None) if table.index.nlevels > 1 and index: # Related GH #17123 # If index_names are integers, determine whether the integers refer # to the level position or name. index_names = agged.index.names[: len(index)] to_unstack = [] for i in range(len(index), len(keys)): name = agged.index.names[i] if name is None or name in index_names: to_unstack.append(i) else: to_unstack.append(name) table = agged.unstack(to_unstack) if not dropna: if isinstance(table.index, MultiIndex): m = MultiIndex.from_arrays( cartesian_product(table.index.levels), names=table.index.names ) table = table.reindex(m, axis=0) if isinstance(table.columns, MultiIndex): m = MultiIndex.from_arrays( cartesian_product(table.columns.levels), names=table.columns.names ) table = table.reindex(m, axis=1) if isinstance(table, ABCDataFrame): table = table.sort_index(axis=1) if fill_value is not None: table = table.fillna(fill_value, downcast="infer") if margins: if dropna: data = data[data.notna().all(axis=1)] table = _add_margins( table, data, values, rows=index, cols=columns, aggfunc=aggfunc, observed=dropna, margins_name=margins_name, fill_value=fill_value, ) # discard the top level if values_passed and not values_multi and table.columns.nlevels > 1: table = table.droplevel(0, axis=1) if len(index) == 0 and len(columns) > 0: table = table.T # GH 15193 Make sure empty columns are removed if dropna=True if isinstance(table, ABCDataFrame) and dropna: table = table.dropna(how="all", axis=1) return table
def _flex_binary_moment(arg1, arg2, f, pairwise=False): if not (isinstance(arg1, (np.ndarray, ABCSeries, ABCDataFrame)) and isinstance(arg2, (np.ndarray, ABCSeries, ABCDataFrame))): raise TypeError( "arguments to moment function must be of type np.ndarray/Series/DataFrame" ) if isinstance(arg1, (np.ndarray, ABCSeries)) and isinstance( arg2, (np.ndarray, ABCSeries)): X, Y = prep_binary(arg1, arg2) return f(X, Y) elif isinstance(arg1, ABCDataFrame): from pandas import DataFrame def dataframe_from_int_dict(data, frame_template): result = DataFrame(data, index=frame_template.index) if len(result.columns) > 0: result.columns = frame_template.columns[result.columns] return result results = {} if isinstance(arg2, ABCDataFrame): if pairwise is False: if arg1 is arg2: # special case in order to handle duplicate column names for i, col in enumerate(arg1.columns): results[i] = f(arg1.iloc[:, i], arg2.iloc[:, i]) return dataframe_from_int_dict(results, arg1) else: if not arg1.columns.is_unique: raise ValueError("'arg1' columns are not unique") if not arg2.columns.is_unique: raise ValueError("'arg2' columns are not unique") with warnings.catch_warnings(record=True): warnings.simplefilter("ignore", RuntimeWarning) X, Y = arg1.align(arg2, join="outer") X = X + 0 * Y Y = Y + 0 * X with warnings.catch_warnings(record=True): warnings.simplefilter("ignore", RuntimeWarning) res_columns = arg1.columns.union(arg2.columns) for col in res_columns: if col in X and col in Y: results[col] = f(X[col], Y[col]) return DataFrame(results, index=X.index, columns=res_columns) elif pairwise is True: results = defaultdict(dict) for i, k1 in enumerate(arg1.columns): for j, k2 in enumerate(arg2.columns): if j < i and arg2 is arg1: # Symmetric case results[i][j] = results[j][i] else: results[i][j] = f( *prep_binary(arg1.iloc[:, i], arg2.iloc[:, j])) from pandas import concat result_index = arg1.index.union(arg2.index) if len(result_index): # construct result frame result = concat( [ concat( [ results[i][j] for j, c in enumerate(arg2.columns) ], ignore_index=True, ) for i, c in enumerate(arg1.columns) ], ignore_index=True, axis=1, ) result.columns = arg1.columns # set the index and reorder if arg2.columns.nlevels > 1: result.index = MultiIndex.from_product( arg2.columns.levels + [result_index]) # GH 34440 num_levels = len(result.index.levels) new_order = [num_levels - 1] + list( range(num_levels - 1)) result = result.reorder_levels(new_order).sort_index() else: result.index = MultiIndex.from_product([ range(len(arg2.columns)), range(len(result_index)) ]) result = result.swaplevel(1, 0).sort_index() result.index = MultiIndex.from_product([result_index] + [arg2.columns]) else: # empty result result = DataFrame( index=MultiIndex(levels=[arg1.index, arg2.columns], codes=[[], []]), columns=arg2.columns, dtype="float64", ) # reset our index names to arg1 names # reset our column names to arg2 names # careful not to mutate the original names result.columns = result.columns.set_names(arg1.columns.names) result.index = result.index.set_names(result_index.names + arg2.columns.names) return result else: raise ValueError("'pairwise' is not True/False") else: results = { i: f(*prep_binary(arg1.iloc[:, i], arg2)) for i, col in enumerate(arg1.columns) } return dataframe_from_int_dict(results, arg1) else: return _flex_binary_moment(arg2, arg1, f)
import pandas.util.testing as tm @pytest.fixture(params=[tm.makeUnicodeIndex(100), tm.makeStringIndex(100), tm.makeDateIndex(100), tm.makePeriodIndex(100), tm.makeTimedeltaIndex(100), tm.makeIntIndex(100), tm.makeUIntIndex(100), tm.makeRangeIndex(100), tm.makeFloatIndex(100), Index([True, False]), tm.makeCategoricalIndex(100), Index([]), MultiIndex.from_tuples(lzip( ['foo', 'bar', 'baz'], [1, 2, 3])), Index([0, 0, 1, 1, 2, 2])], ids=lambda x: type(x).__name__) def indices(request): return request.param @pytest.fixture(params=[1, np.array(1, dtype=np.int64)]) def one(request): # zero-dim integer array behaves like an integer return request.param zeros = [box([0] * 5, dtype=dtype) for box in [pd.Index, np.array] for dtype in [np.int64, np.uint64, np.float64]]
import pandas.util.testing as tm @pytest.fixture(params=[tm.makeUnicodeIndex(100), tm.makeStringIndex(100), tm.makeDateIndex(100), tm.makePeriodIndex(100), tm.makeTimedeltaIndex(100), tm.makeIntIndex(100), tm.makeUIntIndex(100), tm.makeRangeIndex(100), tm.makeFloatIndex(100), Index([True, False]), tm.makeCategoricalIndex(100), Index([]), MultiIndex.from_tuples(zip( ['foo', 'bar', 'baz'], [1, 2, 3])), Index([0, 0, 1, 1, 2, 2])], ids=lambda x: type(x).__name__) def indices(request): return request.param @pytest.fixture(params=[1, np.array(1, dtype=np.int64)]) def one(request): # zero-dim integer array behaves like an integer return request.param zeros = [box([0] * 5, dtype=dtype) for box in [pd.Index, np.array] for dtype in [np.int64, np.uint64, np.float64]]
def flex_binary_moment(arg1, arg2, f, pairwise=False): if isinstance(arg1, ABCSeries) and isinstance(arg2, ABCSeries): X, Y = prep_binary(arg1, arg2) return f(X, Y) elif isinstance(arg1, ABCDataFrame): from pandas import DataFrame def dataframe_from_int_dict(data, frame_template): result = DataFrame(data, index=frame_template.index) if len(result.columns) > 0: result.columns = frame_template.columns[result.columns] return result results = {} if isinstance(arg2, ABCDataFrame): if pairwise is False: if arg1 is arg2: # special case in order to handle duplicate column names for i in range(len(arg1.columns)): results[i] = f(arg1.iloc[:, i], arg2.iloc[:, i]) return dataframe_from_int_dict(results, arg1) else: if not arg1.columns.is_unique: raise ValueError("'arg1' columns are not unique") if not arg2.columns.is_unique: raise ValueError("'arg2' columns are not unique") X, Y = arg1.align(arg2, join="outer") X, Y = prep_binary(X, Y) res_columns = arg1.columns.union(arg2.columns) for col in res_columns: if col in X and col in Y: results[col] = f(X[col], Y[col]) return DataFrame(results, index=X.index, columns=res_columns) elif pairwise is True: results = defaultdict(dict) for i in range(len(arg1.columns)): for j in range(len(arg2.columns)): if j < i and arg2 is arg1: # Symmetric case results[i][j] = results[j][i] else: results[i][j] = f( *prep_binary(arg1.iloc[:, i], arg2.iloc[:, j])) from pandas import concat result_index = arg1.index.union(arg2.index) if len(result_index): # construct result frame result = concat( [ concat( [ results[i][j] for j in range(len(arg2.columns)) ], ignore_index=True, ) for i in range(len(arg1.columns)) ], ignore_index=True, axis=1, ) result.columns = arg1.columns # set the index and reorder if arg2.columns.nlevels > 1: # mypy needs to know columns is a MultiIndex, Index doesn't # have levels attribute arg2.columns = cast(MultiIndex, arg2.columns) # GH 21157: Equivalent to MultiIndex.from_product( # [result_index], <unique combinations of arg2.columns.levels>, # ) # A normal MultiIndex.from_product will produce too many # combinations. result_level = np.tile( result_index, len(result) // len(result_index)) arg2_levels = (np.repeat( arg2.columns.get_level_values(i), len(result) // len(arg2.columns), ) for i in range(arg2.columns.nlevels)) result_names = list( arg2.columns.names) + [result_index.name] result.index = MultiIndex.from_arrays( [*arg2_levels, result_level], names=result_names) # GH 34440 num_levels = len(result.index.levels) new_order = [num_levels - 1] + list( range(num_levels - 1)) result = result.reorder_levels(new_order).sort_index() else: result.index = MultiIndex.from_product([ range(len(arg2.columns)), range(len(result_index)) ]) result = result.swaplevel(1, 0).sort_index() result.index = MultiIndex.from_product([result_index] + [arg2.columns]) else: # empty result result = DataFrame( index=MultiIndex(levels=[arg1.index, arg2.columns], codes=[[], []]), columns=arg2.columns, dtype="float64", ) # reset our index names to arg1 names # reset our column names to arg2 names # careful not to mutate the original names result.columns = result.columns.set_names(arg1.columns.names) result.index = result.index.set_names(result_index.names + arg2.columns.names) return result else: results = { i: f(*prep_binary(arg1.iloc[:, i], arg2)) for i in range(len(arg1.columns)) } return dataframe_from_int_dict(results, arg1) else: return flex_binary_moment(arg2, arg1, f)
def pivot_table( data, values=None, index=None, columns=None, aggfunc="mean", fill_value=None, margins=False, dropna=True, margins_name="All", observed=False, ) -> "DataFrame": index = _convert_by(index) columns = _convert_by(columns) if isinstance(aggfunc, list): pieces: List[DataFrame] = [] keys = [] for func in aggfunc: table = pivot_table( data, values=values, index=index, columns=columns, fill_value=fill_value, aggfunc=func, margins=margins, dropna=dropna, margins_name=margins_name, observed=observed, ) pieces.append(table) keys.append(getattr(func, "__name__", func)) return concat(pieces, keys=keys, axis=1) keys = index + columns values_passed = values is not None if values_passed: if is_list_like(values): values_multi = True values = list(values) else: values_multi = False values = [values] # GH14938 Make sure value labels are in data for i in values: if i not in data: raise KeyError(i) to_filter = [] for x in keys + values: if isinstance(x, Grouper): x = x.key try: if x in data: to_filter.append(x) except TypeError: pass if len(to_filter) < len(data.columns): data = data[to_filter] else: values = data.columns for key in keys: try: values = values.drop(key) except (TypeError, ValueError, KeyError): pass values = list(values) grouped = data.groupby(keys, observed=observed) agged = grouped.agg(aggfunc) if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns): agged = agged.dropna(how="all") # gh-21133 # we want to down cast if # the original values are ints # as we grouped with a NaN value # and then dropped, coercing to floats for v in values: if (v in data and is_integer_dtype(data[v]) and v in agged and not is_integer_dtype(agged[v])): agged[v] = maybe_downcast_to_dtype(agged[v], data[v].dtype) table = agged # GH17038, this check should only happen if index is defined (not None) if table.index.nlevels > 1 and index: # Related GH #17123 # If index_names are integers, determine whether the integers refer # to the level position or name. index_names = agged.index.names[:len(index)] to_unstack = [] for i in range(len(index), len(keys)): name = agged.index.names[i] if name is None or name in index_names: to_unstack.append(i) else: to_unstack.append(name) table = agged.unstack(to_unstack) if not dropna: if table.index.nlevels > 1: m = MultiIndex.from_arrays(cartesian_product(table.index.levels), names=table.index.names) table = table.reindex(m, axis=0) if table.columns.nlevels > 1: m = MultiIndex.from_arrays(cartesian_product(table.columns.levels), names=table.columns.names) table = table.reindex(m, axis=1) if isinstance(table, ABCDataFrame): table = table.sort_index(axis=1) if fill_value is not None: table = table._ensure_type(table.fillna(fill_value, downcast="infer")) if margins: if dropna: data = data[data.notna().all(axis=1)] table = _add_margins( table, data, values, rows=index, cols=columns, aggfunc=aggfunc, observed=dropna, margins_name=margins_name, fill_value=fill_value, ) # discard the top level if (values_passed and not values_multi and not table.empty and (table.columns.nlevels > 1)): table = table[values[0]] if len(index) == 0 and len(columns) > 0: table = table.T # GH 15193 Make sure empty columns are removed if dropna=True if isinstance(table, ABCDataFrame) and dropna: table = table.dropna(how="all", axis=1) return table
def _maybe_make_multi_index_columns(self, columns, col_names=None): # possibly create a column mi here if _is_potential_multi_index(columns): columns = MultiIndex.from_tuples(columns, names=col_names) return columns
def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiIndex: if (levels is None and isinstance(keys[0], tuple)) or (levels is not None and len(levels) > 1): zipped = list(zip(*keys)) if names is None: names = [None] * len(zipped) if levels is None: _, levels = factorize_from_iterables(zipped) else: levels = [ensure_index(x) for x in levels] else: zipped = [keys] if names is None: names = [None] if levels is None: levels = [ensure_index(keys)] else: levels = [ensure_index(x) for x in levels] if not all_indexes_same(indexes): codes_list = [] # things are potentially different sizes, so compute the exact codes # for each level and pass those to MultiIndex.from_arrays for hlevel, level in zip(zipped, levels): to_concat = [] for key, index in zip(hlevel, indexes): # Find matching codes, include matching nan values as equal. mask = (isna(level) & isna(key)) | (level == key) if not mask.any(): raise ValueError(f"Key {key} not in level {level}") i = np.nonzero(mask)[0][0] to_concat.append(np.repeat(i, len(index))) codes_list.append(np.concatenate(to_concat)) concat_index = _concat_indexes(indexes) # these go at the end if isinstance(concat_index, MultiIndex): levels.extend(concat_index.levels) codes_list.extend(concat_index.codes) else: codes, categories = factorize_from_iterable(concat_index) levels.append(categories) codes_list.append(codes) if len(names) == len(levels): names = list(names) else: # make sure that all of the passed indices have the same nlevels if not len({idx.nlevels for idx in indexes}) == 1: raise AssertionError( "Cannot concat indices that do not have the same number of levels" ) # also copies names = list(names) + list(get_unanimous_names(*indexes)) return MultiIndex(levels=levels, codes=codes_list, names=names, verify_integrity=False) new_index = indexes[0] n = len(new_index) kpieces = len(indexes) # also copies new_names = list(names) new_levels = list(levels) # construct codes new_codes = [] # do something a bit more speedy for hlevel, level in zip(zipped, levels): hlevel = ensure_index(hlevel) mapped = level.get_indexer(hlevel) mask = mapped == -1 if mask.any(): raise ValueError( f"Values not found in passed level: {hlevel[mask]!s}") new_codes.append(np.repeat(mapped, n)) if isinstance(new_index, MultiIndex): new_levels.extend(new_index.levels) new_codes.extend([np.tile(lab, kpieces) for lab in new_index.codes]) else: new_levels.append(new_index) new_codes.append(np.tile(np.arange(n), kpieces)) if len(new_names) < len(new_levels): new_names.extend(new_index.names) return MultiIndex(levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False)