def merge_index(index: pd.MultiIndex, ) -> pd.MultiIndex: r"""Merge overlapping segments. Assumes that index is sorted by 'start' level. """ if index.empty: return index starts = index.get_level_values('start') ends = index.get_level_values('end') new_starts = [] new_ends = [] new_start = starts[0] new_end = ends[0] for start, end in zip(starts[1:], ends[1:]): if start > new_end: new_starts.append(new_start) new_ends.append(new_end) new_start = start new_end = end elif end > new_end: new_end = end new_starts.append(new_start) new_ends.append(new_end) return utils.signal_index(new_starts, new_ends)
def test_unicode_repr_issues(self): levels = [Index(["a/\u03c3", "b/\u03c3", "c/\u03c3"]), Index([0, 1])] codes = [np.arange(3).repeat(2), np.tile(np.arange(2), 3)] index = MultiIndex(levels=levels, codes=codes) repr(index.levels) repr(index.get_level_values(1))
def test_reset_index(self): df = tm.makeDataFrame()[:5] ser = df.stack() ser.index.names = ["hash", "category"] ser.name = "value" df = ser.reset_index() assert "value" in df df = ser.reset_index(name="value2") assert "value2" in df # check inplace s = ser.reset_index(drop=True) s2 = ser s2.reset_index(drop=True, inplace=True) tm.assert_series_equal(s, s2) # level index = MultiIndex( levels=[["bar"], ["one", "two", "three"], [0, 1]], codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]], ) s = Series(np.random.randn(6), index=index) rs = s.reset_index(level=1) assert len(rs.columns) == 2 rs = s.reset_index(level=[0, 2], drop=True) tm.assert_index_equal(rs.index, Index(index.get_level_values(1))) assert isinstance(rs, Series)
def test_reset_index(self): df = tm.makeDataFrame()[:5] ser = df.stack() ser.index.names = ['hash', 'category'] ser.name = 'value' df = ser.reset_index() assert 'value' in df df = ser.reset_index(name='value2') assert 'value2' in df # check inplace s = ser.reset_index(drop=True) s2 = ser s2.reset_index(drop=True, inplace=True) tm.assert_series_equal(s, s2) # level index = MultiIndex(levels=[['bar'], ['one', 'two', 'three'], [0, 1]], codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]]) s = Series(np.random.randn(6), index=index) rs = s.reset_index(level=1) assert len(rs.columns) == 2 rs = s.reset_index(level=[0, 2], drop=True) tm.assert_index_equal(rs.index, Index(index.get_level_values(1))) assert isinstance(rs, Series)
def remove_constant_levels(index: pd.MultiIndex) -> pd.MultiIndex: index = index.copy() levels = index.names for level in levels: if len(index.get_level_values( level).unique()) == 1 and level not in WHITELISTED_LEVELS: index = index.droplevel(level=level) return index
def indexFillNAs(indexdata: pd.MultiIndex, replacementValues: dict): """ Reemplaza los NAs de niveles de índice por valores configurables por nivel. :param indexdata: Indice a tratar :param replacementValues: diccionario con "nivel":"valor de reemplazo" :return: """ newData = [] for name in indexdata.names: dataLevel = indexdata.get_level_values(name).fillna( replacementValues[name] ) if name in replacementValues else indexdata.get_level_values(name) newData.append(dataLevel) result = pd.MultiIndex.from_arrays(newData, names=indexdata.names) return result
def test_get_level_values_box_datetime64(self): dates = date_range("1/1/2000", periods=4) levels = [dates, [0, 1]] codes = [[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]] index = MultiIndex(levels=levels, codes=codes) assert isinstance(index.get_level_values(0)[0], Timestamp)
def test_get_level_values_box(self): from pandas import MultiIndex dates = date_range('1/1/2000', periods=4) levels = [dates, [0, 1]] labels = [[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]] index = MultiIndex(levels=levels, labels=labels) self.assertTrue(isinstance(index.get_level_values(0)[0], Timestamp))
def test_get_level_values_box(self): from pandas import MultiIndex dates = date_range('1/1/2000', periods=4) levels = [dates, [0, 1]] labels = [[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]] index = MultiIndex(levels=levels, labels=labels) assert isinstance(index.get_level_values(0)[0], Timestamp)
def test_get_level_values(idx): result = idx.get_level_values(0) expected = Index(["foo", "foo", "bar", "baz", "qux", "qux"], name="first") tm.assert_index_equal(result, expected) assert result.name == "first" result = idx.get_level_values("first") expected = idx.get_level_values(0) tm.assert_index_equal(result, expected) # GH 10460 index = MultiIndex( levels=[CategoricalIndex(["A", "B"]), CategoricalIndex([1, 2, 3])], codes=[np.array([0, 0, 0, 1, 1, 1]), np.array([0, 1, 2, 0, 1, 2])], ) exp = CategoricalIndex(["A", "A", "A", "B", "B", "B"]) tm.assert_index_equal(index.get_level_values(0), exp) exp = CategoricalIndex([1, 2, 3, 1, 2, 3]) tm.assert_index_equal(index.get_level_values(1), exp)
def test_get_level_values(idx): result = idx.get_level_values(0) expected = Index(['foo', 'foo', 'bar', 'baz', 'qux', 'qux'], name='first') tm.assert_index_equal(result, expected) assert result.name == 'first' result = idx.get_level_values('first') expected = idx.get_level_values(0) tm.assert_index_equal(result, expected) # GH 10460 index = MultiIndex( levels=[CategoricalIndex(['A', 'B']), CategoricalIndex([1, 2, 3])], labels=[np.array([0, 0, 0, 1, 1, 1]), np.array([0, 1, 2, 0, 1, 2])]) exp = CategoricalIndex(['A', 'A', 'A', 'B', 'B', 'B']) tm.assert_index_equal(index.get_level_values(0), exp) exp = CategoricalIndex([1, 2, 3, 1, 2, 3]) tm.assert_index_equal(index.get_level_values(1), exp)
def product_combine_frames(data: List[pd.DataFrame], index: pd.MultiIndex, cols: pd.MultiIndex) -> pd.DataFrame: """Iterate through the dataframes, filling data into the combined dataframe with duplicate indexes being resolved using a 'last one wins' logic. """ df = pd.DataFrame([], index=index, columns=cols) for idx, f in enumerate(data): data = f.loc[:, cols.get_level_values(idx)] data.columns = cols df.loc[data.index, :] = data return df
def test_get_level_values(idx): result = idx.get_level_values(0) expected = Index(['foo', 'foo', 'bar', 'baz', 'qux', 'qux'], name='first') tm.assert_index_equal(result, expected) assert result.name == 'first' result = idx.get_level_values('first') expected = idx.get_level_values(0) tm.assert_index_equal(result, expected) # GH 10460 index = MultiIndex( levels=[CategoricalIndex(['A', 'B']), CategoricalIndex([1, 2, 3])], codes=[np.array([0, 0, 0, 1, 1, 1]), np.array([0, 1, 2, 0, 1, 2])]) exp = CategoricalIndex(['A', 'A', 'A', 'B', 'B', 'B']) tm.assert_index_equal(index.get_level_values(0), exp) exp = CategoricalIndex([1, 2, 3, 1, 2, 3]) tm.assert_index_equal(index.get_level_values(1), exp)
def invert_index( index: pd.MultiIndex, dur: pd.Timedelta, ) -> pd.MultiIndex: r"""Invert index. Assumes that index is sorted by 'start' level. """ if index.empty: return utils.signal_index(0, dur) starts = index.get_level_values('start') ends = index.get_level_values('end') new_starts = ends[:-1] new_ends = starts[1:] if starts[0] != pd.to_timedelta(0): new_starts = new_starts.insert(0, pd.to_timedelta(0)) new_ends = new_ends.insert(0, starts[0]) if ends[-1] != dur: new_starts = new_starts.insert(len(new_starts), ends[-1]) new_ends = new_ends.insert(len(new_ends), dur) return utils.signal_index(new_starts, new_ends)
def test_nan_stays_float(): # GH 7031 idx0 = MultiIndex(levels=[["A", "B"], []], codes=[[1, 0], [-1, -1]], names=[0, 1]) idx1 = MultiIndex(levels=[["C"], ["D"]], codes=[[0], [0]], names=[0, 1]) idxm = idx0.join(idx1, how="outer") assert pd.isna(idx0.get_level_values(1)).all() # the following failed in 0.14.1 assert pd.isna(idxm.get_level_values(1)[:-1]).all() df0 = pd.DataFrame([[1, 2]], index=idx0) df1 = pd.DataFrame([[3, 4]], index=idx1) dfm = df0 - df1 assert pd.isna(df0.index.get_level_values(1)).all() # the following failed in 0.14.1 assert pd.isna(dfm.index.get_level_values(1)[:-1]).all()
def coerce_dtype(self, obj: pd.MultiIndex) -> pd.MultiIndex: """Coerce type of a pd.Series by type specified in dtype. :param obj: multi-index to coerce. :returns: ``MultiIndex`` with coerced data type """ error_handler = SchemaErrorHandler(lazy=True) # construct MultiIndex with coerced data types coerced_multi_index = {} for i, index in enumerate(self.indexes): if all(x is None for x in self.names): index_levels = [i] else: index_levels = [ i for i, name in enumerate(obj.names) if name == index.name ] for index_level in index_levels: index_array = obj.get_level_values(index_level) if index.coerce or self._coerce: try: index_array = index.coerce_dtype(index_array) except errors.SchemaError as err: error_handler.collect_error("dtype_coercion_error", err) coerced_multi_index[index_level] = index_array if error_handler.collected_errors: raise errors.SchemaErrors(error_handler.collected_errors, obj) multiindex_cls = pd.MultiIndex # NOTE: this is a hack to support koalas if type(obj).__module__.startswith("databricks.koalas"): # pylint: disable=import-outside-toplevel import databricks.koalas as ks multiindex_cls = ks.MultiIndex return multiindex_cls.from_arrays( [ v.to_numpy() for k, v in sorted(coerced_multi_index.items(), key=lambda x: x[0]) ], names=obj.names, )
def coerce_dtype(self, multi_index: pd.MultiIndex) -> pd.MultiIndex: """Coerce type of a pd.Series by type specified in pandas_dtype. :param multi_index: multi-index to coerce. :returns: ``MultiIndex`` with coerced data type """ _coerced_multi_index = [] if multi_index.nlevels != len(self.indexes): raise errors.SchemaError( "multi_index does not have equal number of levels as " "MultiIndex schema %d != %d." % (multi_index.nlevels, len(self.indexes))) for level_i, index in enumerate(self.indexes): index_array = multi_index.get_level_values(level_i) if index.coerce or self.coerce: index_array = index.coerce_dtype(index_array) _coerced_multi_index.append(index_array) return pd.MultiIndex.from_arrays(_coerced_multi_index, names=multi_index.names)
def coerce_dtype(self, obj: pd.MultiIndex) -> pd.MultiIndex: """Coerce type of a pd.Series by type specified in pandas_dtype. :param obj: multi-index to coerce. :returns: ``MultiIndex`` with coerced data type """ error_handler = SchemaErrorHandler(lazy=True) # construct MultiIndex with coerced data types coerced_multi_index = {} for i, index in enumerate(self.indexes): if all(x is None for x in self.names): index_levels = [i] else: index_levels = [ i for i, name in enumerate(obj.names) if name == index.name ] for index_level in index_levels: index_array = obj.get_level_values(index_level) if index.coerce or self._coerce: try: index_array = index.coerce_dtype(index_array) except errors.SchemaError as err: error_handler.collect_error( "dtype_coercion_error", err ) coerced_multi_index[index_level] = index_array if error_handler.collected_errors: raise errors.SchemaErrors(error_handler.collected_errors, obj) return pd.MultiIndex.from_arrays( [ v for k, v in sorted( coerced_multi_index.items(), key=lambda x: x[0] ) ], names=obj.names, )
def merge_tuples( sq: Tuple[Union[str, slice], Union[str, slice], Union[str, slice], Union[str, slice]], res: pd.MultiIndex, ) -> Tuple[str, str, str, str]: """Replace tuple values where the index is an empty slice. Behaviour change in pandas 1.4, in previous versions the full index was returned. Post 1.4, pandas returns only the missing levels. :param sq: query tuple :type sq: Tuple[ Union[str, slice], Union[str, slice], Union[str, slice], Union[str, slice] ] :param res: index part :type res: Tuple :return: Full lookup value :rtype: Tuple[str, str, str, str] """ out = list(sq) for n in res.names: idx = INDEX_COLS.index(n) out[idx] = res.get_level_values(n)[0] return tuple(out)