def test_iloc_getitem_multiindex2(self): # TODO(wesm): fix this pytest.skip('this test was being suppressed, ' 'needs to be fixed') arr = np.random.randn(3, 3) df = DataFrame(arr, columns=[[2, 2, 4], [6, 8, 10]], index=[[4, 4, 8], [8, 10, 12]]) rs = df.iloc[2] xp = Series(arr[2], index=df.columns) tm.assert_series_equal(rs, xp) rs = df.iloc[:, 2] xp = Series(arr[:, 2], index=df.index) tm.assert_series_equal(rs, xp) rs = df.iloc[2, 2] xp = df.values[2, 2] assert rs == xp # for multiple items # GH 5528 rs = df.iloc[[0, 1]] xp = df.xs(4, drop_level=False) tm.assert_frame_equal(rs, xp) tup = zip(*[['a', 'a', 'b', 'b'], ['x', 'y', 'x', 'y']]) index = MultiIndex.from_tuples(tup) df = DataFrame(np.random.randn(4, 4), index=index) rs = df.iloc[[2, 3]] xp = df.xs('b', drop_level=False) tm.assert_frame_equal(rs, xp)
class XS(object): params = [0, 1] param_names = ['axis'] def setup(self, axis): self.N = 10**4 self.df = DataFrame(np.random.randn(self.N, self.N)) def time_frame_xs(self, axis): self.df.xs(self.N / 2, axis=axis)
def test_xs_partial(self, multiindex_dataframe_random_data, multiindex_year_month_day_dataframe_random_data): frame = multiindex_dataframe_random_data ymd = multiindex_year_month_day_dataframe_random_data result = frame.xs('foo') result2 = frame.loc['foo'] expected = frame.T['foo'].T tm.assert_frame_equal(result, expected) tm.assert_frame_equal(result, result2) result = ymd.xs((2000, 4)) expected = ymd.loc[2000, 4] tm.assert_frame_equal(result, expected) # ex from #1796 index = MultiIndex(levels=[['foo', 'bar'], ['one', 'two'], [-1, 1]], codes=[[0, 0, 0, 0, 1, 1, 1, 1], [0, 0, 1, 1, 0, 0, 1, 1], [0, 1, 0, 1, 0, 1, 0, 1]]) df = DataFrame(np.random.randn(8, 4), index=index, columns=list('abcd')) result = df.xs(['foo', 'one']) expected = df.loc['foo', 'one'] tm.assert_frame_equal(result, expected)
def test_xs(self, multiindex_dataframe_random_data): frame = multiindex_dataframe_random_data xs = frame.xs(('bar', 'two')) xs2 = frame.loc[('bar', 'two')] tm.assert_series_equal(xs, xs2) tm.assert_almost_equal(xs.values, frame.values[4]) # GH 6574 # missing values in returned index should be preserrved acc = [ ('a', 'abcde', 1), ('b', 'bbcde', 2), ('y', 'yzcde', 25), ('z', 'xbcde', 24), ('z', None, 26), ('z', 'zbcde', 25), ('z', 'ybcde', 26), ] df = DataFrame(acc, columns=['a1', 'a2', 'cnt']).set_index(['a1', 'a2']) expected = DataFrame({'cnt': [24, 26, 25, 26]}, index=Index( ['xbcde', np.nan, 'zbcde', 'ybcde'], name='a2')) result = df.xs('z', level='a1') tm.assert_frame_equal(result, expected)
def test_xs_level(self, multiindex_dataframe_random_data): frame = multiindex_dataframe_random_data result = frame.xs('two', level='second') expected = frame[frame.index.get_level_values(1) == 'two'] expected.index = expected.index.droplevel(1) tm.assert_frame_equal(result, expected) index = MultiIndex.from_tuples([('x', 'y', 'z'), ('a', 'b', 'c'), ( 'p', 'q', 'r')]) df = DataFrame(np.random.randn(3, 5), index=index) result = df.xs('c', level=2) expected = df[1:2] expected.index = expected.index.droplevel(2) tm.assert_frame_equal(result, expected) # this is a copy in 0.14 result = frame.xs('two', level='second') # setting this will give a SettingWithCopyError # as we are trying to write a view def f(x): x[:] = 10 pytest.raises(com.SettingWithCopyError, f, result)
def test_xs_multiindex(self): # GH2903 columns = MultiIndex.from_tuples( [('a', 'foo'), ('a', 'bar'), ('b', 'hello'), ('b', 'world')], names=['lvl0', 'lvl1']) df = DataFrame(np.random.randn(4, 4), columns=columns) df.sort_index(axis=1, inplace=True) result = df.xs('a', level='lvl0', axis=1) expected = df.iloc[:, 0:2].loc[:, 'a'] tm.assert_frame_equal(result, expected) result = df.xs('foo', level='lvl1', axis=1) expected = df.iloc[:, 1:2].copy() expected.columns = expected.columns.droplevel('lvl1') tm.assert_frame_equal(result, expected)
def test_from_frame_level1_unsorted(self): tuples = [('MSFT', 3), ('MSFT', 2), ('AAPL', 2), ('AAPL', 1), ('MSFT', 1)] midx = MultiIndex.from_tuples(tuples) df = DataFrame(np.random.rand(5,4), index=midx) p = df.to_panel() assert_frame_equal(p.minor_xs(2), df.xs(2, level=1).sort_index())
def test_xs_level_multiple(self): text = """ A B C D E one two three four a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640 a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744 x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838""" df = read_csv(StringIO(text), sep=r'\s+', engine='python') result = df.xs(('a', 4), level=['one', 'four']) expected = df.xs('a').xs(4, level='four') tm.assert_frame_equal(result, expected) # this is a copy in 0.14 result = df.xs(('a', 4), level=['one', 'four']) # setting this will give a SettingWithCopyError # as we are trying to write a view def f(x): x[:] = 10 pytest.raises(com.SettingWithCopyError, f, result) # GH2107 dates = lrange(20111201, 20111205) ids = 'abcde' idx = MultiIndex.from_tuples([x for x in cart_product(dates, ids)]) idx.names = ['date', 'secid'] df = DataFrame(np.random.randn(len(idx), 3), idx, ['X', 'Y', 'Z']) rs = df.xs(20111201, level='date') xp = df.loc[20111201, :] tm.assert_frame_equal(rs, xp)
def test_iloc_getitem_multiple_items(): # GH 5528 tup = zip(*[['a', 'a', 'b', 'b'], ['x', 'y', 'x', 'y']]) index = MultiIndex.from_tuples(tup) df = DataFrame(np.random.randn(4, 4), index=index) result = df.iloc[[2, 3]] expected = df.xs('b', drop_level=False) tm.assert_frame_equal(result, expected)
def test_level_with_tuples(self): index = MultiIndex(levels=[[('foo', 'bar', 0), ('foo', 'baz', 0), ('foo', 'qux', 0)], [0, 1]], labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]]) series = Series(np.random.randn(6), index=index) frame = DataFrame(np.random.randn(6, 4), index=index) result = series[('foo', 'bar', 0)] result2 = series.ix[('foo', 'bar', 0)] expected = series[:2] expected.index = expected.index.droplevel(0) assert_series_equal(result, expected) assert_series_equal(result2, expected) self.assertRaises(KeyError, series.__getitem__, (('foo', 'bar', 0), 2)) result = frame.ix[('foo', 'bar', 0)] result2 = frame.xs(('foo', 'bar', 0)) expected = frame[:2] expected.index = expected.index.droplevel(0) assert_frame_equal(result, expected) assert_frame_equal(result2, expected) index = MultiIndex(levels=[[('foo', 'bar'), ('foo', 'baz'), ('foo', 'qux')], [0, 1]], labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]]) series = Series(np.random.randn(6), index=index) frame = DataFrame(np.random.randn(6, 4), index=index) result = series[('foo', 'bar')] result2 = series.ix[('foo', 'bar')] expected = series[:2] expected.index = expected.index.droplevel(0) assert_series_equal(result, expected) assert_series_equal(result2, expected) result = frame.ix[('foo', 'bar')] result2 = frame.xs(('foo', 'bar')) expected = frame[:2] expected.index = expected.index.droplevel(0) assert_frame_equal(result, expected) assert_frame_equal(result2, expected)
def test_xs_level_eq_2(): arr = np.random.randn(3, 5) index = MultiIndex( levels=[['a', 'p', 'x'], ['b', 'q', 'y'], ['c', 'r', 'z']], codes=[[2, 0, 1], [2, 0, 1], [2, 0, 1]]) df = DataFrame(arr, index=index) expected = DataFrame(arr[1:2], index=[['a'], ['b']]) result = df.xs('c', level=2) tm.assert_frame_equal(result, expected)
def test_xs_named_levels_axis_eq_1(key, level, exp_arr, exp_index): # see gh-2903 arr = np.random.randn(4, 4) index = MultiIndex(levels=[['a', 'b'], ['bar', 'foo', 'hello', 'world']], codes=[[0, 0, 1, 1], [0, 1, 2, 3]], names=['lvl0', 'lvl1']) df = DataFrame(arr, columns=index) result = df.xs(key, level=level, axis=1) expected = DataFrame(exp_arr(arr), columns=exp_index) tm.assert_frame_equal(result, expected)
def test_nonunique_assignment_1750(self): df = DataFrame([[1, 1, "x", "X"], [1, 1, "y", "Y"], [1, 2, "z", "Z"]], columns=list("ABCD")) df = df.set_index(["A", "B"]) ix = MultiIndex.from_tuples([(1, 1)]) df.loc[ix, "C"] = "_" assert (df.xs((1, 1))["C"] == "_").all()
def test_nonunique_assignment_1750(self): df = DataFrame([[1, 1, "x", "X"], [1, 1, "y", "Y"], [1, 2, "z", "Z"]], columns=list("ABCD")) df = df.set_index(['A', 'B']) ix = MultiIndex.from_tuples([(1, 1)]) df.loc[ix, "C"] = '_' assert (df.xs((1, 1))['C'] == '_').all()
def test_xs_integer_key(): # see gh-2107 dates = range(20111201, 20111205) ids = list("abcde") index = MultiIndex.from_product([dates, ids], names=["date", "secid"]) df = DataFrame(np.random.randn(len(index), 3), index, ["X", "Y", "Z"]) result = df.xs(20111201, level="date") expected = df.loc[20111201, :] tm.assert_frame_equal(result, expected)
def test_xs_level_eq_2(): arr = np.random.randn(3, 5) index = MultiIndex( levels=[["a", "p", "x"], ["b", "q", "y"], ["c", "r", "z"]], codes=[[2, 0, 1], [2, 0, 1], [2, 0, 1]], ) df = DataFrame(arr, index=index) expected = DataFrame(arr[1:2], index=[["a"], ["b"]]) result = df.xs("c", level=2) tm.assert_frame_equal(result, expected)
def entropy(data: pd.DataFrame, keys: Sequence[str], kind: str='kl') -> pd.DataFrame: """ Compute the divergence of two probability distributions. Parameters ---------- data : Probability distributions. keys : Two keys indicating the distributions to be compared. kind : Type of metric to use. Returns ------- kld : Dataframe of length 1 with the calculated values. """ # Check input valid_kinds = ['kl', 'kls', 'shannon', 'js', 'hellinger'] if kind not in valid_kinds: e = ('\'{0}\' is not a valid entropy kind!' 'Valid types: {1}').format(kind, valid_kinds) raise ValueError(e) if isinstance(keys, list): keylist = keys else: keylist = [keys] kls = [] for k1, k2 in keylist: data_a = data.xs(k1) data_b = data.xs(k2) kl = [] for col in data_a.columns: if kind == 'kls': S = (scipy.stats.entropy(data_a[col], data_b[col]) + scipy.stats.entropy(data_b[col], data_a[col])) elif kind == 'kl': S = scipy.stats.entropy(data_a[col], data_b[col]) elif kind in ['shannon', 'js']: M = 0.5 * (data_a[col] + data_b[col]) S = 0.5 * (scipy.stats.entropy(data_a[col], M) + scipy.stats.entropy(data_b[col], M)) elif kind == 'hellinger': S = (1 / np.sqrt(2) * np.sqrt(((np.sqrt(data_a[col]) - np.sqrt(data_b[col])) ** 2).sum())) kl.append(pd.Series( {'{0}-{1}'.format(k1, k2): S}, name=col )) kls.append(pd.concat(kl, axis=1)) return pd.concat(kls)
def test_xs_corner(self): # pathological mixed-type reordering case df = DataFrame(index=[0]) df["A"] = 1.0 df["B"] = "foo" df["C"] = 2.0 df["D"] = "bar" df["E"] = 3.0 xs = df.xs(0) exp = Series([1.0, "foo", 2.0, "bar", 3.0], index=list("ABCDE"), name=0) tm.assert_series_equal(xs, exp) # no columns but Index(dtype=object) df = DataFrame(index=["a", "b", "c"]) result = df.xs("a") expected = Series([], name="a", index=Index([]), dtype=np.float64) tm.assert_series_equal(result, expected)
def subset(data: DataFrame, key='Close', axis=1, level='Stock Info', reset_index=False): sub = data.xs(key=key, level=level, axis=axis) if reset_index: return sub.reset_index() return sub
def test_xs_integer_key(): # see gh-2107 dates = range(20111201, 20111205) ids = 'abcde' index = MultiIndex.from_tuples([x for x in product(dates, ids)], names=['date', 'secid']) df = DataFrame(np.random.randn(len(index), 3), index, ['X', 'Y', 'Z']) result = df.xs(20111201, level='date') expected = df.loc[20111201, :] tm.assert_frame_equal(result, expected)
def exec_on_each_tl_column(df: pd.DataFrame, *args, **kwargs): if df.ndim > 1 and isinstance(df.columns, pd.MultiIndex): # check if the shape of the 2nd level is identical else threat as if not multi index if same_columns_after_level(df, level): top_level = unique_level_columns(df, level) groups = [func(df.xs(group, axis=1, level=level), *args, **kwargs).to_frame().add_multi_index(group, inplace=True, level=level) for group in top_level] return pd.concat(groups, axis=1) else: _log.warning(f"columns in further levels do not follow the same structure! Treat as normal Index") return func(df, *args, **kwargs) else: return func(df, *args, **kwargs)
def test_xs_named_levels_axis_eq_1(self, key, level, exp_arr, exp_index): # see GH#2903 arr = np.random.randn(4, 4) index = MultiIndex( levels=[["a", "b"], ["bar", "foo", "hello", "world"]], codes=[[0, 0, 1, 1], [0, 1, 2, 3]], names=["lvl0", "lvl1"], ) df = DataFrame(arr, columns=index) result = df.xs(key, level=level, axis=1) expected = DataFrame(exp_arr(arr), columns=exp_index) tm.assert_frame_equal(result, expected)
def generateReport(data: pd.DataFrame): """ :param data: mapping of asset_id to its signal data frame :return: """ signals = data.xs('signal', axis=1, level='Attributes') ### email the following detail emailReporter = EmailReporter(['*****@*****.**']) attachments = getGraphs(signals.columns) emailReporter.send(signals.tail(10).sort_index(ascending=False), attachments=attachments) return signals
def test_xs_multiindex_droplevel_false(self): # GH#19056 mi = MultiIndex.from_tuples([("a", "x"), ("a", "y"), ("b", "x")], names=["level1", "level2"]) df = DataFrame([[1, 2, 3]], columns=mi) result = df.xs("a", axis=1, drop_level=False) expected = DataFrame( [[1, 2]], columns=MultiIndex.from_tuples([("a", "x"), ("a", "y")], names=["level1", "level2"]), ) tm.assert_frame_equal(result, expected)
def test_xs_level(self): result = self.frame.xs("two", level="second") expected = self.frame[self.frame.index.get_level_values(1) == "two"] expected.index = expected.index.droplevel(1) assert_frame_equal(result, expected) index = MultiIndex.from_tuples([("x", "y", "z"), ("a", "b", "c"), ("p", "q", "r")]) df = DataFrame(np.random.randn(3, 5), index=index) result = df.xs("c", level=2) expected = df[1:2] expected.index = expected.index.droplevel(2) assert_frame_equal(result, expected)
def test_xs_integer_key(): # see gh-2107 dates = lrange(20111201, 20111205) ids = 'abcde' index = MultiIndex.from_tuples( [x for x in product(dates, ids)], names=['date', 'secid']) df = DataFrame( np.random.randn(len(index), 3), index, ['X', 'Y', 'Z']) result = df.xs(20111201, level='date') expected = df.loc[20111201, :] tm.assert_frame_equal(result, expected)
def create_general_csvs(dataframe: pandas.DataFrame, outdir: Path) -> None: """ Produce CSV's of the summarised general intercomparison results. The CSV's are used for direct reads into the LaTeX document. """ measurements = dataframe.index.get_level_values(0).unique() reflectance_keep = [col for col in measurements if "nbar" in col] nbar_keep = [col for col in reflectance_keep if "nbart" not in col] nbart_keep = [col for col in reflectance_keep if "nbart" in col] oa_keep = [col for col in measurements if "oa_" in col] col_groups = {"nbar": nbar_keep, "nbart": nbart_keep, "oa": oa_keep} min_max_df = pandas.concat( [ dataframe.xs((slice(None), "min_residual"))["Minimum"], dataframe.xs((slice(None), "max_residual"))["Maximum"], dataframe.xs((slice(None), "mean_residual"))["Mean"], ], axis=1, ) percent_df = dataframe.xs((slice(None), "percent_different")).drop( columns=["Minimum", "Mean"] ) dataframes = {"stats_residual": min_max_df, "percent_different": percent_df} for key in dataframes: for product_group in col_groups: out_fname = outdir.joinpath(f"{product_group}_{key}.csv") subset = dataframes[key].loc[(col_groups[product_group])] subset.reset_index(inplace=True) subset.rename(columns={"measurement": "Measurement"}, inplace=True) _LOG.info("writing CSV", out_fname=str(out_fname)) subset.to_csv(out_fname, index=False)
def test_partial_slicing_with_multiindex(self): # GH 4758 # partial string indexing with a multi-index buggy df = DataFrame( { "ACCOUNT": ["ACCT1", "ACCT1", "ACCT1", "ACCT2"], "TICKER": ["ABC", "MNP", "XYZ", "XYZ"], "val": [1, 2, 3, 4], }, index=date_range("2013-06-19 09:30:00", periods=4, freq="5T"), ) df_multi = df.set_index(["ACCOUNT", "TICKER"], append=True) expected = DataFrame( [[1]], index=Index(["ABC"], name="TICKER"), columns=["val"] ) result = df_multi.loc[("2013-06-19 09:30:00", "ACCT1")] tm.assert_frame_equal(result, expected) expected = df_multi.loc[ (Timestamp("2013-06-19 09:30:00", tz=None), "ACCT1", "ABC") ] result = df_multi.loc[("2013-06-19 09:30:00", "ACCT1", "ABC")] tm.assert_series_equal(result, expected) # this is an IndexingError as we don't do partial string selection on # multi-levels. msg = "Too many indexers" with pytest.raises(IndexingError, match=msg): df_multi.loc[("2013-06-19", "ACCT1", "ABC")] # GH 4294 # partial slice on a series mi s = DataFrame( np.random.rand(1000, 1000), index=date_range("2000-1-1", periods=1000) ).stack() s2 = s[:-1].copy() expected = s2["2000-1-4"] result = s2[Timestamp("2000-1-4")] tm.assert_series_equal(result, expected) result = s[Timestamp("2000-1-4")] expected = s["2000-1-4"] tm.assert_series_equal(result, expected) df2 = DataFrame(s) expected = df2.xs("2000-1-4") result = df2.loc[Timestamp("2000-1-4")] tm.assert_frame_equal(result, expected)
def pandas_mean_std_to_str( df: pandas.DataFrame, *, mean_col: str = "mean", std_col: str = "std", precision: int = 3, level=1, axis=1, ) -> pandas.DataFrame: """ latex \pm plus minus :param level: :param axis: :param std_col: :param mean_col: :param df: :param precision: :return: """ return ( df.xs(mean_col, axis=axis, level=level).round(precision).astype(str) + " \pm " + df.xs(std_col, axis=axis, level=level).round(precision).astype(str))
def calc_entropy(data: pd.DataFrame, keys: Sequence[str], kind: str = 'kl') -> pd.DataFrame: ''' Compute the divergence of two probability distributions. Parameters ---------- data : Probability distributions. keys : Two keys indicating the distributions to be compared. kind : Type of metric to use. Returns ------- kld : Dataframe of length 1 with the calculated values. ''' # Check input valid_kinds = ['kl', 'kls', 'shannon', 'js', 'hellinger'] if kind not in valid_kinds: e = ('\'{0}\' is not a valid entropy kind!' 'Valid types: {1}').format(kind, valid_kinds) raise ValueError(e) if isinstance(keys, list): keylist = keys else: keylist = [keys] kls = [] for k1, k2 in keylist: data_a = data.xs(k1) data_b = data.xs(k2) kl = [] for col in data_a.columns: if kind == 'kls': S = (entropy(data_a[col], data_b[col]) + entropy(data_b[col], data_a[col])) elif kind == 'kl': S = entropy(data_a[col], data_b[col]) elif kind == 'shannon': M = 0.5 * (data_a[col] + data_b[col]) S = 0.5 * (entropy(data_a[col], M) + entropy(data_b[col], M)) elif kind == 'hellinger': S = (1 / np.sqrt(2) * np.sqrt( ((np.sqrt(data_a[col]) - np.sqrt(data_b[col]))**2).sum())) kl.append(pd.Series({'{0}-{1}'.format(k1, k2): S}, name=col)) kls.append(pd.concat(kl, axis=1)) return pd.concat(kls)
def _collect_data_by_round( data_frame: pd.DataFrame) -> List[RoundModelMetrics]: return [ cast( RoundModelMetrics, { data_frame.index.names[ROUND_NUMBER_LVL]: round_number_idx, "model_metrics": data_frame.xs(round_number_idx, level=ROUND_NUMBER_LVL), }, ) for round_number_idx in data_frame.index.get_level_values( ROUND_NUMBER_LVL).drop_duplicates() ]
def test_xs_droplevel_false_view(self, using_array_manager): # GH#37832 df = DataFrame([[1, 2, 3]], columns=Index(["a", "b", "c"])) result = df.xs("a", axis=1, drop_level=False) # check that result still views the same data as df assert np.shares_memory(result.iloc[:, 0]._values, df.iloc[:, 0]._values) # modifying original df also modifies result when having a single block df.iloc[0, 0] = 2 if not using_array_manager: expected = DataFrame({"a": [2]}) else: # TODO(ArrayManager) iloc does not update the array inplace using # "split" path expected = DataFrame({"a": [1]}) tm.assert_frame_equal(result, expected) # with mixed dataframe, modifying the parent doesn't modify result # TODO the "split" path behaves differently here as with single block df = DataFrame([[1, 2.5, "a"]], columns=Index(["a", "b", "c"])) result = df.xs("a", axis=1, drop_level=False) df.iloc[0, 0] = 2 expected = DataFrame({"a": [1]}) tm.assert_frame_equal(result, expected)
def _create_YState(Y: NDArray[(Any, Any), Float], phenotype_df: pd.DataFrame, offset_df: pd.DataFrame, Y_mask: NDArray[(Any, Any), Float], dt, contigs: Optional[List[str]]) -> Union[YState, Dict[str, YState]]: offset_type = gwas_fx._validate_offset(phenotype_df, offset_df) if offset_type != gwas_fx._OffsetType.LOCO_OFFSET: return _create_one_YState(Y, phenotype_df, offset_df, Y_mask, dt) if contigs is None: contigs = _get_contigs_from_loco_df(offset_df) return { contig: _create_one_YState(Y, phenotype_df, offset_df.xs(contig, level=1), Y_mask, dt) for contig in contigs }
def test_xs_level(self): result = self.frame.xs('two', level='second') expected = self.frame[self.frame.index.get_level_values(1) == 'two'] expected.index = expected.index.droplevel(1) assert_frame_equal(result, expected) index = MultiIndex.from_tuples([('x', 'y', 'z'), ('a', 'b', 'c'), ('p', 'q', 'r')]) df = DataFrame(np.random.randn(3, 5), index=index) result = df.xs('c', level=2) expected = df[1:2] expected.index = expected.index.droplevel(2) assert_frame_equal(result, expected)
def __split_multi_output_df( multi_output_df: pd.DataFrame, ) -> Union[pd.DataFrame, List[pd.DataFrame]]: # Split a multi-output data frame into a list of single-output data frames. # Return single-output data frames as is. # Multi-output data frames are grouped by level 0 in the column index. if multi_output_df.columns.nlevels == 1: return multi_output_df else: return [ multi_output_df.xs(key=output_name, axis=1, level=0, drop_level=True) for output_name in ( cast(pd.MultiIndex, multi_output_df.columns).levels[0] ) ]
def forecast_DK(df: pandas.DataFrame): """ Applies testcount interpolation/extrapolation. Currently this assumes the OWID data, which only has an "all" region. In the future, this should be replaced with more fine graned data loading! """ # forecast with existing data df['predicted_new_tests'], results = preprocessing.predict_testcounts_all_regions(df, 'DK') # interpolate the initial testing ramp-up to account for missing data df_region = df.xs('all') df_region.loc['2020-01-01', 'predicted_new_tests'] = 0 df_region.predicted_new_tests = df_region.predicted_new_tests.interpolate('linear') df_region['region'] = 'all' df = df_region.reset_index().set_index(['region', 'date']) return df, results
def same_columns_after_level(df: pd.DataFrame, level=0): if df.ndim < 2: return None df = df[:1].copy() top_level_columns = unique_level_columns(df, level) last_columns = None for tlc in top_level_columns: xs = df.xs(tlc, axis=1, level=level) this_columns = xs.columns.to_list() if last_columns is None or last_columns == this_columns: last_columns = this_columns else: return False return True
def test_xs_doc_example(self): # TODO: more descriptive name # based on example in advanced.rst arrays = [ ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], ["one", "two", "one", "two", "one", "two", "one", "two"], ] tuples = list(zip(*arrays)) index = MultiIndex.from_tuples(tuples, names=["first", "second"]) df = DataFrame(np.random.randn(3, 8), index=["A", "B", "C"], columns=index) result = df.xs(("one", "bar"), level=("second", "first"), axis=1) expected = df.iloc[:, [0]] tm.assert_frame_equal(result, expected)
def sample_seq(seq: pd.DataFrame, n_samples=10, samp_len=pd.Timedelta(seconds=10), starts=None, reset_time=True): if starts is None: starts = [pd.Timedelta(seconds=t) for t in random.uniform(low=0, high=float(seq.index.max()-samp_len), size=n_samples)] idx = pd.IndexSlice if type(seq.index) == pd.MultiIndex: samples = [seq.xs(idx[start:start+samp_len], level='t', drop_level=False) for start in starts] else: samples = [seq.loc[start:start+samp_len] for start in starts] # Some samples will be empty/incomplete due to lapses in measurements # TODO: Address multiple devices in real-pd smartwatch measurements if not reset_time: return samples if type(seq.index) == pd.MultiIndex: return [samp.set_index(samp.index.set_levels(samp.index.levels[1] - start, level='t')) for samp,start in zip(samples, starts)] else: return [samp.set_index(samp.index - start) for samp,start in zip(samples, starts)]
def forecast_FR(df: pandas.DataFrame) -> Tuple[pandas.DataFrame, dict]: """ Applies test count interpolation/extrapolation to French data. Parameters ---------- df : pandas.DataFrame Data as returned by data loader function. Returns ------- df : pandas.DataFrame Input dataframe with a new column "predicted_new_tests" and an index expanded back to 01/01/2020 (filled with zeros until 13/05/2020) to account for the absence of tests in this period. results : dict The fbprophet results by region """ # forecast with existing data df["predicted_new_tests"], results = preprocessing.predict_testcounts_all_regions( df, "FR") # interpolate the initial testing ramp-up to account for missing data df_list = [] for region in df.index.get_level_values(level="region").unique(): df_region = df.xs(region).copy() df_complement = pandas.DataFrame( index=pandas.date_range( start="2020-01-01", end=df_region.index.get_level_values(level="date")[0] - pandas.DateOffset(1, "D"), freq="D", ), columns=df_region.columns, ) df_complement["predicted_new_tests"] = 0 df_region = df_complement.append(df_region) df_region.index.name = "date" df_region.predicted_new_tests = df_region.predicted_new_tests.interpolate( "linear") df_region["region"] = region df_list.append(df_region.reset_index().set_index(["region", "date"])) return pandas.concat(df_list), results
def __allocate_photons_to_bins(self, idx: np.ndarray, photons: pd.DataFrame): """ Generate a summed-histogram of photons for the relevant edges. :param photons: :param edge_x: :param edge_y: :return: """ hist_storage = np.zeros_like(idx, dtype=object) for cur_idx, col in enumerate(idx): cur_photons = photons.xs(key=col, level="bins_y", drop_level=False) hist_storage[cur_idx] = numba_histogram( cur_photons.loc[:, "time_rel_pulse"].values, bins=np.arange(0, self.bins_bet_pulses + 1, dtype=np.uint8), ).astype("uint8", copy=False) return hist_storage
def calculate_weights(data: pd.DataFrame): """Calculates weights based on the PCA framework""" # Looping over sub-indices sub_idxs = data.columns.get_level_values('subindex').unique() final_w = {} # Setting number of PCs found in analysis PC_n = pd.Series([3, 2, 3, 2, 3, 2], index=sub_idxs) for idx in sub_idxs: # Selecting data idx_data = data.xs(idx, axis=1, level='subindex') if idx == 'culture': idx_data = idx_data.reindex(['cult_exp', 'whc'], axis=1, level='variable') elif idx == 'comercial': idx_data = idx_data.reindex(['gci', 'ofi', 'patents'], axis=1, level='variable') elif idx == 'global_reach': idx_data = idx_data.reindex(['emb', 'gdelt', 'migrants'], axis=1, level='variable') elif idx == 'education': idx_data = idx_data.reindex([ 'educ_expend', 'pisa_maths', 'pisa_reading', 'pisa_science', 'publications' ], axis=1, level='variable') # Stacking data pooled_data = idx_data.stack('country') # Droping nas all_nonna = pooled_data.dropna(how='any') # Running PCA vr, w = pca_analysis(all_nonna, PC_n[idx]) # Getting weights = pd.DataFrame(w, columns=all_nonna.columns) # Dropping weights less than 0.1 - not in this version weights[weights < 0.10] = 0 # Using PCs to ws to build final weights sm = np.zeros(weights.shape[1]) for i in range(len(vr)): sm = sm + vr[i] * weights.loc[i] final_w[idx] = sm / sm.sum() return final_w
def calculate_sub(data: pd.DataFrame, weights: dict): """Calculates sub-indices given data and weights""" sub_idxs = data.columns.get_level_values('subindex').unique() final_idxs = {} for idx in sub_idxs: # Selecting data int_data = data.xs(idx, axis=1, level='subindex') w_idx = weights[idx] idx_data = int_data.reindex(w_idx.index, axis=1, level='variable') # Multiplying by weight prod_data = idx_data.multiply(w_idx, level='variable') #TODO: think if this is the way we want to treat missing values for individial variables series = prod_data.fillna(np.inf).groupby( axis=1, level='country').sum().replace(np.inf, np.nan) final_idxs[idx] = series df = pd.concat(final_idxs, axis=1, names=['subindex']) final_df = df.dropna(how='all').dropna(axis=1, how='all') return final_df
def test_xs_missing_values_in_index(): # see gh-6574 # missing values in returned index should be preserrved acc = [ ('a', 'abcde', 1), ('b', 'bbcde', 2), ('y', 'yzcde', 25), ('z', 'xbcde', 24), ('z', None, 26), ('z', 'zbcde', 25), ('z', 'ybcde', 26), ] df = DataFrame(acc, columns=['a1', 'a2', 'cnt']).set_index(['a1', 'a2']) expected = DataFrame({'cnt': [24, 26, 25, 26]}, index=Index( ['xbcde', np.nan, 'zbcde', 'ybcde'], name='a2')) result = df.xs('z', level='a1') tm.assert_frame_equal(result, expected)
def test_partial_slicing_with_multiindex_series(self): # GH 4294 # partial slice on a series mi ser = DataFrame(np.random.rand(1000, 1000), index=date_range("2000-1-1", periods=1000)).stack() s2 = ser[:-1].copy() expected = s2["2000-1-4"] result = s2[Timestamp("2000-1-4")] tm.assert_series_equal(result, expected) result = ser[Timestamp("2000-1-4")] expected = ser["2000-1-4"] tm.assert_series_equal(result, expected) df2 = DataFrame(ser) expected = df2.xs("2000-1-4") result = df2.loc[Timestamp("2000-1-4")] tm.assert_frame_equal(result, expected)
def test_xs_IndexSlice_argument_not_implemented(self, klass): # GH#35301 index = MultiIndex( levels=[[("foo", "bar", 0), ("foo", "baz", 0), ("foo", "qux", 0)], [0, 1]], codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]], ) obj = DataFrame(np.random.randn(6, 4), index=index) if klass is Series: obj = obj[0] expected = obj.iloc[-2:].droplevel(0) result = obj.xs(IndexSlice[("foo", "qux", 0), :]) tm.assert_equal(result, expected) result = obj.loc[IndexSlice[("foo", "qux", 0), :]] tm.assert_equal(result, expected)
def test_getitem_int(self, multiindex_dataframe_random_data): levels = [[0, 1], [0, 1, 2]] codes = [[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]] index = MultiIndex(levels=levels, codes=codes) frame = DataFrame(np.random.randn(6, 2), index=index) result = frame.loc[1] expected = frame[-3:] expected.index = expected.index.droplevel(0) tm.assert_frame_equal(result, expected) # raises exception pytest.raises(KeyError, frame.loc.__getitem__, 3) # however this will work frame = multiindex_dataframe_random_data result = frame.iloc[2] expected = frame.xs(frame.index[2]) tm.assert_series_equal(result, expected)
class TestMultiLevel(unittest.TestCase): def setUp(self): index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=['first', 'second']) self.frame = DataFrame(np.random.randn(10, 3), index=index, columns=Index(['A', 'B', 'C'], name='exp')) self.single_level = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux']], labels=[[0, 1, 2, 3]], names=['first']) # create test series object arrays = [['bar', 'bar', 'baz', 'baz', 'qux', 'qux', 'foo', 'foo'], ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] tuples = zip(*arrays) index = MultiIndex.from_tuples(tuples) s = Series(randn(8), index=index) s[3] = np.NaN self.series = s tm.N = 100 self.tdf = tm.makeTimeDataFrame() self.ymd = self.tdf.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day]).sum() # use Int64Index, to make sure things work self.ymd.index.levels = [lev.astype('i8') for lev in self.ymd.index.levels] self.ymd.index.names = ['year', 'month', 'day'] def test_append(self): a, b = self.frame[:5], self.frame[5:] result = a.append(b) tm.assert_frame_equal(result, self.frame) result = a['A'].append(b['A']) tm.assert_series_equal(result, self.frame['A']) def test_dataframe_constructor(self): multi = DataFrame(np.random.randn(4, 4), index=[np.array(['a', 'a', 'b', 'b']), np.array(['x', 'y', 'x', 'y'])]) self.assert_(isinstance(multi.index, MultiIndex)) self.assert_(not isinstance(multi.columns, MultiIndex)) multi = DataFrame(np.random.randn(4, 4), columns=[['a', 'a', 'b', 'b'], ['x', 'y', 'x', 'y']]) self.assert_(isinstance(multi.columns, MultiIndex)) def test_series_constructor(self): multi = Series(1., index=[np.array(['a', 'a', 'b', 'b']), np.array(['x', 'y', 'x', 'y'])]) self.assert_(isinstance(multi.index, MultiIndex)) multi = Series(1., index=[['a', 'a', 'b', 'b'], ['x', 'y', 'x', 'y']]) self.assert_(isinstance(multi.index, MultiIndex)) multi = Series(range(4), index=[['a', 'a', 'b', 'b'], ['x', 'y', 'x', 'y']]) self.assert_(isinstance(multi.index, MultiIndex)) def test_reindex_level(self): # axis=0 month_sums = self.ymd.sum(level='month') result = month_sums.reindex(self.ymd.index, level=1) expected = self.ymd.groupby(level='month').transform(np.sum) assert_frame_equal(result, expected) # Series result = month_sums['A'].reindex(self.ymd.index, level=1) expected = self.ymd['A'].groupby(level='month').transform(np.sum) assert_series_equal(result, expected) # axis=1 month_sums = self.ymd.T.sum(axis=1, level='month') result = month_sums.reindex(columns=self.ymd.index, level=1) expected = self.ymd.groupby(level='month').transform(np.sum).T assert_frame_equal(result, expected) def test_binops_level(self): def _check_op(opname): op = getattr(DataFrame, opname) month_sums = self.ymd.sum(level='month') result = op(self.ymd, month_sums, level='month') broadcasted = self.ymd.groupby(level='month').transform(np.sum) expected = op(self.ymd, broadcasted) assert_frame_equal(result, expected) # Series op = getattr(Series, opname) result = op(self.ymd['A'], month_sums['A'], level='month') broadcasted = self.ymd['A'].groupby(level='month').transform(np.sum) expected = op(self.ymd['A'], broadcasted) assert_series_equal(result, expected) _check_op('sub') _check_op('add') _check_op('mul') _check_op('div') def test_pickle(self): import cPickle def _test_roundtrip(frame): pickled = cPickle.dumps(frame) unpickled = cPickle.loads(pickled) assert_frame_equal(frame, unpickled) _test_roundtrip(self.frame) _test_roundtrip(self.frame.T) _test_roundtrip(self.ymd) _test_roundtrip(self.ymd.T) def test_reindex(self): reindexed = self.frame.ix[[('foo', 'one'), ('bar', 'one')]] expected = self.frame.ix[[0, 3]] assert_frame_equal(reindexed, expected) def test_reindex_preserve_levels(self): new_index = self.ymd.index[::10] chunk = self.ymd.reindex(new_index) self.assert_(chunk.index is new_index) chunk = self.ymd.ix[new_index] self.assert_(chunk.index is new_index) ymdT = self.ymd.T chunk = ymdT.reindex(columns=new_index) self.assert_(chunk.columns is new_index) chunk = ymdT.ix[:, new_index] self.assert_(chunk.columns is new_index) def test_sort_index_preserve_levels(self): result = self.frame.sort_index() self.assertEquals(result.index.names, self.frame.index.names) def test_repr_to_string(self): repr(self.frame) repr(self.ymd) repr(self.frame.T) repr(self.ymd.T) buf = StringIO() self.frame.to_string(buf=buf) self.ymd.to_string(buf=buf) self.frame.T.to_string(buf=buf) self.ymd.T.to_string(buf=buf) def test_repr_name_coincide(self): index = MultiIndex.from_tuples([('a', 0, 'foo'), ('b', 1, 'bar')], names=['a', 'b', 'c']) df = DataFrame({'value': [0, 1]}, index=index) lines = repr(df).split('\n') self.assert_(lines[2].startswith('a 0 foo')) def test_getitem_simple(self): df = self.frame.T col = df['foo', 'one'] assert_almost_equal(col.values, df.values[:, 0]) self.assertRaises(KeyError, df.__getitem__, ('foo', 'four')) self.assertRaises(KeyError, df.__getitem__, 'foobar') def test_series_getitem(self): s = self.ymd['A'] result = s[2000, 3] result2 = s.ix[2000, 3] expected = s.reindex(s.index[42:65]) expected.index = expected.index.droplevel(0).droplevel(0) assert_series_equal(result, expected) result = s[2000, 3, 10] expected = s[49] self.assertEquals(result, expected) # fancy result = s.ix[[(2000, 3, 10), (2000, 3, 13)]] expected = s.reindex(s.index[49:51]) assert_series_equal(result, expected) # key error self.assertRaises(KeyError, s.__getitem__, (2000, 3, 4)) def test_series_getitem_corner(self): s = self.ymd['A'] # don't segfault, GH #495 # out of bounds access self.assertRaises(IndexError, s.__getitem__, len(self.ymd)) # generator result = s[(x > 0 for x in s)] expected = s[s > 0] assert_series_equal(result, expected) def test_series_setitem(self): s = self.ymd['A'] s[2000, 3] = np.nan self.assert_(isnull(s.values[42:65]).all()) self.assert_(notnull(s.values[:42]).all()) self.assert_(notnull(s.values[65:]).all()) s[2000, 3, 10] = np.nan self.assert_(isnull(s[49])) def test_series_slice_partial(self): pass def test_frame_getitem_setitem_slice(self): # getitem result = self.frame.ix[:4] expected = self.frame[:4] assert_frame_equal(result, expected) # setitem cp = self.frame.copy() cp.ix[:4] = 0 self.assert_((cp.values[:4] == 0).all()) self.assert_((cp.values[4:] != 0).all()) def test_frame_getitem_setitem_multislice(self): levels = [['t1', 't2'], ['a','b','c']] labels = [[0,0,0,1,1], [0,1,2,0,1]] midx = MultiIndex(labels=labels, levels=levels, names=[None, 'id']) df = DataFrame({'value':[1,2,3,7,8]}, index=midx) result = df.ix[:,'value'] assert_series_equal(df['value'], result) result = df.ix[1:3,'value'] assert_series_equal(df['value'][1:3], result) result = df.ix[:,:] assert_frame_equal(df, result) result = df df.ix[:, 'value'] = 10 result['value'] = 10 assert_frame_equal(df, result) df.ix[:,:] = 10 assert_frame_equal(df, result) def test_getitem_tuple_plus_slice(self): # GH #671 df = DataFrame({'a' : range(10), 'b' : range(10), 'c' : np.random.randn(10), 'd' : np.random.randn(10)}) idf = df.set_index(['a', 'b']) result = idf.ix[(0, 0), :] expected = idf.ix[0, 0] expected2 = idf.xs((0, 0)) assert_series_equal(result, expected) assert_series_equal(result, expected2) def test_getitem_setitem_tuple_plus_columns(self): # GH #1013 df = self.ymd[:5] result = df.ix[(2000, 1, 6), ['A', 'B', 'C']] expected = df.ix[2000, 1, 6][['A', 'B', 'C']] assert_series_equal(result, expected) def test_xs(self): xs = self.frame.xs(('bar', 'two')) xs2 = self.frame.ix[('bar', 'two')] assert_series_equal(xs, xs2) assert_almost_equal(xs.values, self.frame.values[4]) def test_xs_partial(self): result = self.frame.xs('foo') result2 = self.frame.ix['foo'] expected = self.frame.T['foo'].T assert_frame_equal(result, expected) assert_frame_equal(result, result2) def test_xs_level(self): result = self.frame.xs('two', level='second') expected = self.frame[self.frame.index.get_level_values(1) == 'two'] expected.index = expected.index.droplevel(1) assert_frame_equal(result, expected) index = MultiIndex.from_tuples([('x', 'y', 'z'), ('a', 'b', 'c'), ('p', 'q', 'r')]) df = DataFrame(np.random.randn(3, 5), index=index) result = df.xs('c', level=2) expected = df[1:2] expected.index = expected.index.droplevel(2) assert_frame_equal(result, expected) def test_xs_level_multiple(self): from pandas import read_table from StringIO import StringIO text = """ A B C D E one two three four a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640 a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744 x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838""" df = read_table(StringIO(text), sep='\s+') result = df.xs(('a', 4), level=['one', 'four']) expected = df.xs('a').xs(4, level='four') assert_frame_equal(result, expected) def test_xs_level0(self): from pandas import read_table from StringIO import StringIO text = """ A B C D E one two three four a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640 a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744 x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838""" df = read_table(StringIO(text), sep='\s+') result = df.xs('a', level=0) expected = df.xs('a') self.assertEqual(len(result), 2) assert_frame_equal(result, expected) def test_xs_level_series(self): s = self.frame['A'] result = s[:, 'two'] expected = self.frame.xs('two', level=1)['A'] assert_series_equal(result, expected) s = self.ymd['A'] result = s[2000, 5] expected = self.ymd.ix[2000, 5]['A'] assert_series_equal(result, expected) # not implementing this for now self.assertRaises(TypeError, s.__getitem__, (2000, slice(3, 4))) # result = s[2000, 3:4] # lv =s.index.get_level_values(1) # expected = s[(lv == 3) | (lv == 4)] # expected.index = expected.index.droplevel(0) # assert_series_equal(result, expected) # can do this though def test_get_loc_single_level(self): s = Series(np.random.randn(len(self.single_level)), index=self.single_level) for k in self.single_level.values: s[k] def test_getitem_toplevel(self): df = self.frame.T result = df['foo'] expected = df.reindex(columns=df.columns[:3]) expected.columns = expected.columns.droplevel(0) assert_frame_equal(result, expected) result = df['bar'] result2 = df.ix[:, 'bar'] expected = df.reindex(columns=df.columns[3:5]) expected.columns = expected.columns.droplevel(0) assert_frame_equal(result, expected) assert_frame_equal(result, result2) def test_getitem_setitem_slice_integers(self): index = MultiIndex(levels=[[0, 1, 2], [0, 2]], labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]]) frame = DataFrame(np.random.randn(len(index), 4), index=index, columns=['a', 'b', 'c', 'd']) res = frame.ix[1:2] exp = frame.reindex(frame.index[2:]) assert_frame_equal(res, exp) frame.ix[1:2] = 7 self.assert_((frame.ix[1:2] == 7).values.all()) series = Series(np.random.randn(len(index)), index=index) res = series.ix[1:2] exp = series.reindex(series.index[2:]) assert_series_equal(res, exp) series.ix[1:2] = 7 self.assert_((series.ix[1:2] == 7).values.all()) def test_getitem_int(self): levels = [[0, 1], [0, 1, 2]] labels = [[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]] index = MultiIndex(levels=levels, labels=labels) frame = DataFrame(np.random.randn(6, 2), index=index) result = frame.ix[1] expected = frame[-3:] expected.index = expected.index.droplevel(0) assert_frame_equal(result, expected) # raises exception self.assertRaises(KeyError, frame.ix.__getitem__, 3) # however this will work result = self.frame.ix[2] expected = self.frame.xs(self.frame.index[2]) assert_series_equal(result, expected) def test_getitem_partial(self): ymd = self.ymd.T result = ymd[2000, 2] expected = ymd.reindex(columns=ymd.columns[ymd.columns.labels[1] == 1]) expected.columns = expected.columns.droplevel(0).droplevel(0) assert_frame_equal(result, expected) def test_getitem_slice_not_sorted(self): df = self.frame.sortlevel(1).T # buglet with int typechecking result = df.ix[:, :np.int32(3)] expected = df.reindex(columns=df.columns[:3]) assert_frame_equal(result, expected) def test_setitem_change_dtype(self): dft = self.frame.T s = dft['foo', 'two'] dft['foo', 'two'] = s > s.median() assert_series_equal(dft['foo', 'two'], s > s.median()) self.assert_(isinstance(dft._data.blocks[1].items, MultiIndex)) reindexed = dft.reindex(columns=[('foo', 'two')]) assert_series_equal(reindexed['foo', 'two'], s > s.median()) def test_frame_setitem_ix(self): self.frame.ix[('bar', 'two'), 'B'] = 5 self.assertEquals(self.frame.ix[('bar', 'two'), 'B'], 5) # with integer labels df = self.frame.copy() df.columns = range(3) df.ix[('bar', 'two'), 1] = 7 self.assertEquals(df.ix[('bar', 'two'), 1], 7) def test_fancy_slice_partial(self): result = self.frame.ix['bar':'baz'] expected = self.frame[3:7] assert_frame_equal(result, expected) result = self.ymd.ix[(2000,2):(2000,4)] lev = self.ymd.index.labels[1] expected = self.ymd[(lev >= 1) & (lev <= 3)] assert_frame_equal(result, expected) def test_sortlevel(self): df = self.frame.copy() df.index = np.arange(len(df)) self.assertRaises(Exception, df.sortlevel, 0) # axis=1 # series a_sorted = self.frame['A'].sortlevel(0) self.assertRaises(Exception, self.frame.reset_index()['A'].sortlevel) # preserve names self.assertEquals(a_sorted.index.names, self.frame.index.names) def test_delevel_infer_dtype(self): tuples = [tuple for tuple in cart_product(['foo', 'bar'], [10, 20], [1.0, 1.1])] index = MultiIndex.from_tuples(tuples, names=['prm0', 'prm1', 'prm2']) df = DataFrame(np.random.randn(8,3), columns=['A', 'B', 'C'], index=index) deleveled = df.reset_index() self.assert_(com.is_integer_dtype(deleveled['prm1'])) self.assert_(com.is_float_dtype(deleveled['prm2'])) def test_reset_index_with_drop(self): deleveled = self.ymd.reset_index(drop = True) self.assertEquals(len(deleveled.columns), len(self.ymd.columns)) deleveled = self.series.reset_index() self.assert_(isinstance(deleveled, DataFrame)) self.assert_(len(deleveled.columns) == len(self.series.index.levels)+1) deleveled = self.series.reset_index(drop = True) self.assert_(isinstance(deleveled, Series)) def test_sortlevel_by_name(self): self.frame.index.names = ['first', 'second'] result = self.frame.sortlevel(level='second') expected = self.frame.sortlevel(level=1) assert_frame_equal(result, expected) def test_sortlevel_mixed(self): sorted_before = self.frame.sortlevel(1) df = self.frame.copy() df['foo'] = 'bar' sorted_after = df.sortlevel(1) assert_frame_equal(sorted_before, sorted_after.drop(['foo'], axis=1)) dft = self.frame.T sorted_before = dft.sortlevel(1, axis=1) dft['foo', 'three'] = 'bar' sorted_after = dft.sortlevel(1, axis=1) assert_frame_equal(sorted_before.drop([('foo', 'three')], axis=1), sorted_after.drop([('foo', 'three')], axis=1)) def test_count_level(self): def _check_counts(frame, axis=0): index = frame._get_axis(axis) for i in range(index.nlevels): result = frame.count(axis=axis, level=i) expected = frame.groupby(axis=axis, level=i).count(axis=axis) expected = expected.reindex_like(result).astype('i8') assert_frame_equal(result, expected) self.frame.ix[1, [1, 2]] = np.nan self.frame.ix[7, [0, 1]] = np.nan self.ymd.ix[1, [1, 2]] = np.nan self.ymd.ix[7, [0, 1]] = np.nan _check_counts(self.frame) _check_counts(self.ymd) _check_counts(self.frame.T, axis=1) _check_counts(self.ymd.T, axis=1) # can't call with level on regular DataFrame df = tm.makeTimeDataFrame() self.assertRaises(Exception, df.count, level=0) self.frame['D'] = 'foo' result = self.frame.count(level=0, numeric_only=True) assert_almost_equal(result.columns, ['A', 'B', 'C']) def test_count_level_series(self): index = MultiIndex(levels=[['foo', 'bar', 'baz'], ['one', 'two', 'three', 'four']], labels=[[0, 0, 0, 2, 2], [2, 0, 1, 1, 2]]) s = Series(np.random.randn(len(index)), index=index) result = s.count(level=0) expected = s.groupby(level=0).count() assert_series_equal(result.astype('f8'), expected.reindex(result.index).fillna(0)) result = s.count(level=1) expected = s.groupby(level=1).count() assert_series_equal(result.astype('f8'), expected.reindex(result.index).fillna(0)) def test_count_level_corner(self): s = self.frame['A'][:0] result = s.count(level=0) expected = Series(0, index=s.index.levels[0]) assert_series_equal(result, expected) df = self.frame[:0] result = df.count(level=0) expected = DataFrame({}, index=s.index.levels[0], columns=df.columns).fillna(0).astype(int) assert_frame_equal(result, expected) def test_unstack(self): # just check that it works for now unstacked = self.ymd.unstack() unstacked2 = unstacked.unstack() # test that ints work unstacked = self.ymd.astype(int).unstack() def test_unstack_multiple_no_empty_columns(self): index = MultiIndex.from_tuples([(0, 'foo', 0), (0, 'bar', 0), (1, 'baz', 1), (1, 'qux', 1)]) s = Series(np.random.randn(4), index=index) unstacked = s.unstack([1, 2]) expected = unstacked.dropna(axis=1, how='all') assert_frame_equal(unstacked, expected) def test_stack(self): # regular roundtrip unstacked = self.ymd.unstack() restacked = unstacked.stack() assert_frame_equal(restacked, self.ymd) unlexsorted = self.ymd.sortlevel(2) unstacked = unlexsorted.unstack(2) restacked = unstacked.stack() assert_frame_equal(restacked.sortlevel(0), self.ymd) unlexsorted = unlexsorted[::-1] unstacked = unlexsorted.unstack(1) restacked = unstacked.stack().swaplevel(1, 2) assert_frame_equal(restacked.sortlevel(0), self.ymd) unlexsorted = unlexsorted.swaplevel(0, 1) unstacked = unlexsorted.unstack(0).swaplevel(0, 1, axis=1) restacked = unstacked.stack(0).swaplevel(1, 2) assert_frame_equal(restacked.sortlevel(0), self.ymd) # columns unsorted unstacked = self.ymd.unstack() unstacked = unstacked.sort(axis=1, ascending=False) restacked = unstacked.stack() assert_frame_equal(restacked, self.ymd) # more than 2 levels in the columns unstacked = self.ymd.unstack(1).unstack(1) result = unstacked.stack(1) expected = self.ymd.unstack() assert_frame_equal(result, expected) result = unstacked.stack(2) expected = self.ymd.unstack(1) assert_frame_equal(result, expected) result = unstacked.stack(0) expected = self.ymd.stack().unstack(1).unstack(1) assert_frame_equal(result, expected) # not all levels present in each echelon unstacked = self.ymd.unstack(2).ix[:, ::3] stacked = unstacked.stack().stack() ymd_stacked = self.ymd.stack() assert_series_equal(stacked, ymd_stacked.reindex(stacked.index)) # stack with negative number result = self.ymd.unstack(0).stack(-2) expected = self.ymd.unstack(0).stack(0) def test_stack_mixed_dtype(self): df = self.frame.T df['foo', 'four'] = 'foo' df = df.sortlevel(1, axis=1) stacked = df.stack() assert_series_equal(stacked['foo'], df['foo'].stack()) self.assert_(stacked['bar'].dtype == np.float_) def test_unstack_bug(self): df = DataFrame({'state': ['naive','naive','naive', 'activ','activ','activ'], 'exp':['a','b','b','b','a','a'], 'barcode':[1,2,3,4,1,3], 'v':['hi','hi','bye','bye','bye','peace'], 'extra': np.arange(6.)}) result = df.groupby(['state','exp','barcode','v']).apply(len) unstacked = result.unstack() restacked = unstacked.stack() assert_series_equal(restacked, result.reindex(restacked.index).astype(float)) def test_stack_unstack_preserve_names(self): unstacked = self.frame.unstack() self.assertEquals(unstacked.index.name, 'first') self.assertEquals(unstacked.columns.names, ['exp', 'second']) restacked = unstacked.stack() self.assertEquals(restacked.index.names, self.frame.index.names) def test_unstack_level_name(self): result = self.frame.unstack('second') expected = self.frame.unstack(level=1) assert_frame_equal(result, expected) def test_stack_level_name(self): unstacked = self.frame.unstack('second') result = unstacked.stack('exp') expected = self.frame.unstack().stack(0) assert_frame_equal(result, expected) result = self.frame.stack('exp') expected = self.frame.stack() assert_series_equal(result, expected) def test_stack_unstack_multiple(self): unstacked = self.ymd.unstack(['year', 'month']) expected = self.ymd.unstack('year').unstack('month') assert_frame_equal(unstacked, expected) self.assertEquals(unstacked.columns.names, expected.columns.names) # series s = self.ymd['A'] s_unstacked = s.unstack(['year', 'month']) assert_frame_equal(s_unstacked, expected['A']) restacked = unstacked.stack(['year', 'month']) restacked = restacked.swaplevel(0, 1).swaplevel(1, 2) restacked = restacked.sortlevel(0) assert_frame_equal(restacked, self.ymd) self.assertEquals(restacked.index.names, self.ymd.index.names) # GH #451 unstacked = self.ymd.unstack([1, 2]) expected = self.ymd.unstack(1).unstack(1).dropna(axis=1, how='all') assert_frame_equal(unstacked, expected) unstacked = self.ymd.unstack([2, 1]) expected = self.ymd.unstack(2).unstack(1).dropna(axis=1, how='all') assert_frame_equal(unstacked, expected.ix[:, unstacked.columns]) def test_groupby_transform(self): s = self.frame['A'] grouper = s.index.get_level_values(0) grouped = s.groupby(grouper) applied = grouped.apply(lambda x: x * 2) expected = grouped.transform(lambda x: x * 2) assert_series_equal(applied.reindex(expected.index), expected) def test_groupby_corner(self): midx = MultiIndex(levels=[['foo'],['bar'],['baz']], labels=[[0],[0],[0]], names=['one','two','three']) df = DataFrame([np.random.rand(4)], columns=['a','b','c','d'], index=midx) # should work df.groupby(level='three') def test_join(self): a = self.frame.ix[:5, ['A']] b = self.frame.ix[2:, ['B', 'C']] joined = a.join(b, how='outer').reindex(self.frame.index) expected = self.frame.copy() expected.values[np.isnan(joined.values)] = np.nan self.assert_(not np.isnan(joined.values).all()) assert_frame_equal(joined, expected) def test_swaplevel(self): swapped = self.frame['A'].swaplevel(0, 1) swapped2 = self.frame['A'].swaplevel('first', 'second') self.assert_(not swapped.index.equals(self.frame.index)) assert_series_equal(swapped, swapped2) back = swapped.swaplevel(0, 1) back2 = swapped.swaplevel('second', 'first') self.assert_(back.index.equals(self.frame.index)) assert_series_equal(back, back2) ft = self.frame.T swapped = ft.swaplevel('first', 'second', axis=1) exp = self.frame.swaplevel('first', 'second').T assert_frame_equal(swapped, exp) def test_swaplevel_panel(self): panel = Panel({'ItemA' : self.frame, 'ItemB' : self.frame * 2}) result = panel.swaplevel(0, 1, axis='major') expected = panel.copy() expected.major_axis = expected.major_axis.swaplevel(0, 1) tm.assert_panel_equal(result, expected) def test_reorder_levels(self): result = self.ymd.reorder_levels(['month', 'day', 'year']) expected = self.ymd.swaplevel(0, 1).swaplevel(1, 2) assert_frame_equal(result, expected) result = self.ymd['A'].reorder_levels(['month', 'day', 'year']) expected = self.ymd['A'].swaplevel(0, 1).swaplevel(1, 2) assert_series_equal(result, expected) result = self.ymd.T.reorder_levels(['month', 'day', 'year'], axis=1) expected = self.ymd.T.swaplevel(0, 1, axis=1).swaplevel(1, 2, axis=1) assert_frame_equal(result, expected) self.assertRaises(Exception, self.ymd.index.reorder_levels, [1, 2, 3]) def test_insert_index(self): df = self.ymd[:5].T df[2000, 1, 10] = df[2000, 1, 7] self.assert_(isinstance(df.columns, MultiIndex)) self.assert_((df[2000, 1, 10] == df[2000, 1, 7]).all()) def test_alignment(self): x = Series(data=[1,2,3], index=MultiIndex.from_tuples([("A", 1), ("A", 2), ("B",3)])) y = Series(data=[4,5,6], index=MultiIndex.from_tuples([("Z", 1), ("Z", 2), ("B",3)])) res = x - y exp_index = x.index.union(y.index) exp = x.reindex(exp_index) - y.reindex(exp_index) assert_series_equal(res, exp) # hit non-monotonic code path res = x[::-1] - y[::-1] exp_index = x.index.union(y.index) exp = x.reindex(exp_index) - y.reindex(exp_index) assert_series_equal(res, exp) def test_is_lexsorted(self): levels = [[0, 1], [0, 1, 2]] index = MultiIndex(levels=levels, labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]]) self.assert_(index.is_lexsorted()) index = MultiIndex(levels=levels, labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 2, 1]]) self.assert_(not index.is_lexsorted()) index = MultiIndex(levels=levels, labels=[[0, 0, 1, 0, 1, 1], [0, 1, 0, 2, 2, 1]]) self.assert_(not index.is_lexsorted()) self.assert_(index.lexsort_depth == 0) def test_frame_getitem_view(self): df = self.frame.T df['foo'].values[:] = 0 self.assert_((df['foo'].values == 0).all()) # but not if it's mixed-type df['foo', 'four'] = 'foo' df = df.sortlevel(0, axis=1) df['foo']['one'] = 2 self.assert_((df['foo', 'one'] == 0).all()) def test_frame_getitem_not_sorted(self): df = self.frame.T df['foo', 'four'] = 'foo' arrays = [np.array(x) for x in zip(*df.columns.get_tuple_index())] result = df['foo'] result2 = df.ix[:, 'foo'] expected = df.reindex(columns=df.columns[arrays[0] == 'foo']) expected.columns = expected.columns.droplevel(0) assert_frame_equal(result, expected) assert_frame_equal(result2, expected) df = df.T result = df.xs('foo') result2 = df.ix['foo'] expected = df.reindex(df.index[arrays[0] == 'foo']) expected.index = expected.index.droplevel(0) assert_frame_equal(result, expected) assert_frame_equal(result2, expected) def test_series_getitem_not_sorted(self): arrays = [['bar', 'bar', 'baz', 'baz', 'qux', 'qux', 'foo', 'foo'], ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] tuples = zip(*arrays) index = MultiIndex.from_tuples(tuples) s = Series(randn(8), index=index) arrays = [np.array(x) for x in zip(*index.get_tuple_index())] result = s['qux'] result2 = s.ix['qux'] expected = s[arrays[0] == 'qux'] expected.index = expected.index.droplevel(0) assert_series_equal(result, expected) assert_series_equal(result2, expected) def test_count(self): frame = self.frame.copy() frame.index.names = ['a', 'b'] result = frame.count(level='b') expect = self.frame.count(level=1) assert_frame_equal(result, expect) result = frame.count(level='a') expect = self.frame.count(level=0) assert_frame_equal(result, expect) series = self.series.copy() series.index.names = ['a', 'b'] result = series.count(level='b') expect = self.series.count(level=1) assert_series_equal(result, expect) result = series.count(level='a') expect = self.series.count(level=0) assert_series_equal(result, expect) self.assertRaises(Exception, series.count, 'x') self.assertRaises(Exception, frame.count, level='x') AGG_FUNCTIONS = ['sum', 'prod', 'min', 'max', 'median', 'mean', 'skew', 'mad', 'std', 'var'] def test_series_group_min_max(self): for op, level, skipna in cart_product(self.AGG_FUNCTIONS, range(2), [False, True]): grouped = self.series.groupby(level=level) aggf = lambda x: getattr(x, op)(skipna=skipna) # skipna=True leftside = grouped.agg(aggf) rightside = getattr(self.series, op)(level=level, skipna=skipna) assert_series_equal(leftside, rightside) def test_frame_group_ops(self): self.frame.ix[1, [1, 2]] = np.nan self.frame.ix[7, [0, 1]] = np.nan for op, level, axis, skipna in cart_product(self.AGG_FUNCTIONS, range(2), range(2), [False, True]): if axis == 0: frame = self.frame else: frame = self.frame.T grouped = frame.groupby(level=level, axis=axis) pieces = [] def aggf(x): pieces.append(x) return getattr(x, op)(skipna=skipna, axis=axis) leftside = grouped.agg(aggf) rightside = getattr(frame, op)(level=level, axis=axis, skipna=skipna) # for good measure, groupby detail level_index = frame._get_axis(axis).levels[level] self.assert_(leftside._get_axis(axis).equals(level_index)) self.assert_(rightside._get_axis(axis).equals(level_index)) assert_frame_equal(leftside, rightside) def test_std_var_pass_ddof(self): index = MultiIndex.from_arrays([np.arange(5).repeat(10), np.tile(np.arange(10), 5)]) df = DataFrame(np.random.randn(len(index), 5), index=index) for meth in ['var', 'std']: ddof = 4 alt = lambda x: getattr(x, meth)(ddof=ddof) result = getattr(df[0], meth)(level=0, ddof=ddof) expected = df[0].groupby(level=0).agg(alt) assert_series_equal(result, expected) result = getattr(df, meth)(level=0, ddof=ddof) expected = df.groupby(level=0).agg(alt) assert_frame_equal(result, expected) def test_frame_series_agg_multiple_levels(self): result = self.ymd.sum(level=['year', 'month']) expected = self.ymd.groupby(level=['year', 'month']).sum() assert_frame_equal(result, expected) result = self.ymd['A'].sum(level=['year', 'month']) expected = self.ymd['A'].groupby(level=['year', 'month']).sum() assert_series_equal(result, expected) def test_groupby_multilevel(self): result = self.ymd.groupby(level=[0, 1]).mean() k1 = self.ymd.index.get_level_values(0) k2 = self.ymd.index.get_level_values(1) expected = self.ymd.groupby([k1, k2]).mean() assert_frame_equal(result, expected) self.assertEquals(result.index.names, self.ymd.index.names[:2]) result2 = self.ymd.groupby(level=self.ymd.index.names[:2]).mean() assert_frame_equal(result, result2) def test_groupby_multilevel_with_transform(self): pass def test_multilevel_consolidate(self): index = MultiIndex.from_tuples([('foo', 'one'), ('foo', 'two'), ('bar', 'one'), ('bar', 'two')]) df = DataFrame(np.random.randn(4, 4), index=index, columns=index) df['Totals', ''] = df.sum(1) df = df.consolidate() def test_ix_preserve_names(self): result = self.ymd.ix[2000] result2 = self.ymd['A'].ix[2000] self.assertEquals(result.index.names, self.ymd.index.names[1:]) self.assertEquals(result2.index.names, self.ymd.index.names[1:]) result = self.ymd.ix[2000, 2] result2 = self.ymd['A'].ix[2000, 2] self.assertEquals(result.index.name, self.ymd.index.names[2]) self.assertEquals(result2.index.name, self.ymd.index.names[2]) def test_partial_set(self): # GH #397 df = self.ymd.copy() exp = self.ymd.copy() df.ix[2000, 4] = 0 exp.ix[2000, 4].values[:] = 0 assert_frame_equal(df, exp) df['A'].ix[2000, 4] = 1 exp['A'].ix[2000, 4].values[:] = 1 assert_frame_equal(df, exp) df.ix[2000] = 5 exp.ix[2000].values[:] = 5 assert_frame_equal(df, exp) # this works...for now df['A'].ix[14] = 5 self.assertEquals(df['A'][14], 5) def test_unstack_preserve_types(self): # GH #403 self.ymd['E'] = 'foo' self.ymd['F'] = 2 unstacked = self.ymd.unstack('month') self.assert_(unstacked['A', 1].dtype == np.float64) self.assert_(unstacked['E', 1].dtype == np.object_) self.assert_(unstacked['F', 1].dtype == np.float64) def test_getitem_lowerdim_corner(self): self.assertRaises(KeyError, self.frame.ix.__getitem__, (('bar', 'three'), 'B')) self.assertRaises(KeyError, self.frame.ix.__setitem__, (('bar', 'three'), 'B'), 0) #---------------------------------------------------------------------- # AMBIGUOUS CASES! def test_partial_ix_missing(self): raise nose.SkipTest result = self.ymd.ix[2000, 0] expected = self.ymd.ix[2000]['A'] assert_series_equal(result, expected) # need to put in some work here # self.ymd.ix[2000, 0] = 0 # self.assert_((self.ymd.ix[2000]['A'] == 0).all()) self.assertRaises(Exception, self.ymd.ix.__getitem__, (2000, 6)) self.assertRaises(Exception, self.ymd.ix.__getitem__, (2000, 6), 0) #---------------------------------------------------------------------- def test_to_html(self): self.ymd.columns.name = 'foo' self.ymd.to_html() self.ymd.T.to_html() def test_level_with_tuples(self): index = MultiIndex(levels=[[('foo', 'bar', 0), ('foo', 'baz', 0), ('foo', 'qux', 0)], [0, 1]], labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]]) series = Series(np.random.randn(6), index=index) frame = DataFrame(np.random.randn(6, 4), index=index) result = series[('foo', 'bar', 0)] result2 = series.ix[('foo', 'bar', 0)] expected = series[:2] expected.index = expected.index.droplevel(0) assert_series_equal(result, expected) assert_series_equal(result2, expected) self.assertRaises(KeyError, series.__getitem__, (('foo', 'bar', 0), 2)) result = frame.ix[('foo', 'bar', 0)] result2 = frame.xs(('foo', 'bar', 0)) expected = frame[:2] expected.index = expected.index.droplevel(0) assert_frame_equal(result, expected) assert_frame_equal(result2, expected) index = MultiIndex(levels=[[('foo', 'bar'), ('foo', 'baz'), ('foo', 'qux')], [0, 1]], labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]]) series = Series(np.random.randn(6), index=index) frame = DataFrame(np.random.randn(6, 4), index=index) result = series[('foo', 'bar')] result2 = series.ix[('foo', 'bar')] expected = series[:2] expected.index = expected.index.droplevel(0) assert_series_equal(result, expected) assert_series_equal(result2, expected) result = frame.ix[('foo', 'bar')] result2 = frame.xs(('foo', 'bar')) expected = frame[:2] expected.index = expected.index.droplevel(0) assert_frame_equal(result, expected) assert_frame_equal(result2, expected) def test_int_series_slicing(self): s = self.ymd['A'] result = s[5:] expected = s.reindex(s.index[5:]) assert_series_equal(result, expected) exp = self.ymd['A'].copy() s[5:] = 0 exp.values[5:] = 0 self.assert_(np.array_equal(s.values, exp.values)) result = self.ymd[5:] expected = self.ymd.reindex(s.index[5:]) assert_frame_equal(result, expected) def test_mixed_depth_get(self): arrays = [[ 'a', 'top', 'top', 'routine1', 'routine1', 'routine2'], [ '', 'OD', 'OD', 'result1', 'result2', 'result1'], [ '', 'wx', 'wy', '', '', '']] tuples = zip(*arrays) tuples.sort() index = MultiIndex.from_tuples(tuples) df = DataFrame(randn(4,6),columns = index) result = df['a'] expected = df['a','',''] assert_series_equal(result, expected) self.assertEquals(result.name, 'a') result = df['routine1','result1'] expected = df['routine1','result1',''] assert_series_equal(result, expected) self.assertEquals(result.name, ('routine1', 'result1')) def test_mixed_depth_insert(self): arrays = [[ 'a', 'top', 'top', 'routine1', 'routine1', 'routine2'], [ '', 'OD', 'OD', 'result1', 'result2', 'result1'], [ '', 'wx', 'wy', '', '', '']] tuples = zip(*arrays) tuples.sort() index = MultiIndex.from_tuples(tuples) df = DataFrame(randn(4,6),columns = index) result = df.copy() expected = df.copy() result['b'] = [1,2,3,4] expected['b','',''] = [1,2,3,4] assert_frame_equal(result, expected) def test_mixed_depth_drop(self): arrays = [[ 'a', 'top', 'top', 'routine1', 'routine1', 'routine2'], [ '', 'OD', 'OD', 'result1', 'result2', 'result1'], [ '', 'wx', 'wy', '', '', '']] tuples = zip(*arrays) tuples.sort() index = MultiIndex.from_tuples(tuples) df = DataFrame(randn(4,6),columns = index) result = df.drop('a',axis=1) expected = df.drop([('a','','')],axis=1) assert_frame_equal(expected, result) result = df.drop(['top'],axis=1) expected = df.drop([('top','OD','wx')], axis=1) expected = expected.drop([('top','OD','wy')], axis=1) assert_frame_equal(expected, result) result = df.drop(('top', 'OD', 'wx'), axis=1) expected = df.drop([('top','OD','wx')], axis=1) assert_frame_equal(expected, result) expected = df.drop([('top','OD','wy')], axis=1) expected = df.drop('top', axis=1) result = df.drop('result1', level=1, axis=1) expected = df.drop([('routine1', 'result1', ''), ('routine2', 'result1', '')], axis=1) assert_frame_equal(expected, result) def test_mixed_depth_pop(self): arrays = [[ 'a', 'top', 'top', 'routine1', 'routine1', 'routine2'], [ '', 'OD', 'OD', 'result1', 'result2', 'result1'], [ '', 'wx', 'wy', '', '', '']] tuples = zip(*arrays) tuples.sort() index = MultiIndex.from_tuples(tuples) df = DataFrame(randn(4,6),columns = index) df1 = df.copy() df2 = df.copy() result = df1.pop('a') expected = df2.pop(('a','','')) assert_series_equal(expected, result) assert_frame_equal(df1, df2) self.assertEquals(result.name,'a') expected = df1['top'] df1 = df1.drop(['top'],axis=1) result = df2.pop('top') assert_frame_equal(expected, result) assert_frame_equal(df1, df2) def test_reindex_level_partial_selection(self): result = self.frame.reindex(['foo', 'qux'], level=0) expected = self.frame.ix[[0, 1, 2, 7, 8, 9]] assert_frame_equal(result, expected) result = self.frame.T.reindex_axis(['foo', 'qux'], axis=1, level=0) assert_frame_equal(result, expected.T) result = self.frame.ix[['foo', 'qux']] assert_frame_equal(result, expected) result = self.frame['A'].ix[['foo', 'qux']] assert_series_equal(result, expected['A']) result = self.frame.T.ix[:, ['foo', 'qux']] assert_frame_equal(result, expected.T) def test_setitem_multiple_partial(self): expected = self.frame.copy() result = self.frame.copy() result.ix[['foo', 'bar']] = 0 expected.ix['foo'] = 0 expected.ix['bar'] = 0 assert_frame_equal(result, expected) expected = self.frame.copy() result = self.frame.copy() result.ix['foo':'bar'] = 0 expected.ix['foo'] = 0 expected.ix['bar'] = 0 assert_frame_equal(result, expected) expected = self.frame['A'].copy() result = self.frame['A'].copy() result.ix[['foo', 'bar']] = 0 expected.ix['foo'] = 0 expected.ix['bar'] = 0 assert_series_equal(result, expected) expected = self.frame['A'].copy() result = self.frame['A'].copy() result.ix['foo':'bar'] = 0 expected.ix['foo'] = 0 expected.ix['bar'] = 0 assert_series_equal(result, expected) def test_drop_level(self): result = self.frame.drop(['bar', 'qux'], level='first') expected = self.frame.ix[[0, 1, 2, 5, 6]] assert_frame_equal(result, expected) result = self.frame.drop(['two'], level='second') expected = self.frame.ix[[0, 2, 3, 6, 7, 9]] assert_frame_equal(result, expected) result = self.frame.T.drop(['bar', 'qux'], axis=1, level='first') expected = self.frame.ix[[0, 1, 2, 5, 6]].T assert_frame_equal(result, expected) result = self.frame.T.drop(['two'], axis=1, level='second') expected = self.frame.ix[[0, 2, 3, 6, 7, 9]].T assert_frame_equal(result, expected) def test_unicode_repr_issues(self): levels = [Index([u'a/\u03c3', u'b/\u03c3',u'c/\u03c3']), Index([0, 1])] labels = [np.arange(3).repeat(2), np.tile(np.arange(2), 3)] index = MultiIndex(levels=levels, labels=labels) repr(index.levels)
class TestMultiLevel(unittest.TestCase): def setUp(self): index = MultiIndex( levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=["first", "second"], ) self.frame = DataFrame(np.random.randn(10, 3), index=index, columns=Index(["A", "B", "C"], name="exp")) self.single_level = MultiIndex(levels=[["foo", "bar", "baz", "qux"]], labels=[[0, 1, 2, 3]], names=["first"]) # create test series object arrays = [ ["bar", "bar", "baz", "baz", "qux", "qux", "foo", "foo"], ["one", "two", "one", "two", "one", "two", "one", "two"], ] tuples = zip(*arrays) index = MultiIndex.from_tuples(tuples) s = Series(randn(8), index=index) s[3] = np.NaN self.series = s tm.N = 100 self.tdf = tm.makeTimeDataFrame() self.ymd = self.tdf.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day]).sum() # use Int64Index, to make sure things work self.ymd.index.levels = [lev.astype("i8") for lev in self.ymd.index.levels] self.ymd.index.names = ["year", "month", "day"] def test_append(self): a, b = self.frame[:5], self.frame[5:] result = a.append(b) tm.assert_frame_equal(result, self.frame) result = a["A"].append(b["A"]) tm.assert_series_equal(result, self.frame["A"]) def test_reindex_level(self): # axis=0 month_sums = self.ymd.sum(level="month") result = month_sums.reindex(self.ymd.index, level=1) expected = self.ymd.groupby(level="month").transform(np.sum) assert_frame_equal(result, expected) # Series result = month_sums["A"].reindex(self.ymd.index, level=1) expected = self.ymd["A"].groupby(level="month").transform(np.sum) assert_series_equal(result, expected) # axis=1 month_sums = self.ymd.T.sum(axis=1, level="month") result = month_sums.reindex(columns=self.ymd.index, level=1) expected = self.ymd.groupby(level="month").transform(np.sum).T assert_frame_equal(result, expected) def test_binops_level(self): def _check_op(opname): op = getattr(DataFrame, opname) month_sums = self.ymd.sum(level="month") result = op(self.ymd, month_sums, level="month") broadcasted = self.ymd.groupby(level="month").transform(np.sum) expected = op(self.ymd, broadcasted) assert_frame_equal(result, expected) # Series op = getattr(Series, opname) result = op(self.ymd["A"], month_sums["A"], level="month") broadcasted = self.ymd["A"].groupby(level="month").transform(np.sum) expected = op(self.ymd["A"], broadcasted) assert_series_equal(result, expected) _check_op("sub") _check_op("add") _check_op("mul") _check_op("div") def test_pickle(self): import cPickle def _test_roundtrip(frame): pickled = cPickle.dumps(frame) unpickled = cPickle.loads(pickled) assert_frame_equal(frame, unpickled) _test_roundtrip(self.frame) _test_roundtrip(self.frame.T) _test_roundtrip(self.ymd) _test_roundtrip(self.ymd.T) def test_reindex(self): reindexed = self.frame.ix[[("foo", "one"), ("bar", "one")]] expected = self.frame.ix[[0, 3]] assert_frame_equal(reindexed, expected) def test_reindex_preserve_levels(self): new_index = self.ymd.index[::10] chunk = self.ymd.reindex(new_index) self.assert_(chunk.index is new_index) chunk = self.ymd.ix[new_index] self.assert_(chunk.index is new_index) ymdT = self.ymd.T chunk = ymdT.reindex(columns=new_index) self.assert_(chunk.columns is new_index) chunk = ymdT.ix[:, new_index] self.assert_(chunk.columns is new_index) def test_sort_index_preserve_levels(self): result = self.frame.sort_index() self.assertEquals(result.index.names, self.frame.index.names) def test_repr_to_string(self): repr(self.frame) repr(self.ymd) repr(self.frame.T) repr(self.ymd.T) buf = StringIO() self.frame.to_string(buf=buf) self.ymd.to_string(buf=buf) self.frame.T.to_string(buf=buf) self.ymd.T.to_string(buf=buf) def test_getitem_simple(self): df = self.frame.T col = df["foo", "one"] assert_almost_equal(col.values, df.values[:, 0]) self.assertRaises(KeyError, df.__getitem__, ("foo", "four")) self.assertRaises(KeyError, df.__getitem__, "foobar") def test_series_getitem(self): s = self.ymd["A"] result = s[2000, 3] result2 = s.ix[2000, 3] expected = s.reindex(s.index[42:65]) expected.index = expected.index.droplevel(0).droplevel(0) assert_series_equal(result, expected) result = s[2000, 3, 10] expected = s[49] self.assertEquals(result, expected) # fancy result = s.ix[[(2000, 3, 10), (2000, 3, 13)]] expected = s.reindex(s.index[49:51]) assert_series_equal(result, expected) # key error self.assertRaises(KeyError, s.__getitem__, (2000, 3, 4)) def test_series_getitem_corner(self): s = self.ymd["A"] # don't segfault, GH #495 # out of bounds access self.assertRaises(IndexError, s.__getitem__, len(self.ymd)) # generator result = s[(x > 0 for x in s)] expected = s[s > 0] assert_series_equal(result, expected) def test_series_setitem(self): s = self.ymd["A"] s[2000, 3] = np.nan self.assert_(isnull(s.values[42:65]).all()) self.assert_(notnull(s.values[:42]).all()) self.assert_(notnull(s.values[65:]).all()) s[2000, 3, 10] = np.nan self.assert_(isnull(s[49])) def test_series_slice_partial(self): pass def test_frame_getitem_setitem_slice(self): # getitem result = self.frame.ix[:4] expected = self.frame[:4] assert_frame_equal(result, expected) # setitem cp = self.frame.copy() cp.ix[:4] = 0 self.assert_((cp.values[:4] == 0).all()) self.assert_((cp.values[4:] != 0).all()) def test_frame_getitem_setitem_multislice(self): levels = [["t1", "t2"], ["a", "b", "c"]] labels = [[0, 0, 0, 1, 1], [0, 1, 2, 0, 1]] midx = MultiIndex(labels=labels, levels=levels, names=[None, "id"]) df = DataFrame({"value": [1, 2, 3, 7, 8]}, index=midx) result = df.ix[:, "value"] assert_series_equal(df["value"], result) result = df.ix[1:3, "value"] assert_series_equal(df["value"][1:3], result) result = df.ix[:, :] assert_frame_equal(df, result) result = df df.ix[:, "value"] = 10 result["value"] = 10 assert_frame_equal(df, result) df.ix[:, :] = 10 assert_frame_equal(df, result) def test_getitem_tuple_plus_slice(self): # GH #671 df = DataFrame({"a": range(10), "b": range(10), "c": np.random.randn(10), "d": np.random.randn(10)}) idf = df.set_index(["a", "b"]) result = idf.ix[(0, 0), :] expected = idf.ix[0, 0] expected2 = idf.xs((0, 0)) assert_series_equal(result, expected) assert_series_equal(result, expected2) def test_xs(self): xs = self.frame.xs(("bar", "two")) xs2 = self.frame.ix[("bar", "two")] assert_series_equal(xs, xs2) assert_almost_equal(xs.values, self.frame.values[4]) def test_xs_partial(self): result = self.frame.xs("foo") result2 = self.frame.ix["foo"] expected = self.frame.T["foo"].T assert_frame_equal(result, expected) assert_frame_equal(result, result2) def test_xs_level(self): result = self.frame.xs("two", level="second") expected = self.frame[self.frame.index.get_level_values(1) == "two"] expected.index = expected.index.droplevel(1) assert_frame_equal(result, expected) index = MultiIndex.from_tuples([("x", "y", "z"), ("a", "b", "c"), ("p", "q", "r")]) df = DataFrame(np.random.randn(3, 5), index=index) result = df.xs("c", level=2) expected = df[1:2] expected.index = expected.index.droplevel(2) assert_frame_equal(result, expected) def test_xs_level_multiple(self): from pandas import read_table from StringIO import StringIO text = """ A B C D E one two three four a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640 a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744 x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838""" df = read_table(StringIO(text), sep="\s+") result = df.xs(("a", 4), level=["one", "four"]) expected = df.xs("a").xs(4, level="four") assert_frame_equal(result, expected) def test_xs_level0(self): from pandas import read_table from StringIO import StringIO text = """ A B C D E one two three four a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640 a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744 x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838""" df = read_table(StringIO(text), sep="\s+") result = df.xs("a", level=0) expected = df.xs("a") self.assertEqual(len(result), 2) assert_frame_equal(result, expected) def test_xs_level_series(self): s = self.frame["A"] result = s[:, "two"] expected = self.frame.xs("two", level=1)["A"] assert_series_equal(result, expected) s = self.ymd["A"] result = s[2000, 5] expected = self.ymd.ix[2000, 5]["A"] assert_series_equal(result, expected) # not implementing this for now self.assertRaises(TypeError, s.__getitem__, (2000, slice(3, 4))) # result = s[2000, 3:4] # lv =s.index.get_level_values(1) # expected = s[(lv == 3) | (lv == 4)] # expected.index = expected.index.droplevel(0) # assert_series_equal(result, expected) # can do this though def test_get_loc_single_level(self): s = Series(np.random.randn(len(self.single_level)), index=self.single_level) for k in self.single_level.values: s[k] def test_getitem_toplevel(self): df = self.frame.T result = df["foo"] expected = df.reindex(columns=df.columns[:3]) expected.columns = expected.columns.droplevel(0) assert_frame_equal(result, expected) result = df["bar"] result2 = df.ix[:, "bar"] expected = df.reindex(columns=df.columns[3:5]) expected.columns = expected.columns.droplevel(0) assert_frame_equal(result, expected) assert_frame_equal(result, result2) def test_getitem_setitem_slice_integers(self): index = MultiIndex(levels=[[0, 1, 2], [0, 2]], labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]]) frame = DataFrame(np.random.randn(len(index), 4), index=index, columns=["a", "b", "c", "d"]) res = frame.ix[1:2] exp = frame.reindex(frame.index[2:]) assert_frame_equal(res, exp) frame.ix[1:2] = 7 self.assert_((frame.ix[1:2] == 7).values.all()) series = Series(np.random.randn(len(index)), index=index) res = series.ix[1:2] exp = series.reindex(series.index[2:]) assert_series_equal(res, exp) series.ix[1:2] = 7 self.assert_((series.ix[1:2] == 7).values.all()) def test_getitem_int(self): levels = [[0, 1], [0, 1, 2]] labels = [[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]] index = MultiIndex(levels=levels, labels=labels) frame = DataFrame(np.random.randn(6, 2), index=index) result = frame.ix[1] expected = frame[-3:] expected.index = expected.index.droplevel(0) assert_frame_equal(result, expected) # raises exception self.assertRaises(KeyError, frame.ix.__getitem__, 3) # however this will work result = self.frame.ix[2] expected = self.frame.xs(self.frame.index[2]) assert_series_equal(result, expected) def test_getitem_partial(self): ymd = self.ymd.T result = ymd[2000, 2] expected = ymd.reindex(columns=ymd.columns[ymd.columns.labels[1] == 1]) expected.columns = expected.columns.droplevel(0).droplevel(0) assert_frame_equal(result, expected) def test_getitem_slice_not_sorted(self): df = self.frame.sortlevel(1).T # buglet with int typechecking result = df.ix[:, : np.int32(3)] expected = df.reindex(columns=df.columns[:3]) assert_frame_equal(result, expected) def test_setitem_change_dtype(self): dft = self.frame.T s = dft["foo", "two"] dft["foo", "two"] = s > s.median() assert_series_equal(dft["foo", "two"], s > s.median()) self.assert_(isinstance(dft._data.blocks[1].items, MultiIndex)) reindexed = dft.reindex(columns=[("foo", "two")]) assert_series_equal(reindexed["foo", "two"], s > s.median()) def test_frame_setitem_ix(self): self.frame.ix[("bar", "two"), "B"] = 5 self.assertEquals(self.frame.ix[("bar", "two"), "B"], 5) # with integer labels df = self.frame.copy() df.columns = range(3) df.ix[("bar", "two"), 1] = 7 self.assertEquals(df.ix[("bar", "two"), 1], 7) def test_fancy_slice_partial(self): result = self.frame.ix["bar":"baz"] expected = self.frame[3:7] assert_frame_equal(result, expected) result = self.ymd.ix[(2000, 2):(2000, 4)] lev = self.ymd.index.labels[1] expected = self.ymd[(lev >= 1) & (lev <= 3)] assert_frame_equal(result, expected) def test_sortlevel(self): df = self.frame.copy() df.index = np.arange(len(df)) self.assertRaises(Exception, df.sortlevel, 0) # axis=1 # series a_sorted = self.frame["A"].sortlevel(0) self.assertRaises(Exception, self.frame.reset_index()["A"].sortlevel) # preserve names self.assertEquals(a_sorted.index.names, self.frame.index.names) def test_delevel_infer_dtype(self): tuples = [tuple for tuple in cart_product(["foo", "bar"], [10, 20], [1.0, 1.1])] index = MultiIndex.from_tuples(tuples, names=["prm0", "prm1", "prm2"]) df = DataFrame(np.random.randn(8, 3), columns=["A", "B", "C"], index=index) deleveled = df.reset_index() self.assert_(com.is_integer_dtype(deleveled["prm1"])) self.assert_(com.is_float_dtype(deleveled["prm2"])) def test_reset_index_with_drop(self): deleveled = self.ymd.reset_index(drop=True) self.assertEquals(len(deleveled.columns), len(self.ymd.columns)) deleveled = self.series.reset_index() self.assert_(isinstance(deleveled, DataFrame)) self.assert_(len(deleveled.columns) == len(self.series.index.levels) + 1) deleveled = self.series.reset_index(drop=True) self.assert_(isinstance(deleveled, Series)) def test_sortlevel_by_name(self): self.frame.index.names = ["first", "second"] result = self.frame.sortlevel(level="second") expected = self.frame.sortlevel(level=1) assert_frame_equal(result, expected) def test_sortlevel_mixed(self): sorted_before = self.frame.sortlevel(1) df = self.frame.copy() df["foo"] = "bar" sorted_after = df.sortlevel(1) assert_frame_equal(sorted_before, sorted_after.drop(["foo"], axis=1)) dft = self.frame.T sorted_before = dft.sortlevel(1, axis=1) dft["foo", "three"] = "bar" sorted_after = dft.sortlevel(1, axis=1) assert_frame_equal( sorted_before.drop([("foo", "three")], axis=1), sorted_after.drop([("foo", "three")], axis=1) ) def test_count_level(self): def _check_counts(frame, axis=0): index = frame._get_axis(axis) for i in range(index.nlevels): result = frame.count(axis=axis, level=i) expected = frame.groupby(axis=axis, level=i).count(axis=axis) expected = expected.reindex_like(result).astype("i8") assert_frame_equal(result, expected) self.frame.ix[1, [1, 2]] = np.nan self.frame.ix[7, [0, 1]] = np.nan self.ymd.ix[1, [1, 2]] = np.nan self.ymd.ix[7, [0, 1]] = np.nan _check_counts(self.frame) _check_counts(self.ymd) _check_counts(self.frame.T, axis=1) _check_counts(self.ymd.T, axis=1) # can't call with level on regular DataFrame df = tm.makeTimeDataFrame() self.assertRaises(Exception, df.count, level=0) self.frame["D"] = "foo" result = self.frame.count(level=0, numeric_only=True) assert_almost_equal(result.columns, ["A", "B", "C"]) def test_count_level_series(self): index = MultiIndex( levels=[["foo", "bar", "baz"], ["one", "two", "three", "four"]], labels=[[0, 0, 0, 2, 2], [2, 0, 1, 1, 2]] ) s = Series(np.random.randn(len(index)), index=index) result = s.count(level=0) expected = s.groupby(level=0).count() assert_series_equal(result.astype("f8"), expected.reindex(result.index).fillna(0)) result = s.count(level=1) expected = s.groupby(level=1).count() assert_series_equal(result.astype("f8"), expected.reindex(result.index).fillna(0)) def test_count_level_corner(self): s = self.frame["A"][:0] result = s.count(level=0) expected = Series(0, index=s.index.levels[0]) assert_series_equal(result, expected) df = self.frame[:0] result = df.count(level=0) expected = DataFrame({}, index=s.index.levels[0], columns=df.columns).fillna(0).astype(int) assert_frame_equal(result, expected) def test_unstack(self): # just check that it works for now unstacked = self.ymd.unstack() unstacked2 = unstacked.unstack() # test that ints work unstacked = self.ymd.astype(int).unstack() def test_stack(self): # regular roundtrip unstacked = self.ymd.unstack() restacked = unstacked.stack() assert_frame_equal(restacked, self.ymd) unlexsorted = self.ymd.sortlevel(2) unstacked = unlexsorted.unstack(2) restacked = unstacked.stack() assert_frame_equal(restacked.sortlevel(0), self.ymd) unlexsorted = unlexsorted[::-1] unstacked = unlexsorted.unstack(1) restacked = unstacked.stack().swaplevel(1, 2) assert_frame_equal(restacked.sortlevel(0), self.ymd) unlexsorted = unlexsorted.swaplevel(0, 1) unstacked = unlexsorted.unstack(0).swaplevel(0, 1, axis=1) restacked = unstacked.stack(0).swaplevel(1, 2) assert_frame_equal(restacked.sortlevel(0), self.ymd) # columns unsorted unstacked = self.ymd.unstack() unstacked = unstacked.sort(axis=1, ascending=False) restacked = unstacked.stack() assert_frame_equal(restacked, self.ymd) # more than 2 levels in the columns unstacked = self.ymd.unstack(1).unstack(1) result = unstacked.stack(1) expected = self.ymd.unstack() assert_frame_equal(result, expected) result = unstacked.stack(2) expected = self.ymd.unstack(1) assert_frame_equal(result, expected) result = unstacked.stack(0) expected = self.ymd.stack().unstack(1).unstack(1) assert_frame_equal(result, expected) # not all levels present in each echelon unstacked = self.ymd.unstack(2).ix[:, ::3] stacked = unstacked.stack().stack() ymd_stacked = self.ymd.stack() assert_series_equal(stacked, ymd_stacked.reindex(stacked.index)) # stack with negative number result = self.ymd.unstack(0).stack(-2) expected = self.ymd.unstack(0).stack(0) def test_stack_mixed_dtype(self): df = self.frame.T df["foo", "four"] = "foo" df = df.sortlevel(1, axis=1) stacked = df.stack() assert_series_equal(stacked["foo"], df["foo"].stack()) self.assert_(stacked["bar"].dtype == np.float_) def test_unstack_bug(self): df = DataFrame( { "state": ["naive", "naive", "naive", "activ", "activ", "activ"], "exp": ["a", "b", "b", "b", "a", "a"], "barcode": [1, 2, 3, 4, 1, 3], "v": ["hi", "hi", "bye", "bye", "bye", "peace"], "extra": np.arange(6.0), } ) result = df.groupby(["state", "exp", "barcode", "v"]).apply(len) unstacked = result.unstack() restacked = unstacked.stack() assert_series_equal(restacked, result.reindex(restacked.index).astype(float)) def test_stack_unstack_preserve_names(self): unstacked = self.frame.unstack() self.assertEquals(unstacked.index.name, "first") self.assertEquals(unstacked.columns.names, ["exp", "second"]) restacked = unstacked.stack() self.assertEquals(restacked.index.names, self.frame.index.names) def test_unstack_level_name(self): result = self.frame.unstack("second") expected = self.frame.unstack(level=1) assert_frame_equal(result, expected) def test_stack_level_name(self): unstacked = self.frame.unstack("second") result = unstacked.stack("exp") expected = self.frame.unstack().stack(0) assert_frame_equal(result, expected) result = self.frame.stack("exp") expected = self.frame.stack() assert_series_equal(result, expected) def test_stack_unstack_multiple(self): unstacked = self.ymd.unstack(["year", "month"]) expected = self.ymd.unstack("year").unstack("month") assert_frame_equal(unstacked, expected) self.assertEquals(unstacked.columns.names, expected.columns.names) # series s = self.ymd["A"] s_unstacked = s.unstack(["year", "month"]) assert_frame_equal(s_unstacked, expected["A"]) restacked = unstacked.stack(["year", "month"]) restacked = restacked.swaplevel(0, 1).swaplevel(1, 2) restacked = restacked.sortlevel(0) assert_frame_equal(restacked, self.ymd) self.assertEquals(restacked.index.names, self.ymd.index.names) # GH #451 unstacked = self.ymd.unstack([1, 2]) expected = self.ymd.unstack(1).unstack(1) assert_frame_equal(unstacked, expected) unstacked = self.ymd.unstack([2, 1]) expected = self.ymd.unstack(2).unstack(1) assert_frame_equal(unstacked, expected) def test_groupby_transform(self): s = self.frame["A"] grouper = s.index.get_level_values(0) grouped = s.groupby(grouper) applied = grouped.apply(lambda x: x * 2) expected = grouped.transform(lambda x: x * 2) assert_series_equal(applied.reindex(expected.index), expected) def test_groupby_corner(self): midx = MultiIndex(levels=[["foo"], ["bar"], ["baz"]], labels=[[0], [0], [0]], names=["one", "two", "three"]) df = DataFrame([np.random.rand(4)], columns=["a", "b", "c", "d"], index=midx) # should work df.groupby(level="three") def test_join(self): a = self.frame.ix[:5, ["A"]] b = self.frame.ix[2:, ["B", "C"]] joined = a.join(b, how="outer").reindex(self.frame.index) expected = self.frame.copy() expected.values[np.isnan(joined.values)] = np.nan self.assert_(not np.isnan(joined.values).all()) assert_frame_equal(joined, expected) def test_swaplevel(self): swapped = self.frame["A"].swaplevel(0, 1) swapped2 = self.frame["A"].swaplevel("first", "second") self.assert_(not swapped.index.equals(self.frame.index)) assert_series_equal(swapped, swapped2) back = swapped.swaplevel(0, 1) back2 = swapped.swaplevel("second", "first") self.assert_(back.index.equals(self.frame.index)) assert_series_equal(back, back2) ft = self.frame.T swapped = ft.swaplevel("first", "second", axis=1) exp = self.frame.swaplevel("first", "second").T assert_frame_equal(swapped, exp) def test_swaplevel_panel(self): panel = Panel({"ItemA": self.frame, "ItemB": self.frame * 2}) result = panel.swaplevel(0, 1, axis="major") expected = panel.copy() expected.major_axis = expected.major_axis.swaplevel(0, 1) tm.assert_panel_equal(result, expected) def test_reorder_levels(self): result = self.ymd.reorder_levels(["month", "day", "year"]) expected = self.ymd.swaplevel(0, 1).swaplevel(1, 2) assert_frame_equal(result, expected) result = self.ymd["A"].reorder_levels(["month", "day", "year"]) expected = self.ymd["A"].swaplevel(0, 1).swaplevel(1, 2) assert_series_equal(result, expected) result = self.ymd.T.reorder_levels(["month", "day", "year"], axis=1) expected = self.ymd.T.swaplevel(0, 1, axis=1).swaplevel(1, 2, axis=1) assert_frame_equal(result, expected) self.assertRaises(Exception, self.ymd.index.reorder_levels, [1, 2, 3]) def test_insert_index(self): df = self.ymd[:5].T df[2000, 1, 10] = df[2000, 1, 7] self.assert_(isinstance(df.columns, MultiIndex)) self.assert_((df[2000, 1, 10] == df[2000, 1, 7]).all()) def test_alignment(self): x = Series(data=[1, 2, 3], index=MultiIndex.from_tuples([("A", 1), ("A", 2), ("B", 3)])) y = Series(data=[4, 5, 6], index=MultiIndex.from_tuples([("Z", 1), ("Z", 2), ("B", 3)])) res = x - y exp_index = x.index.union(y.index) exp = x.reindex(exp_index) - y.reindex(exp_index) assert_series_equal(res, exp) # hit non-monotonic code path res = x[::-1] - y[::-1] exp_index = x.index.union(y.index) exp = x.reindex(exp_index) - y.reindex(exp_index) assert_series_equal(res, exp) def test_is_lexsorted(self): levels = [[0, 1], [0, 1, 2]] index = MultiIndex(levels=levels, labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]]) self.assert_(index.is_lexsorted()) index = MultiIndex(levels=levels, labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 2, 1]]) self.assert_(not index.is_lexsorted()) index = MultiIndex(levels=levels, labels=[[0, 0, 1, 0, 1, 1], [0, 1, 0, 2, 2, 1]]) self.assert_(not index.is_lexsorted()) self.assert_(index.lexsort_depth == 0) def test_frame_getitem_view(self): df = self.frame.T df["foo"].values[:] = 0 self.assert_((df["foo"].values == 0).all()) # but not if it's mixed-type df["foo", "four"] = "foo" df = df.sortlevel(0, axis=1) df["foo"]["one"] = 2 self.assert_((df["foo", "one"] == 0).all()) def test_frame_getitem_not_sorted(self): df = self.frame.T df["foo", "four"] = "foo" arrays = [np.array(x) for x in zip(*df.columns.get_tuple_index())] result = df["foo"] result2 = df.ix[:, "foo"] expected = df.reindex(columns=df.columns[arrays[0] == "foo"]) expected.columns = expected.columns.droplevel(0) assert_frame_equal(result, expected) assert_frame_equal(result2, expected) df = df.T result = df.xs("foo") result2 = df.ix["foo"] expected = df.reindex(df.index[arrays[0] == "foo"]) expected.index = expected.index.droplevel(0) assert_frame_equal(result, expected) assert_frame_equal(result2, expected) def test_series_getitem_not_sorted(self): arrays = [ ["bar", "bar", "baz", "baz", "qux", "qux", "foo", "foo"], ["one", "two", "one", "two", "one", "two", "one", "two"], ] tuples = zip(*arrays) index = MultiIndex.from_tuples(tuples) s = Series(randn(8), index=index) arrays = [np.array(x) for x in zip(*index.get_tuple_index())] result = s["qux"] result2 = s.ix["qux"] expected = s[arrays[0] == "qux"] expected.index = expected.index.droplevel(0) assert_series_equal(result, expected) assert_series_equal(result2, expected) def test_count(self): frame = self.frame.copy() frame.index.names = ["a", "b"] result = frame.count(level="b") expect = self.frame.count(level=1) assert_frame_equal(result, expect) result = frame.count(level="a") expect = self.frame.count(level=0) assert_frame_equal(result, expect) series = self.series.copy() series.index.names = ["a", "b"] result = series.count(level="b") expect = self.series.count(level=1) assert_series_equal(result, expect) result = series.count(level="a") expect = self.series.count(level=0) assert_series_equal(result, expect) self.assertRaises(Exception, series.count, "x") self.assertRaises(Exception, frame.count, level="x") AGG_FUNCTIONS = ["sum", "prod", "min", "max", "median", "mean", "skew", "mad", "std", "var"] def test_series_group_min_max(self): for op, level, skipna in cart_product(self.AGG_FUNCTIONS, range(2), [False, True]): grouped = self.series.groupby(level=level) aggf = lambda x: getattr(x, op)(skipna=skipna) # skipna=True leftside = grouped.agg(aggf) rightside = getattr(self.series, op)(level=level, skipna=skipna) assert_series_equal(leftside, rightside) def test_frame_group_ops(self): self.frame.ix[1, [1, 2]] = np.nan self.frame.ix[7, [0, 1]] = np.nan for op, level, axis, skipna in cart_product(self.AGG_FUNCTIONS, range(2), range(2), [False, True]): if axis == 0: frame = self.frame else: frame = self.frame.T grouped = frame.groupby(level=level, axis=axis) aggf = lambda x: getattr(x, op)(skipna=skipna, axis=axis) leftside = grouped.agg(aggf) rightside = getattr(frame, op)(level=level, axis=axis, skipna=skipna) # for good measure, groupby detail level_index = frame._get_axis(axis).levels[level] self.assert_(leftside._get_axis(axis).equals(level_index)) self.assert_(rightside._get_axis(axis).equals(level_index)) assert_frame_equal(leftside, rightside) def test_frame_series_agg_multiple_levels(self): result = self.ymd.sum(level=["year", "month"]) expected = self.ymd.groupby(level=["year", "month"]).sum() assert_frame_equal(result, expected) result = self.ymd["A"].sum(level=["year", "month"]) expected = self.ymd["A"].groupby(level=["year", "month"]).sum() assert_series_equal(result, expected) def test_groupby_multilevel(self): result = self.ymd.groupby(level=[0, 1]).mean() k1 = self.ymd.index.get_level_values(0) k2 = self.ymd.index.get_level_values(1) expected = self.ymd.groupby([k1, k2]).mean() assert_frame_equal(result, expected) self.assertEquals(result.index.names, self.ymd.index.names[:2]) result2 = self.ymd.groupby(level=self.ymd.index.names[:2]).mean() assert_frame_equal(result, result2) def test_groupby_multilevel_with_transform(self): pass def test_multilevel_consolidate(self): index = MultiIndex.from_tuples([("foo", "one"), ("foo", "two"), ("bar", "one"), ("bar", "two")]) df = DataFrame(np.random.randn(4, 4), index=index, columns=index) df["Totals", ""] = df.sum(1) df = df.consolidate() def test_ix_preserve_names(self): result = self.ymd.ix[2000] result2 = self.ymd["A"].ix[2000] self.assertEquals(result.index.names, self.ymd.index.names[1:]) self.assertEquals(result2.index.names, self.ymd.index.names[1:]) result = self.ymd.ix[2000, 2] result2 = self.ymd["A"].ix[2000, 2] self.assertEquals(result.index.name, self.ymd.index.names[2]) self.assertEquals(result2.index.name, self.ymd.index.names[2]) def test_partial_set(self): # GH #397 df = self.ymd.copy() exp = self.ymd.copy() df.ix[2000, 4] = 0 exp.ix[2000, 4].values[:] = 0 assert_frame_equal(df, exp) df["A"].ix[2000, 4] = 1 exp["A"].ix[2000, 4].values[:] = 1 assert_frame_equal(df, exp) df.ix[2000] = 5 exp.ix[2000].values[:] = 5 assert_frame_equal(df, exp) # this works...for now df["A"].ix[14] = 5 self.assertEquals(df["A"][14], 5) def test_unstack_preserve_types(self): # GH #403 self.ymd["E"] = "foo" self.ymd["F"] = 2 unstacked = self.ymd.unstack("month") self.assert_(unstacked["A", 1].dtype == np.float64) self.assert_(unstacked["E", 1].dtype == np.object_) self.assert_(unstacked["F", 1].dtype == np.float64) def test_getitem_lowerdim_corner(self): self.assertRaises(KeyError, self.frame.ix.__getitem__, (("bar", "three"), "B")) self.assertRaises(KeyError, self.frame.ix.__setitem__, (("bar", "three"), "B"), 0) # ---------------------------------------------------------------------- # AMBIGUOUS CASES! def test_partial_ix_missing(self): raise nose.SkipTest result = self.ymd.ix[2000, 0] expected = self.ymd.ix[2000]["A"] assert_series_equal(result, expected) # need to put in some work here # self.ymd.ix[2000, 0] = 0 # self.assert_((self.ymd.ix[2000]['A'] == 0).all()) self.assertRaises(Exception, self.ymd.ix.__getitem__, (2000, 6)) self.assertRaises(Exception, self.ymd.ix.__getitem__, (2000, 6), 0) def test_fancy_2d(self): raise nose.SkipTest result = self.frame.ix["foo", "B"] expected = self.frame.xs("foo")["B"] assert_series_equal(result, expected) ft = self.frame.T result = ft.ix["B", "foo"] expected = ft.xs("B")["foo"] assert_series_equal(result, expected) # ---------------------------------------------------------------------- def test_to_html(self): self.ymd.columns.name = "foo" self.ymd.to_html() self.ymd.T.to_html() def test_level_with_tuples(self): index = MultiIndex( levels=[[("foo", "bar", 0), ("foo", "baz", 0), ("foo", "qux", 0)], [0, 1]], labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]], ) series = Series(np.random.randn(6), index=index) frame = DataFrame(np.random.randn(6, 4), index=index) result = series[("foo", "bar", 0)] result2 = series.ix[("foo", "bar", 0)] expected = series[:2] expected.index = expected.index.droplevel(0) assert_series_equal(result, expected) assert_series_equal(result2, expected) self.assertRaises(KeyError, series.__getitem__, (("foo", "bar", 0), 2)) result = frame.ix[("foo", "bar", 0)] result2 = frame.xs(("foo", "bar", 0)) expected = frame[:2] expected.index = expected.index.droplevel(0) assert_frame_equal(result, expected) assert_frame_equal(result2, expected) index = MultiIndex( levels=[[("foo", "bar"), ("foo", "baz"), ("foo", "qux")], [0, 1]], labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]], ) series = Series(np.random.randn(6), index=index) frame = DataFrame(np.random.randn(6, 4), index=index) result = series[("foo", "bar")] result2 = series.ix[("foo", "bar")] expected = series[:2] expected.index = expected.index.droplevel(0) assert_series_equal(result, expected) assert_series_equal(result2, expected) result = frame.ix[("foo", "bar")] result2 = frame.xs(("foo", "bar")) expected = frame[:2] expected.index = expected.index.droplevel(0) assert_frame_equal(result, expected) assert_frame_equal(result2, expected) def test_int_series_slicing(self): s = self.ymd["A"] result = s[5:] expected = s.reindex(s.index[5:]) assert_series_equal(result, expected) exp = self.ymd["A"].copy() s[5:] = 0 exp.values[5:] = 0 self.assert_(np.array_equal(s.values, exp.values)) result = self.ymd[5:] expected = self.ymd.reindex(s.index[5:]) assert_frame_equal(result, expected) def test_mixed_depth_get(self): arrays = [ ["a", "top", "top", "routine1", "routine1", "routine2"], ["", "OD", "OD", "result1", "result2", "result1"], ["", "wx", "wy", "", "", ""], ] tuples = zip(*arrays) tuples.sort() index = MultiIndex.from_tuples(tuples) df = DataFrame(randn(4, 6), columns=index) result = df["a"] expected = df["a", "", ""] assert_series_equal(result, expected) self.assertEquals(result.name, "a") result = df["routine1", "result1"] expected = df["routine1", "result1", ""] assert_series_equal(result, expected) self.assertEquals(result.name, ("routine1", "result1")) def test_mixed_depth_insert(self): arrays = [ ["a", "top", "top", "routine1", "routine1", "routine2"], ["", "OD", "OD", "result1", "result2", "result1"], ["", "wx", "wy", "", "", ""], ] tuples = zip(*arrays) tuples.sort() index = MultiIndex.from_tuples(tuples) df = DataFrame(randn(4, 6), columns=index) result = df.copy() expected = df.copy() result["b"] = [1, 2, 3, 4] expected["b", "", ""] = [1, 2, 3, 4] assert_frame_equal(result, expected) def test_mixed_depth_drop(self): arrays = [ ["a", "top", "top", "routine1", "routine1", "routine2"], ["", "OD", "OD", "result1", "result2", "result1"], ["", "wx", "wy", "", "", ""], ] tuples = zip(*arrays) tuples.sort() index = MultiIndex.from_tuples(tuples) df = DataFrame(randn(4, 6), columns=index) result = df.drop("a", axis=1) expected = df.drop([("a", "", "")], axis=1) assert_frame_equal(expected, result) result = df.drop(["top"], axis=1) expected = df.drop([("top", "OD", "wx")], axis=1) expected = expected.drop([("top", "OD", "wy")], axis=1) assert_frame_equal(expected, result) def test_mixed_depth_pop(self): arrays = [ ["a", "top", "top", "routine1", "routine1", "routine2"], ["", "OD", "OD", "result1", "result2", "result1"], ["", "wx", "wy", "", "", ""], ] tuples = zip(*arrays) tuples.sort() index = MultiIndex.from_tuples(tuples) df = DataFrame(randn(4, 6), columns=index) df1 = df.copy() df2 = df.copy() result = df1.pop("a") expected = df2.pop(("a", "", "")) assert_series_equal(expected, result) assert_frame_equal(df1, df2) self.assertEquals(result.name, "a") expected = df1["top"] df1 = df1.drop(["top"], axis=1) result = df2.pop("top") assert_frame_equal(expected, result) assert_frame_equal(df1, df2)
class Scores(object): """ Parameters ---------- uri : str, optional modality : str, optional Returns ------- scores : `Scores` Examples -------- >>> s = Scores(uri='video', modality='speaker') >>> s[Segment(0,1), 's1', 'A'] = 0.1 >>> s[Segment(0,1), 's1', 'B'] = 0.2 >>> s[Segment(0,1), 's1', 'C'] = 0.3 >>> s[Segment(0,1), 's2', 'A'] = 0.4 >>> s[Segment(0,1), 's2', 'B'] = 0.3 >>> s[Segment(0,1), 's2', 'C'] = 0.2 >>> s[Segment(2,3), 's1', 'A'] = 0.2 >>> s[Segment(2,3), 's1', 'B'] = 0.1 >>> s[Segment(2,3), 's1', 'C'] = 0.3 """ @classmethod def from_df( cls, df, uri=None, modality=None, aggfunc=np.mean ): """ Parameters ---------- df : DataFrame Must contain the following columns: 'segment', 'track', 'label' and 'value' uri : str, optional Resource identifier modality : str, optional Modality aggfunc : func Value aggregation function in case of duplicate (segment, track, label) tuples Returns ------- """ dataframe = pivot_table( df, values=PYANNOTE_SCORE, index=[PYANNOTE_SEGMENT, PYANNOTE_TRACK], columns=PYANNOTE_LABEL, aggfunc=aggfunc ) annotation = Annotation(uri=uri, modality=modality) for index, _ in dataframe.iterrows(): segment = Segment(*index[0]) track = index[1] annotation[segment, track] = '' labels = dataframe.columns return cls(uri=uri, modality=modality, annotation=annotation, labels=labels, values=dataframe.values) def __init__(self, uri=None, modality=None, annotation=None, labels=None, values=None, dtype=None): super(Scores, self).__init__() names = [PYANNOTE_SEGMENT + '_' + field for field in Segment._fields] + [PYANNOTE_TRACK] if annotation: annotation = annotation.copy() index = Index( [s + (t, ) for s, t in annotation.itertracks()], name=names) else: annotation = Annotation(uri=uri, modality=modality) index = MultiIndex(levels=[list() for name in names], labels=[list() for name in names], names=names) self.annotation_ = annotation columns = None if labels is None else list(labels) data = None if values is None else np.array(values) dtype = np.float if values is None else values.dtype self.dataframe_ = DataFrame(data=data, dtype=dtype, index=index, columns=columns) self.hasChanged_ = True self.modality = modality self.uri = uri def copy(self): self._reindexIfNeeded() copied = self.__class__(uri=self.uri, modality=self.modality) copied.dataframe_ = self.dataframe_.copy() copied.annotation_ = self.annotation_.copy() copied.hasChanged_ = self.hasChanged_ return copied # del scores[segment] # del scores[segment, :] # del scores[segment, track] def __delitem__(self, key): if isinstance(key, Segment): segment = key self.dataframe_.drop(tuple(segment), axis=0, inplace=True) del self.annotation_[segment] self.hasChanged_ = True elif isinstance(key, tuple) and len(key) == 2: segment, track = key self.dataframe_.drop(tuple(segment) + (track, ), axis=0, inplace=True) del self.annotation_[segment, track] self.hasChanged_ = True else: raise KeyError('') # value = scores[segment, track, label] def __getitem__(self, key): if len(key) == 2: key = (key[0], '_', key[1]) segment, track, label = key return self.dataframe_.at[tuple(segment) + (track, ), label] # scores[segment, track, label] = value # scores[segment, label] ==== scores[segment, '_', label] def __setitem__(self, key, value): if len(key) == 2: key = (key[0], '_', key[1]) segment, track, label = key # do not add empty track if not segment: return self.dataframe_.at[tuple(segment) + (track,), label] = value self.annotation_[segment, track] = label self.hasChanged_ = True def __len__(self): """Number of annotated segments""" return len(self.annotation_) def __nonzero__(self): return self.__bool__() def __bool__(self): """False if annotation is empty""" return True if self.annotation_ else False def __contains__(self, included): """Check if segments are annotated Parameters ---------- included : `Segment` or `Timeline` Returns ------- contains : bool True if every segment in `included` is annotated, False otherwise. """ return included in self.annotation_ def __iter__(self): """Iterate over sorted segments""" return iter(self.annotation_.get_timeline()) def __reversed__(self): """Reverse iterate over sorted segments""" return reversed(self.annotation_.get_timeline()) def itersegments(self): return iter(self) def tracks(self, segment): """Set of tracks for query segment Parameters ---------- segment : `Segment` Query segment Returns ------- tracks : set Set of tracks for query segment """ return self.annotation_.get_tracks(segment) def has_track(self, segment, track): """Check whether a given track exists Parameters ---------- segment : `Segment` Query segment track : Query track Returns ------- exists : bool True if track exists for segment """ return self.annotation_.has_track(segment, track) def get_track_by_name(self, track): """Get all tracks with given name Parameters ---------- track : any valid track name Requested name track Returns ------- tracks : list List of (segment, track) tuples """ return self.annotation_.get_track_by_name(track) def new_track(self, segment, candidate=None, prefix=None): """Track name generator Parameters ---------- segment : Segment prefix : str, optional candidate : any valid track name Returns ------- track : str New track name """ return self.annotation_.new_track(segment, candidate=None, prefix=None) def itertracks(self): """Iterate over annotation as (segment, track) tuple""" return self.annotation_.itertracks() def itervalues(self): """Iterate over scores as (segment, track, label, value) tuple""" # make sure segment/track pairs are sorted self._reindexIfNeeded() labels = self.labels() # yield one (segment, track, label) tuple per loop for index, columns in self.dataframe_.iterrows(): segment = Segment(*index[:-1]) track = index[-1] for label in labels: value = columns[label] if not np.isnan(value): yield segment, track, label, value def get_track_scores(self, segment, track): """Get all scores for a given track. Parameters ---------- segment : Segment track : hashable segment, track must be a valid track Returns ------- scores : dict {label: score} dictionary """ return dict(self.dataframe_.xs(tuple(segment) + (track, ))) def labels(self, unknown=True): """List of labels Parameters ---------- unknown : bool, optional When False, do not return Unknown instances When True, return any label (even Unknown instances) Returns ------- labels : list Sorted list of existing labels Remarks ------- Labels are sorted based on their string representation. """ labels = sorted(self.dataframe_.columns, key=str) if unknown: return labels else: return [l for l in labels if not isinstance(l, Unknown)] def _reindexIfNeeded(self): if not self.hasChanged_: return names = [PYANNOTE_SEGMENT + '_' + field for field in Segment._fields] + [PYANNOTE_TRACK] new_index = Index( [s + (t, ) for s, t in self.annotation_.itertracks()], name=names) self.dataframe_ = self.dataframe_.reindex(new_index) self.hasChanged_ = False return def retrack(self): """ """ self._reindexIfNeeded() retracked = self.copy() annotation = self.annotation_.retrack() retracked.annotation_ = annotation names = [PYANNOTE_SEGMENT + '_' + field for field in Segment._fields] + [PYANNOTE_TRACK] new_index = Index( [s + (t, ) for s, t in annotation.itertracks()], name=names) retracked.dataframe_.index = new_index return retracked def apply(self, func, axis=0): applied = self.copy() applied.dataframe_ = self.dataframe_.apply(func, axis=axis) applied.hasChanged_ = True return applied def rank(self, ascending=False): """ Parameters ---------- ascending : boolean, default False False for ranks by high (0) to low (N-1) Returns ------- rank : `Scores` """ ranked = self.copy() ranked.dataframe_ = -1 + self.dataframe_.rank(axis=1, ascending=ascending) ranked.hasChanged_ = True return ranked def nbest(self, n, ascending=False): """ Parameters ---------- n : int Size of n-best list ascending : boolean, default False False for ranks by high (0) to low (N-1) Returns ------- nbest : `Scores` New scores where only n-best are kept. """ filtered = self.copy() ranked_ = -1 + self.dataframe_.rank(axis=1, ascending=ascending) filtered.dataframe_ = filtered.dataframe_.where(ranked_ < n, other=np.NaN) filtered.hasChanged_ = True return filtered def subset(self, labels, invert=False): """Scores subset Extract scores subset based on labels Parameters ---------- labels : set Set of labels invert : bool, optional If invert is True, extract all but requested `labels` Returns ------- subset : `Scores` Scores subset. """ self._reindexIfNeeded() if not isinstance(labels, set): raise TypeError('labels must be provided as a set of labels.') if invert: labels = set(self.labels()) - labels else: labels = labels & set(self.labels()) subset = Scores(uri=self.uri, modality=self.modality) subset.annotation_ = self.annotation_ subset.dataframe_ = self.dataframe_[list(labels)] return subset def to_annotation(self, threshold=-np.inf, posterior=False): """ Parameters ---------- threshold : float, optional Each track is annotated with the label with the highest score. Yet, if the latter is smaller than `threshold`, label is replaced with an `Unknown` instance. posterior : bool, optional If True, scores are posterior probabilities in open-set identification. If top model posterior is higher than unknown posterior, it is selected. Otherwise, label is replaced with an `Unknown` instance. """ if not self: return Annotation(uri=self.uri, modality=self.modality) best = self.nbest(1, ascending=False) large_enough = best.copy() if posterior: unknown_posterior = 1. - self.dataframe_.sum(axis=1) large_enough.dataframe_ = ( ((best.dataframe_.T > unknown_posterior) & (best.dataframe_.T > threshold)).T ) else: large_enough.dataframe_ = ( (best.dataframe_.T > threshold).T ) large_enough.dataframe_.where(best.dataframe_.notnull(), inplace=True, other=np.NaN) annotation = Annotation(uri=self.uri, modality=self.modality) for segment, track, label, value in large_enough.itervalues(): label = label if value else Unknown() annotation[segment, track] = label return annotation def map(self, func): """Apply function to all values""" mapped = self.copy() mapped.dataframe_ = self.dataframe_.applymap(func) mapped.hasChanged_ = True return mapped def crop(self, focus, mode='strict'): """Crop on focus Parameters ---------- focus : `Segment` or `Timeline` mode : {'strict', 'loose', 'intersection'} In 'strict' mode, only segments fully included in focus coverage are kept. In 'loose' mode, any intersecting segment is kept unchanged. In 'intersection' mode, only intersecting segments are kept and replaced by their actual intersection with the focus. Returns ------- cropped : same type as caller Cropped version of the caller containing only tracks matching the provided focus and mode. Remarks ------- In 'intersection' mode, the best is done to keep the track names unchanged. However, in some cases where two original segments are cropped into the same resulting segments, conflicting track names are modified to make sure no track is lost. """ if isinstance(focus, Segment): return self.crop(Timeline([focus], uri=self.uri), mode=mode) self._reindexIfNeeded() cropped = self.copy() if mode in ['strict', 'loose']: new_annotation = self.annotation_.crop(focus, mode=mode) keep = [new_annotation.has_track(segment, track) for segment, track in self.itertracks()] cropped.dataframe_ = self.dataframe_[keep] cropped.annotation_ = new_annotation cropped.hasChanged_ = True return cropped elif mode in ['intersection']: raise NotImplementedError('') # # two original segments might be cropped into the same resulting # # segment -- therefore, we keep track of the mapping # intersection, mapping = timeline.crop(coverage, # mode=mode, mapping=True) # # # create new empty annotation # A = self.__class__(uri=self.uri, modality=self.modality) # # for cropped in intersection: # for original in mapping[cropped]: # for track in self.tracks(original): # # try to use original track name (candidate) # # if it already exists, create a brand new one # new_track = A.new_track(cropped, candidate=track) # # copy each value, column by column # for label in self.dataframe_.columns: # value = self.dataframe_.get_value((original, track), # label) # A.dataframe_ = A.dataframe_.set_value((cropped, new_track), # label, value) # # return A def __str__(self): """Human-friendly representation""" if self: self._reindexIfNeeded() return str(self.dataframe_) else: return "" def _repr_png_(self): from .notebook import repr_scores return repr_scores(self)
#!/usr/bin/env python """ A basic demo of pandas """ from pandas import DataFrame df = DataFrame(["a", "b", "c"], index=[("0", "1"), ("1", "2"), ("2", "3")]) print(df.get_values()) try: print(df.ix[("0", "1")]) except: print("yes, accessing .ix with tuple does not work") print(df.xs(("0", "1")))
pprint( s[ 1 : ] + s[ : -1 ] ) # <demo> --- stop --- # 'DataFrame' objects are 2D. # They can be created from a 'dict' of Series objects, a 2D array, etc... df = DataFrame( { "col1": randn( 4 ), "col2": randn( 4 ) } ) pprint( df ) df = DataFrame( randn( 6, 4 ), columns=[ 'A', 'B', 'C', 'D' ] ) pprint( df ) # A number of selection operations are available. # For example, you can select a particular row across columns. pprint( df.xs( 3 ) ) # Series can be used to perform vectorized ("broadcasted") operations # against DataFrames. pprint( df - df.ix[ 2 ] ) # The transpose of a DataFrame can easily be found. pprint( df.T ) # Matrix multiplication can be performed on DataFrames. pprint( df.T.dot(df) ) # <demo> --- stop --- # Pandas can directly read CSV files into data frames. df = pd.read_csv( "demo_data_1.csv" )
def f(s): s2 = pd.Series(0, index=s.index) s2.iloc[-1] = 1 return s2 df["PrimDSMgrpCase1"] = df.groupby(['Primdx_grpdsm','case'])['case'].apply(f) df ep1=df.sort(['ru','case','opdate','lst_svc'],ascending=True).drop_duplicates(['case','ru','opdate'],take_last=True).index df['ep1']=0 df['ep1'].iloc[ep1] =1 df.opdate[df.lst_svc.idxmin()] df.xs(df.ru.idxmax()) df[df.ru.isin([1,2])] df[df.lst_svc > datetime(2011, 6, 5)] To return a Series of the same shape as the original In [102]: s.where(s > 0) df.ru=df.ru.where(df.ru > 1,9) df[df.ru >5]=1 #nulls df['Provname '][pd.isnull(df.ru)]
def matrix(self,domain,independent,domainb=[('without value','withoutvalue')]): # creates a matrix M from the paper cross domain sentiment classification stemmer=SnowballStemmer("english") #################################################### domaincheck=domain domainl=domain domain1,domain2=map(list, zip(*domainb)) domain1= list(map(stemmer.stem, domain1)) domain2= list(map(stemmer.stem, domain2)) matrixM=DataFrame(0,index=domainl, columns=independent) joinf=joindocuments(df1,df2) undersampleddf=joinf.join(self.df1,self.df2) for x in undersampleddf[self.column].values: tokens = x.split() tokens=[x.lower() for x in tokens] stemm_words=[] tokens_clean=[] for j in tokens: sa=re.sub('[^A-Za-z]+', '', j) tokens_clean.append(sa) for s in tokens_clean: try: stem= stemmer.stem(s) if s!='': stemm_words.append(str(stem)) except: pass inter=set(domain).intersection(stemm_words) #find the intersection two lists intersection1= list(inter) inter1=set(independent).intersection(stemm_words) #find the intersection two lists intersection2= list(inter1) inter3=set(domain1).intersection(stemm_words) #find the intersection two lists intersection3= list(inter3) inter4=set(domain2).intersection(stemm_words) #find the intersection two lists intersection4= list(inter4) if intersection1: if intersection2: for x in intersection1: for y in intersection2: matrixM.xs(x)[y]=matrixM.xs(x)[y]+1 if intersection3: if intersection4: if intersection2: for x1 in intersection3: for y1 in intersection4: for z1 in intersection2: label=x1+y1 if label in domain: matrixM.xs(label)[z1]=matrixM.xs(label)[z1]+1 return matrixM
def main(): # reindex obj = Series(range(4), index="a b c d".split(" ")[::-1]) print obj obj2 = obj.reindex("a b c d e".split(" ")) print obj2 # Change NaN print obj.reindex("a b c d e".split(" "), fill_value=0) colors = ["blue", "purple", "yellow"] index = [0, 2, 4] obj3 = Series(colors, index=index) print obj3.reindex(range(6)) print obj3.reindex(range(6), method="ffill") # not found forward fill print obj3.reindex(range(6), method="backfill") # bfill # DataFrame states = ["Ohio", "Texas", "California"] frame = DataFrame(np.arange(9).reshape((3, 3)), index="a b c".split(" "), columns=["Ohio", "Texas", "California"]) print frame frame2 = frame.reindex("a b c d".split(" ")) print frame2 states[0] = "Utah" states[1], states[0] = states[:2] print frame.reindex(columns=states) # fill print frame.reindex("a b c d".split(" "), method="ffill", columns=states) print frame.ix["a b c d".split(" ")] print frame.ix["a b c d".split(" "), states] # Delete column print "", "" obj = Series(range(5), index="a b c d e".split(" ")) new_obj = obj.drop("c") print new_obj print obj # Index reference print "", "" obj = Series(np.arange(4.0), index="a b c d".split(" ")) print obj["b"] print obj[1] # same print obj[2:4] print obj[["b", "a", "c"]] print obj[[1, 3]] print obj[obj < 2] # Slice with label print obj["b":"c"] # include 'c' obj["b":"c"] = 5 print obj data = DataFrame( np.arange(16).reshape((4, 4)), index=["Ohio", "Colorado", "Utah", "New York"], columns=["one", "two", "three", "four"], ) print data # column print data["two"] print data[["three", "one"]] # row print data[:2] print data[data["three"] > 5] # all values print data < 5 data[data < 5] = 0 print data # row and column print data.ix[["Colorado"], ["two", "three"]] print data.ix[["Colorado", "Utah"], [3, 0, 1]] # row print data.ix[2] # label row and column, return column print data.ix[:"Utah", "two"] # xs # row print data.xs("Utah") print data.xs("Utah", axis=0) # rows print data.xs("two", axis=1) # icol/irow i is index print data.icol(1) print data.irow(1) # Union print "", "" s1 = Series([7.3, -2.5, 3.4, 1.5], index=["a", "c", "d", "e"]) s2 = Series([-2.1, 3.6, -1.5, 4, 3.1], index=["a", "c", "e", "f", "g"]) print s1 print s2 # index is union, but d, f, g are NaN print s1 + s2 df1 = DataFrame(np.arange(9.0).reshape((3, 3)), columns=list("bcd"), index=["Ohio", "Texas", "Colorado"]) df2 = DataFrame(np.arange(12.0).reshape((4, 3)), columns=list("bde"), index=["Utah", "Ohio", "Texas", "Oregon"]) print df1 print df2 print df1 + df2 # arithmetic method print "", "" df1 = DataFrame(np.arange(12.0).reshape((3, 4)), columns=list("abcd")) df2 = DataFrame(np.arange(20.0).reshape((4, 5)), columns=list("abcde")) print df1 print df2 print df1.add(df2, fill_value=0) # reindex has fill_value argument # other arithmetic method are sub/div/mul(ti) # Calculation in a DataFrame and Series print "", "" # subtract from each row. broadcat arr = np.arange(12.0).reshape((3, 4)) print arr print arr[0] print arr - arr[0] frame = DataFrame(np.arange(12.0).reshape((4, 3)), columns=list("bde"), index=["Utah", "Ohio", "Texas", "Oregon"]) series = frame.ix[0] print frame print series print frame - series series2 = Series(range(3), index=list("bef")) print frame + series2 series3 = frame["d"] series4 = frame.ix[0] print frame print series3 print series4 print frame.sub(series3, axis=0) print frame.sub(series4, axis=1) # apply function and mapping print "", "" frame = DataFrame(np.arange(12.0).reshape((4, 3)), columns=list("bde"), index=["Utah", "Ohio", "Texas", "Oregon"]) print frame f = lambda x: x.max() - x.min() print frame.apply(f) print frame.apply(f, axis=1) f = lambda x: Series([x.min(), x.max()], index=["min", "max"]) print frame.apply(f) format = lambda x: "{0:.2f}".format(x) print frame.applymap(format) # frame print frame["e"].map(format) # series # sort and rank print "", "" obj = Series(range(4), index=list("dabc")) print obj print obj.sort_index() frame = DataFrame(np.arange(8).reshape((2, 4)), index=["three", "one"], columns=list("dabc")) print frame print frame.sort_index() print frame.sort_index(axis=1) print frame.sort_index(axis=1, ascending=False) # Sorting series print "", "" obj = Series([4, 7, -3, 2]) print obj.order() obj = Series([4, np.nan, 7, np.nan, -3, 2]) print obj.order() print obj.order(ascending=False) # order by multi columns print "", "" frame = DataFrame({"b": [4, 7, -3, 2], "a": [0, 1, 0, 1]}) print frame.sort_index(by=["a", "b"]) # rank print "", "" obj = Series([7, -5, 7, 4, 2, 0, 4]) print obj.rank() # method is average print obj.rank(method="first") # No Duplicates print obj.rank(ascending=False, method="min") print obj.rank(ascending=False, method="max") f1 = DataFrame(obj, columns=["data"]) f2 = DataFrame(obj.rank(), columns=["rank"]) # merge by each index print pd.merge(f1, f2, left_index=True, right_index=True) # Index of the axis with duplicate values print "", "" obj = Series(range(5), index=list("aaabc")) print obj print obj.index.is_unique print obj["a"] print obj["c"] df = DataFrame(np.arange(12.0).reshape((4, 3)), index=list("aabb"), columns=list("ccd")) print df print df.ix["b"] print df["c"]
class TestMultiLevel(unittest.TestCase): def setUp(self): index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=['first', 'second']) self.frame = DataFrame(np.random.randn(10, 3), index=index, columns=Index(['A', 'B', 'C'], name='exp')) self.single_level = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux']], labels=[[0, 1, 2, 3]], names=['first']) # create test series object arrays = [['bar', 'bar', 'baz', 'baz', 'qux', 'qux', 'foo', 'foo'], ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] tuples = zip(*arrays) index = MultiIndex.from_tuples(tuples) s = Series(randn(8), index=index) s[3] = np.NaN self.series = s tm.N = 100 self.tdf = tm.makeTimeDataFrame() self.ymd = self.tdf.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day]).sum() # use Int64Index, to make sure things work self.ymd.index.levels = [lev.astype('i8') for lev in self.ymd.index.levels] self.ymd.index.names = ['year', 'month', 'day'] def test_append(self): a, b = self.frame[:5], self.frame[5:] result = a.append(b) tm.assert_frame_equal(result, self.frame) result = a['A'].append(b['A']) tm.assert_series_equal(result, self.frame['A']) def test_reindex_level(self): # axis=0 month_sums = self.ymd.sum(level='month') result = month_sums.reindex(self.ymd.index, level=1) expected = self.ymd.groupby(level='month').transform(np.sum) assert_frame_equal(result, expected) # Series result = month_sums['A'].reindex(self.ymd.index, level=1) expected = self.ymd['A'].groupby(level='month').transform(np.sum) assert_series_equal(result, expected) # axis=1 month_sums = self.ymd.T.sum(axis=1, level='month') result = month_sums.reindex(columns=self.ymd.index, level=1) expected = self.ymd.groupby(level='month').transform(np.sum).T assert_frame_equal(result, expected) def test_binops_level(self): def _check_op(opname): op = getattr(DataFrame, opname) month_sums = self.ymd.sum(level='month') result = op(self.ymd, month_sums, level='month') broadcasted = self.ymd.groupby(level='month').transform(np.sum) expected = op(self.ymd, broadcasted) assert_frame_equal(result, expected) # Series op = getattr(Series, opname) result = op(self.ymd['A'], month_sums['A'], level='month') broadcasted = self.ymd['A'].groupby(level='month').transform(np.sum) expected = op(self.ymd['A'], broadcasted) assert_series_equal(result, expected) _check_op('sub') _check_op('add') _check_op('mul') _check_op('div') def test_pickle(self): import cPickle def _test_roundtrip(frame): pickled = cPickle.dumps(frame) unpickled = cPickle.loads(pickled) assert_frame_equal(frame, unpickled) _test_roundtrip(self.frame) _test_roundtrip(self.frame.T) _test_roundtrip(self.ymd) _test_roundtrip(self.ymd.T) def test_reindex(self): reindexed = self.frame.ix[[('foo', 'one'), ('bar', 'one')]] expected = self.frame.ix[[0, 3]] assert_frame_equal(reindexed, expected) def test_reindex_preserve_levels(self): new_index = self.ymd.index[::10] chunk = self.ymd.reindex(new_index) self.assert_(chunk.index is new_index) chunk = self.ymd.ix[new_index] self.assert_(chunk.index is new_index) ymdT = self.ymd.T chunk = ymdT.reindex(columns=new_index) self.assert_(chunk.columns is new_index) chunk = ymdT.ix[:, new_index] self.assert_(chunk.columns is new_index) def test_sort_index_preserve_levels(self): result = self.frame.sort_index() self.assertEquals(result.index.names, self.frame.index.names) def test_repr_to_string(self): repr(self.frame) repr(self.ymd) repr(self.frame.T) repr(self.ymd.T) buf = StringIO() self.frame.to_string(buf=buf) self.ymd.to_string(buf=buf) self.frame.T.to_string(buf=buf) self.ymd.T.to_string(buf=buf) def test_getitem_simple(self): df = self.frame.T col = df['foo', 'one'] assert_almost_equal(col.values, df.values[:, 0]) self.assertRaises(KeyError, df.__getitem__, ('foo', 'four')) self.assertRaises(KeyError, df.__getitem__, 'foobar') def test_series_getitem(self): s = self.ymd['A'] result = s[2000, 3] result2 = s.ix[2000, 3] expected = s[42:65] expected.index = expected.index.droplevel(0).droplevel(0) assert_series_equal(result, expected) result = s[2000, 3, 10] expected = s[49] self.assertEquals(result, expected) # fancy result = s.ix[[(2000, 3, 10), (2000, 3, 13)]] expected = s[49:51] assert_series_equal(result, expected) # key error self.assertRaises(KeyError, s.__getitem__, (2000, 3, 4)) def test_series_setitem(self): s = self.ymd['A'] s[2000, 3] = np.nan self.assert_(isnull(s[42:65]).all()) self.assert_(notnull(s[:42]).all()) self.assert_(notnull(s[65:]).all()) s[2000, 3, 10] = np.nan self.assert_(isnull(s[49])) def test_series_slice_partial(self): pass def test_xs(self): xs = self.frame.xs(('bar', 'two')) xs2 = self.frame.ix[('bar', 'two')] assert_series_equal(xs, xs2) assert_almost_equal(xs.values, self.frame.values[4]) def test_xs_partial(self): result = self.frame.xs('foo') result2 = self.frame.ix['foo'] expected = self.frame.T['foo'].T assert_frame_equal(result, expected) assert_frame_equal(result, result2) def test_fancy_2d(self): result = self.frame.ix['foo', 'B'] expected = self.frame.xs('foo')['B'] assert_series_equal(result, expected) ft = self.frame.T result = ft.ix['B', 'foo'] expected = ft.xs('B')['foo'] assert_series_equal(result, expected) def test_get_loc_single_level(self): s = Series(np.random.randn(len(self.single_level)), index=self.single_level) for k in self.single_level.values: s[k] def test_getitem_toplevel(self): df = self.frame.T result = df['foo'] expected = df.reindex(columns=df.columns[:3]) expected.columns = expected.columns.droplevel(0) assert_frame_equal(result, expected) result = df['bar'] result2 = df.ix[:, 'bar'] expected = df.reindex(columns=df.columns[3:5]) expected.columns = expected.columns.droplevel(0) assert_frame_equal(result, expected) assert_frame_equal(result, result2) def test_getitem_slice_integers(self): index = MultiIndex(levels=[[0, 1, 2], [0, 2]], labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]]) frame = DataFrame(np.random.randn(len(index), 4), index=index, columns=['a', 'b', 'c', 'd']) res = frame.ix[1:2] exp = frame[2:] assert_frame_equal(res, exp) series = Series(np.random.randn(len(index)), index=index) res = series.ix[1:2] exp = series[2:] assert_series_equal(res, exp) def test_getitem_int(self): levels = [[0, 1], [0, 1, 2]] labels = [[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]] index = MultiIndex(levels=levels, labels=labels) frame = DataFrame(np.random.randn(6, 2), index=index) result = frame.ix[1] expected = frame[-3:] expected.index = expected.index.droplevel(0) assert_frame_equal(result, expected) # raises exception self.assertRaises(KeyError, frame.ix.__getitem__, 3) # however this will work result = self.frame.ix[2] expected = self.frame.xs(self.frame.index[2]) assert_series_equal(result, expected) def test_getitem_partial(self): ymd = self.ymd.T result = ymd[2000, 2] expected = ymd.reindex(columns=ymd.columns[ymd.columns.labels[1] == 1]) expected.columns = expected.columns.droplevel(0).droplevel(0) assert_frame_equal(result, expected) def test_getitem_slice_not_sorted(self): df = self.frame.sortlevel(1).T # buglet with int typechecking result = df.ix[:, :np.int32(3)] expected = df.reindex(columns=df.columns[:3]) assert_frame_equal(result, expected) def test_setitem_change_dtype(self): dft = self.frame.T s = dft['foo', 'two'] dft['foo', 'two'] = s > s.median() assert_series_equal(dft['foo', 'two'], s > s.median()) self.assert_(isinstance(dft._data.blocks[1].items, MultiIndex)) reindexed = dft.reindex(columns=[('foo', 'two')]) assert_series_equal(reindexed['foo', 'two'], s > s.median()) def test_frame_setitem_ix(self): self.frame.ix[('bar', 'two'), 'B'] = 5 self.assertEquals(self.frame.ix[('bar', 'two'), 'B'], 5) # with integer labels df = self.frame.copy() df.columns = range(3) df.ix[('bar', 'two'), 1] = 7 self.assertEquals(df.ix[('bar', 'two'), 1], 7) def test_fancy_slice_partial(self): result = self.frame.ix['bar':'baz'] expected = self.frame[3:7] assert_frame_equal(result, expected) result = self.ymd.ix[(2000,2):(2000,4)] lev = self.ymd.index.labels[1] expected = self.ymd[(lev >= 1) & (lev <= 3)] assert_frame_equal(result, expected) def test_sortlevel(self): df = self.frame.copy() df.index = np.arange(len(df)) self.assertRaises(Exception, df.sortlevel, 0) # axis=1 # series a_sorted = self.frame['A'].sortlevel(0) self.assertRaises(Exception, self.frame.reset_index()['A'].sortlevel) # preserve names self.assertEquals(a_sorted.index.names, self.frame.index.names) def test_delevel_infer_dtype(self): tuples = [tuple for tuple in cart_product(['foo', 'bar'], [10, 20], [1.0, 1.1])] index = MultiIndex.from_tuples(tuples, names=['prm0', 'prm1', 'prm2']) df = DataFrame(np.random.randn(8,3), columns=['A', 'B', 'C'], index=index) deleveled = df.reset_index() self.assert_(com.is_integer_dtype(deleveled['prm1'])) self.assert_(com.is_float_dtype(deleveled['prm2'])) def test_sortlevel_by_name(self): self.frame.index.names = ['first', 'second'] result = self.frame.sortlevel(level='second') expected = self.frame.sortlevel(level=1) assert_frame_equal(result, expected) def test_sortlevel_mixed(self): sorted_before = self.frame.sortlevel(1) df = self.frame.copy() df['foo'] = 'bar' sorted_after = df.sortlevel(1) assert_frame_equal(sorted_before, sorted_after.drop(['foo'], axis=1)) dft = self.frame.T sorted_before = dft.sortlevel(1, axis=1) dft['foo', 'three'] = 'bar' sorted_after = dft.sortlevel(1, axis=1) assert_frame_equal(sorted_before.drop([('foo', 'three')], axis=1), sorted_after.drop([('foo', 'three')], axis=1)) def test_count_level(self): def _check_counts(frame, axis=0): index = frame._get_axis(axis) for i in range(index.nlevels): result = frame.count(axis=axis, level=i) expected = frame.groupby(axis=axis, level=i).count(axis=axis) expected = expected.reindex_like(result).astype('i8') assert_frame_equal(result, expected) self.frame.ix[1, [1, 2]] = np.nan self.frame.ix[7, [0, 1]] = np.nan self.ymd.ix[1, [1, 2]] = np.nan self.ymd.ix[7, [0, 1]] = np.nan _check_counts(self.frame) _check_counts(self.ymd) _check_counts(self.frame.T, axis=1) _check_counts(self.ymd.T, axis=1) # can't call with level on regular DataFrame df = tm.makeTimeDataFrame() self.assertRaises(Exception, df.count, level=0) self.frame['D'] = 'foo' result = self.frame.count(level=0, numeric_only=True) assert_almost_equal(result.columns, ['A', 'B', 'C']) def test_count_level_series(self): index = MultiIndex(levels=[['foo', 'bar', 'baz'], ['one', 'two', 'three', 'four']], labels=[[0, 0, 0, 2, 2], [2, 0, 1, 1, 2]]) s = Series(np.random.randn(len(index)), index=index) result = s.count(level=0) expected = s.groupby(level=0).count() assert_series_equal(result.astype('f8'), expected.reindex(result.index).fillna(0)) result = s.count(level=1) expected = s.groupby(level=1).count() assert_series_equal(result.astype('f8'), expected.reindex(result.index).fillna(0)) def test_count_level_corner(self): s = self.frame['A'][:0] result = s.count(level=0) expected = Series(0, index=s.index.levels[0]) assert_series_equal(result, expected) df = self.frame[:0] result = df.count(level=0) expected = DataFrame({}, index=s.index.levels[0], columns=df.columns).fillna(0).astype(int) assert_frame_equal(result, expected) def test_unstack(self): # just check that it works for now unstacked = self.ymd.unstack() unstacked2 = unstacked.unstack() # test that ints work unstacked = self.ymd.astype(int).unstack() def test_stack(self): # regular roundtrip unstacked = self.ymd.unstack() restacked = unstacked.stack() assert_frame_equal(restacked, self.ymd) unlexsorted = self.ymd.sortlevel(2) unstacked = unlexsorted.unstack(2) restacked = unstacked.stack() assert_frame_equal(restacked.sortlevel(0), self.ymd) unlexsorted = unlexsorted[::-1] unstacked = unlexsorted.unstack(1) restacked = unstacked.stack().swaplevel(1, 2) assert_frame_equal(restacked.sortlevel(0), self.ymd) unlexsorted = unlexsorted.swaplevel(0, 1) unstacked = unlexsorted.unstack(0).swaplevel(0, 1, axis=1) restacked = unstacked.stack(0).swaplevel(1, 2) assert_frame_equal(restacked.sortlevel(0), self.ymd) # columns unsorted unstacked = self.ymd.unstack() unstacked = unstacked.sort(axis=1, ascending=False) restacked = unstacked.stack() assert_frame_equal(restacked, self.ymd) # more than 2 levels in the columns unstacked = self.ymd.unstack(1).unstack(1) result = unstacked.stack(1) expected = self.ymd.unstack() assert_frame_equal(result, expected) result = unstacked.stack(2) expected = self.ymd.unstack(1) assert_frame_equal(result, expected) result = unstacked.stack(0) expected = self.ymd.stack().unstack(1).unstack(1) assert_frame_equal(result, expected) # not all levels present in each echelon unstacked = self.ymd.unstack(2).ix[:, ::3] stacked = unstacked.stack().stack() ymd_stacked = self.ymd.stack() assert_series_equal(stacked, ymd_stacked.reindex(stacked.index)) # stack with negative number result = self.ymd.unstack(0).stack(-2) expected = self.ymd.unstack(0).stack(0) def test_stack_mixed_dtype(self): df = self.frame.T df['foo', 'four'] = 'foo' df = df.sortlevel(1, axis=1) stacked = df.stack() assert_series_equal(stacked['foo'], df['foo'].stack()) self.assert_(stacked['bar'].dtype == np.float_) def test_unstack_bug(self): df = DataFrame({'state': ['naive','naive','naive', 'activ','activ','activ'], 'exp':['a','b','b','b','a','a'], 'barcode':[1,2,3,4,1,3], 'v':['hi','hi','bye','bye','bye','peace'], 'extra': np.arange(6.)}) result = df.groupby(['state','exp','barcode','v']).apply(len) unstacked = result.unstack() restacked = unstacked.stack() assert_series_equal(restacked, result.reindex(restacked.index).astype(float)) def test_stack_unstack_preserve_names(self): unstacked = self.frame.unstack() self.assertEquals(unstacked.index.name, 'first') self.assertEquals(unstacked.columns.names, ['exp', 'second']) restacked = unstacked.stack() self.assertEquals(restacked.index.names, self.frame.index.names) def test_unstack_level_name(self): result = self.frame.unstack('second') expected = self.frame.unstack(level=1) assert_frame_equal(result, expected) def test_stack_level_name(self): unstacked = self.frame.unstack('second') result = unstacked.stack('exp') expected = self.frame.unstack().stack(0) assert_frame_equal(result, expected) result = self.frame.stack('exp') expected = self.frame.stack() assert_series_equal(result, expected) def test_stack_unstack_multiple(self): unstacked = self.ymd.unstack(['year', 'month']) expected = self.ymd.unstack('year').unstack('month') assert_frame_equal(unstacked, expected) self.assertEquals(unstacked.columns.names, expected.columns.names) # series s = self.ymd['A'] s_unstacked = s.unstack(['year', 'month']) assert_frame_equal(s_unstacked, expected['A']) restacked = unstacked.stack(['year', 'month']) restacked = restacked.swaplevel(0, 1).swaplevel(1, 2) restacked = restacked.sortlevel(0) assert_frame_equal(restacked, self.ymd) self.assertEquals(restacked.index.names, self.ymd.index.names) # GH #451 unstacked = self.ymd.unstack([1, 2]) expected = self.ymd.unstack(1).unstack(1) assert_frame_equal(unstacked, expected) unstacked = self.ymd.unstack([2, 1]) expected = self.ymd.unstack(2).unstack(1) assert_frame_equal(unstacked, expected) def test_groupby_transform(self): s = self.frame['A'] grouper = s.index.get_level_values(0) grouped = s.groupby(grouper) applied = grouped.apply(lambda x: x * 2) expected = grouped.transform(lambda x: x * 2) assert_series_equal(applied.reindex(expected.index), expected) def test_join(self): a = self.frame.ix[:5, ['A']] b = self.frame.ix[2:, ['B', 'C']] joined = a.join(b, how='outer').reindex(self.frame.index) expected = self.frame.copy() expected.values[np.isnan(joined.values)] = np.nan self.assert_(not np.isnan(joined.values).all()) assert_frame_equal(joined, expected) def test_swaplevel(self): swapped = self.frame['A'].swaplevel(0, 1) swapped2 = self.frame['A'].swaplevel('first', 'second') self.assert_(not swapped.index.equals(self.frame.index)) assert_series_equal(swapped, swapped2) back = swapped.swaplevel(0, 1) back2 = swapped.swaplevel('second', 'first') self.assert_(back.index.equals(self.frame.index)) assert_series_equal(back, back2) ft = self.frame.T swapped = ft.swaplevel('first', 'second', axis=1) exp = self.frame.swaplevel('first', 'second').T assert_frame_equal(swapped, exp) def test_swaplevel_panel(self): panel = Panel({'ItemA' : self.frame, 'ItemB' : self.frame * 2}) result = panel.swaplevel(0, 1, axis='major') expected = panel.copy() expected.major_axis = expected.major_axis.swaplevel(0, 1) tm.assert_panel_equal(result, expected) def test_reorder_levels(self): result = self.ymd.reorder_levels(['month', 'day', 'year']) expected = self.ymd.swaplevel(0, 1).swaplevel(1, 2) assert_frame_equal(result, expected) result = self.ymd['A'].reorder_levels(['month', 'day', 'year']) expected = self.ymd['A'].swaplevel(0, 1).swaplevel(1, 2) assert_series_equal(result, expected) result = self.ymd.T.reorder_levels(['month', 'day', 'year'], axis=1) expected = self.ymd.T.swaplevel(0, 1, axis=1).swaplevel(1, 2, axis=1) assert_frame_equal(result, expected) self.assertRaises(Exception, self.ymd.index.reorder_levels, [1, 2, 3]) def test_insert_index(self): df = self.ymd[:5].T df[2000, 1, 10] = df[2000, 1, 7] self.assert_(isinstance(df.columns, MultiIndex)) self.assert_((df[2000, 1, 10] == df[2000, 1, 7]).all()) def test_alignment(self): x = Series(data=[1,2,3], index=MultiIndex.from_tuples([("A", 1), ("A", 2), ("B",3)])) y = Series(data=[4,5,6], index=MultiIndex.from_tuples([("Z", 1), ("Z", 2), ("B",3)])) res = x - y exp_index = x.index.union(y.index) exp = x.reindex(exp_index) - y.reindex(exp_index) assert_series_equal(res, exp) # hit non-monotonic code path res = x[::-1] - y[::-1] exp_index = x.index.union(y.index) exp = x.reindex(exp_index) - y.reindex(exp_index) assert_series_equal(res, exp) def test_is_lexsorted(self): levels = [[0, 1], [0, 1, 2]] index = MultiIndex(levels=levels, labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]]) self.assert_(index.is_lexsorted()) index = MultiIndex(levels=levels, labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 2, 1]]) self.assert_(not index.is_lexsorted()) index = MultiIndex(levels=levels, labels=[[0, 0, 1, 0, 1, 1], [0, 1, 0, 2, 2, 1]]) self.assert_(not index.is_lexsorted()) self.assert_(index.lexsort_depth == 0) def test_frame_getitem_view(self): df = self.frame.T df['foo'].values[:] = 0 self.assert_((df['foo'].values == 0).all()) # but not if it's mixed-type df['foo', 'four'] = 'foo' df = df.sortlevel(0, axis=1) df['foo']['one'] = 2 self.assert_((df['foo', 'one'] == 0).all()) def test_frame_getitem_not_sorted(self): df = self.frame.T df['foo', 'four'] = 'foo' arrays = [np.array(x) for x in zip(*df.columns.get_tuple_index())] result = df['foo'] result2 = df.ix[:, 'foo'] expected = df.reindex(columns=df.columns[arrays[0] == 'foo']) expected.columns = expected.columns.droplevel(0) assert_frame_equal(result, expected) assert_frame_equal(result2, expected) df = df.T result = df.xs('foo') result2 = df.ix['foo'] expected = df.reindex(df.index[arrays[0] == 'foo']) expected.index = expected.index.droplevel(0) assert_frame_equal(result, expected) assert_frame_equal(result2, expected) def test_series_getitem_not_sorted(self): arrays = [['bar', 'bar', 'baz', 'baz', 'qux', 'qux', 'foo', 'foo'], ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] tuples = zip(*arrays) index = MultiIndex.from_tuples(tuples) s = Series(randn(8), index=index) arrays = [np.array(x) for x in zip(*index.get_tuple_index())] result = s['qux'] result2 = s.ix['qux'] expected = s[arrays[0] == 'qux'] expected.index = expected.index.droplevel(0) assert_series_equal(result, expected) assert_series_equal(result2, expected) AGG_FUNCTIONS = ['sum', 'prod', 'min', 'max', 'median', 'mean', 'skew', 'mad', 'std', 'var'] def test_series_group_min_max(self): for op, level, skipna in cart_product(self.AGG_FUNCTIONS, range(2), [False, True]): grouped = self.series.groupby(level=level) aggf = lambda x: getattr(x, op)(skipna=skipna) # skipna=True leftside = grouped.agg(aggf) rightside = getattr(self.series, op)(level=level, skipna=skipna) assert_series_equal(leftside, rightside) def test_frame_group_ops(self): self.frame.ix[1, [1, 2]] = np.nan self.frame.ix[7, [0, 1]] = np.nan for op, level, axis, skipna in cart_product(self.AGG_FUNCTIONS, range(2), range(2), [False, True]): if axis == 0: frame = self.frame else: frame = self.frame.T grouped = frame.groupby(level=level, axis=axis) aggf = lambda x: getattr(x, op)(skipna=skipna, axis=axis) leftside = grouped.agg(aggf) rightside = getattr(frame, op)(level=level, axis=axis, skipna=skipna) # for good measure, groupby detail level_index = frame._get_axis(axis).levels[level] self.assert_(leftside._get_axis(axis).equals(level_index)) self.assert_(rightside._get_axis(axis).equals(level_index)) assert_frame_equal(leftside, rightside) def test_frame_series_agg_multiple_levels(self): result = self.ymd.sum(level=['year', 'month']) expected = self.ymd.groupby(level=['year', 'month']).sum() assert_frame_equal(result, expected) result = self.ymd['A'].sum(level=['year', 'month']) expected = self.ymd['A'].groupby(level=['year', 'month']).sum() assert_series_equal(result, expected) def test_groupby_multilevel(self): result = self.ymd.groupby(level=[0, 1]).mean() k1 = self.ymd.index.get_level_values(0) k2 = self.ymd.index.get_level_values(1) expected = self.ymd.groupby([k1, k2]).mean() assert_frame_equal(result, expected) self.assertEquals(result.index.names, self.ymd.index.names[:2]) result2 = self.ymd.groupby(level=self.ymd.index.names[:2]).mean() assert_frame_equal(result, result2) def test_groupby_multilevel_with_transform(self): pass def test_multilevel_consolidate(self): index = MultiIndex.from_tuples([('foo', 'one'), ('foo', 'two'), ('bar', 'one'), ('bar', 'two')]) df = DataFrame(np.random.randn(4, 4), index=index, columns=index) df['Totals', ''] = df.sum(1) df = df.consolidate() def test_ix_preserve_names(self): result = self.ymd.ix[2000] result2 = self.ymd['A'].ix[2000] self.assertEquals(result.index.names, self.ymd.index.names[1:]) self.assertEquals(result2.index.names, self.ymd.index.names[1:]) result = self.ymd.ix[2000, 2] result2 = self.ymd['A'].ix[2000, 2] self.assertEquals(result.index.name, self.ymd.index.names[2]) self.assertEquals(result2.index.name, self.ymd.index.names[2]) def test_partial_set(self): # GH #397 df = self.ymd.copy() exp = self.ymd.copy() df.ix[2000, 4] = 0 exp.ix[2000, 4].values[:] = 0 assert_frame_equal(df, exp) df['A'].ix[2000, 4] = 1 exp['A'].ix[2000, 4].values[:] = 1 assert_frame_equal(df, exp) df.ix[2000] = 5 exp.ix[2000].values[:] = 5 assert_frame_equal(df, exp) # this works...for now df['A'].ix[14] = 5 self.assertEquals(df['A'][14], 5) def test_unstack_preserve_types(self): # GH #403 self.ymd['E'] = 'foo' self.ymd['F'] = 2 unstacked = self.ymd.unstack('month') self.assert_(unstacked['A', 1].dtype == np.float64) self.assert_(unstacked['E', 1].dtype == np.object_) self.assert_(unstacked['F', 1].dtype == np.float64) def test_partial_ix_missing(self): result = self.ymd.ix[2000, 0] expected = self.ymd.ix[2000]['A'] assert_series_equal(result, expected) # need to put in some work here # self.ymd.ix[2000, 0] = 0 # self.assert_((self.ymd.ix[2000]['A'] == 0).all()) self.assertRaises(Exception, self.ymd.ix.__getitem__, (2000, 6)) self.assertRaises(Exception, self.ymd.ix.__getitem__, (2000, 6), 0) def test_to_html(self): self.ymd.columns.name = 'foo' self.ymd.to_html() self.ymd.T.to_html()
class TestMultiLevel(unittest.TestCase): def setUp(self): index = MultiIndex( levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=["first", "second"], ) self.frame = DataFrame(np.random.randn(10, 3), index=index, columns=["A", "B", "C"]) self.single_level = MultiIndex(levels=[["foo", "bar", "baz", "qux"]], labels=[[0, 1, 2, 3]], names=["first"]) tm.N = 100 self.tdf = tm.makeTimeDataFrame() self.ymd = self.tdf.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day]).sum() def test_append(self): a, b = self.frame[:5], self.frame[5:] result = a.append(b) tm.assert_frame_equal(result, self.frame) result = a["A"].append(b["A"]) tm.assert_series_equal(result, self.frame["A"]) def test_pickle(self): import cPickle def _test_roundtrip(frame): pickled = cPickle.dumps(frame) unpickled = cPickle.loads(pickled) assert_frame_equal(frame, unpickled) _test_roundtrip(self.frame) _test_roundtrip(self.frame.T) _test_roundtrip(self.ymd) _test_roundtrip(self.ymd.T) def test_reindex(self): reindexed = self.frame.ix[[("foo", "one"), ("bar", "one")]] expected = self.frame.ix[[0, 3]] assert_frame_equal(reindexed, expected) def test_reindex_preserve_levels(self): new_index = self.ymd.index[::10] chunk = self.ymd.reindex(new_index) self.assert_(chunk.index is new_index) chunk = self.ymd.ix[new_index] self.assert_(chunk.index is new_index) ymdT = self.ymd.T chunk = ymdT.reindex(columns=new_index) self.assert_(chunk.columns is new_index) chunk = ymdT.ix[:, new_index] self.assert_(chunk.columns is new_index) def test_repr_to_string(self): repr(self.frame) repr(self.ymd) repr(self.frame.T) repr(self.ymd.T) buf = StringIO() self.frame.to_string(buf=buf) self.ymd.to_string(buf=buf) self.frame.T.to_string(buf=buf) self.ymd.T.to_string(buf=buf) def test_getitem_simple(self): df = self.frame.T col = df["foo", "one"] assert_almost_equal(col.values, df.values[:, 0]) self.assertRaises(KeyError, df.__getitem__, ("foo", "four")) self.assertRaises(KeyError, df.__getitem__, "foobar") def test_series_getitem(self): s = self.ymd["A"] result = s[2000, 3] result2 = s.ix[2000, 3] expected = s[42:65] expected.index = expected.index.droplevel(0).droplevel(0) assert_series_equal(result, expected) result = s[2000, 3, 10] expected = s[49] self.assertEquals(result, expected) # fancy result = s.ix[[(2000, 3, 10), (2000, 3, 13)]] expected = s[49:51] assert_series_equal(result, expected) # key error self.assertRaises(KeyError, s.__getitem__, (2000, 3, 4)) def test_series_setitem(self): s = self.ymd["A"] s[2000, 3] = np.nan self.assert_(isnull(s[42:65]).all()) self.assert_(notnull(s[:42]).all()) self.assert_(notnull(s[65:]).all()) s[2000, 3, 10] = np.nan self.assert_(isnull(s[49])) def test_series_slice_partial(self): pass def test_xs(self): xs = self.frame.xs(("bar", "two")) xs2 = self.frame.ix[("bar", "two")] assert_series_equal(xs, xs2) assert_almost_equal(xs.values, self.frame.values[4]) def test_xs_partial(self): result = self.frame.xs("foo") result2 = self.frame.ix["foo"] expected = self.frame.T["foo"].T assert_frame_equal(result, expected) assert_frame_equal(result, result2) def test_fancy_2d(self): result = self.frame.ix["foo", "B"] expected = self.frame.xs("foo")["B"] assert_series_equal(result, expected) ft = self.frame.T result = ft.ix["B", "foo"] expected = ft.xs("B")["foo"] assert_series_equal(result, expected) def test_get_loc_single_level(self): s = Series(np.random.randn(len(self.single_level)), index=self.single_level) for k in self.single_level.values: s[k] def test_getitem_toplevel(self): df = self.frame.T result = df["foo"] expected = df.reindex(columns=df.columns[:3]) expected.columns = expected.columns.droplevel(0) assert_frame_equal(result, expected) result = df["bar"] result2 = df.ix[:, "bar"] expected = df.reindex(columns=df.columns[3:5]) expected.columns = expected.columns.droplevel(0) assert_frame_equal(result, expected) assert_frame_equal(result, result2) def test_getitem_int(self): levels = [[0, 1], [0, 1, 2]] labels = [[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]] index = MultiIndex(levels=levels, labels=labels) frame = DataFrame(np.random.randn(6, 2), index=index) result = frame.ix[1] expected = frame[-3:] expected.index = expected.index.droplevel(0) assert_frame_equal(result, expected) # raises exception self.assertRaises(KeyError, frame.ix.__getitem__, 3) # however this will work result = self.frame.ix[2] expected = self.frame.xs(self.frame.index[2]) assert_series_equal(result, expected) def test_getitem_partial(self): ymd = self.ymd.T result = ymd[2000, 2] expected = ymd.reindex(columns=ymd.columns[ymd.columns.labels[1] == 1]) expected.columns = expected.columns.droplevel(0).droplevel(0) assert_frame_equal(result, expected) def test_setitem_change_dtype(self): dft = self.frame.T s = dft["foo", "two"] dft["foo", "two"] = s > s.median() assert_series_equal(dft["foo", "two"], s > s.median()) self.assert_(isinstance(dft._data.blocks[1].items, MultiIndex)) reindexed = dft.reindex(columns=[("foo", "two")]) assert_series_equal(reindexed["foo", "two"], s > s.median()) def test_fancy_slice_partial(self): result = self.frame.ix["bar":"baz"] expected = self.frame[3:7] assert_frame_equal(result, expected) result = self.ymd.ix[(2000, 2):(2000, 4)] lev = self.ymd.index.labels[1] expected = self.ymd[(lev >= 1) & (lev <= 3)] assert_frame_equal(result, expected) def test_sortlevel(self): df = self.frame.copy() df.index = np.arange(len(df)) self.assertRaises(Exception, df.sortlevel, 0) # axis=1 # series a_sorted = self.frame["A"].sortlevel(0) self.assertRaises(Exception, self.frame.delevel()["A"].sortlevel) # preserve names self.assertEquals(a_sorted.index.names, self.frame.index.names) def test_sortlevel_by_name(self): self.frame.index.names = ["first", "second"] result = self.frame.sortlevel(level="second") expected = self.frame.sortlevel(level=1) assert_frame_equal(result, expected) def test_sortlevel_mixed(self): sorted_before = self.frame.sortlevel(1) df = self.frame.copy() df["foo"] = "bar" sorted_after = df.sortlevel(1) assert_frame_equal(sorted_before, sorted_after.drop(["foo"], axis=1)) dft = self.frame.T sorted_before = dft.sortlevel(1, axis=1) dft["foo", "three"] = "bar" sorted_after = dft.sortlevel(1, axis=1) assert_frame_equal( sorted_before.drop([("foo", "three")], axis=1), sorted_after.drop([("foo", "three")], axis=1) ) def test_count_level(self): def _check_counts(frame, axis=0): index = frame._get_axis(axis) for i in range(index.nlevels): result = frame.count(axis=axis, level=i) expected = frame.groupby(axis=axis, level=i).count(axis=axis) _check_counts(self.frame) _check_counts(self.ymd) _check_counts(self.frame.T, axis=1) _check_counts(self.ymd.T, axis=1) # can't call with level on regular DataFrame df = tm.makeTimeDataFrame() self.assertRaises(Exception, df.count, level=0) def test_count_level_corner(self): s = self.frame["A"][:0] result = s.count(level=0) expected = Series(0, index=s.index.levels[0]) assert_series_equal(result, expected) df = self.frame[:0] result = df.count(level=0) expected = DataFrame({}, index=s.index.levels[0], columns=df.columns).fillna(0).astype(int) assert_frame_equal(result, expected) def test_unstack(self): # just check that it works for now unstacked = self.ymd.unstack() unstacked2 = unstacked.unstack() # test that ints work unstacked = self.ymd.astype(int).unstack() def test_stack(self): # regular roundtrip unstacked = self.ymd.unstack() restacked = unstacked.stack() assert_frame_equal(restacked, self.ymd) unlexsorted = self.ymd.sortlevel(2) unstacked = unlexsorted.unstack(2) restacked = unstacked.stack() assert_frame_equal(restacked.sortlevel(0), self.ymd) unlexsorted = unlexsorted[::-1] unstacked = unlexsorted.unstack(1) restacked = unstacked.stack().swaplevel(1, 2) assert_frame_equal(restacked.sortlevel(0), self.ymd) unlexsorted = unlexsorted.swaplevel(0, 1) unstacked = unlexsorted.unstack(0).swaplevel(0, 1, axis=1) restacked = unstacked.stack(0).swaplevel(1, 2) assert_frame_equal(restacked.sortlevel(0), self.ymd) # columns unsorted unstacked = self.ymd.unstack() unstacked = unstacked.sort(axis=1, ascending=False) restacked = unstacked.stack() assert_frame_equal(restacked, self.ymd) # more than 2 levels in the columns unstacked = self.ymd.unstack(1).unstack(1) result = unstacked.stack(1) expected = self.ymd.unstack() assert_frame_equal(result, expected) result = unstacked.stack(2) expected = self.ymd.unstack(1) assert_frame_equal(result, expected) result = unstacked.stack(0) expected = self.ymd.stack().unstack(1).unstack(1) assert_frame_equal(result, expected) # not all levels present in each echelon unstacked = self.ymd.unstack(2).ix[:, ::3] stacked = unstacked.stack().stack() ymd_stacked = self.ymd.stack() assert_series_equal(stacked, ymd_stacked.reindex(stacked.index)) def test_stack_mixed_dtype(self): df = self.frame.T df["foo", "four"] = "foo" df = df.sortlevel(1, axis=1) stacked = df.stack() assert_series_equal(stacked["foo"], df["foo"].stack()) self.assert_(stacked["bar"].dtype == np.float_) def test_unstack_bug(self): df = DataFrame( { "state": ["naive", "naive", "naive", "activ", "activ", "activ"], "exp": ["a", "b", "b", "b", "a", "a"], "barcode": [1, 2, 3, 4, 1, 3], "v": ["hi", "hi", "bye", "bye", "bye", "peace"], "extra": np.arange(6.0), } ) result = df.groupby(["state", "exp", "barcode", "v"]).apply(len) unstacked = result.unstack() restacked = unstacked.stack() assert_series_equal(restacked, result.reindex(restacked.index).astype(float)) def test_stack_unstack_preserve_names(self): unstacked = self.frame.unstack() self.assertEquals(unstacked.index.name, "first") self.assertEquals(unstacked.columns.names, [None, "second"]) restacked = unstacked.stack() self.assertEquals(restacked.index.names, self.frame.index.names) def test_unstack_level_name(self): result = self.frame.unstack("second") expected = self.frame.unstack(level=1) assert_frame_equal(result, expected) def test_groupby_transform(self): s = self.frame["A"] grouper = s.index.get_level_values(0) grouped = s.groupby(grouper) applied = grouped.apply(lambda x: x * 2) expected = grouped.transform(lambda x: x * 2) assert_series_equal(applied.reindex(expected.index), expected) def test_join(self): a = self.frame.ix[:5, ["A"]] b = self.frame.ix[2:, ["B", "C"]] joined = a.join(b, how="outer").reindex(self.frame.index) expected = self.frame.copy() expected.values[np.isnan(joined.values)] = np.nan self.assert_(not np.isnan(joined.values).all()) assert_frame_equal(joined, expected) def test_swaplevel(self): swapped = self.frame["A"].swaplevel(0, 1) self.assert_(not swapped.index.equals(self.frame.index)) back = swapped.swaplevel(0, 1) self.assert_(back.index.equals(self.frame.index)) def test_swaplevel_panel(self): panel = Panel({"ItemA": self.frame, "ItemB": self.frame * 2}) result = panel.swaplevel(0, 1, axis="major") expected = panel.copy() expected.major_axis = expected.major_axis.swaplevel(0, 1) def test_insert_index(self): df = self.ymd[:5].T df[2000, 1, 10] = df[2000, 1, 7] self.assert_(isinstance(df.columns, MultiIndex)) self.assert_((df[2000, 1, 10] == df[2000, 1, 7]).all()) def test_alignment(self): pass def test_is_lexsorted(self): levels = [[0, 1], [0, 1, 2]] index = MultiIndex(levels=levels, labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]]) self.assert_(index.is_lexsorted()) index = MultiIndex(levels=levels, labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 2, 1]]) self.assert_(not index.is_lexsorted()) index = MultiIndex(levels=levels, labels=[[0, 0, 1, 0, 1, 1], [0, 1, 0, 2, 2, 1]]) self.assert_(not index.is_lexsorted()) self.assert_(index.lexsort_depth == 0) def test_frame_getitem_view(self): df = self.frame.T df["foo"].values[:] = 0 self.assert_((df["foo"].values == 0).all()) # but not if it's mixed-type df["foo", "four"] = "foo" df = df.sortlevel(0, axis=1) df["foo"]["one"] = 2 self.assert_((df["foo", "one"] == 0).all()) def test_frame_getitem_not_sorted(self): df = self.frame.T df["foo", "four"] = "foo" arrays = [np.array(x) for x in zip(*df.columns.get_tuple_index())] result = df["foo"] result2 = df.ix[:, "foo"] expected = df.reindex(columns=df.columns[arrays[0] == "foo"]) expected.columns = expected.columns.droplevel(0) assert_frame_equal(result, expected) assert_frame_equal(result2, expected) df = df.T result = df.xs("foo") result2 = df.ix["foo"] expected = df.reindex(df.index[arrays[0] == "foo"]) expected.index = expected.index.droplevel(0) assert_frame_equal(result, expected) assert_frame_equal(result2, expected) def test_series_getitem_not_sorted(self): arrays = [ ["bar", "bar", "baz", "baz", "qux", "qux", "foo", "foo"], ["one", "two", "one", "two", "one", "two", "one", "two"], ] tuples = zip(*arrays) index = MultiIndex.from_tuples(tuples) s = Series(randn(8), index=index) arrays = [np.array(x) for x in zip(*index.get_tuple_index())] result = s["qux"] result2 = s.ix["qux"] expected = s[arrays[0] == "qux"] expected.index = expected.index.droplevel(0) assert_series_equal(result, expected) assert_series_equal(result2, expected)