def test_hash_pandas_object(self): for obj in [Series([1, 2, 3]), Series([1.0, 1.5, 3.2]), Series([1.0, 1.5, np.nan]), Series([1.0, 1.5, 3.2], index=[1.5, 1.1, 3.3]), Series(['a', 'b', 'c']), Series(['a', np.nan, 'c']), Series(['a', None, 'c']), Series([True, False, True]), Series(), Index([1, 2, 3]), Index([True, False, True]), DataFrame({'x': ['a', 'b', 'c'], 'y': [1, 2, 3]}), DataFrame(), tm.makeMissingDataframe(), tm.makeMixedDataFrame(), tm.makeTimeDataFrame(), tm.makeTimeSeries(), tm.makeTimedeltaIndex(), tm.makePeriodIndex(), Series(tm.makePeriodIndex()), Series(pd.date_range('20130101', periods=3, tz='US/Eastern')), MultiIndex.from_product( [range(5), ['foo', 'bar', 'baz'], pd.date_range('20130101', periods=2)]), MultiIndex.from_product( [pd.CategoricalIndex(list('aabc')), range(3)])]: self.check_equal(obj) self.check_not_equal_with_index(obj)
def __init__(self, node_name, parents=None, node_domain=None): super(CPT, self).__init__(node_name) if node_domain is None or node_domain.__len__() == 0: self._domain = ['T', 'F'] else: self._domain = node_domain[:] self.m = 1 self.n = self._domain.__len__() if parents is None or parents.__len__() == 0: self.rows = [self._name] self.cols = MultiIndex.from_product([self._domain]) else: parents_names = [] parents_domains = [] for parent in parents: parents_names.append(parent.name) parents_domains.append(parent.domain) self.m = self.m * parent.domain.__len__() self.cols = MultiIndex.from_product([self._domain], names=[self._name]) self.rows = MultiIndex.from_product(parents_domains, names=parents_names) self._values = np.zeros((self.m, self.n)) self._table = DataFrame(self._values, index=self.rows, columns=self.cols)
def setup_method(self, method): self.series_ints = Series(np.random.rand(4), index=lrange(0, 8, 2)) self.frame_ints = DataFrame(np.random.randn(4, 4), index=lrange(0, 8, 2), columns=lrange(0, 12, 3)) self.series_uints = Series(np.random.rand(4), index=UInt64Index(lrange(0, 8, 2))) self.frame_uints = DataFrame(np.random.randn(4, 4), index=UInt64Index(lrange(0, 8, 2)), columns=UInt64Index(lrange(0, 12, 3))) self.series_floats = Series(np.random.rand(4), index=Float64Index(range(0, 8, 2))) self.frame_floats = DataFrame(np.random.randn(4, 4), index=Float64Index(range(0, 8, 2)), columns=Float64Index(range(0, 12, 3))) m_idces = [MultiIndex.from_product([[1, 2], [3, 4]]), MultiIndex.from_product([[5, 6], [7, 8]]), MultiIndex.from_product([[9, 10], [11, 12]])] self.series_multi = Series(np.random.rand(4), index=m_idces[0]) self.frame_multi = DataFrame(np.random.randn(4, 4), index=m_idces[0], columns=m_idces[1]) self.series_labels = Series(np.random.randn(4), index=list('abcd')) self.frame_labels = DataFrame(np.random.randn(4, 4), index=list('abcd'), columns=list('ABCD')) self.series_mixed = Series(np.random.randn(4), index=[2, 4, 'null', 8]) self.frame_mixed = DataFrame(np.random.randn(4, 4), index=[2, 4, 'null', 8]) self.series_ts = Series(np.random.randn(4), index=date_range('20130101', periods=4)) self.frame_ts = DataFrame(np.random.randn(4, 4), index=date_range('20130101', periods=4)) dates_rev = (date_range('20130101', periods=4) .sort_values(ascending=False)) self.series_ts_rev = Series(np.random.randn(4), index=dates_rev) self.frame_ts_rev = DataFrame(np.random.randn(4, 4), index=dates_rev) self.frame_empty = DataFrame() self.series_empty = Series() # form agglomerates for o in self._objs: d = dict() for t in self._typs: d[t] = getattr(self, '%s_%s' % (o, t), None) setattr(self, o, d)
def test_join_multi_to_multi(self, join_type): # GH 20475 leftindex = MultiIndex.from_product([list('abc'), list('xy'), [1, 2]], names=['abc', 'xy', 'num']) left = DataFrame({'v1': range(12)}, index=leftindex) rightindex = MultiIndex.from_product([list('abc'), list('xy')], names=['abc', 'xy']) right = DataFrame({'v2': [100 * i for i in range(1, 7)]}, index=rightindex) result = left.join(right, on=['abc', 'xy'], how=join_type) expected = (left.reset_index() .merge(right.reset_index(), on=['abc', 'xy'], how=join_type) .set_index(['abc', 'xy', 'num']) ) assert_frame_equal(expected, result) msg = (r'len\(left_on\) must equal the number of levels in the index' ' of "right"') with pytest.raises(ValueError, match=msg): left.join(right, on='xy', how=join_type) with pytest.raises(ValueError, match=msg): right.join(left, on=['abc', 'xy'], how=join_type)
def test_loc_getitem_nested_indexer(self, indexer_type_1, indexer_type_2): # GH #19686 # .loc should work with nested indexers which can be # any list-like objects (see `pandas.api.types.is_list_like`) or slices def convert_nested_indexer(indexer_type, keys): if indexer_type == np.ndarray: return np.array(keys) if indexer_type == slice: return slice(*keys) return indexer_type(keys) a = [10, 20, 30] b = [1, 2, 3] index = MultiIndex.from_product([a, b]) df = DataFrame( np.arange(len(index), dtype='int64'), index=index, columns=['Data']) keys = ([10, 20], [2, 3]) types = (indexer_type_1, indexer_type_2) # check indexers with all the combinations of nested objects # of all the valid types indexer = tuple( convert_nested_indexer(indexer_type, k) for indexer_type, k in zip(types, keys)) result = df.loc[indexer, 'Data'] expected = Series( [1, 2, 4, 5], name='Data', index=MultiIndex.from_product(keys)) tm.assert_series_equal(result, expected)
def test_apply_categorical_data(self): # GH 10138 for ordered in [True, False]: dense = Categorical(list('abc'), ordered=ordered) # 'b' is in the categories but not in the list missing = Categorical( list('aaa'), categories=['a', 'b'], ordered=ordered) values = np.arange(len(dense)) df = DataFrame({'missing': missing, 'dense': dense, 'values': values}) grouped = df.groupby(['missing', 'dense']) # missing category 'b' should still exist in the output index idx = MultiIndex.from_product( [Categorical(['a', 'b'], ordered=ordered), Categorical(['a', 'b', 'c'], ordered=ordered)], names=['missing', 'dense']) expected = DataFrame([0, 1, 2, np.nan, np.nan, np.nan], index=idx, columns=['values']) assert_frame_equal(grouped.apply(lambda x: np.mean(x)), expected) assert_frame_equal(grouped.mean(), expected) assert_frame_equal(grouped.agg(np.mean), expected) # but for transform we should still get back the original index idx = MultiIndex.from_product([['a'], ['a', 'b', 'c']], names=['missing', 'dense']) expected = Series(1, index=idx) assert_series_equal(grouped.apply(lambda x: 1), expected)
def test_loc_getitem_series(self): # GH14730 # passing a series as a key with a MultiIndex index = MultiIndex.from_product([[1, 2, 3], ['A', 'B', 'C']]) x = Series(index=index, data=range(9), dtype=np.float64) y = Series([1, 3]) expected = Series( data=[0, 1, 2, 6, 7, 8], index=MultiIndex.from_product([[1, 3], ['A', 'B', 'C']]), dtype=np.float64) result = x.loc[y] tm.assert_series_equal(result, expected) result = x.loc[[1, 3]] tm.assert_series_equal(result, expected) # GH15424 y1 = Series([1, 3], index=[1, 2]) result = x.loc[y1] tm.assert_series_equal(result, expected) empty = Series(data=[], dtype=np.float64) expected = Series([], index=MultiIndex( levels=index.levels, labels=[[], []], dtype=np.float64)) result = x.loc[empty] tm.assert_series_equal(result, expected)
def test_loc_getitem_duplicates_multiindex_missing_indexers(indexer, is_level1, expected_error): # GH 7866 # multi-index slicing with missing indexers idx = MultiIndex.from_product([['A', 'B', 'C'], ['foo', 'bar', 'baz']], names=['one', 'two']) s = Series(np.arange(9, dtype='int64'), index=idx).sort_index() if indexer == []: expected = s.iloc[[]] elif is_level1: expected = Series([0, 3, 6], index=MultiIndex.from_product( [['A', 'B', 'C'], ['foo']], names=['one', 'two'])).sort_index() else: exp_idx = MultiIndex.from_product([['A'], ['foo', 'bar', 'baz']], names=['one', 'two']) expected = Series(np.arange(3, dtype='int64'), index=exp_idx).sort_index() if expected_error is not None: with pytest.raises(KeyError, match=expected_error): s.loc[indexer] else: result = s.loc[indexer] tm.assert_series_equal(result, expected)
def test_loc_getitem_array(self): # GH15434 # passing an array as a key with a MultiIndex index = MultiIndex.from_product([[1, 2, 3], ['A', 'B', 'C']]) x = Series(index=index, data=range(9), dtype=np.float64) y = np.array([1, 3]) expected = Series( data=[0, 1, 2, 6, 7, 8], index=MultiIndex.from_product([[1, 3], ['A', 'B', 'C']]), dtype=np.float64) result = x.loc[y] tm.assert_series_equal(result, expected) # empty array: empty = np.array([]) expected = Series([], index=MultiIndex( levels=index.levels, labels=[[], []], dtype=np.float64)) result = x.loc[empty] tm.assert_series_equal(result, expected) # 0-dim array (scalar): scalar = np.int64(1) expected = Series( data=[0, 1, 2], index=['A', 'B', 'C'], dtype=np.float64) result = x.loc[scalar] tm.assert_series_equal(result, expected)
def test_loc_multiindex_incomplete(self): # GH 7399 # incomplete indexers s = Series(np.arange(15, dtype='int64'), MultiIndex.from_product([range(5), ['a', 'b', 'c']])) expected = s.loc[:, 'a':'c'] result = s.loc[0:4, 'a':'c'] tm.assert_series_equal(result, expected) tm.assert_series_equal(result, expected) result = s.loc[:4, 'a':'c'] tm.assert_series_equal(result, expected) tm.assert_series_equal(result, expected) result = s.loc[0:, 'a':'c'] tm.assert_series_equal(result, expected) tm.assert_series_equal(result, expected) # GH 7400 # multiindexer gettitem with list of indexers skips wrong element s = Series(np.arange(15, dtype='int64'), MultiIndex.from_product([range(5), ['a', 'b', 'c']])) expected = s.iloc[[6, 7, 8, 12, 13, 14]] result = s.loc[2:4:2, 'a':'c'] tm.assert_series_equal(result, expected)
def test_from_product_empty(): # 0 levels with tm.assert_raises_regex( ValueError, "Must pass non-zero number of levels/labels"): MultiIndex.from_product([]) # 1 level result = MultiIndex.from_product([[]], names=['A']) expected = pd.Index([], name='A') tm.assert_index_equal(result.levels[0], expected) # 2 levels l1 = [[], ['foo', 'bar', 'baz'], []] l2 = [[], [], ['a', 'b', 'c']] names = ['A', 'B'] for first, second in zip(l1, l2): result = MultiIndex.from_product([first, second], names=names) expected = MultiIndex(levels=[first, second], labels=[[], []], names=names) tm.assert_index_equal(result, expected) # GH12258 names = ['A', 'B', 'C'] for N in range(4): lvl2 = lrange(N) result = MultiIndex.from_product([[], lvl2, []], names=names) expected = MultiIndex(levels=[[], lvl2, []], labels=[[], [], []], names=names) tm.assert_index_equal(result, expected)
def test_binary_ops_align(self): # test aligning binary ops # GH 6681 index = MultiIndex.from_product( [list("abc"), ["one", "two", "three"], [1, 2, 3]], names=["first", "second", "third"] ) df = DataFrame( np.arange(27 * 3).reshape(27, 3), index=index, columns=["value1", "value2", "value3"] ).sortlevel() idx = pd.IndexSlice for op in ["add", "sub", "mul", "div", "truediv"]: opa = getattr(operator, op, None) if opa is None: continue x = Series([1.0, 10.0, 100.0], [1, 2, 3]) result = getattr(df, op)(x, level="third", axis=0) expected = pd.concat([opa(df.loc[idx[:, :, i], :], v) for i, v in x.iteritems()]).sortlevel() assert_frame_equal(result, expected) x = Series([1.0, 10.0], ["two", "three"]) result = getattr(df, op)(x, level="second", axis=0) expected = pd.concat([opa(df.loc[idx[:, i], :], v) for i, v in x.iteritems()]).reindex_like(df).sortlevel() assert_frame_equal(result, expected) # GH9463 (alignment level of dataframe with series) midx = MultiIndex.from_product([["A", "B"], ["a", "b"]]) df = DataFrame(np.ones((2, 4), dtype="int64"), columns=midx) s = pd.Series({"a": 1, "b": 2}) df2 = df.copy() df2.columns.names = ["lvl0", "lvl1"] s2 = s.copy() s2.index.name = "lvl1" # different cases of integer/string level names: res1 = df.mul(s, axis=1, level=1) res2 = df.mul(s2, axis=1, level=1) res3 = df2.mul(s, axis=1, level=1) res4 = df2.mul(s2, axis=1, level=1) res5 = df2.mul(s, axis=1, level="lvl1") res6 = df2.mul(s2, axis=1, level="lvl1") exp = DataFrame(np.array([[1, 2, 1, 2], [1, 2, 1, 2]], dtype="int64"), columns=midx) for res in [res1, res2]: assert_frame_equal(res, exp) exp.columns.names = ["lvl0", "lvl1"] for res in [res3, res4, res5, res6]: assert_frame_equal(res, exp)
def test_to_html_multi_indexes_index_false(datapath): # GH 22579 df = DataFrame({'a': range(10), 'b': range(10, 20), 'c': range(10, 20), 'd': range(10, 20)}) df.columns = MultiIndex.from_product([['a', 'b'], ['c', 'd']]) df.index = MultiIndex.from_product([['a', 'b'], ['c', 'd', 'e', 'f', 'g']]) result = df.to_html(index=False) expected = expected_html(datapath, 'gh22579_expected_output') assert result == expected
def setup(self): self.mi_large = MultiIndex.from_product( [np.arange(1000), np.arange(20), list(string.ascii_letters)], names=['one', 'two', 'three']) self.mi_med = MultiIndex.from_product( [np.arange(1000), np.arange(10), list('A')], names=['one', 'two', 'three']) self.mi_small = MultiIndex.from_product( [np.arange(100), list('A'), list('A')], names=['one', 'two', 'three'])
def test_repeat(): reps = 2 numbers = [1, 2, 3] names = np.array(['foo', 'bar']) m = MultiIndex.from_product([ numbers, names], names=names) expected = MultiIndex.from_product([ numbers, names.repeat(reps)], names=names) tm.assert_index_equal(m.repeat(reps), expected)
def test_conversion_multiindex(self): d = {'comp_str': ["Fe2", "MnO2"]} df_1lvl = DataFrame(data=d) df_1lvl = StrToComposition().featurize_dataframe( df_1lvl, 'comp_str', multiindex=True) self.assertEqual(df_1lvl[("StrToComposition", "composition")].tolist(), [Composition("Fe2"), Composition("MnO2")]) df_2lvl = DataFrame(data=d) df_2lvl.columns = MultiIndex.from_product((["custom"], df_2lvl.columns.values)) df_2lvl = StrToComposition().featurize_dataframe( df_2lvl, ("custom", "comp_str"), multiindex=True) self.assertEqual(df_2lvl[("StrToComposition", "composition")].tolist(), [Composition("Fe2"), Composition("MnO2")]) df_2lvl = DataFrame(data=d) df_2lvl.columns = MultiIndex.from_product((["custom"], df_2lvl.columns.values)) sto = StrToComposition(target_col_id='test') df_2lvl = sto.featurize_dataframe( df_2lvl, ("custom", "comp_str"), multiindex=True) self.assertEqual(df_2lvl[("StrToComposition", "test")].tolist(), [Composition("Fe2"), Composition("MnO2")]) # if two level multiindex provided as target, it should be written there # here we test converting multiindex in place df_2lvl = DataFrame(data=d) df_2lvl.columns = MultiIndex.from_product((["custom"], df_2lvl.columns.values)) sto = StrToComposition(target_col_id=None, overwrite_data=True) df_2lvl = sto.featurize_dataframe( df_2lvl, ("custom", "comp_str"), multiindex=True, inplace=False) self.assertEqual(df_2lvl[("custom", "comp_str")].tolist(), [Composition("Fe2"), Composition("MnO2")]) # Try inplace multiindex conversion with return errors df_2lvl = DataFrame(data=d) df_2lvl.columns = MultiIndex.from_product((["custom"], df_2lvl.columns.values)) sto = StrToComposition(target_col_id=None, overwrite_data=True) df_2lvl = sto.featurize_dataframe( df_2lvl, ("custom", "comp_str"), multiindex=True, return_errors=True, ignore_errors=True) self.assertTrue( all(df_2lvl[("custom", "StrToComposition Exceptions")].isnull()))
def test_aggregate_api_consistency(): # GH 9052 # make sure that the aggregates via dict # are consistent df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B': ['one', 'one', 'two', 'two', 'two', 'two', 'one', 'two'], 'C': np.random.randn(8) + 1.0, 'D': np.arange(8)}) grouped = df.groupby(['A', 'B']) c_mean = grouped['C'].mean() c_sum = grouped['C'].sum() d_mean = grouped['D'].mean() d_sum = grouped['D'].sum() result = grouped['D'].agg(['sum', 'mean']) expected = pd.concat([d_sum, d_mean], axis=1) expected.columns = ['sum', 'mean'] tm.assert_frame_equal(result, expected, check_like=True) result = grouped.agg([np.sum, np.mean]) expected = pd.concat([c_sum, c_mean, d_sum, d_mean], axis=1) expected.columns = MultiIndex.from_product([['C', 'D'], ['sum', 'mean']]) tm.assert_frame_equal(result, expected, check_like=True) result = grouped[['D', 'C']].agg([np.sum, np.mean]) expected = pd.concat([d_sum, d_mean, c_sum, c_mean], axis=1) expected.columns = MultiIndex.from_product([['D', 'C'], ['sum', 'mean']]) tm.assert_frame_equal(result, expected, check_like=True) result = grouped.agg({'C': 'mean', 'D': 'sum'}) expected = pd.concat([d_sum, c_mean], axis=1) tm.assert_frame_equal(result, expected, check_like=True) result = grouped.agg({'C': ['mean', 'sum'], 'D': ['mean', 'sum']}) expected = pd.concat([c_mean, c_sum, d_mean, d_sum], axis=1) expected.columns = MultiIndex.from_product([['C', 'D'], ['mean', 'sum']]) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = grouped[['D', 'C']].agg({'r': np.sum, 'r2': np.mean}) expected = pd.concat([d_sum, c_sum, d_mean, c_mean], axis=1) expected.columns = MultiIndex.from_product([['r', 'r2'], ['D', 'C']]) tm.assert_frame_equal(result, expected, check_like=True)
def test_repeat(): reps = 2 numbers = [1, 2, 3] names = np.array(['foo', 'bar']) m = MultiIndex.from_product([ numbers, names], names=names) expected = MultiIndex.from_product([ numbers, names.repeat(reps)], names=names) tm.assert_index_equal(m.repeat(reps), expected) with tm.assert_produces_warning(FutureWarning): result = m.repeat(n=reps) tm.assert_index_equal(result, expected)
def test_numpy_repeat(): reps = 2 numbers = [1, 2, 3] names = np.array(['foo', 'bar']) m = MultiIndex.from_product([ numbers, names], names=names) expected = MultiIndex.from_product([ numbers, names.repeat(reps)], names=names) tm.assert_index_equal(np.repeat(m, reps), expected) msg = "the 'axis' parameter is not supported" with pytest.raises(ValueError, match=msg): np.repeat(m, reps, axis=1)
def test_duplicate_level_names(names): # GH18872, GH19029 mi = MultiIndex.from_product([[0, 1]] * 3, names=names) assert mi.names == names # With .rename() mi = MultiIndex.from_product([[0, 1]] * 3) mi = mi.rename(names) assert mi.names == names # With .rename(., level=) mi.rename(names[1], level=1, inplace=True) mi = mi.rename([names[0], names[2]], level=[0, 2]) assert mi.names == names
def test_delitem_multiindex(self): midx = MultiIndex.from_product([['A', 'B'], [1, 2]]) df = DataFrame(np.random.randn(4, 4), columns=midx) assert len(df.columns) == 4 assert ('A', ) in df.columns assert 'A' in df.columns result = df['A'] assert isinstance(result, DataFrame) del df['A'] assert len(df.columns) == 2 # A still in the levels, BUT get a KeyError if trying # to delete assert ('A', ) not in df.columns with pytest.raises(KeyError): del df[('A',)] # behavior of dropped/deleted MultiIndex levels changed from # GH 2770 to GH 19027: MultiIndex no longer '.__contains__' # levels which are dropped/deleted assert 'A' not in df.columns with pytest.raises(KeyError): del df['A']
def factor_matrix(self, terms, start_date, end_date): return DataFrame( index=MultiIndex.from_product( [date_range(start=start_date, end=end_date, freq='D'), ()], ), columns=sorted(terms.keys()) )
def test_get_indexer_categorical_time(): # https://github.com/pandas-dev/pandas/issues/21390 midx = MultiIndex.from_product( [Categorical(['a', 'b', 'c']), Categorical(date_range("2012-01-01", periods=3, freq='H'))]) result = midx.get_indexer(midx) tm.assert_numpy_array_equal(result, np.arange(9, dtype=np.intp))
def analyze(): signals = read_csv(FILE_SIGNALS) devices = signals["id"].unique() print("got %d signals from %d devices" % (len(signals), len(devices))) signals = signals.groupby(["frequency", "id"]).size() signals = signals.reindex(MultiIndex.from_product([SPECTRUM, devices], names=signals.index.names), fill_value=0) signals = signals.unstack("id") # let's only keep frequencies with all signals present candidates = signals.dropna() # suggest frequency where the weakest sensor has the most # received signals, and then the frequency with most total # received signals for all sensors candidates = DataFrame({"total": candidates.sum(axis=1), "weakest": candidates.min(axis=1)}) appropriate_freq = candidates.sort(["weakest", "total"], ascending=False).index[0] print("suggesting frequency %s" % mhz(appropriate_freq)) signals.to_csv("spectrum.csv") import matplotlib.pyplot as plt from matplotlib.ticker import EngFormatter p=signals.plot(kind="Area") p.xaxis.set_major_formatter(EngFormatter(unit='Hz', places=2)) plt.savefig(FILE_SPECTRUM, dpi=300) print("saved spectrum as %s" % FILE_SPECTRUM)
def test_multi_index_parse_dates(all_parsers, index_col): data = """index1,index2,A,B,C 20090101,one,a,1,2 20090101,two,b,3,4 20090101,three,c,4,5 20090102,one,a,1,2 20090102,two,b,3,4 20090102,three,c,4,5 20090103,one,a,1,2 20090103,two,b,3,4 20090103,three,c,4,5 """ parser = all_parsers index = MultiIndex.from_product([ (datetime(2009, 1, 1), datetime(2009, 1, 2), datetime(2009, 1, 3)), ("one", "two", "three")], names=["index1", "index2"]) # Out of order. if index_col == [1, 0]: index = index.swaplevel(0, 1) expected = DataFrame([["a", 1, 2], ["b", 3, 4], ["c", 4, 5], ["a", 1, 2], ["b", 3, 4], ["c", 4, 5], ["a", 1, 2], ["b", 3, 4], ["c", 4, 5]], columns=["A", "B", "C"], index=index) result = parser.read_csv(StringIO(data), index_col=index_col, parse_dates=True) tm.assert_frame_equal(result, expected)
def run_pipeline(self, pipeline, start_date, end_date): return DataFrame( index=MultiIndex.from_product( [date_range(start=start_date, end=end_date, freq='D'), ()], ), columns=sorted(pipeline.columns.keys()), )
def setup(self): s = Series([np.nan] * 10000) s[0] = 3.0 s[100] = -1.0 s[999] = 12.1 s.index = MultiIndex.from_product([range(10)] * 4) self.ss = s.to_sparse()
def test_multiindex_label_slicing_with_negative_step(self): s = Series(np.arange(20), MultiIndex.from_product([list('abcde'), np.arange(4)])) SLC = pd.IndexSlice def assert_slices_equivalent(l_slc, i_slc): tm.assert_series_equal(s.loc[l_slc], s.iloc[i_slc]) tm.assert_series_equal(s[l_slc], s.iloc[i_slc]) with catch_warnings(record=True): tm.assert_series_equal(s.ix[l_slc], s.iloc[i_slc]) assert_slices_equivalent(SLC[::-1], SLC[::-1]) assert_slices_equivalent(SLC['d'::-1], SLC[15::-1]) assert_slices_equivalent(SLC[('d', )::-1], SLC[15::-1]) assert_slices_equivalent(SLC[:'d':-1], SLC[:11:-1]) assert_slices_equivalent(SLC[:('d', ):-1], SLC[:11:-1]) assert_slices_equivalent(SLC['d':'b':-1], SLC[15:3:-1]) assert_slices_equivalent(SLC[('d', ):'b':-1], SLC[15:3:-1]) assert_slices_equivalent(SLC['d':('b', ):-1], SLC[15:3:-1]) assert_slices_equivalent(SLC[('d', ):('b', ):-1], SLC[15:3:-1]) assert_slices_equivalent(SLC['b':'d':-1], SLC[:0]) assert_slices_equivalent(SLC[('c', 2)::-1], SLC[10::-1]) assert_slices_equivalent(SLC[:('c', 2):-1], SLC[:9:-1]) assert_slices_equivalent(SLC[('e', 0):('c', 2):-1], SLC[16:9:-1])
def setup(self): self.mi_int = MultiIndex.from_product([np.arange(1000), np.arange(1000)], names=['one', 'two']) self.obj_index = np.array([(0, 10), (0, 11), (0, 12), (0, 13), (0, 14), (0, 15), (0, 16), (0, 17), (0, 18), (0, 19)], dtype=object)
def test_get_loc_nan(level, nulls_fixture): # GH 18485 : NaN in MultiIndex levels = [['a', 'b'], ['c', 'd']] key = ['b', 'd'] levels[level] = np.array([0, nulls_fixture], dtype=type(nulls_fixture)) key[level] = nulls_fixture idx = MultiIndex.from_product(levels) assert idx.get_loc(tuple(key)) == 3
def test_binary_ops_align(self): # test aligning binary ops # GH 6681 index = MultiIndex.from_product( [list("abc"), ["one", "two", "three"], [1, 2, 3]], names=["first", "second", "third"], ) df = DataFrame( np.arange(27 * 3).reshape(27, 3), index=index, columns=["value1", "value2", "value3"], ).sort_index() idx = pd.IndexSlice for op in ["add", "sub", "mul", "div", "truediv"]: opa = getattr(operator, op, None) if opa is None: continue x = Series([1.0, 10.0, 100.0], [1, 2, 3]) result = getattr(df, op)(x, level="third", axis=0) expected = pd.concat([ opa(df.loc[idx[:, :, i], :], v) for i, v in x.items() ]).sort_index() tm.assert_frame_equal(result, expected) x = Series([1.0, 10.0], ["two", "three"]) result = getattr(df, op)(x, level="second", axis=0) expected = (pd.concat([ opa(df.loc[idx[:, i], :], v) for i, v in x.items() ]).reindex_like(df).sort_index()) tm.assert_frame_equal(result, expected) # GH9463 (alignment level of dataframe with series) midx = MultiIndex.from_product([["A", "B"], ["a", "b"]]) df = DataFrame(np.ones((2, 4), dtype="int64"), columns=midx) s = pd.Series({"a": 1, "b": 2}) df2 = df.copy() df2.columns.names = ["lvl0", "lvl1"] s2 = s.copy() s2.index.name = "lvl1" # different cases of integer/string level names: res1 = df.mul(s, axis=1, level=1) res2 = df.mul(s2, axis=1, level=1) res3 = df2.mul(s, axis=1, level=1) res4 = df2.mul(s2, axis=1, level=1) res5 = df2.mul(s, axis=1, level="lvl1") res6 = df2.mul(s2, axis=1, level="lvl1") exp = DataFrame(np.array([[1, 2, 1, 2], [1, 2, 1, 2]], dtype="int64"), columns=midx) for res in [res1, res2]: tm.assert_frame_equal(res, exp) exp.columns.names = ["lvl0", "lvl1"] for res in [res3, res4, res5, res6]: tm.assert_frame_equal(res, exp)
def test_from_product_invalid_input(invalid_input): msg = r"Input must be a list / sequence of iterables|Input must be list-like" with pytest.raises(TypeError, match=msg): MultiIndex.from_product(iterables=invalid_input)
def test_groupby_categorical_two_columns(): # https://github.com/pandas-dev/pandas/issues/8138 d = { 'cat': pd.Categorical(["a", "b", "a", "b"], categories=["a", "b", "c"], ordered=True), 'ints': [1, 1, 2, 2], 'val': [10, 20, 30, 40] } test = pd.DataFrame(d) # Grouping on a single column groups_single_key = test.groupby("cat") res = groups_single_key.agg('mean') exp_index = pd.CategoricalIndex(["a", "b", "c"], name="cat", ordered=True) exp = DataFrame({ "ints": [1.5, 1.5, np.nan], "val": [20, 30, np.nan] }, index=exp_index) tm.assert_frame_equal(res, exp) # Grouping on two columns groups_double_key = test.groupby(["cat", "ints"]) res = groups_double_key.agg('mean') exp = DataFrame({ "val": [10, 30, 20, 40, np.nan, np.nan], "cat": pd.Categorical(["a", "a", "b", "b", "c", "c"], ordered=True), "ints": [1, 2, 1, 2, 1, 2] }).set_index(["cat", "ints"]) tm.assert_frame_equal(res, exp) # GH 10132 for key in [('a', 1), ('b', 2), ('b', 1), ('a', 2)]: c, i = key result = groups_double_key.get_group(key) expected = test[(test.cat == c) & (test.ints == i)] assert_frame_equal(result, expected) d = {'C1': [3, 3, 4, 5], 'C2': [1, 2, 3, 4], 'C3': [10, 100, 200, 34]} test = pd.DataFrame(d) values = pd.cut(test['C1'], [1, 2, 3, 6]) values.name = "cat" groups_double_key = test.groupby([values, 'C2']) res = groups_double_key.agg('mean') nan = np.nan idx = MultiIndex.from_product([ Categorical( [Interval(1, 2), Interval(2, 3), Interval(3, 6)], ordered=True), [1, 2, 3, 4] ], names=["cat", "C2"]) exp = DataFrame( { "C1": [nan, nan, nan, nan, 3, 3, nan, nan, nan, nan, 4, 5], "C3": [nan, nan, nan, nan, 10, 100, nan, nan, nan, nan, 200, 34] }, index=idx) tm.assert_frame_equal(res, exp)
def setup_method(self, method): self.series_ints = Series(np.random.rand(4), index=lrange(0, 8, 2)) self.frame_ints = DataFrame(np.random.randn(4, 4), index=lrange(0, 8, 2), columns=lrange(0, 12, 3)) with catch_warnings(record=True): self.panel_ints = Panel(np.random.rand(4, 4, 4), items=lrange(0, 8, 2), major_axis=lrange(0, 12, 3), minor_axis=lrange(0, 16, 4)) self.series_uints = Series(np.random.rand(4), index=UInt64Index(lrange(0, 8, 2))) self.frame_uints = DataFrame(np.random.randn(4, 4), index=UInt64Index(lrange(0, 8, 2)), columns=UInt64Index(lrange(0, 12, 3))) self.panel_uints = Panel(np.random.rand(4, 4, 4), items=UInt64Index(lrange(0, 8, 2)), major_axis=UInt64Index(lrange(0, 12, 3)), minor_axis=UInt64Index(lrange(0, 16, 4))) self.series_floats = Series(np.random.rand(4), index=Float64Index(range(0, 8, 2))) self.frame_floats = DataFrame(np.random.randn(4, 4), index=Float64Index(range(0, 8, 2)), columns=Float64Index(range(0, 12, 3))) self.panel_floats = Panel(np.random.rand(4, 4, 4), items=Float64Index(range(0, 8, 2)), major_axis=Float64Index(range(0, 12, 3)), minor_axis=Float64Index(range(0, 16, 4))) m_idces = [MultiIndex.from_product([[1, 2], [3, 4]]), MultiIndex.from_product([[5, 6], [7, 8]]), MultiIndex.from_product([[9, 10], [11, 12]])] self.series_multi = Series(np.random.rand(4), index=m_idces[0]) self.frame_multi = DataFrame(np.random.randn(4, 4), index=m_idces[0], columns=m_idces[1]) self.panel_multi = Panel(np.random.rand(4, 4, 4), items=m_idces[0], major_axis=m_idces[1], minor_axis=m_idces[2]) self.series_labels = Series(np.random.randn(4), index=list('abcd')) self.frame_labels = DataFrame(np.random.randn(4, 4), index=list('abcd'), columns=list('ABCD')) self.panel_labels = Panel(np.random.randn(4, 4, 4), items=list('abcd'), major_axis=list('ABCD'), minor_axis=list('ZYXW')) self.series_mixed = Series(np.random.randn(4), index=[2, 4, 'null', 8]) self.frame_mixed = DataFrame(np.random.randn(4, 4), index=[2, 4, 'null', 8]) self.panel_mixed = Panel(np.random.randn(4, 4, 4), items=[2, 4, 'null', 8]) self.series_ts = Series(np.random.randn(4), index=date_range('20130101', periods=4)) self.frame_ts = DataFrame(np.random.randn(4, 4), index=date_range('20130101', periods=4)) self.panel_ts = Panel(np.random.randn(4, 4, 4), items=date_range('20130101', periods=4)) dates_rev = (date_range('20130101', periods=4) .sort_values(ascending=False)) self.series_ts_rev = Series(np.random.randn(4), index=dates_rev) self.frame_ts_rev = DataFrame(np.random.randn(4, 4), index=dates_rev) self.panel_ts_rev = Panel(np.random.randn(4, 4, 4), items=dates_rev) self.frame_empty = DataFrame({}) self.series_empty = Series({}) self.panel_empty = Panel({}) # form agglomerates for o in self._objs: d = dict() for t in self._typs: d[t] = getattr(self, '%s_%s' % (o, t), None) setattr(self, o, d)
def test_per_axis_per_level_getitem(self): # GH6134 # example test case ix = MultiIndex.from_product( [_mklbl("A", 5), _mklbl("B", 7), _mklbl("C", 4), _mklbl("D", 2)]) df = DataFrame(np.arange(len(ix.to_numpy())), index=ix) result = df.loc[(slice("A1", "A3"), slice(None), ["C1", "C3"]), :] expected = df.loc[[ tuple([a, b, c, d]) for a, b, c, d in df.index.values if (a == "A1" or a == "A2" or a == "A3") and ( c == "C1" or c == "C3") ]] tm.assert_frame_equal(result, expected) expected = df.loc[[ tuple([a, b, c, d]) for a, b, c, d in df.index.values if (a == "A1" or a == "A2" or a == "A3") and ( c == "C1" or c == "C2" or c == "C3") ]] result = df.loc[(slice("A1", "A3"), slice(None), slice("C1", "C3")), :] tm.assert_frame_equal(result, expected) # test multi-index slicing with per axis and per index controls index = MultiIndex.from_tuples([("A", 1), ("A", 2), ("A", 3), ("B", 1)], names=["one", "two"]) columns = MultiIndex.from_tuples( [("a", "foo"), ("a", "bar"), ("b", "foo"), ("b", "bah")], names=["lvl0", "lvl1"], ) df = DataFrame(np.arange(16, dtype="int64").reshape(4, 4), index=index, columns=columns) df = df.sort_index(axis=0).sort_index(axis=1) # identity result = df.loc[(slice(None), slice(None)), :] tm.assert_frame_equal(result, df) result = df.loc[(slice(None), slice(None)), (slice(None), slice(None))] tm.assert_frame_equal(result, df) result = df.loc[:, (slice(None), slice(None))] tm.assert_frame_equal(result, df) # index result = df.loc[(slice(None), [1]), :] expected = df.iloc[[0, 3]] tm.assert_frame_equal(result, expected) result = df.loc[(slice(None), 1), :] expected = df.iloc[[0, 3]] tm.assert_frame_equal(result, expected) # columns result = df.loc[:, (slice(None), ["foo"])] expected = df.iloc[:, [1, 3]] tm.assert_frame_equal(result, expected) # both result = df.loc[(slice(None), 1), (slice(None), ["foo"])] expected = df.iloc[[0, 3], [1, 3]] tm.assert_frame_equal(result, expected) result = df.loc["A", "a"] expected = DataFrame( dict(bar=[1, 5, 9], foo=[0, 4, 8]), index=Index([1, 2, 3], name="two"), columns=Index(["bar", "foo"], name="lvl1"), ) tm.assert_frame_equal(result, expected) result = df.loc[(slice(None), [1, 2]), :] expected = df.iloc[[0, 1, 3]] tm.assert_frame_equal(result, expected) # multi-level series s = Series(np.arange(len(ix.to_numpy())), index=ix) result = s.loc["A1":"A3", :, ["C1", "C3"]] expected = s.loc[[ tuple([a, b, c, d]) for a, b, c, d in s.index.values if (a == "A1" or a == "A2" or a == "A3") and ( c == "C1" or c == "C3") ]] tm.assert_series_equal(result, expected) # boolean indexers result = df.loc[(slice(None), df.loc[:, ("a", "bar")] > 5), :] expected = df.iloc[[2, 3]] tm.assert_frame_equal(result, expected) with pytest.raises(ValueError): df.loc[(slice(None), np.array([True, False])), :] # ambiguous notation # this is interpreted as slicing on both axes (GH #16396) result = df.loc[slice(None), [1]] expected = df.iloc[:, []] tm.assert_frame_equal(result, expected) result = df.loc[(slice(None), [1]), :] expected = df.iloc[[0, 3]] tm.assert_frame_equal(result, expected) # not lexsorted assert df.index.lexsort_depth == 2 df = df.sort_index(level=1, axis=0) assert df.index.lexsort_depth == 0 msg = ("MultiIndex slicing requires the index to be " r"lexsorted: slicing on levels \[1\], lexsort depth 0") with pytest.raises(UnsortedIndexError, match=msg): df.loc[(slice(None), slice("bar")), :] # GH 16734: not sorted, but no real slicing result = df.loc[(slice(None), df.loc[:, ("a", "bar")] > 5), :] tm.assert_frame_equal(result, df.iloc[[1, 3], :])
def setup_method(self, method): self.series_ints = Series(np.random.rand(4), index=np.arange(0, 8, 2)) self.frame_ints = DataFrame(np.random.randn(4, 4), index=np.arange(0, 8, 2), columns=np.arange(0, 12, 3)) self.series_uints = Series(np.random.rand(4), index=UInt64Index(np.arange(0, 8, 2))) self.frame_uints = DataFrame( np.random.randn(4, 4), index=UInt64Index(range(0, 8, 2)), columns=UInt64Index(range(0, 12, 3)), ) self.series_floats = Series(np.random.rand(4), index=Float64Index(range(0, 8, 2))) self.frame_floats = DataFrame( np.random.randn(4, 4), index=Float64Index(range(0, 8, 2)), columns=Float64Index(range(0, 12, 3)), ) m_idces = [ MultiIndex.from_product([[1, 2], [3, 4]]), MultiIndex.from_product([[5, 6], [7, 8]]), MultiIndex.from_product([[9, 10], [11, 12]]), ] self.series_multi = Series(np.random.rand(4), index=m_idces[0]) self.frame_multi = DataFrame(np.random.randn(4, 4), index=m_idces[0], columns=m_idces[1]) self.series_labels = Series(np.random.randn(4), index=list("abcd")) self.frame_labels = DataFrame(np.random.randn(4, 4), index=list("abcd"), columns=list("ABCD")) self.series_mixed = Series(np.random.randn(4), index=[2, 4, "null", 8]) self.frame_mixed = DataFrame(np.random.randn(4, 4), index=[2, 4, "null", 8]) self.series_ts = Series(np.random.randn(4), index=date_range("20130101", periods=4)) self.frame_ts = DataFrame(np.random.randn(4, 4), index=date_range("20130101", periods=4)) dates_rev = date_range("20130101", periods=4).sort_values(ascending=False) self.series_ts_rev = Series(np.random.randn(4), index=dates_rev) self.frame_ts_rev = DataFrame(np.random.randn(4, 4), index=dates_rev) self.frame_empty = DataFrame() self.series_empty = Series() # form agglomerates for kind in self._kinds: d = dict() for typ in self._typs: d[typ] = getattr(self, "{kind}_{typ}".format(kind=kind, typ=typ)) setattr(self, kind, d)
class PerformanceTestCase(TestCase): dr = date_range(start="2015-1-1", end="2015-1-2", name="date") tickers = ["A", "B", "C", "D"] factor = ( DataFrame(index=dr, columns=tickers, data=[[1, 2, 3, 4], [4, 3, 2, 1]]) .stack() .rename("factor") ) factor.index = factor.index.set_names(["date", "asset"]) factor_data = DataFrame( { "factor": factor, "group": Series( index=factor.index, data=[1, 1, 2, 2, 1, 1, 2, 2], dtype="category", ), } ) @parameterized.expand( [ ( factor_data, [4, 3, 2, 1, 1, 2, 3, 4], False, False, dr, [-1.0, -1.0], ), ( factor_data, [1, 2, 3, 4, 4, 3, 2, 1], False, False, dr, [1.0, 1.0], ), ( factor_data, [1, 2, 3, 4, 4, 3, 2, 1], False, True, MultiIndex.from_product( [dr, Categorical([1, 2])], names=["date", "group"] ), [1.0, 1.0, 1.0, 1.0], ), ( factor_data, [1, 2, 3, 4, 4, 3, 2, 1], True, True, MultiIndex.from_product( [dr, Categorical([1, 2])], names=["date", "group"] ), [1.0, 1.0, 1.0, 1.0], ), ] ) def test_information_coefficient( self, factor_data, forward_returns, group_adjust, by_group, expected_ix, expected_ic_val, ): factor_data["1D"] = Series( index=factor_data.index, data=forward_returns ) ic = factor_information_coefficient( factor_data=factor_data, group_adjust=group_adjust, by_group=by_group, ) expected_ic_df = DataFrame( index=expected_ix, columns=Index(["1D"], dtype="object"), data=expected_ic_val, ) assert_frame_equal(ic, expected_ic_df) @parameterized.expand( [ ( factor_data, [4, 3, 2, 1, 1, 2, 3, 4], False, False, "D", dr, [-1.0, -1.0], ), ( factor_data, [1, 2, 3, 4, 4, 3, 2, 1], False, False, "W", DatetimeIndex(["2015-01-04"], name="date", freq="W-SUN"), [1.0], ), ( factor_data, [1, 2, 3, 4, 4, 3, 2, 1], False, True, None, CategoricalIndex([1, 2], name="group"), [1.0, 1.0], ), ( factor_data, [1, 2, 3, 4, 4, 3, 2, 1], False, True, "W", MultiIndex.from_product( [ DatetimeIndex( ["2015-01-04"], name="date", freq="W-SUN" ), Categorical([1, 2]), ], names=["date", "group"], ), [1.0, 1.0], ), ] ) def test_mean_information_coefficient( self, factor_data, forward_returns, group_adjust, by_group, by_time, expected_ix, expected_ic_val, ): factor_data["1D"] = Series( index=factor_data.index, data=forward_returns ) ic = mean_information_coefficient( factor_data, group_adjust=group_adjust, by_group=by_group, by_time=by_time, ) expected_ic_df = DataFrame( index=expected_ix, columns=Index(["1D"], dtype="object"), data=expected_ic_val, ) assert_frame_equal(ic, expected_ic_df) @parameterized.expand( [ ( [1.1, 1.2, 1.1, 1.2, 1.1, 1.2], [[1, 2, 1, 2, 1, 2], [1, 2, 1, 2, 1, 2], [1, 2, 1, 2, 1, 2]], 2, False, [0.1, 0.2], ), ( [1.1, 1.2, 1.1, 1.2, 1.1, 1.2], [[1, 2, 1, 2, 1, 2], [1, 2, 1, 2, 1, 2], [1, 2, 1, 2, 1, 2]], 2, True, [0.1, 0.1, 0.2, 0.2], ), ( [1.1, 1.1, 1.1, 1.2, 1.2, 1.2], [[1, 2, 3, 1, 2, 3], [1, 2, 3, 1, 2, 3], [1, 2, 3, 1, 2, 3]], 3, False, [0.15, 0.15, 0.15], ), ( [1.1, 1.1, 1.1, 1.2, 1.2, 1.2], [[1, 2, 3, 1, 2, 3], [1, 2, 3, 1, 2, 3], [1, 2, 3, 1, 2, 3]], 3, True, [0.1, 0.2, 0.1, 0.2, 0.1, 0.2], ), ( [1.5, 1.5, 1.2, 1.0, 1.0, 1.0], [[1, 1, 2, 2, 2, 2], [2, 2, 1, 2, 2, 2], [2, 2, 1, 2, 2, 2]], 2, False, [0.3, 0.15], ), ( [1.5, 1.5, 1.2, 1.0, 1.0, 1.0], [[1, 1, 3, 2, 2, 2], [3, 3, 1, 2, 2, 2], [3, 3, 1, 2, 2, 2]], 3, False, [0.3, 0.0, 0.4], ), ( [1.6, 1.6, 1.0, 1.0, 1.0, 1.0], [[1, 1, 2, 2, 2, 2], [2, 2, 1, 1, 1, 1], [2, 2, 1, 1, 1, 1]], 2, False, [0.2, 0.4], ), ( [1.6, 1.6, 1.0, 1.6, 1.6, 1.0], [[1, 1, 2, 1, 1, 2], [2, 2, 1, 2, 2, 1], [2, 2, 1, 2, 2, 1]], 2, True, [0.2, 0.2, 0.4, 0.4], ), ] ) def test_mean_return_by_quantile( self, daily_rets, factor, bins, by_group, expected_data ): """ Test mean_return_by_quantile """ tickers = ["A", "B", "C", "D", "E", "F"] factor_groups = {"A": 1, "B": 1, "C": 1, "D": 2, "E": 2, "F": 2} price_data = [ [ daily_rets[0] ** i, daily_rets[1] ** i, daily_rets[2] ** i, daily_rets[3] ** i, daily_rets[4] ** i, daily_rets[5] ** i, ] for i in range(1, 5) ] # 4 days start = "2015-1-11" factor_end = "2015-1-13" price_end = "2015-1-14" # 1D fwd returns price_index = date_range(start=start, end=price_end) price_index.name = "date" prices = DataFrame(index=price_index, columns=tickers, data=price_data) factor_index = date_range(start=start, end=factor_end) factor_index.name = "date" factor = DataFrame( index=factor_index, columns=tickers, data=factor ).stack() factor_data = get_clean_factor_and_forward_returns( factor, prices, groupby=factor_groups, quantiles=None, bins=bins, periods=(1,), ) mean_quant_ret, std_quantile = mean_return_by_quantile( factor_data, by_date=False, by_group=by_group, demeaned=False, group_adjust=False, ) expected = DataFrame( index=mean_quant_ret.index.copy(), columns=mean_quant_ret.columns.copy(), data=expected_data, ) expected.index.name = "factor_quantile" assert_frame_equal(mean_quant_ret, expected) @parameterized.expand( [ ( [ [1.0, 2.0, 3.0, 4.0], [4.0, 3.0, 2.0, 1.0], [1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0], ], "1B", 4.0, 1, [nan, 1.0, 1.0, 0.0], ), ( [ [1.0, 2.0, 3.0, 4.0], [4.0, 3.0, 2.0, 1.0], [1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0], ], "1D", 4.0, 1, [nan, 1.0, 1.0, 0.0], ), ( [ [1.0, 2.0, 3.0, 4.0], [4.0, 3.0, 2.0, 1.0], [1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0], ], "1B", 4.0, 2, [nan, nan, 0.0, 1.0], ), ( [ [1.0, 2.0, 3.0, 4.0], [4.0, 3.0, 2.0, 1.0], [1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0], ], "1D", 4.0, 2, [nan, nan, 0.0, 1.0], ), ( [ [1.0, 2.0, 3.0, 4.0], [4.0, 3.0, 2.0, 1.0], [1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0], ], "1B", 4.0, 3, [nan, nan, nan, 0.0], ), ( [ [1.0, 2.0, 3.0, 4.0], [4.0, 3.0, 2.0, 1.0], [1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0], ], "1D", 4.0, 3, [nan, nan, nan, 0.0], ), ( [ [1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0], ], "1B", 3.0, 1, [nan, 0.0, 0.0, 0.0], ), ( [ [1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0], ], "1D", 3.0, 1, [nan, 0.0, 0.0, 0.0], ), ( [ [1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0], ], "1B", 3.0, 2, [nan, nan, 0.0, 0.0], ), ( [ [1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0], ], "1D", 3.0, 2, [nan, nan, 0.0, 0.0], ), ( [ [1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0], ], "1B", 3.0, 3, [nan, nan, nan, 0.0], ), ( [ [1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0], ], "1D", 3.0, 3, [nan, nan, nan, 0.0], ), ( [ [1.0, 2.0, 3.0, 4.0], [4.0, 3.0, 2.0, 1.0], [1.0, 2.0, 3.0, 4.0], [4.0, 3.0, 2.0, 1.0], ], "1B", 2.0, 1, [nan, 1.0, 1.0, 1.0], ), ( [ [1.0, 2.0, 3.0, 4.0], [4.0, 3.0, 2.0, 1.0], [1.0, 2.0, 3.0, 4.0], [4.0, 3.0, 2.0, 1.0], ], "1D", 2.0, 1, [nan, 1.0, 1.0, 1.0], ), ( [ [1.0, 2.0, 3.0, 4.0], [1.0, 3.0, 2.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 3.0, 2.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 3.0, 2.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 3.0, 2.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 3.0, 2.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 3.0, 2.0, 4.0], ], "1B", 3.0, 4, [nan, nan, nan, nan, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], ), ( [ [1.0, 2.0, 3.0, 4.0], [1.0, 3.0, 2.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 3.0, 2.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 3.0, 2.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 3.0, 2.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 3.0, 2.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 3.0, 2.0, 4.0], ], "1D", 3.0, 4, [nan, nan, nan, nan, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], ), ( [ [1.0, 2.0, 3.0, 4.0], [1.0, 3.0, 2.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 3.0, 2.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 3.0, 2.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 3.0, 2.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 3.0, 2.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0], ], "1B", 3.0, 10, [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, 0.0, 1.0], ), ( [ [1.0, 2.0, 3.0, 4.0], [1.0, 3.0, 2.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 3.0, 2.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 3.0, 2.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 3.0, 2.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 3.0, 2.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0], ], "1D", 3.0, 10, [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, 0.0, 1.0], ), ] ) def test_quantile_turnover( self, quantile_values, freq, test_quantile, period, expected_vals ): dr = date_range( start="2015-1-1", periods=len(quantile_values), freq=freq, name="date", ) tickers = ["A", "B", "C", "D"] quantized_test_factor = ( DataFrame(index=dr, columns=tickers, data=quantile_values) .rename_axis("asset", axis=1) .stack() ) to = quantile_turnover(quantized_test_factor, test_quantile, period) expected = Series( index=quantized_test_factor.index.levels[0], data=expected_vals ).rename(test_quantile) assert_series_equal(to, expected) @parameterized.expand( [ ( [[3, 4, 2, 1, nan], [3, 4, -2, -1, nan], [3, nan, nan, 1, 4]], ["A", "B", "C", "D", "E"], { "A": "Group1", "B": "Group2", "C": "Group1", "D": "Group2", "E": "Group1", }, False, False, False, [ 0.30, 0.40, 0.20, 0.10, 0.30, 0.40, -0.20, -0.10, 0.375, 0.125, 0.50, ], ), ( [[3, 4, 2, 1, nan], [3, 4, -2, -1, nan], [3, nan, nan, 1, 4]], ["A", "B", "C", "D", "E"], { "A": "Group1", "B": "Group2", "C": "Group1", "D": "Group2", "E": "Group1", }, True, False, False, [ 0.125, 0.375, -0.125, -0.375, 0.20, 0.30, -0.30, -0.20, 0.10, -0.50, 0.40, ], ), ( [[3, 4, 2, 1, nan], [-3, 4, -2, 1, nan], [2, 2, 2, 3, 1]], ["A", "B", "C", "D", "E"], { "A": "Group1", "B": "Group2", "C": "Group1", "D": "Group2", "E": "Group1", }, False, True, False, [ 0.30, 0.40, 0.20, 0.10, -0.30, 0.40, -0.20, 0.10, 0.20, 0.20, 0.20, 0.30, 0.10, ], ), ( [[3, 4, 2, 1, nan], [3, 4, -2, -1, nan], [3, nan, nan, 1, 4]], ["A", "B", "C", "D", "E"], { "A": "Group1", "B": "Group2", "C": "Group1", "D": "Group2", "E": "Group1", }, True, True, False, [ 0.25, 0.25, -0.25, -0.25, 0.25, 0.25, -0.25, -0.25, -0.50, nan, 0.50, ], ), ( [[3, 4, 2, 1, 5], [3, 4, -2, -1, 5], [3, nan, nan, 1, nan]], ["A", "B", "C", "D", "E"], { "A": "Group1", "B": "Group2", "C": "Group1", "D": "Group2", "E": "Group1", }, False, False, True, [ 0.20, 0.20, 0.20, 0.20, 0.20, 0.20, 0.20, -0.20, -0.20, 0.20, 0.50, 0.50, ], ), ( [[1, 4, 2, 3, nan], [1, 4, -2, -3, nan], [3, nan, nan, 2, 7]], ["A", "B", "C", "D", "E"], { "A": "Group1", "B": "Group2", "C": "Group1", "D": "Group2", "E": "Group1", }, True, False, True, [ -0.25, 0.25, -0.25, 0.25, 0.25, 0.25, -0.25, -0.25, 0.0, -0.50, 0.50, ], ), ( [ [3, 4, 2, 1, nan], [-3, 4, -2, 1, nan], [3, nan, nan, 1, 4], [3, nan, nan, -1, 4], [3, nan, nan, 1, -4], ], ["A", "B", "C", "D", "E"], { "A": "Group1", "B": "Group2", "C": "Group1", "D": "Group2", "E": "Group1", }, False, True, True, [ 0.25, 0.25, 0.25, 0.25, -0.25, 0.25, -0.25, 0.25, 0.25, 0.50, 0.25, 0.25, -0.50, 0.25, 0.25, 0.50, -0.25, ], ), ( [ [1, 4, 2, 3, nan], [3, 4, -2, -1, nan], [3, nan, nan, 2, 7], [3, nan, nan, 2, -7], ], ["A", "B", "C", "D", "E"], { "A": "Group1", "B": "Group2", "C": "Group1", "D": "Group2", "E": "Group1", }, True, True, True, [ -0.25, 0.25, 0.25, -0.25, 0.25, 0.25, -0.25, -0.25, -0.50, nan, 0.50, 0.50, nan, -0.50, ], ), ] ) def test_factor_weights( self, factor_vals, tickers, groups, demeaned, group_adjust, equal_weight, expected_vals, ): index = date_range("1/12/2000", periods=len(factor_vals)) factor = DataFrame( index=index, columns=tickers, data=factor_vals ).stack() factor.index = factor.index.set_names(["date", "asset"]) factor.name = "factor" factor_data = DataFrame() factor_data["factor"] = factor groups = Series(groups) factor_data["group"] = Series( index=factor.index, data=groups[factor.index.get_level_values("asset")].values, ) weights = factor_weights( factor_data, demeaned, group_adjust, equal_weight ) expected = Series( data=expected_vals, index=factor_data.index, name="factor" ) assert_series_equal(weights, expected) @parameterized.expand( [ ( [1, 2, 3, 4, 4, 3, 2, 1], [4, 3, 2, 1, 1, 2, 3, 4], False, [-1.25000, -1.25000], ), ( [1, 1, 1, 1, 1, 1, 1, 1], [4, 3, 2, 1, 1, 2, 3, 4], False, [nan, nan], ), ( [1, 2, 3, 4, 4, 3, 2, 1], [4, 3, 2, 1, 1, 2, 3, 4], True, [-0.5, -0.5], ), ( [1, 2, 3, 4, 1, 2, 3, 4], [1, 4, 1, 2, 1, 2, 2, 1], True, [1.0, 0.0], ), ( [1, 1, 1, 1, 1, 1, 1, 1], [4, 3, 2, 1, 1, 2, 3, 4], True, [nan, nan], ), ] ) def test_factor_returns( self, factor_vals, fwd_return_vals, group_adjust, expected_vals ): factor_data = self.factor_data.copy() factor_data["1D"] = fwd_return_vals factor_data["factor"] = factor_vals factor_returns_s = factor_returns( factor_data=factor_data, demeaned=True, group_adjust=group_adjust ) expected = DataFrame( index=self.dr, data=expected_vals, columns=get_forward_returns_columns(factor_data.columns), ) assert_frame_equal(factor_returns_s, expected) @parameterized.expand([([1, 2, 3, 4, 1, 1, 1, 1], -1, 5.0 / 6.0)]) def test_factor_alpha_beta(self, fwd_return_vals, alpha, beta): factor_data = self.factor_data.copy() factor_data["1D"] = fwd_return_vals ab = factor_alpha_beta(factor_data=factor_data) expected = DataFrame( columns=["1D"], index=["Ann. alpha", "beta"], data=[alpha, beta] ) assert_frame_equal(ab, expected) @parameterized.expand( [ ( [1.0, 0.5, 1.0, 0.5, 0.5], "1D", "1D", [2.0, 3.0, 6.0, 9.0, 13.50], ), ( [0.1, 0.1, 0.1, 0.1, 0.1], "1D", "1D", [1.1, 1.21, 1.331, 1.4641, 1.61051], ), ( [-0.1, -0.1, -0.1, -0.1, -0.1], "1D", "1D", [0.9, 0.81, 0.729, 0.6561, 0.59049], ), ( [1.0, 0.5, 1.0, 0.5, 0.5], "1B", "1D", [2.0, 3.0, 6.0, 9.0, 13.50], ), ( [0.1, 0.1, 0.1, 0.1, 0.1], "1B", "1D", [1.1, 1.21, 1.331, 1.4641, 1.61051], ), ( [-0.1, -0.1, -0.1, -0.1, -0.1], "1B", "1D", [0.9, 0.81, 0.729, 0.6561, 0.59049], ), ( [1.0, 0.5, 1.0, 0.5, 0.5], "1CD", "1D", [2.0, 3.0, 6.0, 9.0, 13.50], ), ( [0.1, 0.1, 0.1, 0.1, 0.1], "1CD", "1D", [1.1, 1.21, 1.331, 1.4641, 1.61051], ), ( [-0.1, -0.1, -0.1, -0.1, -0.1], "1CD", "1D", [0.9, 0.81, 0.729, 0.6561, 0.59049], ), ] ) def test_cumulative_returns( self, returns, ret_freq, period_len, expected_vals ): if "CD" in ret_freq: ret_freq_class = CDay(weekmask="Tue Wed Thu Fri Sun") ret_freq = ret_freq_class elif "B" in ret_freq: ret_freq_class = BDay() else: ret_freq_class = Day() period_len = Timedelta(period_len) index = date_range("1/1/1999", periods=len(returns), freq=ret_freq) returns = Series(returns, index=index) cum_ret = cumulative_returns(returns) expected = Series(expected_vals, index=cum_ret.index) assert_series_equal(cum_ret, expected) @parameterized.expand( [ ( [ [1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0], ], "1B", 1, [nan, 1.0, 1.0, 1.0], ), ( [ [1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0], ], "1D", 1, [nan, 1.0, 1.0, 1.0], ), ( [ [4.0, 3.0, 2.0, 1.0], [1.0, 2.0, 3.0, 4.0], [4.0, 3.0, 2.0, 1.0], [1.0, 2.0, 3.0, 4.0], ], "1B", 1, [nan, -1.0, -1.0, -1.0], ), ( [ [4.0, 3.0, 2.0, 1.0], [1.0, 2.0, 3.0, 4.0], [4.0, 3.0, 2.0, 1.0], [1.0, 2.0, 3.0, 4.0], ], "1D", 1, [nan, -1.0, -1.0, -1.0], ), ( [ [1.0, 2.0, 3.0, 4.0], [2.0, 1.0, 4.0, 3.0], [4.0, 3.0, 2.0, 1.0], [1.0, 2.0, 3.0, 4.0], [2.0, 1.0, 4.0, 3.0], [4.0, 3.0, 2.0, 1.0], [2.0, 1.0, 4.0, 3.0], [4.0, 3.0, 2.0, 1.0], [1.0, 2.0, 3.0, 4.0], [2.0, 1.0, 4.0, 3.0], [2.0, 1.0, 4.0, 3.0], [4.0, 3.0, 2.0, 1.0], ], "1B", 3, [ nan, nan, nan, 1.0, 1.0, 1.0, 0.6, -0.6, -1.0, 1.0, -0.6, -1.0, ], ), ( [ [1.0, 2.0, 3.0, 4.0], [2.0, 1.0, 4.0, 3.0], [4.0, 3.0, 2.0, 1.0], [1.0, 2.0, 3.0, 4.0], [2.0, 1.0, 4.0, 3.0], [4.0, 3.0, 2.0, 1.0], [2.0, 1.0, 4.0, 3.0], [4.0, 3.0, 2.0, 1.0], [1.0, 2.0, 3.0, 4.0], [2.0, 1.0, 4.0, 3.0], [2.0, 1.0, 4.0, 3.0], [4.0, 3.0, 2.0, 1.0], ], "1D", 3, [ nan, nan, nan, 1.0, 1.0, 1.0, 0.6, -0.6, -1.0, 1.0, -0.6, -1.0, ], ), ] ) def test_factor_rank_autocorrelation( self, factor_values, freq, period, expected_vals ): dr = date_range( start="2015-1-1", periods=len(factor_values), freq=freq, name="date", ) tickers = ["A", "B", "C", "D"] factor = ( DataFrame(index=dr, columns=tickers, data=factor_values) .rename_axis("asset", axis=1) .stack() ) factor_df = DataFrame(data=factor, columns=["factor"]) fa = factor_rank_autocorrelation(factor_df, period) expected = Series(index=dr, data=expected_vals) expected.name = period assert_series_equal(fa, expected) @parameterized.expand( [ ( 2, 3, False, False, [ [4.93048307, 8.68843922], [6.60404312, 12.22369139], [8.92068367, 17.1794088], [12.1275523, 24.12861778], [16.5694159, 33.8740100], [22.7273233, 47.53995233], ], ), ( 3, 2, False, True, [ [0.0, 5.63219176], [0.0, 7.96515233], [0.0, 11.2420646], [0.0, 15.8458720], [0.0, 22.3134160], [0.0, 31.3970961], ], ), ( 3, 5, True, False, [ [3.7228318, 2.6210478], [4.9304831, 3.6296796], [6.6040431, 5.0193734], # noqa [8.9206837, 6.9404046], [12.127552, 9.6023405], [16.569416, 13.297652], # noqa [22.727323, 18.434747], [31.272682, 25.584180], [34.358565, 25.497254], ], # noqa ), ( 1, 4, True, True, [ [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], ], ), ( 6, 6, False, False, [ [2.02679565, 2.38468223], [2.38769454, 3.22602748], [2.85413029, 4.36044469], [3.72283181, 6.16462715], [4.93048307, 8.68843922], [6.60404312, 12.2236914], [8.92068367, 17.1794088], [12.1275523, 24.1286178], [16.5694159, 33.8740100], [22.7273233, 47.5399523], [31.2726821, 66.7013483], [34.3585654, 70.1828776], [37.9964585, 74.3294620], ], ), ( 6, 6, False, True, [ [0.0, 2.20770299], [0.0, 2.95942924], [0.0, 3.97022414], [0.0, 5.63219176], [0.0, 7.96515233], [0.0, 11.2420646], [0.0, 15.8458720], [0.0, 22.3134160], [0.0, 31.3970962], [0.0, 44.1512888], [0.0, 62.0533954], [0.0, 65.8668371], [0.0, 70.4306483], ], ), ( 6, 6, True, False, [ [2.0267957, 0.9562173], [2.3876945, 1.3511898], [2.8541303, 1.8856194], # noqa [3.7228318, 2.6210478], [4.9304831, 3.6296796], [6.6040431, 5.0193734], # noqa [8.9206837, 6.9404046], [12.127552, 9.6023405], [16.569416, 13.297652], # noqa [22.727323, 18.434747], [31.272682, 25.584180], [34.358565, 25.497254], # noqa [37.996459, 25.198051], ], ), ( 6, 6, True, True, [ [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], ], ), ] ) def test_common_start_returns( self, before, after, mean_by_date, demeaned, expected_vals ): dr = date_range(start="2015-1-17", end="2015-2-2") dr.name = "date" tickers = ["A", "B", "C", "D"] r1, r2, r3, r4 = (1.20, 1.40, 0.90, 0.80) data = [[r1 ** i, r2 ** i, r3 ** i, r4 ** i] for i in range(1, 18)] returns = DataFrame(data=data, index=dr, columns=tickers) dr2 = date_range(start="2015-1-21", end="2015-1-29") factor = DataFrame( index=dr2, columns=tickers, data=[ [3, 4, 2, 1], [3, 4, 2, 1], [3, 4, 2, 1], [3, 4, 2, 1], [3, 4, 2, 1], [3, 4, 2, 1], [3, 4, 2, 1], [3, 4, 2, 1], [3, 4, 2, 1], ], ).stack() factor.index = factor.index.set_names(["date", "asset"]) factor.name = "factor" cmrt = common_start_returns( factor, returns, before, after, cumulative=True, mean_by_date=mean_by_date, demean_by=factor if demeaned else None, ) cmrt = DataFrame({"mean": cmrt.mean(axis=1), "std": cmrt.std(axis=1)}) expected = DataFrame( index=range(-before, after + 1), columns=["mean", "std"], data=expected_vals, ) assert_frame_equal(cmrt, expected) @parameterized.expand( [ ( 1, 2, False, 4, [ [0.00512695, 0.00256348, 0.00128174, 6.40869e-4], [0.00579185, 0.00289592, 0.00144796, 7.23981e-4], [1.00000000, 1.00000000, 1.00000000, 1.00000000], [0.00000000, 0.00000000, 0.00000000, 0.00000000], [7.15814531, 8.94768164, 11.1846020, 13.9807526], [2.93784787, 3.67230984, 4.59038730, 5.73798413], [39.4519043, 59.1778564, 88.7667847, 133.150177], [28.3717330, 42.5575995, 63.8363992, 95.7545989], ], ), ( 1, 2, True, 4, [ [-11.898667, -17.279462, -25.236885, -37.032252], [7.82587034, 11.5529583, 17.0996881, 25.3636472], [-10.903794, -16.282025, -24.238167, -36.032893], [7.82140124, 11.5507268, 17.0985737, 25.3630906], [-4.7456488, -8.3343438, -14.053565, -23.052140], [4.91184665, 7.91180853, 12.5481552, 19.6734224], [27.5481102, 41.8958311, 63.5286176, 96.1172844], [20.5510133, 31.0075980, 46.7385910, 70.3923129], ], ), ( 3, 0, False, 4, [ [7.0, 3.0, 1.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [-0.488, -0.36, -0.2, 0.0], [0.0, 0.0, 0.0, 0.0], [-0.703704, -0.55555555, -0.333333333, 0.0], [0.0, 0.0, 0.0, 0.0], ], ), ( 0, 3, True, 4, [ [-17.279462, -25.236885, -37.032252, -54.550061], [11.5529583, 17.0996881, 25.3636472, 37.6887906], [-16.282025, -24.238167, -36.032893, -53.550382], [11.5507268, 17.0985737, 25.3630906, 37.6885125], [-8.3343438, -14.053565, -23.052140, -37.074441], [7.91180853, 12.5481552, 19.6734224, 30.5748605], [41.8958311, 63.5286176, 96.1172844, 145.174884], [31.0075980, 46.7385910, 70.3923129, 105.944230], ], ), ( 3, 3, False, 2, [ [ 0.5102539, 0.50512695, 0.50256348, 0.50128174, 0.50064087, 0.50032043, 0.50016022, ], # noqa [ 0.0115837, 0.00579185, 0.00289592, 1.44796e-3, 7.23981e-4, 3.61990e-4, 1.80995e-4, ], # noqa [ 11.057696, 16.0138929, 23.3050248, 34.0627690, 49.9756934, 73.5654648, 108.600603, ], # noqa [ 7.2389454, 10.6247239, 15.6450367, 23.1025693, 34.1977045, 50.7264595, 75.3771641, ], ], # noqa ), ( 3, 3, True, 2, [ [ -5.273721, -7.754383, -11.40123, -16.78074, -24.73753, -36.53257, -54.05022, ], # noqa [ 3.6239580, 5.3146000, 7.8236356, 11.551843, 17.099131, 25.363369, 37.688652, ], # noqa [ 5.2737212, 7.7543830, 11.401231, 16.780744, 24.737526, 36.532572, 54.050221, ], # noqa [ 3.6239580, 5.3146000, 7.8236356, 11.551843, 17.099131, 25.363369, 37.688652, ], ], # noqa ), ] ) def test_average_cumulative_return_by_quantile( self, before, after, demeaned, quantiles, expected_vals ): dr = date_range(start="2015-1-15", end="2015-2-1") dr.name = "date" tickers = ["A", "B", "C", "D"] r1, r2, r3, r4 = (1.25, 1.50, 1.00, 0.50) data = [[r1 ** i, r2 ** i, r3 ** i, r4 ** i] for i in range(1, 19)] returns = DataFrame(index=dr, columns=tickers, data=data) dr2 = date_range(start="2015-1-21", end="2015-1-26") dr2.name = "date" factor = DataFrame( index=dr2, columns=tickers, data=[ [3, 4, 2, 1], [3, 4, 2, 1], [3, 4, 2, 1], [3, 4, 2, 1], [3, 4, 2, 1], [3, 4, 2, 1], ], ).stack() factor_data = get_clean_factor_and_forward_returns( factor, returns, quantiles=quantiles, periods=range(0, after + 1), filter_zscore=False, ) avgrt = average_cumulative_return_by_quantile( factor_data, returns, before, after, demeaned ) arrays = [] for q in range(1, quantiles + 1): arrays.append((q, "mean")) arrays.append((q, "std")) index = MultiIndex.from_tuples(arrays, names=["factor_quantile", None]) expected = DataFrame( index=index, columns=range(-before, after + 1), data=expected_vals ) assert_frame_equal(avgrt, expected) @parameterized.expand( [ ( 0, 2, False, 4, [ [0.0292969, 0.0146484, 7.32422e-3], [0.0241851, 0.0120926, 6.04628e-3], [1.0000000, 1.0000000, 1.00000000], [0.0000000, 0.0000000, 0.00000000], [3.5190582, 4.3988228, 5.49852848], [1.0046375, 1.2557969, 1.56974616], [10.283203, 15.424805, 23.1372070], [5.2278892, 7.8418338, 11.7627508], ], ), ( 0, 3, True, 4, [ [-3.6785927, -5.1949205, -7.4034407, -10.641996], [1.57386873, 2.28176590, 3.33616491, 4.90228915], [-2.7078896, -4.2095690, -6.4107649, -9.6456583], [1.55205002, 2.27087143, 3.33072273, 4.89956999], [-0.1888313, -0.8107462, -1.9122365, -3.7724977], [0.55371389, 1.02143924, 1.76795263, 2.94536298], [6.57531357, 10.2152357, 15.7264421, 24.0601522], [3.67596914, 5.57112656, 8.43221341, 12.7447568], ], ), ( 0, 3, False, 2, [ [0.51464844, 0.50732422, 0.50366211, 0.50183105], [0.01209256, 0.00604628, 0.00302314, 0.00151157], [6.90113068, 9.91181374, 14.3178678, 20.7894856], [3.11499629, 4.54718783, 6.66416616, 9.80049950], ], ), ( 0, 3, True, 2, [ [-3.1932411, -4.7022448, -6.9071028, -10.143827], [1.56295067, 2.27631715, 3.33344356, 4.90092953], [3.19324112, 4.70224476, 6.90710282, 10.1438273], [1.56295067, 2.27631715, 3.33344356, 4.90092953], ], ), ] ) def test_average_cumulative_return_by_quantile_2( self, before, after, demeaned, quantiles, expected_vals ): """ Test varying factor asset universe: at different dates there might be different assets """ dr = date_range(start="2015-1-15", end="2015-1-25") dr.name = "date" tickers = ["A", "B", "C", "D", "E", "F"] r1, r2, r3, r4 = (1.25, 1.50, 1.00, 0.50) data = [ [r1 ** i, r2 ** i, r3 ** i, r4 ** i, r2 ** i, r3 ** i] for i in range(1, 12) ] prices = DataFrame(index=dr, columns=tickers, data=data) dr2 = date_range(start="2015-1-18", end="2015-1-21") dr2.name = "date" factor = DataFrame( index=dr2, columns=tickers, data=[ [3, 4, 2, 1, nan, nan], [3, 4, 2, 1, nan, nan], [3, nan, nan, 1, 4, 2], [3, nan, nan, 1, 4, 2], ], ).stack() factor_data = get_clean_factor_and_forward_returns( factor, prices, quantiles=quantiles, periods=range(0, after + 1), filter_zscore=False, ) avgrt = average_cumulative_return_by_quantile( factor_data, prices, before, after, demeaned ) arrays = [] for q in range(1, quantiles + 1): arrays.append((q, "mean")) arrays.append((q, "std")) index = MultiIndex.from_tuples(arrays, names=["factor_quantile", None]) expected = DataFrame( index=index, columns=range(-before, after + 1), data=expected_vals ) assert_frame_equal(avgrt, expected)
def makeMultiIndex(k=10, names=None, **kwargs): return MultiIndex.from_product((("foo", "bar"), (1, 2)), names=names, **kwargs)
def test_reindex_lvl_preserves_names_when_target_is_list_or_array(): # GH7774 idx = MultiIndex.from_product([[0, 1], ["a", "b"]], names=["foo", "bar"]) assert idx.reindex([], level=0)[0].names == ["foo", "bar"] assert idx.reindex([], level=1)[0].names == ["foo", "bar"]
def test_repr_roundtrip_raises(): mi = MultiIndex.from_product([list('ab'), range(3)], names=['first', 'second']) with pytest.raises(TypeError): eval(repr(mi))
def test_to_html_multiindex_odd_even_truncate(self): # GH 14882 - Issue on truncation with odd length DataFrame mi = MultiIndex.from_product([[100, 200, 300], [10, 20, 30], [1, 2, 3, 4, 5, 6, 7]], names=['a', 'b', 'c']) df = DataFrame({'n': range(len(mi))}, index=mi) result = df.to_html(max_rows=60) expected = """\ <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th></th> <th></th> <th>n</th> </tr> <tr> <th>a</th> <th>b</th> <th>c</th> <th></th> </tr> </thead> <tbody> <tr> <th rowspan="21" valign="top">100</th> <th rowspan="7" valign="top">10</th> <th>1</th> <td>0</td> </tr> <tr> <th>2</th> <td>1</td> </tr> <tr> <th>3</th> <td>2</td> </tr> <tr> <th>4</th> <td>3</td> </tr> <tr> <th>5</th> <td>4</td> </tr> <tr> <th>6</th> <td>5</td> </tr> <tr> <th>7</th> <td>6</td> </tr> <tr> <th rowspan="7" valign="top">20</th> <th>1</th> <td>7</td> </tr> <tr> <th>2</th> <td>8</td> </tr> <tr> <th>3</th> <td>9</td> </tr> <tr> <th>4</th> <td>10</td> </tr> <tr> <th>5</th> <td>11</td> </tr> <tr> <th>6</th> <td>12</td> </tr> <tr> <th>7</th> <td>13</td> </tr> <tr> <th rowspan="7" valign="top">30</th> <th>1</th> <td>14</td> </tr> <tr> <th>2</th> <td>15</td> </tr> <tr> <th>3</th> <td>16</td> </tr> <tr> <th>4</th> <td>17</td> </tr> <tr> <th>5</th> <td>18</td> </tr> <tr> <th>6</th> <td>19</td> </tr> <tr> <th>7</th> <td>20</td> </tr> <tr> <th rowspan="19" valign="top">200</th> <th rowspan="7" valign="top">10</th> <th>1</th> <td>21</td> </tr> <tr> <th>2</th> <td>22</td> </tr> <tr> <th>3</th> <td>23</td> </tr> <tr> <th>4</th> <td>24</td> </tr> <tr> <th>5</th> <td>25</td> </tr> <tr> <th>6</th> <td>26</td> </tr> <tr> <th>7</th> <td>27</td> </tr> <tr> <th rowspan="5" valign="top">20</th> <th>1</th> <td>28</td> </tr> <tr> <th>2</th> <td>29</td> </tr> <tr> <th>...</th> <td>...</td> </tr> <tr> <th>6</th> <td>33</td> </tr> <tr> <th>7</th> <td>34</td> </tr> <tr> <th rowspan="7" valign="top">30</th> <th>1</th> <td>35</td> </tr> <tr> <th>2</th> <td>36</td> </tr> <tr> <th>3</th> <td>37</td> </tr> <tr> <th>4</th> <td>38</td> </tr> <tr> <th>5</th> <td>39</td> </tr> <tr> <th>6</th> <td>40</td> </tr> <tr> <th>7</th> <td>41</td> </tr> <tr> <th rowspan="21" valign="top">300</th> <th rowspan="7" valign="top">10</th> <th>1</th> <td>42</td> </tr> <tr> <th>2</th> <td>43</td> </tr> <tr> <th>3</th> <td>44</td> </tr> <tr> <th>4</th> <td>45</td> </tr> <tr> <th>5</th> <td>46</td> </tr> <tr> <th>6</th> <td>47</td> </tr> <tr> <th>7</th> <td>48</td> </tr> <tr> <th rowspan="7" valign="top">20</th> <th>1</th> <td>49</td> </tr> <tr> <th>2</th> <td>50</td> </tr> <tr> <th>3</th> <td>51</td> </tr> <tr> <th>4</th> <td>52</td> </tr> <tr> <th>5</th> <td>53</td> </tr> <tr> <th>6</th> <td>54</td> </tr> <tr> <th>7</th> <td>55</td> </tr> <tr> <th rowspan="7" valign="top">30</th> <th>1</th> <td>56</td> </tr> <tr> <th>2</th> <td>57</td> </tr> <tr> <th>3</th> <td>58</td> </tr> <tr> <th>4</th> <td>59</td> </tr> <tr> <th>5</th> <td>60</td> </tr> <tr> <th>6</th> <td>61</td> </tr> <tr> <th>7</th> <td>62</td> </tr> </tbody> </table>""" self.assertEqual(result, expected) # Test that ... appears in a middle level result = df.to_html(max_rows=56) expected = """\ <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th></th> <th></th> <th>n</th> </tr> <tr> <th>a</th> <th>b</th> <th>c</th> <th></th> </tr> </thead> <tbody> <tr> <th rowspan="21" valign="top">100</th> <th rowspan="7" valign="top">10</th> <th>1</th> <td>0</td> </tr> <tr> <th>2</th> <td>1</td> </tr> <tr> <th>3</th> <td>2</td> </tr> <tr> <th>4</th> <td>3</td> </tr> <tr> <th>5</th> <td>4</td> </tr> <tr> <th>6</th> <td>5</td> </tr> <tr> <th>7</th> <td>6</td> </tr> <tr> <th rowspan="7" valign="top">20</th> <th>1</th> <td>7</td> </tr> <tr> <th>2</th> <td>8</td> </tr> <tr> <th>3</th> <td>9</td> </tr> <tr> <th>4</th> <td>10</td> </tr> <tr> <th>5</th> <td>11</td> </tr> <tr> <th>6</th> <td>12</td> </tr> <tr> <th>7</th> <td>13</td> </tr> <tr> <th rowspan="7" valign="top">30</th> <th>1</th> <td>14</td> </tr> <tr> <th>2</th> <td>15</td> </tr> <tr> <th>3</th> <td>16</td> </tr> <tr> <th>4</th> <td>17</td> </tr> <tr> <th>5</th> <td>18</td> </tr> <tr> <th>6</th> <td>19</td> </tr> <tr> <th>7</th> <td>20</td> </tr> <tr> <th rowspan="15" valign="top">200</th> <th rowspan="7" valign="top">10</th> <th>1</th> <td>21</td> </tr> <tr> <th>2</th> <td>22</td> </tr> <tr> <th>3</th> <td>23</td> </tr> <tr> <th>4</th> <td>24</td> </tr> <tr> <th>5</th> <td>25</td> </tr> <tr> <th>6</th> <td>26</td> </tr> <tr> <th>7</th> <td>27</td> </tr> <tr> <th>...</th> <th>...</th> <td>...</td> </tr> <tr> <th rowspan="7" valign="top">30</th> <th>1</th> <td>35</td> </tr> <tr> <th>2</th> <td>36</td> </tr> <tr> <th>3</th> <td>37</td> </tr> <tr> <th>4</th> <td>38</td> </tr> <tr> <th>5</th> <td>39</td> </tr> <tr> <th>6</th> <td>40</td> </tr> <tr> <th>7</th> <td>41</td> </tr> <tr> <th rowspan="21" valign="top">300</th> <th rowspan="7" valign="top">10</th> <th>1</th> <td>42</td> </tr> <tr> <th>2</th> <td>43</td> </tr> <tr> <th>3</th> <td>44</td> </tr> <tr> <th>4</th> <td>45</td> </tr> <tr> <th>5</th> <td>46</td> </tr> <tr> <th>6</th> <td>47</td> </tr> <tr> <th>7</th> <td>48</td> </tr> <tr> <th rowspan="7" valign="top">20</th> <th>1</th> <td>49</td> </tr> <tr> <th>2</th> <td>50</td> </tr> <tr> <th>3</th> <td>51</td> </tr> <tr> <th>4</th> <td>52</td> </tr> <tr> <th>5</th> <td>53</td> </tr> <tr> <th>6</th> <td>54</td> </tr> <tr> <th>7</th> <td>55</td> </tr> <tr> <th rowspan="7" valign="top">30</th> <th>1</th> <td>56</td> </tr> <tr> <th>2</th> <td>57</td> </tr> <tr> <th>3</th> <td>58</td> </tr> <tr> <th>4</th> <td>59</td> </tr> <tr> <th>5</th> <td>60</td> </tr> <tr> <th>6</th> <td>61</td> </tr> <tr> <th>7</th> <td>62</td> </tr> </tbody> </table>""" self.assertEqual(result, expected)
def setup_cache(self): level1 = range(1000) level2 = date_range(start="1/1/2012", periods=100) mi = MultiIndex.from_product([level1, level2]) return mi
def test_from_product_empty_zero_levels(): # 0 levels msg = "Must pass non-zero number of levels/codes" with pytest.raises(ValueError, match=msg): MultiIndex.from_product([])
def test_cython_transform_frame(op, args, targop): s = Series(np.random.randn(1000)) s_missing = s.copy() s_missing.iloc[2:10] = np.nan labels = np.random.randint(0, 50, size=1000).astype(float) strings = list("qwertyuiopasdfghjklz") strings_missing = strings[:] strings_missing[5] = np.nan df = DataFrame( { "float": s, "float_missing": s_missing, "int": [1, 1, 1, 1, 2] * 200, "datetime": date_range("1990-1-1", periods=1000), "timedelta": pd.timedelta_range(1, freq="s", periods=1000), "string": strings * 50, "string_missing": strings_missing * 50, }, columns=[ "float", "float_missing", "int", "datetime", "timedelta", "string", "string_missing", ], ) df["cat"] = df["string"].astype("category") df2 = df.copy() df2.index = MultiIndex.from_product([range(100), range(10)]) # DataFrame - Single and MultiIndex, # group by values, index level, columns for df in [df, df2]: for gb_target in [ { "by": labels }, { "level": 0 }, { "by": "string" }, ]: # {"by": 'string_missing'}]: # {"by": ['int','string']}]: gb = df.groupby(**gb_target) # allowlisted methods set the selection before applying # bit a of hack to make sure the cythonized shift # is equivalent to pre 0.17.1 behavior if op == "shift": gb._set_group_selection() if op != "shift" and "int" not in gb_target: # numeric apply fastpath promotes dtype so have # to apply separately and concat i = gb[["int"]].apply(targop) f = gb[["float", "float_missing"]].apply(targop) expected = concat([f, i], axis=1) else: expected = gb.apply(targop) expected = expected.sort_index(axis=1) tm.assert_frame_equal(expected, gb.transform(op, *args).sort_index(axis=1)) tm.assert_frame_equal(expected, getattr(gb, op)(*args).sort_index(axis=1)) # individual columns for c in df: if c not in ["float", "int", "float_missing" ] and op != "shift": msg = "No numeric types to aggregate" with pytest.raises(DataError, match=msg): gb[c].transform(op) with pytest.raises(DataError, match=msg): getattr(gb[c], op)() else: expected = gb[c].apply(targop) expected.name = c tm.assert_series_equal(expected, gb[c].transform(op, *args)) tm.assert_series_equal(expected, getattr(gb[c], op)(*args))
def time_multiindex_from_iterables(self): MultiIndex.from_product(self.iterables)
def setup(self): mi = MultiIndex.from_product([range(100), range(100)]) self.s = Series(np.random.randn(10000), index=mi)
def test_reset_index_datetime(self, tz_naive_fixture): # GH#3950 tz = tz_naive_fixture idx1 = date_range("1/1/2011", periods=5, freq="D", tz=tz, name="idx1") idx2 = Index(range(5), name="idx2", dtype="int64") idx = MultiIndex.from_arrays([idx1, idx2]) df = DataFrame( { "a": np.arange(5, dtype="int64"), "b": ["A", "B", "C", "D", "E"] }, index=idx, ) expected = DataFrame( { "idx1": [ datetime(2011, 1, 1), datetime(2011, 1, 2), datetime(2011, 1, 3), datetime(2011, 1, 4), datetime(2011, 1, 5), ], "idx2": np.arange(5, dtype="int64"), "a": np.arange(5, dtype="int64"), "b": ["A", "B", "C", "D", "E"], }, columns=["idx1", "idx2", "a", "b"], ) expected["idx1"] = expected["idx1"].apply( lambda d: Timestamp(d, tz=tz)) tm.assert_frame_equal(df.reset_index(), expected) idx3 = date_range("1/1/2012", periods=5, freq="MS", tz="Europe/Paris", name="idx3") idx = MultiIndex.from_arrays([idx1, idx2, idx3]) df = DataFrame( { "a": np.arange(5, dtype="int64"), "b": ["A", "B", "C", "D", "E"] }, index=idx, ) expected = DataFrame( { "idx1": [ datetime(2011, 1, 1), datetime(2011, 1, 2), datetime(2011, 1, 3), datetime(2011, 1, 4), datetime(2011, 1, 5), ], "idx2": np.arange(5, dtype="int64"), "idx3": [ datetime(2012, 1, 1), datetime(2012, 2, 1), datetime(2012, 3, 1), datetime(2012, 4, 1), datetime(2012, 5, 1), ], "a": np.arange(5, dtype="int64"), "b": ["A", "B", "C", "D", "E"], }, columns=["idx1", "idx2", "idx3", "a", "b"], ) expected["idx1"] = expected["idx1"].apply( lambda d: Timestamp(d, tz=tz)) expected["idx3"] = expected["idx3"].apply( lambda d: Timestamp(d, tz="Europe/Paris")) tm.assert_frame_equal(df.reset_index(), expected) # GH#7793 idx = MultiIndex.from_product([["a", "b"], date_range("20130101", periods=3, tz=tz)]) df = DataFrame(np.arange(6, dtype="int64").reshape(6, 1), columns=["a"], index=idx) expected = DataFrame( { "level_0": "a a a b b b".split(), "level_1": [ datetime(2013, 1, 1), datetime(2013, 1, 2), datetime(2013, 1, 3), ] * 2, "a": np.arange(6, dtype="int64"), }, columns=["level_0", "level_1", "a"], ) expected["level_1"] = expected["level_1"].apply( lambda d: Timestamp(d, freq="D", tz=tz)) result = df.reset_index() tm.assert_frame_equal(result, expected)
def test_setitem_multiindex(self): for index_fn in ("loc", ): def assert_equal(a, b): assert a == b def check(target, indexers, value, compare_fn, expected=None): fn = getattr(target, index_fn) fn.__setitem__(indexers, value) result = fn.__getitem__(indexers) if expected is None: expected = value compare_fn(result, expected) # GH7190 index = MultiIndex.from_product( [np.arange(0, 100), np.arange(0, 80)], names=["time", "firm"]) t, n = 0, 2 df = DataFrame( np.nan, columns=["A", "w", "l", "a", "x", "X", "d", "profit"], index=index, ) check(target=df, indexers=((t, n), "X"), value=0, compare_fn=assert_equal) df = DataFrame( -999, columns=["A", "w", "l", "a", "x", "X", "d", "profit"], index=index) check(target=df, indexers=((t, n), "X"), value=1, compare_fn=assert_equal) df = DataFrame( columns=["A", "w", "l", "a", "x", "X", "d", "profit"], index=index) check(target=df, indexers=((t, n), "X"), value=2, compare_fn=assert_equal) # gh-7218: assigning with 0-dim arrays df = DataFrame( -999, columns=["A", "w", "l", "a", "x", "X", "d", "profit"], index=index) check( target=df, indexers=((t, n), "X"), value=np.array(3), compare_fn=assert_equal, expected=3, ) # GH5206 df = DataFrame(np.arange(25).reshape(5, 5), columns="A,B,C,D,E".split(","), dtype=float) df["F"] = 99 row_selection = df["A"] % 2 == 0 col_selection = ["B", "C"] df.loc[row_selection, col_selection] = df["F"] output = DataFrame(99.0, index=[0, 2, 4], columns=["B", "C"]) tm.assert_frame_equal(df.loc[row_selection, col_selection], output) check( target=df, indexers=(row_selection, col_selection), value=df["F"], compare_fn=tm.assert_frame_equal, expected=output, ) # GH11372 idx = MultiIndex.from_product([["A", "B", "C"], date_range("2015-01-01", "2015-04-01", freq="MS")]) cols = MultiIndex.from_product([["foo", "bar"], date_range("2016-01-01", "2016-02-01", freq="MS")]) df = DataFrame(np.random.random((12, 4)), index=idx, columns=cols) subidx = MultiIndex.from_tuples([("A", Timestamp("2015-01-01")), ("A", Timestamp("2015-02-01"))]) subcols = MultiIndex.from_tuples([("foo", Timestamp("2016-01-01")), ("foo", Timestamp("2016-02-01")) ]) vals = DataFrame(np.random.random((2, 2)), index=subidx, columns=subcols) check( target=df, indexers=(subidx, subcols), value=vals, compare_fn=tm.assert_frame_equal, ) # set all columns vals = DataFrame(np.random.random((2, 4)), index=subidx, columns=cols) check( target=df, indexers=(subidx, slice(None, None, None)), value=vals, compare_fn=tm.assert_frame_equal, ) # identity copy = df.copy() check( target=df, indexers=(df.index, df.columns), value=df, compare_fn=tm.assert_frame_equal, expected=copy, )
def test_read_excel_multiindex(self, read_ext): # see gh-4679 if pd.read_excel.keywords["engine"] == "pyxlsb": pytest.xfail("Sheets containing datetimes not supported by pyxlsb") mi = MultiIndex.from_product([["foo", "bar"], ["a", "b"]]) mi_file = "testmultiindex" + read_ext # "mi_column" sheet expected = DataFrame( [ [1, 2.5, pd.Timestamp("2015-01-01"), True], [2, 3.5, pd.Timestamp("2015-01-02"), False], [3, 4.5, pd.Timestamp("2015-01-03"), False], [4, 5.5, pd.Timestamp("2015-01-04"), True], ], columns=mi, ) actual = pd.read_excel(mi_file, sheet_name="mi_column", header=[0, 1], index_col=0) tm.assert_frame_equal(actual, expected) # "mi_index" sheet expected.index = mi expected.columns = ["a", "b", "c", "d"] actual = pd.read_excel(mi_file, sheet_name="mi_index", index_col=[0, 1]) tm.assert_frame_equal(actual, expected, check_names=False) # "both" sheet expected.columns = mi actual = pd.read_excel(mi_file, sheet_name="both", index_col=[0, 1], header=[0, 1]) tm.assert_frame_equal(actual, expected, check_names=False) # "mi_index_name" sheet expected.columns = ["a", "b", "c", "d"] expected.index = mi.set_names(["ilvl1", "ilvl2"]) actual = pd.read_excel(mi_file, sheet_name="mi_index_name", index_col=[0, 1]) tm.assert_frame_equal(actual, expected) # "mi_column_name" sheet expected.index = list(range(4)) expected.columns = mi.set_names(["c1", "c2"]) actual = pd.read_excel(mi_file, sheet_name="mi_column_name", header=[0, 1], index_col=0) tm.assert_frame_equal(actual, expected) # see gh-11317 # "name_with_int" sheet expected.columns = mi.set_levels([1, 2], level=1).set_names(["c1", "c2"]) actual = pd.read_excel(mi_file, sheet_name="name_with_int", index_col=0, header=[0, 1]) tm.assert_frame_equal(actual, expected) # "both_name" sheet expected.columns = mi.set_names(["c1", "c2"]) expected.index = mi.set_names(["ilvl1", "ilvl2"]) actual = pd.read_excel(mi_file, sheet_name="both_name", index_col=[0, 1], header=[0, 1]) tm.assert_frame_equal(actual, expected) # "both_skiprows" sheet actual = pd.read_excel( mi_file, sheet_name="both_name_skiprows", index_col=[0, 1], header=[0, 1], skiprows=2, ) tm.assert_frame_equal(actual, expected)
def test_is_monotonic_decreasing(): i = MultiIndex.from_product( [np.arange(9, -1, -1), np.arange(9, -1, -1)], names=["one", "two"]) assert i.is_monotonic_decreasing is True assert i._is_strictly_monotonic_decreasing is True assert Index(i.values).is_monotonic_decreasing is True assert i._is_strictly_monotonic_decreasing is True i = MultiIndex.from_product( [np.arange(10), np.arange(10, 0, -1)], names=["one", "two"]) assert i.is_monotonic_decreasing is False assert i._is_strictly_monotonic_decreasing is False assert Index(i.values).is_monotonic_decreasing is False assert Index(i.values)._is_strictly_monotonic_decreasing is False i = MultiIndex.from_product( [np.arange(10, 0, -1), np.arange(10)], names=["one", "two"]) assert i.is_monotonic_decreasing is False assert i._is_strictly_monotonic_decreasing is False assert Index(i.values).is_monotonic_decreasing is False assert Index(i.values)._is_strictly_monotonic_decreasing is False i = MultiIndex.from_product([[2.0, np.nan, 1.0], ["c", "b", "a"]]) assert i.is_monotonic_decreasing is False assert i._is_strictly_monotonic_decreasing is False assert Index(i.values).is_monotonic_decreasing is False assert Index(i.values)._is_strictly_monotonic_decreasing is False # string ordering i = MultiIndex( levels=[["qux", "foo", "baz", "bar"], ["three", "two", "one"]], codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=["first", "second"], ) assert i.is_monotonic_decreasing is False assert Index(i.values).is_monotonic_decreasing is False assert i._is_strictly_monotonic_decreasing is False assert Index(i.values)._is_strictly_monotonic_decreasing is False i = MultiIndex( levels=[["qux", "foo", "baz", "bar"], ["zenith", "next", "mom"]], codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=["first", "second"], ) assert i.is_monotonic_decreasing is True assert Index(i.values).is_monotonic_decreasing is True assert i._is_strictly_monotonic_decreasing is True assert Index(i.values)._is_strictly_monotonic_decreasing is True # mixed levels, hits the TypeError i = MultiIndex( levels=[ [4, 3, 2, 1], [ "nl0000301109", "nl0000289965", "nl0000289783", "lu0197800237", "gb00b03mlx29", ], ], codes=[[0, 1, 1, 2, 2, 2, 3], [4, 2, 0, 0, 1, 3, -1]], names=["household_id", "asset_id"], ) assert i.is_monotonic_decreasing is False assert i._is_strictly_monotonic_decreasing is False # empty i = MultiIndex.from_arrays([[], []]) assert i.is_monotonic_decreasing is True assert Index(i.values).is_monotonic_decreasing is True assert i._is_strictly_monotonic_decreasing is True assert Index(i.values)._is_strictly_monotonic_decreasing is True
def test_sort_index_and_reconstruction(self): # GH#15622 # lexsortedness should be identical # across MultiIndex construction methods df = DataFrame([[1, 1], [2, 2]], index=list("ab")) expected = DataFrame( [[1, 1], [2, 2], [1, 1], [2, 2]], index=MultiIndex.from_tuples([(0.5, "a"), (0.5, "b"), (0.8, "a"), (0.8, "b")]), ) assert expected.index.is_lexsorted() result = DataFrame( [[1, 1], [2, 2], [1, 1], [2, 2]], index=MultiIndex.from_product([[0.5, 0.8], list("ab")]), ) result = result.sort_index() assert result.index.is_lexsorted() assert result.index.is_monotonic tm.assert_frame_equal(result, expected) result = DataFrame( [[1, 1], [2, 2], [1, 1], [2, 2]], index=MultiIndex(levels=[[0.5, 0.8], ["a", "b"]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]]), ) result = result.sort_index() assert result.index.is_lexsorted() tm.assert_frame_equal(result, expected) concatted = pd.concat([df, df], keys=[0.8, 0.5]) result = concatted.sort_index() assert result.index.is_lexsorted() assert result.index.is_monotonic tm.assert_frame_equal(result, expected) # GH#14015 df = DataFrame( [[1, 2], [6, 7]], columns=MultiIndex.from_tuples( [(0, "20160811 12:00:00"), (0, "20160809 12:00:00")], names=["l1", "Date"], ), ) df.columns = df.columns.set_levels(pd.to_datetime( df.columns.levels[1]), level=1) assert not df.columns.is_lexsorted() assert not df.columns.is_monotonic result = df.sort_index(axis=1) assert result.columns.is_lexsorted() assert result.columns.is_monotonic result = df.sort_index(axis=1, level=1) assert result.columns.is_lexsorted() assert result.columns.is_monotonic
def test_from_product_empty_one_level(): result = MultiIndex.from_product([[]], names=["A"]) expected = pd.Index([], name="A") tm.assert_index_equal(result.levels[0], expected) assert result.names == ["A"]
("foo", "two", "max"), ("bar", "one", "min"), ("bar", "one", "max"), ("bar", "three", "min"), ("bar", "three", "max"), ], names=["A", "B", None], ), [1, 1, 3, 3, 2, 2, 4, 4], ), ( False, MultiIndex.from_product( [ CategoricalIndex(["bar", "foo"], ordered=False), CategoricalIndex(["one", "three", "two"], ordered=False), Index(["min", "max"]), ], names=["A", "B", None], ), [2, 2, 4, 4, np.nan, np.nan, 1, 1, np.nan, np.nan, 3, 3], ), ( None, MultiIndex.from_product( [ CategoricalIndex(["bar", "foo"], ordered=False), CategoricalIndex(["one", "three", "two"], ordered=False), Index(["min", "max"]), ], names=["A", "B", None], ),
def test_mi_indexing_list_nonexistent_raises(): # GH 15452 s = Series(range(4), index=MultiIndex.from_product([[1, 2], ["a", "b"]])) with pytest.raises(KeyError, match="\\['not' 'found'\\] not in index"): s.loc[["not", "found"]]
def test_loader_given_multiple_columns(self): class Loader1DataSet1(DataSet): col1 = Column(float) col2 = Column(float32) class Loader1DataSet2(DataSet): col1 = Column(float32) col2 = Column(float32) class Loader2DataSet(DataSet): col1 = Column(float32) col2 = Column(float32) constants1 = {Loader1DataSet1.col1: 1, Loader1DataSet1.col2: 2, Loader1DataSet2.col1: 3, Loader1DataSet2.col2: 4} loader1 = RecordingPrecomputedLoader(constants=constants1, dates=self.dates, sids=self.assets) constants2 = {Loader2DataSet.col1: 5, Loader2DataSet.col2: 6} loader2 = RecordingPrecomputedLoader(constants=constants2, dates=self.dates, sids=self.assets) engine = SimplePipelineEngine( lambda column: loader2 if column.dataset == Loader2DataSet else loader1, self.dates, self.asset_finder, ) pipe_col1 = RollingSumSum(inputs=[Loader1DataSet1.col1, Loader1DataSet2.col1, Loader2DataSet.col1], window_length=2) pipe_col2 = RollingSumSum(inputs=[Loader1DataSet1.col2, Loader1DataSet2.col2, Loader2DataSet.col2], window_length=3) pipe_col3 = RollingSumSum(inputs=[Loader2DataSet.col1], window_length=3) columns = OrderedDict([ ('pipe_col1', pipe_col1), ('pipe_col2', pipe_col2), ('pipe_col3', pipe_col3), ]) result = engine.run_pipeline( Pipeline(columns=columns), self.dates[2], # index is >= the largest window length - 1 self.dates[-1] ) min_window = min(pip_col.window_length for pip_col in itervalues(columns)) col_to_val = ChainMap(constants1, constants2) vals = {name: (sum(col_to_val[col] for col in pipe_col.inputs) * pipe_col.window_length) for name, pipe_col in iteritems(columns)} index = MultiIndex.from_product([self.dates[2:], self.assets]) def expected_for_col(col): val = vals[col] offset = columns[col].window_length - min_window return concatenate( [ full(offset * index.levshape[1], nan), full( (index.levshape[0] - offset) * index.levshape[1], val, float, ) ], ) expected = DataFrame( data={col: expected_for_col(col) for col in vals}, index=index, columns=columns, ) assert_frame_equal(result, expected) self.assertEqual(set(loader1.load_calls), {ColumnArgs.sorted_by_ds(Loader1DataSet1.col1, Loader1DataSet2.col1), ColumnArgs.sorted_by_ds(Loader1DataSet1.col2, Loader1DataSet2.col2)}) self.assertEqual(set(loader2.load_calls), {ColumnArgs.sorted_by_ds(Loader2DataSet.col1, Loader2DataSet.col2)})
def test_from_product_empty_two_levels(first, second): names = ["A", "B"] result = MultiIndex.from_product([first, second], names=names) expected = MultiIndex(levels=[first, second], codes=[[], []], names=names) tm.assert_index_equal(result, expected)
def test_per_axis_per_level_getitem(self): # GH6134 # example test case ix = MultiIndex.from_product( [_mklbl('A', 5), _mklbl('B', 7), _mklbl('C', 4), _mklbl('D', 2)]) df = DataFrame(np.arange(len(ix.get_values())), index=ix) result = df.loc[(slice('A1', 'A3'), slice(None), ['C1', 'C3']), :] expected = df.loc[[ tuple([a, b, c, d]) for a, b, c, d in df.index.values if (a == 'A1' or a == 'A2' or a == 'A3') and ( c == 'C1' or c == 'C3') ]] tm.assert_frame_equal(result, expected) expected = df.loc[[ tuple([a, b, c, d]) for a, b, c, d in df.index.values if (a == 'A1' or a == 'A2' or a == 'A3') and ( c == 'C1' or c == 'C2' or c == 'C3') ]] result = df.loc[(slice('A1', 'A3'), slice(None), slice('C1', 'C3')), :] tm.assert_frame_equal(result, expected) # test multi-index slicing with per axis and per index controls index = MultiIndex.from_tuples([('A', 1), ('A', 2), ('A', 3), ('B', 1)], names=['one', 'two']) columns = MultiIndex.from_tuples([('a', 'foo'), ('a', 'bar'), ('b', 'foo'), ('b', 'bah')], names=['lvl0', 'lvl1']) df = DataFrame(np.arange(16, dtype='int64').reshape(4, 4), index=index, columns=columns) df = df.sort_index(axis=0).sort_index(axis=1) # identity result = df.loc[(slice(None), slice(None)), :] tm.assert_frame_equal(result, df) result = df.loc[(slice(None), slice(None)), (slice(None), slice(None))] tm.assert_frame_equal(result, df) result = df.loc[:, (slice(None), slice(None))] tm.assert_frame_equal(result, df) # index result = df.loc[(slice(None), [1]), :] expected = df.iloc[[0, 3]] tm.assert_frame_equal(result, expected) result = df.loc[(slice(None), 1), :] expected = df.iloc[[0, 3]] tm.assert_frame_equal(result, expected) # columns result = df.loc[:, (slice(None), ['foo'])] expected = df.iloc[:, [1, 3]] tm.assert_frame_equal(result, expected) # both result = df.loc[(slice(None), 1), (slice(None), ['foo'])] expected = df.iloc[[0, 3], [1, 3]] tm.assert_frame_equal(result, expected) result = df.loc['A', 'a'] expected = DataFrame(dict(bar=[1, 5, 9], foo=[0, 4, 8]), index=Index([1, 2, 3], name='two'), columns=Index(['bar', 'foo'], name='lvl1')) tm.assert_frame_equal(result, expected) result = df.loc[(slice(None), [1, 2]), :] expected = df.iloc[[0, 1, 3]] tm.assert_frame_equal(result, expected) # multi-level series s = Series(np.arange(len(ix.get_values())), index=ix) result = s.loc['A1':'A3', :, ['C1', 'C3']] expected = s.loc[[ tuple([a, b, c, d]) for a, b, c, d in s.index.values if (a == 'A1' or a == 'A2' or a == 'A3') and ( c == 'C1' or c == 'C3') ]] tm.assert_series_equal(result, expected) # boolean indexers result = df.loc[(slice(None), df.loc[:, ('a', 'bar')] > 5), :] expected = df.iloc[[2, 3]] tm.assert_frame_equal(result, expected) def f(): df.loc[(slice(None), np.array([True, False])), :] self.assertRaises(ValueError, f) # ambiguous cases # these can be multiply interpreted (e.g. in this case # as df.loc[slice(None),[1]] as well self.assertRaises(KeyError, lambda: df.loc[slice(None), [1]]) result = df.loc[(slice(None), [1]), :] expected = df.iloc[[0, 3]] tm.assert_frame_equal(result, expected) # not lexsorted self.assertEqual(df.index.lexsort_depth, 2) df = df.sort_index(level=1, axis=0) self.assertEqual(df.index.lexsort_depth, 0) with tm.assertRaisesRegexp( UnsortedIndexError, 'MultiIndex Slicing requires the index to be fully ' r'lexsorted tuple len \(2\), lexsort depth \(0\)'): df.loc[(slice(None), df.loc[:, ('a', 'bar')] > 5), :]
def test_is_monotonic_increasing(): i = MultiIndex.from_product([np.arange(10), np.arange(10)], names=['one', 'two']) assert i.is_monotonic is True assert i._is_strictly_monotonic_increasing is True assert Index(i.values).is_monotonic is True assert i._is_strictly_monotonic_increasing is True i = MultiIndex.from_product([np.arange(10, 0, -1), np.arange(10)], names=['one', 'two']) assert i.is_monotonic is False assert i._is_strictly_monotonic_increasing is False assert Index(i.values).is_monotonic is False assert Index(i.values)._is_strictly_monotonic_increasing is False i = MultiIndex.from_product([np.arange(10), np.arange(10, 0, -1)], names=['one', 'two']) assert i.is_monotonic is False assert i._is_strictly_monotonic_increasing is False assert Index(i.values).is_monotonic is False assert Index(i.values)._is_strictly_monotonic_increasing is False i = MultiIndex.from_product([[1.0, np.nan, 2.0], ['a', 'b', 'c']]) assert i.is_monotonic is False assert i._is_strictly_monotonic_increasing is False assert Index(i.values).is_monotonic is False assert Index(i.values)._is_strictly_monotonic_increasing is False # string ordering i = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=['first', 'second']) assert i.is_monotonic is False assert Index(i.values).is_monotonic is False assert i._is_strictly_monotonic_increasing is False assert Index(i.values)._is_strictly_monotonic_increasing is False i = MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['mom', 'next', 'zenith']], labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=['first', 'second']) assert i.is_monotonic is True assert Index(i.values).is_monotonic is True assert i._is_strictly_monotonic_increasing is True assert Index(i.values)._is_strictly_monotonic_increasing is True # mixed levels, hits the TypeError i = MultiIndex( levels=[[1, 2, 3, 4], ['gb00b03mlx29', 'lu0197800237', 'nl0000289783', 'nl0000289965', 'nl0000301109']], labels=[[0, 1, 1, 2, 2, 2, 3], [4, 2, 0, 0, 1, 3, -1]], names=['household_id', 'asset_id']) assert i.is_monotonic is False assert i._is_strictly_monotonic_increasing is False # empty i = MultiIndex.from_arrays([[], []]) assert i.is_monotonic is True assert Index(i.values).is_monotonic is True assert i._is_strictly_monotonic_increasing is True assert Index(i.values)._is_strictly_monotonic_increasing is True
def test_binary_ops_align(self): # test aligning binary ops # GH 6681 index = MultiIndex.from_product( [list('abc'), ['one', 'two', 'three'], [1, 2, 3]], names=['first', 'second', 'third']) df = DataFrame(np.arange(27 * 3).reshape(27, 3), index=index, columns=['value1', 'value2', 'value3']).sort_index() idx = pd.IndexSlice for op in ['add', 'sub', 'mul', 'div', 'truediv']: opa = getattr(operator, op, None) if opa is None: continue x = Series([1.0, 10.0, 100.0], [1, 2, 3]) result = getattr(df, op)(x, level='third', axis=0) expected = pd.concat([ opa(df.loc[idx[:, :, i], :], v) for i, v in x.iteritems() ]).sort_index() assert_frame_equal(result, expected) x = Series([1.0, 10.0], ['two', 'three']) result = getattr(df, op)(x, level='second', axis=0) expected = (pd.concat([ opa(df.loc[idx[:, i], :], v) for i, v in x.iteritems() ]).reindex_like(df).sort_index()) assert_frame_equal(result, expected) # GH9463 (alignment level of dataframe with series) midx = MultiIndex.from_product([['A', 'B'], ['a', 'b']]) df = DataFrame(np.ones((2, 4), dtype='int64'), columns=midx) s = pd.Series({'a': 1, 'b': 2}) df2 = df.copy() df2.columns.names = ['lvl0', 'lvl1'] s2 = s.copy() s2.index.name = 'lvl1' # different cases of integer/string level names: res1 = df.mul(s, axis=1, level=1) res2 = df.mul(s2, axis=1, level=1) res3 = df2.mul(s, axis=1, level=1) res4 = df2.mul(s2, axis=1, level=1) res5 = df2.mul(s, axis=1, level='lvl1') res6 = df2.mul(s2, axis=1, level='lvl1') exp = DataFrame(np.array([[1, 2, 1, 2], [1, 2, 1, 2]], dtype='int64'), columns=midx) for res in [res1, res2]: assert_frame_equal(res, exp) exp.columns.names = ['lvl0', 'lvl1'] for res in [res3, res4, res5, res6]: assert_frame_equal(res, exp)
def test_missing_key_raises_keyerror2(self): # GH#21168 KeyError, not "IndexingError: Too many indexers" ser = Series(-1, index=MultiIndex.from_product([[0, 1]] * 2)) with pytest.raises(KeyError, match=r"\(0, 3\)"): ser.loc[0, 3]