def test_compress_group_combinations(self): # ~ 40000000 possible unique groups key1 = tm.rands_array(10, 10000) key1 = np.tile(key1, 2) key2 = key1[::-1] df = DataFrame({'key1': key1, 'key2': key2, 'value1': np.random.randn(20000)}) df2 = DataFrame({'key1': key1[::2], 'key2': key2[::2], 'value2': np.random.randn(10000)}) # just to hit the label compression code path merge(df, df2, how='outer')
def test_left_merge_na_buglet(self): left = DataFrame({'id': list('abcde'), 'v1': randn(5), 'v2': randn(5), 'dummy': list('abcde'), 'v3': randn(5)}, columns=['id', 'v1', 'v2', 'dummy', 'v3']) right = DataFrame({'id': ['a', 'b', np.nan, np.nan, np.nan], 'sv3': [1.234, 5.678, np.nan, np.nan, np.nan]}) result = merge(left, right, on='id', how='left') rdf = right.drop(['id'], axis=1) expected = left.join(rdf) tm.assert_frame_equal(result, expected)
def test_left_join_index_multi_match(self): left = DataFrame([ ['c', 0], ['b', 1], ['a', 2], ['b', 3]], columns=['tag', 'val'], index=[2, 0, 1, 3]) right = (DataFrame([ ['a', 'v'], ['c', 'w'], ['c', 'x'], ['d', 'y'], ['a', 'z'], ['c', 'r'], ['e', 'q'], ['c', 's']], columns=['tag', 'char']) .set_index('tag')) result = left.join(right, on='tag', how='left') expected = DataFrame([ ['c', 0, 'w'], ['c', 0, 'x'], ['c', 0, 'r'], ['c', 0, 's'], ['b', 1, nan], ['a', 2, 'v'], ['a', 2, 'z'], ['b', 3, nan]], columns=['tag', 'val', 'char'], index=[2, 2, 2, 2, 0, 1, 1, 3]) tm.assert_frame_equal(result, expected) result = left.join(right, on='tag', how='left', sort=True) expected2 = expected.sort_values('tag', kind='mergesort') tm.assert_frame_equal(result, expected2) # GH7331 - maintain left frame order in left merge result = merge(left, right.reset_index(), how='left', on='tag') expected.index = np.arange(len(expected)) tm.assert_frame_equal(result, expected)
def run_asserts(left, right, sort): res = left.join(right, on=icols, how='left', sort=sort) assert len(left) < len(res) + 1 assert not res['4th'].isna().any() assert not res['5th'].isna().any() tm.assert_series_equal( res['4th'], - res['5th'], check_names=False) result = bind_cols(res.iloc[:, :-2]) tm.assert_series_equal(res['4th'], result, check_names=False) assert result.name is None if sort: tm.assert_frame_equal( res, res.sort_values(icols, kind='mergesort')) out = merge(left, right.reset_index(), on=icols, sort=sort, how='left') res.index = np.arange(len(res)) tm.assert_frame_equal(out, res)
def run_asserts(left, right, sort): res = left.join(right, on=icols, how="left", sort=sort) assert len(left) < len(res) + 1 assert not res["4th"].isna().any() assert not res["5th"].isna().any() tm.assert_series_equal(res["4th"], -res["5th"], check_names=False) result = bind_cols(res.iloc[:, :-2]) tm.assert_series_equal(res["4th"], result, check_names=False) assert result.name is None if sort: tm.assert_frame_equal(res, res.sort_values(icols, kind="mergesort")) out = merge(left, right.reset_index(), on=icols, sort=sort, how="left") res.index = np.arange(len(res)) tm.assert_frame_equal(out, res)
def test_join_multi_levels2(self): # some more advanced merges # GH6360 household = DataFrame( { "household_id": [1, 2, 2, 3, 3, 3, 4], "asset_id": [ "nl0000301109", "nl0000301109", "gb00b03mlx29", "gb00b03mlx29", "lu0197800237", "nl0000289965", np.nan, ], "share": [1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0], }, columns=["household_id", "asset_id", "share"], ).set_index(["household_id", "asset_id"]) log_return = DataFrame({ "asset_id": [ "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "lu0197800237", "lu0197800237", ], "t": [233, 234, 235, 180, 181], "log_return": [ 0.09604978, -0.06524096, 0.03532373, 0.03025441, 0.036997, ], }).set_index(["asset_id", "t"]) expected = (DataFrame({ "household_id": [2, 2, 2, 3, 3, 3, 3, 3], "asset_id": [ "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "lu0197800237", "lu0197800237", ], "t": [233, 234, 235, 233, 234, 235, 180, 181], "share": [0.6, 0.6, 0.6, 0.15, 0.15, 0.15, 0.6, 0.6], "log_return": [ 0.09604978, -0.06524096, 0.03532373, 0.09604978, -0.06524096, 0.03532373, 0.03025441, 0.036997, ], }).set_index(["household_id", "asset_id", "t"]).reindex(columns=["share", "log_return"])) # this is the equivalency result = merge( household.reset_index(), log_return.reset_index(), on=["asset_id"], how="inner", ).set_index(["household_id", "asset_id", "t"]) tm.assert_frame_equal(result, expected) expected = (DataFrame({ "household_id": [1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4], "asset_id": [ "nl0000301109", "nl0000301109", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "lu0197800237", "lu0197800237", "nl0000289965", None, ], "t": [ None, None, 233, 234, 235, 233, 234, 235, 180, 181, None, None, ], "share": [ 1.0, 0.4, 0.6, 0.6, 0.6, 0.15, 0.15, 0.15, 0.6, 0.6, 0.25, 1.0, ], "log_return": [ None, None, 0.09604978, -0.06524096, 0.03532373, 0.09604978, -0.06524096, 0.03532373, 0.03025441, 0.036997, None, None, ], }).set_index(["household_id", "asset_id", "t"]).reindex(columns=["share", "log_return"])) result = merge( household.reset_index(), log_return.reset_index(), on=["asset_id"], how="outer", ).set_index(["household_id", "asset_id", "t"]) tm.assert_frame_equal(result, expected)
def test_join_multi_levels(self): # GH 3662 # merge multi-levels household = DataFrame( { "household_id": [1, 2, 3], "male": [0, 1, 0], "wealth": [196087.3, 316478.7, 294750], }, columns=["household_id", "male", "wealth"], ).set_index("household_id") portfolio = DataFrame( { "household_id": [1, 2, 2, 3, 3, 3, 4], "asset_id": [ "nl0000301109", "nl0000289783", "gb00b03mlx29", "gb00b03mlx29", "lu0197800237", "nl0000289965", np.nan, ], "name": [ "ABN Amro", "Robeco", "Royal Dutch Shell", "Royal Dutch Shell", "AAB Eastern Europe Equity Fund", "Postbank BioTech Fonds", np.nan, ], "share": [1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0], }, columns=["household_id", "asset_id", "name", "share"], ).set_index(["household_id", "asset_id"]) result = household.join(portfolio, how="inner") expected = (DataFrame({ "male": [0, 1, 1, 0, 0, 0], "wealth": [ 196087.3, 316478.7, 316478.7, 294750.0, 294750.0, 294750.0, ], "name": [ "ABN Amro", "Robeco", "Royal Dutch Shell", "Royal Dutch Shell", "AAB Eastern Europe Equity Fund", "Postbank BioTech Fonds", ], "share": [1.00, 0.40, 0.60, 0.15, 0.60, 0.25], "household_id": [1, 2, 2, 3, 3, 3], "asset_id": [ "nl0000301109", "nl0000289783", "gb00b03mlx29", "gb00b03mlx29", "lu0197800237", "nl0000289965", ], }).set_index(["household_id", "asset_id" ]).reindex(columns=["male", "wealth", "name", "share"])) tm.assert_frame_equal(result, expected) # equivalency result = merge( household.reset_index(), portfolio.reset_index(), on=["household_id"], how="inner", ).set_index(["household_id", "asset_id"]) tm.assert_frame_equal(result, expected) result = household.join(portfolio, how="outer") expected = concat( [ expected, (DataFrame( {"share": [1.00]}, index=MultiIndex.from_tuples( [(4, np.nan)], names=["household_id", "asset_id"]), )), ], axis=0, sort=True, ).reindex(columns=expected.columns) tm.assert_frame_equal(result, expected) # invalid cases household.index.name = "foo" with pytest.raises( ValueError, match="cannot join with no overlapping index names"): household.join(portfolio, how="inner") portfolio2 = portfolio.copy() portfolio2.index.set_names(["household_id", "foo"]) with pytest.raises(ValueError, match="columns overlap but no suffix specified"): portfolio2.join(portfolio, how="inner")
def test_join_multi_levels2(self): # some more advanced merges # GH6360 household = ( DataFrame( dict(household_id=[1, 2, 2, 3, 3, 3, 4], asset_id=["nl0000301109", "nl0000301109", "gb00b03mlx29", "gb00b03mlx29", "lu0197800237", "nl0000289965", np.nan], share=[1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0]), columns=['household_id', 'asset_id', 'share']) .set_index(['household_id', 'asset_id'])) log_return = DataFrame(dict( asset_id=["gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "lu0197800237", "lu0197800237"], t=[233, 234, 235, 180, 181], log_return=[.09604978, -.06524096, .03532373, .03025441, .036997] )).set_index(["asset_id", "t"]) expected = ( DataFrame(dict( household_id=[2, 2, 2, 3, 3, 3, 3, 3], asset_id=["gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "lu0197800237", "lu0197800237"], t=[233, 234, 235, 233, 234, 235, 180, 181], share=[0.6, 0.6, 0.6, 0.15, 0.15, 0.15, 0.6, 0.6], log_return=[.09604978, -.06524096, .03532373, .09604978, -.06524096, .03532373, .03025441, .036997] )) .set_index(["household_id", "asset_id", "t"]) .reindex(columns=['share', 'log_return'])) # this is the equivalency result = (merge(household.reset_index(), log_return.reset_index(), on=['asset_id'], how='inner') .set_index(['household_id', 'asset_id', 't'])) tm.assert_frame_equal(result, expected) expected = ( DataFrame(dict( household_id=[1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4], asset_id=["nl0000301109", "nl0000301109", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "lu0197800237", "lu0197800237", "nl0000289965", None], t=[None, None, 233, 234, 235, 233, 234, 235, 180, 181, None, None], share=[1.0, 0.4, 0.6, 0.6, 0.6, 0.15, 0.15, 0.15, 0.6, 0.6, 0.25, 1.0], log_return=[None, None, .09604978, -.06524096, .03532373, .09604978, -.06524096, .03532373, .03025441, .036997, None, None] )) .set_index(["household_id", "asset_id", "t"]) .reindex(columns=['share', 'log_return'])) result = (merge(household.reset_index(), log_return.reset_index(), on=['asset_id'], how='outer') .set_index(['household_id', 'asset_id', 't'])) tm.assert_frame_equal(result, expected)
def test_join_multi_levels(self): # GH 3662 # merge multi-levels household = ( DataFrame( dict(household_id=[1, 2, 3], male=[0, 1, 0], wealth=[196087.3, 316478.7, 294750]), columns=['household_id', 'male', 'wealth']) .set_index('household_id')) portfolio = ( DataFrame( dict(household_id=[1, 2, 2, 3, 3, 3, 4], asset_id=["nl0000301109", "nl0000289783", "gb00b03mlx29", "gb00b03mlx29", "lu0197800237", "nl0000289965", np.nan], name=["ABN Amro", "Robeco", "Royal Dutch Shell", "Royal Dutch Shell", "AAB Eastern Europe Equity Fund", "Postbank BioTech Fonds", np.nan], share=[1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0]), columns=['household_id', 'asset_id', 'name', 'share']) .set_index(['household_id', 'asset_id'])) result = household.join(portfolio, how='inner') expected = ( DataFrame( dict(male=[0, 1, 1, 0, 0, 0], wealth=[196087.3, 316478.7, 316478.7, 294750.0, 294750.0, 294750.0], name=['ABN Amro', 'Robeco', 'Royal Dutch Shell', 'Royal Dutch Shell', 'AAB Eastern Europe Equity Fund', 'Postbank BioTech Fonds'], share=[1.00, 0.40, 0.60, 0.15, 0.60, 0.25], household_id=[1, 2, 2, 3, 3, 3], asset_id=['nl0000301109', 'nl0000289783', 'gb00b03mlx29', 'gb00b03mlx29', 'lu0197800237', 'nl0000289965'])) .set_index(['household_id', 'asset_id']) .reindex(columns=['male', 'wealth', 'name', 'share'])) tm.assert_frame_equal(result, expected) # equivalency result = (merge(household.reset_index(), portfolio.reset_index(), on=['household_id'], how='inner') .set_index(['household_id', 'asset_id'])) tm.assert_frame_equal(result, expected) result = household.join(portfolio, how='outer') expected = (concat([ expected, (DataFrame( dict(share=[1.00]), index=MultiIndex.from_tuples( [(4, np.nan)], names=['household_id', 'asset_id']))) ], axis=0, sort=True).reindex(columns=expected.columns)) tm.assert_frame_equal(result, expected) # invalid cases household.index.name = 'foo' with pytest.raises(ValueError): household.join(portfolio, how='inner') portfolio2 = portfolio.copy() portfolio2.index.set_names(['household_id', 'foo']) with pytest.raises(ValueError): portfolio2.join(portfolio, how='inner')
def test_join_multi_levels(self): # GH 3662 # merge multi-levels household = ( DataFrame( dict(household_id=[1, 2, 3], male=[0, 1, 0], wealth=[196087.3, 316478.7, 294750]), columns=['household_id', 'male', 'wealth']) .set_index('household_id')) portfolio = ( DataFrame( dict(household_id=[1, 2, 2, 3, 3, 3, 4], asset_id=["nl0000301109", "nl0000289783", "gb00b03mlx29", "gb00b03mlx29", "lu0197800237", "nl0000289965", np.nan], name=["ABN Amro", "Robeco", "Royal Dutch Shell", "Royal Dutch Shell", "AAB Eastern Europe Equity Fund", "Postbank BioTech Fonds", np.nan], share=[1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0]), columns=['household_id', 'asset_id', 'name', 'share']) .set_index(['household_id', 'asset_id'])) result = household.join(portfolio, how='inner') expected = ( DataFrame( dict(male=[0, 1, 1, 0, 0, 0], wealth=[196087.3, 316478.7, 316478.7, 294750.0, 294750.0, 294750.0], name=['ABN Amro', 'Robeco', 'Royal Dutch Shell', 'Royal Dutch Shell', 'AAB Eastern Europe Equity Fund', 'Postbank BioTech Fonds'], share=[1.00, 0.40, 0.60, 0.15, 0.60, 0.25], household_id=[1, 2, 2, 3, 3, 3], asset_id=['nl0000301109', 'nl0000289783', 'gb00b03mlx29', 'gb00b03mlx29', 'lu0197800237', 'nl0000289965'])) .set_index(['household_id', 'asset_id']) .reindex(columns=['male', 'wealth', 'name', 'share'])) tm.assert_frame_equal(result, expected) # equivalency result = (merge(household.reset_index(), portfolio.reset_index(), on=['household_id'], how='inner') .set_index(['household_id', 'asset_id'])) tm.assert_frame_equal(result, expected) result = household.join(portfolio, how='outer') expected = (concat([ expected, (DataFrame( dict(share=[1.00]), index=MultiIndex.from_tuples( [(4, np.nan)], names=['household_id', 'asset_id']))) ], axis=0, sort=True).reindex(columns=expected.columns)) tm.assert_frame_equal(result, expected) # invalid cases household.index.name = 'foo' def f(): household.join(portfolio, how='inner') pytest.raises(ValueError, f) portfolio2 = portfolio.copy() portfolio2.index.set_names(['household_id', 'foo']) def f(): portfolio2.join(portfolio, how='inner') pytest.raises(ValueError, f)