예제 #1
0
    def test_compress_group_combinations(self):

        # ~ 40000000 possible unique groups
        key1 = tm.rands_array(10, 10000)
        key1 = np.tile(key1, 2)
        key2 = key1[::-1]

        df = DataFrame({'key1': key1, 'key2': key2,
                        'value1': np.random.randn(20000)})

        df2 = DataFrame({'key1': key1[::2], 'key2': key2[::2],
                         'value2': np.random.randn(10000)})

        # just to hit the label compression code path
        merge(df, df2, how='outer')
예제 #2
0
    def test_left_merge_na_buglet(self):
        left = DataFrame({'id': list('abcde'), 'v1': randn(5),
                          'v2': randn(5), 'dummy': list('abcde'),
                          'v3': randn(5)},
                         columns=['id', 'v1', 'v2', 'dummy', 'v3'])
        right = DataFrame({'id': ['a', 'b', np.nan, np.nan, np.nan],
                           'sv3': [1.234, 5.678, np.nan, np.nan, np.nan]})

        result = merge(left, right, on='id', how='left')

        rdf = right.drop(['id'], axis=1)
        expected = left.join(rdf)
        tm.assert_frame_equal(result, expected)
예제 #3
0
    def test_left_join_index_multi_match(self):
        left = DataFrame([
            ['c', 0],
            ['b', 1],
            ['a', 2],
            ['b', 3]],
            columns=['tag', 'val'],
            index=[2, 0, 1, 3])

        right = (DataFrame([
            ['a', 'v'],
            ['c', 'w'],
            ['c', 'x'],
            ['d', 'y'],
            ['a', 'z'],
            ['c', 'r'],
            ['e', 'q'],
            ['c', 's']],
            columns=['tag', 'char'])
            .set_index('tag'))

        result = left.join(right, on='tag', how='left')

        expected = DataFrame([
            ['c', 0, 'w'],
            ['c', 0, 'x'],
            ['c', 0, 'r'],
            ['c', 0, 's'],
            ['b', 1, nan],
            ['a', 2, 'v'],
            ['a', 2, 'z'],
            ['b', 3, nan]],
            columns=['tag', 'val', 'char'],
            index=[2, 2, 2, 2, 0, 1, 1, 3])

        tm.assert_frame_equal(result, expected)

        result = left.join(right, on='tag', how='left', sort=True)
        expected2 = expected.sort_values('tag', kind='mergesort')

        tm.assert_frame_equal(result, expected2)

        # GH7331 - maintain left frame order in left merge
        result = merge(left, right.reset_index(), how='left', on='tag')
        expected.index = np.arange(len(expected))
        tm.assert_frame_equal(result, expected)
예제 #4
0
        def run_asserts(left, right, sort):
            res = left.join(right, on=icols, how='left', sort=sort)

            assert len(left) < len(res) + 1
            assert not res['4th'].isna().any()
            assert not res['5th'].isna().any()

            tm.assert_series_equal(
                res['4th'], - res['5th'], check_names=False)
            result = bind_cols(res.iloc[:, :-2])
            tm.assert_series_equal(res['4th'], result, check_names=False)
            assert result.name is None

            if sort:
                tm.assert_frame_equal(
                    res, res.sort_values(icols, kind='mergesort'))

            out = merge(left, right.reset_index(), on=icols,
                        sort=sort, how='left')

            res.index = np.arange(len(res))
            tm.assert_frame_equal(out, res)
예제 #5
0
        def run_asserts(left, right, sort):
            res = left.join(right, on=icols, how="left", sort=sort)

            assert len(left) < len(res) + 1
            assert not res["4th"].isna().any()
            assert not res["5th"].isna().any()

            tm.assert_series_equal(res["4th"], -res["5th"], check_names=False)
            result = bind_cols(res.iloc[:, :-2])
            tm.assert_series_equal(res["4th"], result, check_names=False)
            assert result.name is None

            if sort:
                tm.assert_frame_equal(res,
                                      res.sort_values(icols, kind="mergesort"))

            out = merge(left,
                        right.reset_index(),
                        on=icols,
                        sort=sort,
                        how="left")

            res.index = np.arange(len(res))
            tm.assert_frame_equal(out, res)
예제 #6
0
    def test_join_multi_levels2(self):

        # some more advanced merges
        # GH6360
        household = DataFrame(
            {
                "household_id": [1, 2, 2, 3, 3, 3, 4],
                "asset_id": [
                    "nl0000301109",
                    "nl0000301109",
                    "gb00b03mlx29",
                    "gb00b03mlx29",
                    "lu0197800237",
                    "nl0000289965",
                    np.nan,
                ],
                "share": [1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0],
            },
            columns=["household_id", "asset_id", "share"],
        ).set_index(["household_id", "asset_id"])

        log_return = DataFrame({
            "asset_id": [
                "gb00b03mlx29",
                "gb00b03mlx29",
                "gb00b03mlx29",
                "lu0197800237",
                "lu0197800237",
            ],
            "t": [233, 234, 235, 180, 181],
            "log_return": [
                0.09604978,
                -0.06524096,
                0.03532373,
                0.03025441,
                0.036997,
            ],
        }).set_index(["asset_id", "t"])

        expected = (DataFrame({
            "household_id": [2, 2, 2, 3, 3, 3, 3, 3],
            "asset_id": [
                "gb00b03mlx29",
                "gb00b03mlx29",
                "gb00b03mlx29",
                "gb00b03mlx29",
                "gb00b03mlx29",
                "gb00b03mlx29",
                "lu0197800237",
                "lu0197800237",
            ],
            "t": [233, 234, 235, 233, 234, 235, 180, 181],
            "share": [0.6, 0.6, 0.6, 0.15, 0.15, 0.15, 0.6, 0.6],
            "log_return": [
                0.09604978,
                -0.06524096,
                0.03532373,
                0.09604978,
                -0.06524096,
                0.03532373,
                0.03025441,
                0.036997,
            ],
        }).set_index(["household_id", "asset_id",
                      "t"]).reindex(columns=["share", "log_return"]))

        # this is the equivalency
        result = merge(
            household.reset_index(),
            log_return.reset_index(),
            on=["asset_id"],
            how="inner",
        ).set_index(["household_id", "asset_id", "t"])
        tm.assert_frame_equal(result, expected)

        expected = (DataFrame({
            "household_id": [1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4],
            "asset_id": [
                "nl0000301109",
                "nl0000301109",
                "gb00b03mlx29",
                "gb00b03mlx29",
                "gb00b03mlx29",
                "gb00b03mlx29",
                "gb00b03mlx29",
                "gb00b03mlx29",
                "lu0197800237",
                "lu0197800237",
                "nl0000289965",
                None,
            ],
            "t": [
                None,
                None,
                233,
                234,
                235,
                233,
                234,
                235,
                180,
                181,
                None,
                None,
            ],
            "share": [
                1.0,
                0.4,
                0.6,
                0.6,
                0.6,
                0.15,
                0.15,
                0.15,
                0.6,
                0.6,
                0.25,
                1.0,
            ],
            "log_return": [
                None,
                None,
                0.09604978,
                -0.06524096,
                0.03532373,
                0.09604978,
                -0.06524096,
                0.03532373,
                0.03025441,
                0.036997,
                None,
                None,
            ],
        }).set_index(["household_id", "asset_id",
                      "t"]).reindex(columns=["share", "log_return"]))

        result = merge(
            household.reset_index(),
            log_return.reset_index(),
            on=["asset_id"],
            how="outer",
        ).set_index(["household_id", "asset_id", "t"])

        tm.assert_frame_equal(result, expected)
예제 #7
0
    def test_join_multi_levels(self):

        # GH 3662
        # merge multi-levels
        household = DataFrame(
            {
                "household_id": [1, 2, 3],
                "male": [0, 1, 0],
                "wealth": [196087.3, 316478.7, 294750],
            },
            columns=["household_id", "male", "wealth"],
        ).set_index("household_id")
        portfolio = DataFrame(
            {
                "household_id": [1, 2, 2, 3, 3, 3, 4],
                "asset_id": [
                    "nl0000301109",
                    "nl0000289783",
                    "gb00b03mlx29",
                    "gb00b03mlx29",
                    "lu0197800237",
                    "nl0000289965",
                    np.nan,
                ],
                "name": [
                    "ABN Amro",
                    "Robeco",
                    "Royal Dutch Shell",
                    "Royal Dutch Shell",
                    "AAB Eastern Europe Equity Fund",
                    "Postbank BioTech Fonds",
                    np.nan,
                ],
                "share": [1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0],
            },
            columns=["household_id", "asset_id", "name", "share"],
        ).set_index(["household_id", "asset_id"])
        result = household.join(portfolio, how="inner")
        expected = (DataFrame({
            "male": [0, 1, 1, 0, 0, 0],
            "wealth": [
                196087.3,
                316478.7,
                316478.7,
                294750.0,
                294750.0,
                294750.0,
            ],
            "name": [
                "ABN Amro",
                "Robeco",
                "Royal Dutch Shell",
                "Royal Dutch Shell",
                "AAB Eastern Europe Equity Fund",
                "Postbank BioTech Fonds",
            ],
            "share": [1.00, 0.40, 0.60, 0.15, 0.60, 0.25],
            "household_id": [1, 2, 2, 3, 3, 3],
            "asset_id": [
                "nl0000301109",
                "nl0000289783",
                "gb00b03mlx29",
                "gb00b03mlx29",
                "lu0197800237",
                "nl0000289965",
            ],
        }).set_index(["household_id", "asset_id"
                      ]).reindex(columns=["male", "wealth", "name", "share"]))
        tm.assert_frame_equal(result, expected)

        # equivalency
        result = merge(
            household.reset_index(),
            portfolio.reset_index(),
            on=["household_id"],
            how="inner",
        ).set_index(["household_id", "asset_id"])
        tm.assert_frame_equal(result, expected)

        result = household.join(portfolio, how="outer")
        expected = concat(
            [
                expected,
                (DataFrame(
                    {"share": [1.00]},
                    index=MultiIndex.from_tuples(
                        [(4, np.nan)], names=["household_id", "asset_id"]),
                )),
            ],
            axis=0,
            sort=True,
        ).reindex(columns=expected.columns)
        tm.assert_frame_equal(result, expected)

        # invalid cases
        household.index.name = "foo"

        with pytest.raises(
                ValueError,
                match="cannot join with no overlapping index names"):
            household.join(portfolio, how="inner")

        portfolio2 = portfolio.copy()
        portfolio2.index.set_names(["household_id", "foo"])

        with pytest.raises(ValueError,
                           match="columns overlap but no suffix specified"):
            portfolio2.join(portfolio, how="inner")
예제 #8
0
    def test_join_multi_levels2(self):

        # some more advanced merges
        # GH6360
        household = (
            DataFrame(
                dict(household_id=[1, 2, 2, 3, 3, 3, 4],
                     asset_id=["nl0000301109", "nl0000301109", "gb00b03mlx29",
                               "gb00b03mlx29", "lu0197800237", "nl0000289965",
                               np.nan],
                     share=[1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0]),
                columns=['household_id', 'asset_id', 'share'])
            .set_index(['household_id', 'asset_id']))

        log_return = DataFrame(dict(
            asset_id=["gb00b03mlx29", "gb00b03mlx29",
                      "gb00b03mlx29", "lu0197800237", "lu0197800237"],
            t=[233, 234, 235, 180, 181],
            log_return=[.09604978, -.06524096, .03532373, .03025441, .036997]
        )).set_index(["asset_id", "t"])

        expected = (
            DataFrame(dict(
                household_id=[2, 2, 2, 3, 3, 3, 3, 3],
                asset_id=["gb00b03mlx29", "gb00b03mlx29",
                          "gb00b03mlx29", "gb00b03mlx29",
                          "gb00b03mlx29", "gb00b03mlx29",
                          "lu0197800237", "lu0197800237"],
                t=[233, 234, 235, 233, 234, 235, 180, 181],
                share=[0.6, 0.6, 0.6, 0.15, 0.15, 0.15, 0.6, 0.6],
                log_return=[.09604978, -.06524096, .03532373,
                            .09604978, -.06524096, .03532373,
                            .03025441, .036997]
            ))
            .set_index(["household_id", "asset_id", "t"])
            .reindex(columns=['share', 'log_return']))

        # this is the equivalency
        result = (merge(household.reset_index(), log_return.reset_index(),
                        on=['asset_id'], how='inner')
                  .set_index(['household_id', 'asset_id', 't']))
        tm.assert_frame_equal(result, expected)

        expected = (
            DataFrame(dict(
                household_id=[1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4],
                asset_id=["nl0000301109", "nl0000301109", "gb00b03mlx29",
                          "gb00b03mlx29", "gb00b03mlx29",
                          "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29",
                          "lu0197800237", "lu0197800237",
                          "nl0000289965", None],
                t=[None, None, 233, 234, 235, 233, 234,
                   235, 180, 181, None, None],
                share=[1.0, 0.4, 0.6, 0.6, 0.6, 0.15,
                       0.15, 0.15, 0.6, 0.6, 0.25, 1.0],
                log_return=[None, None, .09604978, -.06524096, .03532373,
                            .09604978, -.06524096, .03532373,
                            .03025441, .036997, None, None]
            ))
            .set_index(["household_id", "asset_id", "t"])
            .reindex(columns=['share', 'log_return']))

        result = (merge(household.reset_index(), log_return.reset_index(),
                  on=['asset_id'], how='outer')
                  .set_index(['household_id', 'asset_id', 't']))

        tm.assert_frame_equal(result, expected)
예제 #9
0
    def test_join_multi_levels(self):

        # GH 3662
        # merge multi-levels
        household = (
            DataFrame(
                dict(household_id=[1, 2, 3],
                     male=[0, 1, 0],
                     wealth=[196087.3, 316478.7, 294750]),
                columns=['household_id', 'male', 'wealth'])
            .set_index('household_id'))
        portfolio = (
            DataFrame(
                dict(household_id=[1, 2, 2, 3, 3, 3, 4],
                     asset_id=["nl0000301109", "nl0000289783", "gb00b03mlx29",
                               "gb00b03mlx29", "lu0197800237", "nl0000289965",
                               np.nan],
                     name=["ABN Amro", "Robeco", "Royal Dutch Shell",
                           "Royal Dutch Shell",
                           "AAB Eastern Europe Equity Fund",
                           "Postbank BioTech Fonds", np.nan],
                     share=[1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0]),
                columns=['household_id', 'asset_id', 'name', 'share'])
            .set_index(['household_id', 'asset_id']))
        result = household.join(portfolio, how='inner')
        expected = (
            DataFrame(
                dict(male=[0, 1, 1, 0, 0, 0],
                     wealth=[196087.3, 316478.7, 316478.7,
                             294750.0, 294750.0, 294750.0],
                     name=['ABN Amro', 'Robeco', 'Royal Dutch Shell',
                           'Royal Dutch Shell',
                           'AAB Eastern Europe Equity Fund',
                           'Postbank BioTech Fonds'],
                     share=[1.00, 0.40, 0.60, 0.15, 0.60, 0.25],
                     household_id=[1, 2, 2, 3, 3, 3],
                     asset_id=['nl0000301109', 'nl0000289783', 'gb00b03mlx29',
                               'gb00b03mlx29', 'lu0197800237',
                               'nl0000289965']))
            .set_index(['household_id', 'asset_id'])
            .reindex(columns=['male', 'wealth', 'name', 'share']))
        tm.assert_frame_equal(result, expected)

        # equivalency
        result = (merge(household.reset_index(), portfolio.reset_index(),
                        on=['household_id'], how='inner')
                  .set_index(['household_id', 'asset_id']))
        tm.assert_frame_equal(result, expected)

        result = household.join(portfolio, how='outer')
        expected = (concat([
            expected,
            (DataFrame(
                dict(share=[1.00]),
                index=MultiIndex.from_tuples(
                    [(4, np.nan)],
                    names=['household_id', 'asset_id'])))
        ], axis=0, sort=True).reindex(columns=expected.columns))
        tm.assert_frame_equal(result, expected)

        # invalid cases
        household.index.name = 'foo'

        with pytest.raises(ValueError):
            household.join(portfolio, how='inner')

        portfolio2 = portfolio.copy()
        portfolio2.index.set_names(['household_id', 'foo'])

        with pytest.raises(ValueError):
            portfolio2.join(portfolio, how='inner')
예제 #10
0
    def test_join_multi_levels2(self):

        # some more advanced merges
        # GH6360
        household = (
            DataFrame(
                dict(household_id=[1, 2, 2, 3, 3, 3, 4],
                     asset_id=["nl0000301109", "nl0000301109", "gb00b03mlx29",
                               "gb00b03mlx29", "lu0197800237", "nl0000289965",
                               np.nan],
                     share=[1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0]),
                columns=['household_id', 'asset_id', 'share'])
            .set_index(['household_id', 'asset_id']))

        log_return = DataFrame(dict(
            asset_id=["gb00b03mlx29", "gb00b03mlx29",
                      "gb00b03mlx29", "lu0197800237", "lu0197800237"],
            t=[233, 234, 235, 180, 181],
            log_return=[.09604978, -.06524096, .03532373, .03025441, .036997]
        )).set_index(["asset_id", "t"])

        expected = (
            DataFrame(dict(
                household_id=[2, 2, 2, 3, 3, 3, 3, 3],
                asset_id=["gb00b03mlx29", "gb00b03mlx29",
                          "gb00b03mlx29", "gb00b03mlx29",
                          "gb00b03mlx29", "gb00b03mlx29",
                          "lu0197800237", "lu0197800237"],
                t=[233, 234, 235, 233, 234, 235, 180, 181],
                share=[0.6, 0.6, 0.6, 0.15, 0.15, 0.15, 0.6, 0.6],
                log_return=[.09604978, -.06524096, .03532373,
                            .09604978, -.06524096, .03532373,
                            .03025441, .036997]
            ))
            .set_index(["household_id", "asset_id", "t"])
            .reindex(columns=['share', 'log_return']))

        # this is the equivalency
        result = (merge(household.reset_index(), log_return.reset_index(),
                        on=['asset_id'], how='inner')
                  .set_index(['household_id', 'asset_id', 't']))
        tm.assert_frame_equal(result, expected)

        expected = (
            DataFrame(dict(
                household_id=[1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4],
                asset_id=["nl0000301109", "nl0000301109", "gb00b03mlx29",
                          "gb00b03mlx29", "gb00b03mlx29",
                          "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29",
                          "lu0197800237", "lu0197800237",
                          "nl0000289965", None],
                t=[None, None, 233, 234, 235, 233, 234,
                   235, 180, 181, None, None],
                share=[1.0, 0.4, 0.6, 0.6, 0.6, 0.15,
                       0.15, 0.15, 0.6, 0.6, 0.25, 1.0],
                log_return=[None, None, .09604978, -.06524096, .03532373,
                            .09604978, -.06524096, .03532373,
                            .03025441, .036997, None, None]
            ))
            .set_index(["household_id", "asset_id", "t"])
            .reindex(columns=['share', 'log_return']))

        result = (merge(household.reset_index(), log_return.reset_index(),
                  on=['asset_id'], how='outer')
                  .set_index(['household_id', 'asset_id', 't']))

        tm.assert_frame_equal(result, expected)
예제 #11
0
    def test_join_multi_levels(self):

        # GH 3662
        # merge multi-levels
        household = (
            DataFrame(
                dict(household_id=[1, 2, 3],
                     male=[0, 1, 0],
                     wealth=[196087.3, 316478.7, 294750]),
                columns=['household_id', 'male', 'wealth'])
            .set_index('household_id'))
        portfolio = (
            DataFrame(
                dict(household_id=[1, 2, 2, 3, 3, 3, 4],
                     asset_id=["nl0000301109", "nl0000289783", "gb00b03mlx29",
                               "gb00b03mlx29", "lu0197800237", "nl0000289965",
                               np.nan],
                     name=["ABN Amro", "Robeco", "Royal Dutch Shell",
                           "Royal Dutch Shell",
                           "AAB Eastern Europe Equity Fund",
                           "Postbank BioTech Fonds", np.nan],
                     share=[1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0]),
                columns=['household_id', 'asset_id', 'name', 'share'])
            .set_index(['household_id', 'asset_id']))
        result = household.join(portfolio, how='inner')
        expected = (
            DataFrame(
                dict(male=[0, 1, 1, 0, 0, 0],
                     wealth=[196087.3, 316478.7, 316478.7,
                             294750.0, 294750.0, 294750.0],
                     name=['ABN Amro', 'Robeco', 'Royal Dutch Shell',
                           'Royal Dutch Shell',
                           'AAB Eastern Europe Equity Fund',
                           'Postbank BioTech Fonds'],
                     share=[1.00, 0.40, 0.60, 0.15, 0.60, 0.25],
                     household_id=[1, 2, 2, 3, 3, 3],
                     asset_id=['nl0000301109', 'nl0000289783', 'gb00b03mlx29',
                               'gb00b03mlx29', 'lu0197800237',
                               'nl0000289965']))
            .set_index(['household_id', 'asset_id'])
            .reindex(columns=['male', 'wealth', 'name', 'share']))
        tm.assert_frame_equal(result, expected)

        # equivalency
        result = (merge(household.reset_index(), portfolio.reset_index(),
                        on=['household_id'], how='inner')
                  .set_index(['household_id', 'asset_id']))
        tm.assert_frame_equal(result, expected)

        result = household.join(portfolio, how='outer')
        expected = (concat([
            expected,
            (DataFrame(
                dict(share=[1.00]),
                index=MultiIndex.from_tuples(
                    [(4, np.nan)],
                    names=['household_id', 'asset_id'])))
        ], axis=0, sort=True).reindex(columns=expected.columns))
        tm.assert_frame_equal(result, expected)

        # invalid cases
        household.index.name = 'foo'

        def f():
            household.join(portfolio, how='inner')

        pytest.raises(ValueError, f)

        portfolio2 = portfolio.copy()
        portfolio2.index.set_names(['household_id', 'foo'])

        def f():
            portfolio2.join(portfolio, how='inner')

        pytest.raises(ValueError, f)