示例#1
0
    def testConcat(self):
        df1 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD'))
        df2 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD'))

        mdf1 = from_pandas(df1, chunk_size=4)
        mdf2 = from_pandas(df2, chunk_size=4)
        r = concat([mdf1, mdf2], axis='index')

        self.assertEqual(r.shape, (20, 4))
        pd.testing.assert_series_equal(r.dtypes, df1.dtypes)

        tiled = r.tiles()
        self.assertEqual(tiled.nsplits, ((4, 4, 2, 4, 4, 2), (4, )))
        for i, c in enumerate(tiled.chunks):
            self.assertEqual(c.index, (i, 0))

        mdf1 = from_pandas(df1, chunk_size=3)
        mdf2 = from_pandas(df2, chunk_size=4)
        r = concat([mdf1, mdf2], axis='columns')

        self.assertEqual(r.shape, (10, 8))
        expected_dtypes = pd.concat([df1, df2], axis='columns').dtypes
        pd.testing.assert_series_equal(r.dtypes, expected_dtypes)

        tiled = r.tiles()
        self.assertEqual(tiled.nsplits, ((3, 3, 3, 1), (3, 1, 4)))
        for i, c in enumerate(tiled.chunks):
            index = (i // 3, i % 3)
            self.assertEqual(c.index, index)

        df1 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD'))
        df2 = pd.DataFrame(np.random.rand(10, 3), columns=list('ABC'))
        mdf1 = from_pandas(df1, chunk_size=3)
        mdf2 = from_pandas(df2, chunk_size=3)
        r = concat([mdf1, mdf2], join='inner')

        self.assertEqual(r.shape, (20, 3))
        tiled = r.tiles()
        self.assertEqual(tiled.nsplits, ((3, 3, 3, 1, 3, 3, 3, 1), (3, )))
示例#2
0
def test_concat(setup):
    df1 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD'))
    df2 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD'))

    mdf1 = from_pandas(df1, chunk_size=3)
    mdf2 = from_pandas(df2, chunk_size=3)

    r = concat([mdf1, mdf2])
    expected = pd.concat([df1, df2])
    result = r.execute().fetch()
    pd.testing.assert_frame_equal(expected, result)

    # test different chunk size and ignore_index=True
    mdf1 = from_pandas(df1, chunk_size=2)
    mdf2 = from_pandas(df2, chunk_size=3)

    r = concat([mdf1, mdf2], ignore_index=True)
    expected = pd.concat([df1, df2], ignore_index=True)
    result = r.execute(extra_config={'check_index_value': False}).fetch()
    pd.testing.assert_frame_equal(expected, result)

    # test axis=1
    mdf1 = from_pandas(df1, chunk_size=2)
    mdf2 = from_pandas(df2, chunk_size=3)

    r = concat([mdf1, mdf2], axis=1)
    expected = pd.concat([df1, df2], axis=1)
    result = r.execute().fetch()
    pd.testing.assert_frame_equal(expected, result)

    # test multiply dataframes
    r = concat([mdf1, mdf2, mdf1])
    expected = pd.concat([df1, df2, df1])
    result = r.execute().fetch()
    pd.testing.assert_frame_equal(expected, result)

    df1 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD'))
    df2 = pd.DataFrame(np.random.rand(10, 3), columns=list('ABC'))

    mdf1 = from_pandas(df1, chunk_size=3)
    mdf2 = from_pandas(df2, chunk_size=3)

    # test join=inner
    r = concat([mdf1, mdf2], join='inner')
    expected = pd.concat([df1, df2], join='inner')
    result = r.execute().fetch()
    pd.testing.assert_frame_equal(expected, result)

    # test for series
    series1 = pd.Series(np.random.rand(10, ))
    series2 = pd.Series(np.random.rand(10, ))

    mseries1 = series_from_pandas(series1, chunk_size=3)
    mseries2 = series_from_pandas(series2, chunk_size=3)

    r = concat([mseries1, mseries2])
    expected = pd.concat([series1, series2])
    result = r.execute().fetch()
    pd.testing.assert_series_equal(result, expected)

    # test different series and ignore_index
    mseries1 = series_from_pandas(series1, chunk_size=4)
    mseries2 = series_from_pandas(series2, chunk_size=3)

    r = concat([mseries1, mseries2], ignore_index=True)
    expected = pd.concat([series1, series2], ignore_index=True)
    result = r.execute(extra_config={'check_index_value': False}).fetch()
    pd.testing.assert_series_equal(result, expected)

    # test axis=1
    mseries1 = series_from_pandas(series1, chunk_size=3)
    mseries2 = series_from_pandas(series2, chunk_size=3)

    r = concat([mseries1, mseries2], axis=1)
    expected = pd.concat([series1, series2], axis=1)
    result = r.execute(extra_config={'check_shape': False}).fetch()
    pd.testing.assert_frame_equal(result, expected)

    # test merge dataframe and series
    r = concat([mdf1, mseries2], ignore_index=True)
    expected = pd.concat([df1, series2], ignore_index=True)
    result = r.execute(extra_config={'check_index_value': False}).fetch()
    pd.testing.assert_frame_equal(result, expected)

    # test merge series and dataframe
    r = concat([mseries1, mdf2], ignore_index=True)
    expected = pd.concat([series1, df2], ignore_index=True)
    result = r.execute(extra_config={'check_index_value': False}).fetch()
    pd.testing.assert_frame_equal(result, expected)

    # test merge dataframe and series, axis=1
    r = concat([mdf1, mseries2], axis=1)
    expected = pd.concat([df1, series2], axis=1)
    result = r.execute().fetch()
    pd.testing.assert_frame_equal(result, expected)

    # test merge series and dataframe, axis=1
    r = concat([mseries1, mdf2], axis=1)
    expected = pd.concat([series1, df2], axis=1)
    result = r.execute().fetch()
    pd.testing.assert_frame_equal(result, expected)
示例#3
0
def test_concat():
    df1 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD'))
    df2 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD'))

    mdf1 = from_pandas(df1, chunk_size=4)
    mdf2 = from_pandas(df2, chunk_size=4)
    r = concat([mdf1, mdf2], axis='index')

    assert r.shape == (20, 4)
    pd.testing.assert_series_equal(r.dtypes, df1.dtypes)

    tiled = tile(r)
    assert tiled.nsplits == ((4, 4, 2, 4, 4, 2), (4,))
    for i, c in enumerate(tiled.chunks):
        assert c.index == (i, 0)

    df3 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD'),
                       index=pd.RangeIndex(10, 20))

    mdf3 = from_pandas(df3, chunk_size=4)
    r = concat([mdf1, mdf3], axis='index')

    assert r.shape == (20, 4)
    pd.testing.assert_series_equal(r.dtypes, df1.dtypes)
    pd.testing.assert_index_equal(r.index_value.to_pandas(), pd.RangeIndex(20))

    df4 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD'),
                       index=np.random.permutation(np.arange(10)))

    mdf4 = from_pandas(df4, chunk_size=4)
    r = concat([mdf1, mdf4], axis='index')

    assert r.shape == (20, 4)
    pd.testing.assert_series_equal(r.dtypes, df1.dtypes)
    pd.testing.assert_index_equal(r.index_value.to_pandas(), pd.Index([], dtype=np.int64))

    r = concat([mdf4, mdf1], axis='index')

    assert r.shape == (20, 4)
    pd.testing.assert_series_equal(r.dtypes, df1.dtypes)
    pd.testing.assert_index_equal(r.index_value.to_pandas(), pd.Index([], dtype=np.int64))

    r = concat([mdf4, mdf4], axis='index')

    assert r.shape == (20, 4)
    pd.testing.assert_series_equal(r.dtypes, df1.dtypes)
    pd.testing.assert_index_equal(r.index_value.to_pandas(), pd.Index([], dtype=np.int64))

    mdf1 = from_pandas(df1, chunk_size=3)
    mdf2 = from_pandas(df2, chunk_size=4)
    r = concat([mdf1, mdf2], axis='columns')

    assert r.shape == (10, 8)
    expected_dtypes = pd.concat([df1, df2], axis='columns').dtypes
    pd.testing.assert_series_equal(r.dtypes, expected_dtypes)

    tiled = tile(r)
    assert tiled.nsplits == ((3, 3, 3, 1), (3, 1, 4))
    for i, c in enumerate(tiled.chunks):
        index = (i // 3, i % 3)
        assert c.index == index

    df1 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD'))
    df2 = pd.DataFrame(np.random.rand(10, 3), columns=list('ABC'))
    mdf1 = from_pandas(df1, chunk_size=3)
    mdf2 = from_pandas(df2, chunk_size=3)
    r = concat([mdf1, mdf2], join='inner')

    assert r.shape == (20, 3)
    tiled = tile(r)
    assert tiled.nsplits == ((3, 3, 3, 1, 3, 3, 3, 1), (3, ))
示例#4
0
    def testConcat(self):
        executor = ExecutorForTest(storage=new_session().context)

        df1 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD'))
        df2 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD'))

        mdf1 = from_pandas(df1, chunk_size=3)
        mdf2 = from_pandas(df2, chunk_size=3)

        r = concat([mdf1, mdf2])
        expected = pd.concat([df1, df2])
        result = self.executor.execute_dataframe(r, concat=True)[0]
        pd.testing.assert_frame_equal(expected, result)

        # test different chunk size and ignore_index=True
        mdf1 = from_pandas(df1, chunk_size=2)
        mdf2 = from_pandas(df2, chunk_size=3)

        r = concat([mdf1, mdf2], ignore_index=True)
        expected = pd.concat([df1, df2], ignore_index=True)
        result = executor.execute_dataframe(r, concat=True)[0]
        pd.testing.assert_frame_equal(expected, result)

        # test axis=1
        mdf1 = from_pandas(df1, chunk_size=2)
        mdf2 = from_pandas(df2, chunk_size=3)

        r = concat([mdf1, mdf2], axis=1)
        expected = pd.concat([df1, df2], axis=1)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        pd.testing.assert_frame_equal(expected, result)

        # test multiply dataframes
        r = concat([mdf1, mdf2, mdf1])
        expected = pd.concat([df1, df2, df1])
        result = self.executor.execute_dataframe(r, concat=True)[0]
        pd.testing.assert_frame_equal(expected, result)

        df1 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD'))
        df2 = pd.DataFrame(np.random.rand(10, 3), columns=list('ABC'))

        mdf1 = from_pandas(df1, chunk_size=3)
        mdf2 = from_pandas(df2, chunk_size=3)

        # test join=inner
        r = concat([mdf1, mdf2], join='inner')
        expected = pd.concat([df1, df2], join='inner')
        result = self.executor.execute_dataframe(r, concat=True)[0]
        pd.testing.assert_frame_equal(expected, result)

        # test for series
        series1 = pd.Series(np.random.rand(10, ))
        series2 = pd.Series(np.random.rand(10, ))

        mseries1 = series_from_pandas(series1, chunk_size=3)
        mseries2 = series_from_pandas(series2, chunk_size=3)

        r = concat([mseries1, mseries2])
        expected = pd.concat([series1, series2])
        result = self.executor.execute_dataframe(r, concat=True)[0]
        pd.testing.assert_series_equal(result, expected)

        # test different series and ignore_index
        mseries1 = series_from_pandas(series1, chunk_size=4)
        mseries2 = series_from_pandas(series2, chunk_size=3)

        r = concat([mseries1, mseries2], ignore_index=True)
        expected = pd.concat([series1, series2], ignore_index=True)
        result = executor.execute_dataframe(r, concat=True)[0]
        pd.testing.assert_series_equal(result, expected)

        # test axis=1
        mseries1 = series_from_pandas(series1, chunk_size=3)
        mseries2 = series_from_pandas(series2, chunk_size=3)

        r = concat([mseries1, mseries2], axis=1)
        expected = pd.concat([series1, series2], axis=1)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        pd.testing.assert_frame_equal(result, expected)

        # test merge dataframe and series
        r = concat([mdf1, mseries2], ignore_index=True)
        expected = pd.concat([df1, series2], ignore_index=True)
        result = executor.execute_dataframe(r, concat=True)[0]
        pd.testing.assert_frame_equal(result, expected)

        # test merge series and dataframe
        r = concat([mseries1, mdf2], ignore_index=True)
        expected = pd.concat([series1, df2], ignore_index=True)
        result = executor.execute_dataframe(r, concat=True)[0]
        pd.testing.assert_frame_equal(result, expected)

        # test merge dataframe and series, axis=1
        r = concat([mdf1, mseries2], axis=1)
        expected = pd.concat([df1, series2], axis=1)
        result = executor.execute_dataframe(r, concat=True)[0]
        pd.testing.assert_frame_equal(result, expected)

        # test merge series and dataframe, axis=1
        r = concat([mseries1, mdf2], axis=1)
        expected = pd.concat([series1, df2], axis=1)
        result = executor.execute_dataframe(r, concat=True)[0]
        pd.testing.assert_frame_equal(result, expected)