예제 #1
0
    def testGroupBy(self):
        df1 = pd.DataFrame({
            'a': [3, 4, 5, 3, 5, 4, 1, 2, 3],
            'b': [1, 3, 4, 5, 6, 5, 4, 4, 4],
            'c': list('aabaaddce')
        })
        mdf = md.DataFrame(df1, chunk_size=3)
        grouped = mdf.groupby('b')
        assert_groupby_equal(
            self.executor.execute_dataframe(grouped, concat=True)[0],
            df1.groupby('b'))

        df2 = pd.DataFrame(
            {
                'a': [3, 4, 5, 3, 5, 4, 1, 2, 3],
                'b': [1, 3, 4, 5, 6, 5, 4, 4, 4],
                'c': list('aabaaddce')
            },
            index=['i' + str(i) for i in range(9)])
        mdf = md.DataFrame(df2, chunk_size=3)
        grouped = mdf.groupby('b')
        assert_groupby_equal(
            self.executor.execute_dataframe(grouped, concat=True)[0],
            df2.groupby('b'))

        df3 = pd.DataFrame(
            {
                'a': [3, 4, 5, 3, 5, 4, 1, 2, 3],
                'b': [1, 3, 4, 5, 6, 5, 4, 4, 4],
                'c': list('aabaaddce')
            },
            index=pd.MultiIndex.from_tuples([(i % 3, 'i' + str(i))
                                             for i in range(9)]))
        mdf = md.DataFrame(df3, chunk_size=3)
        grouped = mdf.groupby(level=0)
        assert_groupby_equal(
            self.executor.execute_dataframe(grouped, concat=True)[0],
            df3.groupby(level=0))

        series1 = pd.Series([3, 4, 5, 3, 5, 4, 1, 2, 3])
        ms1 = md.Series(series1, chunk_size=3)
        grouped = ms1.groupby(lambda x: x % 3)
        assert_groupby_equal(
            self.executor.execute_dataframe(grouped, concat=True)[0],
            series1.groupby(lambda x: x % 3))

        series2 = pd.Series([3, 4, 5, 3, 5, 4, 1, 2, 3],
                            index=['i' + str(i) for i in range(9)])
        ms2 = md.Series(series2, chunk_size=3)
        grouped = ms2.groupby(lambda x: int(x[1:]) % 3)
        assert_groupby_equal(
            self.executor.execute_dataframe(grouped, concat=True)[0],
            series2.groupby(lambda x: int(x[1:]) % 3))
예제 #2
0
    def testDataSerialize(self):
        array = np.random.rand(1000, 100)
        assert_array_equal(array,
                           dataserializer.loads(dataserializer.dumps(array)))
        assert_array_equal(
            array,
            dataserializer.loads(
                dataserializer.dumps(
                    array, compress=dataserializer.CompressType.LZ4)))
        assert_array_equal(
            array,
            dataserializer.loads(
                dataserializer.dumps(
                    array, compress=dataserializer.CompressType.GZIP)))

        array = np.random.rand(1000, 100)
        assert_array_equal(
            array, dataserializer.load(BytesIO(dataserializer.dumps(array))))
        assert_array_equal(
            array,
            dataserializer.load(
                BytesIO(
                    dataserializer.dumps(
                        array, compress=dataserializer.CompressType.LZ4))))
        assert_array_equal(
            array,
            dataserializer.load(
                BytesIO(
                    dataserializer.dumps(
                        array, compress=dataserializer.CompressType.GZIP))))

        array = np.random.rand(1000, 100).T  # test non c-contiguous
        assert_array_equal(array,
                           dataserializer.loads(dataserializer.dumps(array)))
        assert_array_equal(
            array,
            dataserializer.loads(
                dataserializer.dumps(
                    array, compress=dataserializer.CompressType.LZ4)))
        assert_array_equal(
            array,
            dataserializer.loads(
                dataserializer.dumps(
                    array, compress=dataserializer.CompressType.GZIP)))

        array = np.float64(0.2345)
        assert_array_equal(array,
                           dataserializer.loads(dataserializer.dumps(array)))
        assert_array_equal(
            array,
            dataserializer.loads(
                dataserializer.dumps(
                    array, compress=dataserializer.CompressType.LZ4)))
        assert_array_equal(
            array,
            dataserializer.loads(
                dataserializer.dumps(
                    array, compress=dataserializer.CompressType.GZIP)))

        # test structured arrays.
        rec_dtype = np.dtype([('a', 'int64'), ('b', 'double'), ('c', '<U8')])
        array = np.ones((100, ), dtype=rec_dtype)
        array_loaded = dataserializer.loads(dataserializer.dumps(array))
        self.assertEqual(array.dtype, array_loaded.dtype)
        assert_array_equal(array, array_loaded)

        fn = os.path.join(tempfile.gettempdir(),
                          'test_dump_file_%d.bin' % id(self))
        try:
            array = np.random.rand(1000, 100).T  # test non c-contiguous
            with open(fn, 'wb') as dump_file:
                dataserializer.dump(array, dump_file)
            with open(fn, 'rb') as dump_file:
                assert_array_equal(array, dataserializer.load(dump_file))

            with open(fn, 'wb') as dump_file:
                dataserializer.dump(array,
                                    dump_file,
                                    compress=dataserializer.CompressType.LZ4)
            with open(fn, 'rb') as dump_file:
                assert_array_equal(array, dataserializer.load(dump_file))

            with open(fn, 'wb') as dump_file:
                dataserializer.dump(array,
                                    dump_file,
                                    compress=dataserializer.CompressType.GZIP)
            with open(fn, 'rb') as dump_file:
                assert_array_equal(array, dataserializer.load(dump_file))
        finally:
            if os.path.exists(fn):
                os.unlink(fn)

        # test sparse
        if sps:
            mat = sparse.SparseMatrix(sps.random(100, 100, 0.1, format='csr'))
            des_mat = dataserializer.loads(dataserializer.dumps(mat))
            self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0)

            des_mat = dataserializer.loads(
                dataserializer.dumps(mat,
                                     compress=dataserializer.CompressType.LZ4))
            self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0)

            des_mat = dataserializer.loads(
                dataserializer.dumps(
                    mat, compress=dataserializer.CompressType.GZIP))
            self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0)

            vector = sparse.SparseVector(sps.csr_matrix(np.random.rand(2)),
                                         shape=(2, ))
            des_vector = dataserializer.loads(dataserializer.dumps(vector))
            self.assertTrue((vector.spmatrix != des_vector.spmatrix).nnz == 0)

            des_vector = dataserializer.loads(
                dataserializer.dumps(vector,
                                     compress=dataserializer.CompressType.LZ4))
            self.assertTrue((vector.spmatrix != des_vector.spmatrix).nnz == 0)

            des_vector = dataserializer.loads(
                dataserializer.dumps(
                    vector, compress=dataserializer.CompressType.GZIP))
            self.assertTrue((vector.spmatrix != des_vector.spmatrix).nnz == 0)

        # test groupby
        df1 = pd.DataFrame({
            'a': [3, 4, 5, 3, 5, 4, 1, 2, 3],
            'b': [1, 3, 4, 5, 6, 5, 4, 4, 4],
            'c': list('aabaaddce')
        })
        grouped = wrapped_groupby(df1, 'b')
        restored = dataserializer.loads(dataserializer.dumps(grouped))
        assert_groupby_equal(grouped, restored.groupby_obj)

        grouped = wrapped_groupby(df1, 'b').c
        restored = dataserializer.loads(dataserializer.dumps(grouped))
        assert_groupby_equal(grouped, restored.groupby_obj)

        grouped = wrapped_groupby(df1, 'b')
        getattr(grouped, 'indices')
        restored = dataserializer.loads(dataserializer.dumps(grouped))
        assert_groupby_equal(grouped, restored.groupby_obj)

        grouped = wrapped_groupby(df1.b, lambda x: x % 2)
        restored = dataserializer.loads(dataserializer.dumps(grouped))
        assert_groupby_equal(grouped, restored.groupby_obj)

        grouped = wrapped_groupby(df1.b, lambda x: x % 2)
        getattr(grouped, 'indices')
        restored = dataserializer.loads(dataserializer.dumps(grouped))
        assert_groupby_equal(grouped, restored.groupby_obj)

        # test categorical
        s = np.random.RandomState(0).random(10)
        cat = pd.cut(s, [0.3, 0.5, 0.8])
        self.assertIsInstance(cat, pd.Categorical)
        des_cat = dataserializer.loads(dataserializer.dumps(cat))
        self.assertEqual(len(cat), len(des_cat))
        for c, dc in zip(cat, des_cat):
            np.testing.assert_equal(c, dc)

        # test IntervalIndex
        s = pd.interval_range(10, 100, 3)
        dest_s = dataserializer.loads((dataserializer.dumps(s)))
        pd.testing.assert_index_equal(s, dest_s)
예제 #3
0
파일: test_lib.py 프로젝트: tomzhang/mars-1
    def testGroupByWrapper(self):
        df = pd.DataFrame(
            {
                'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
                'B':
                ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
                'C': np.random.randn(8),
                'D': np.random.randn(8)
            },
            index=pd.MultiIndex.from_tuples([(i // 4, i) for i in range(8)]))

        grouped = GroupByWrapper.from_tuple(
            wrapped_groupby(df, level=0).to_tuple())
        assert_groupby_equal(grouped, df.groupby(level=0))
        self.assertEqual(grouped.shape, (8, 4))
        self.assertTrue(grouped.is_frame)
        self.assertGreater(sys.getsizeof(grouped),
                           sys.getsizeof(grouped.groupby_obj))
        self.assertGreater(calc_data_size(grouped),
                           sys.getsizeof(grouped.groupby_obj))

        grouped = GroupByWrapper.from_tuple(
            wrapped_groupby(df, level=0).C.to_tuple())
        assert_groupby_equal(grouped, df.groupby(level=0).C)
        self.assertEqual(grouped.shape, (8, ))
        self.assertFalse(grouped.is_frame)

        grouped = GroupByWrapper.from_tuple(
            wrapped_groupby(df, 'B').to_tuple())
        assert_groupby_equal(grouped, df.groupby('B'))
        self.assertEqual(grouped.shape, (8, 4))
        self.assertTrue(grouped.is_frame)

        grouped = GroupByWrapper.from_tuple(
            wrapped_groupby(df, 'B').C.to_tuple(truncate=True))
        assert_groupby_equal(grouped, df.groupby('B').C, with_selection=True)
        self.assertEqual(grouped.shape, (8, ))
        self.assertFalse(grouped.is_frame)

        grouped = GroupByWrapper.from_tuple(
            wrapped_groupby(df, 'B')[['C', 'D']].to_tuple(truncate=True))
        assert_groupby_equal(grouped,
                             df.groupby('B')[['C', 'D']],
                             with_selection=True)
        self.assertEqual(grouped.shape, (8, 2))
        self.assertTrue(grouped.is_frame)

        grouped = GroupByWrapper.from_tuple(
            wrapped_groupby(df, ['B', 'C']).to_tuple(truncate=True))
        assert_groupby_equal(grouped, df.groupby(['B', 'C']))
        self.assertEqual(grouped.shape, (8, 4))
        self.assertTrue(grouped.is_frame)

        grouped = GroupByWrapper.from_tuple(
            wrapped_groupby(df, ['B', 'C']).C.to_tuple(truncate=True))
        assert_groupby_equal(grouped,
                             df.groupby(['B', 'C']).C,
                             with_selection=True)
        self.assertEqual(grouped.shape, (8, ))
        self.assertFalse(grouped.is_frame)

        grouped = GroupByWrapper.from_tuple(
            wrapped_groupby(df, ['B', 'C'])[['A',
                                             'D']].to_tuple(truncate=True))
        assert_groupby_equal(grouped,
                             df.groupby(['B', 'C'])[['A', 'D']],
                             with_selection=True)
        self.assertEqual(grouped.shape, (8, 2))
        self.assertTrue(grouped.is_frame)

        grouped = GroupByWrapper.from_tuple(
            wrapped_groupby(df, ['B', 'C'])[['C',
                                             'D']].to_tuple(truncate=True))
        assert_groupby_equal(grouped,
                             df.groupby(['B', 'C'])[['C', 'D']],
                             with_selection=True)
        self.assertEqual(grouped.shape, (8, 2))
        self.assertTrue(grouped.is_frame)

        grouped = GroupByWrapper.from_tuple(
            wrapped_groupby(
                df, lambda x: x[-1] % 2).to_tuple(pickle_function=True))
        assert_groupby_equal(grouped,
                             df.groupby(lambda x: x[-1] % 2),
                             with_selection=True)
        self.assertEqual(grouped.shape, (8, 4))
        self.assertTrue(grouped.is_frame)

        grouped = GroupByWrapper.from_tuple(
            wrapped_groupby(
                df, lambda x: x[-1] % 2).C.to_tuple(pickle_function=True))
        assert_groupby_equal(grouped,
                             df.groupby(lambda x: x[-1] % 2).C,
                             with_selection=True)
        self.assertEqual(grouped.shape, (8, ))
        self.assertFalse(grouped.is_frame)

        grouped = GroupByWrapper.from_tuple(
            wrapped_groupby(
                df, lambda x: x[-1] % 2)[['C',
                                          'D']].to_tuple(pickle_function=True))
        assert_groupby_equal(grouped,
                             df.groupby(lambda x: x[-1] % 2)[['C', 'D']],
                             with_selection=True)
        self.assertEqual(grouped.shape, (8, 2))
        self.assertTrue(grouped.is_frame)

        grouped = GroupByWrapper.from_tuple(
            wrapped_groupby(df.B, lambda x: x[-1] % 2).to_tuple())
        assert_groupby_equal(grouped,
                             df.B.groupby(lambda x: x[-1] % 2),
                             with_selection=True)
        self.assertEqual(grouped.shape, (8, ))
        self.assertFalse(grouped.is_frame)
예제 #4
0
def test_groupby(setup):
    rs = np.random.RandomState(0)
    data_size = 100
    data_dict = {'a': rs.randint(0, 10, size=(data_size,)),
                 'b': rs.randint(0, 10, size=(data_size,)),
                 'c': rs.choice(list('abcd'), size=(data_size,))}

    # test groupby with DataFrames and RangeIndex
    df1 = pd.DataFrame(data_dict)
    mdf = md.DataFrame(df1, chunk_size=13)
    grouped = mdf.groupby('b')
    assert_groupby_equal(grouped.execute().fetch(),
                         df1.groupby('b'))

    # test groupby with string index with duplications
    df2 = pd.DataFrame(data_dict, index=['i' + str(i % 3) for i in range(data_size)])
    mdf = md.DataFrame(df2, chunk_size=13)
    grouped = mdf.groupby('b')
    assert_groupby_equal(grouped.execute().fetch(),
                         df2.groupby('b'))

    # test groupby with DataFrames by series
    grouped = mdf.groupby(mdf['b'])
    assert_groupby_equal(grouped.execute().fetch(),
                         df2.groupby(df2['b']))

    # test groupby with DataFrames by multiple series
    grouped = mdf.groupby(by=[mdf['b'], mdf['c']])
    assert_groupby_equal(grouped.execute().fetch(),
                         df2.groupby(by=[df2['b'], df2['c']]))

    # test groupby with DataFrames with MultiIndex
    df3 = pd.DataFrame(data_dict,
                       index=pd.MultiIndex.from_tuples(
                           [(i % 3, 'i' + str(i)) for i in range(data_size)]))
    mdf = md.DataFrame(df3, chunk_size=13)
    grouped = mdf.groupby(level=0)
    assert_groupby_equal(grouped.execute().fetch(),
                         df3.groupby(level=0))

    # test groupby with DataFrames by integer columns
    df4 = pd.DataFrame(list(data_dict.values())).T
    mdf = md.DataFrame(df4, chunk_size=13)
    grouped = mdf.groupby(0)
    assert_groupby_equal(grouped.execute().fetch(),
                         df4.groupby(0))

    series1 = pd.Series(data_dict['a'])
    ms1 = md.Series(series1, chunk_size=13)
    grouped = ms1.groupby(lambda x: x % 3)
    assert_groupby_equal(grouped.execute().fetch(),
                         series1.groupby(lambda x: x % 3))

    # test groupby series
    grouped = ms1.groupby(ms1)
    assert_groupby_equal(grouped.execute().fetch(),
                         series1.groupby(series1))

    series2 = pd.Series(data_dict['a'],
                        index=['i' + str(i) for i in range(data_size)])
    ms2 = md.Series(series2, chunk_size=13)
    grouped = ms2.groupby(lambda x: int(x[1:]) % 3)
    assert_groupby_equal(grouped.execute().fetch(),
                         series2.groupby(lambda x: int(x[1:]) % 3))
예제 #5
0
def test_groupby_getitem(setup):
    rs = np.random.RandomState(0)
    data_size = 100
    raw = pd.DataFrame({'a': rs.randint(0, 10, size=(data_size,)),
                        'b': rs.randint(0, 10, size=(data_size,)),
                        'c': rs.choice(list('abcd'), size=(data_size,))},
                       index=pd.MultiIndex.from_tuples([(i % 3, 'i' + str(i)) for i in range(data_size)]))
    mdf = md.DataFrame(raw, chunk_size=13)

    r = mdf.groupby(level=0)[['a', 'b']]
    assert_groupby_equal(r.execute().fetch(),
                         raw.groupby(level=0)[['a', 'b']], with_selection=True)

    for method in ('tree', 'shuffle'):
        r = mdf.groupby(level=0)[['a', 'b']].sum(method=method)
        pd.testing.assert_frame_equal(r.execute().fetch().sort_index(),
                                      raw.groupby(level=0)[['a', 'b']].sum().sort_index())

    r = mdf.groupby(level=0)[['a', 'b']].apply(lambda x: x + 1)
    pd.testing.assert_frame_equal(r.execute().fetch().sort_index(),
                                  raw.groupby(level=0)[['a', 'b']].apply(lambda x: x + 1).sort_index())

    r = mdf.groupby('b')[['a', 'b']]
    assert_groupby_equal(r.execute().fetch(),
                         raw.groupby('b')[['a', 'b']], with_selection=True)

    r = mdf.groupby('b')[['a', 'c']]
    assert_groupby_equal(r.execute().fetch(),
                         raw.groupby('b')[['a', 'c']], with_selection=True)

    for method in ('tree', 'shuffle'):
        r = mdf.groupby('b')[['a', 'b']].sum(method=method)
        pd.testing.assert_frame_equal(r.execute().fetch().sort_index(),
                                      raw.groupby('b')[['a', 'b']].sum().sort_index())

        r = mdf.groupby('b')[['a', 'b']].agg(['sum', 'count'], method=method)
        pd.testing.assert_frame_equal(r.execute().fetch().sort_index(),
                                      raw.groupby('b')[['a', 'b']].agg(['sum', 'count']).sort_index())

        r = mdf.groupby('b')[['a', 'c']].agg(['sum', 'count'], method=method)
        pd.testing.assert_frame_equal(r.execute().fetch().sort_index(),
                                      raw.groupby('b')[['a', 'c']].agg(['sum', 'count']).sort_index())

    r = mdf.groupby('b')[['a', 'b']].apply(lambda x: x + 1)
    pd.testing.assert_frame_equal(r.execute().fetch().sort_index(),
                                  raw.groupby('b')[['a', 'b']].apply(lambda x: x + 1).sort_index())

    r = mdf.groupby('b')[['a', 'b']].transform(lambda x: x + 1)
    pd.testing.assert_frame_equal(r.execute().fetch().sort_index(),
                                  raw.groupby('b')[['a', 'b']].transform(lambda x: x + 1).sort_index())

    r = mdf.groupby('b')[['a', 'b']].cumsum()
    pd.testing.assert_frame_equal(r.execute().fetch().sort_index(),
                                  raw.groupby('b')[['a', 'b']].cumsum().sort_index())

    r = mdf.groupby('b').a
    assert_groupby_equal(r.execute().fetch(),
                         raw.groupby('b').a, with_selection=True)

    for method in ('shuffle', 'tree'):
        r = mdf.groupby('b').a.sum(method=method)
        pd.testing.assert_series_equal(r.execute().fetch().sort_index(),
                                       raw.groupby('b').a.sum().sort_index())

        r = mdf.groupby('b').a.agg(['sum', 'mean', 'var'], method=method)
        pd.testing.assert_frame_equal(r.execute().fetch().sort_index(),
                                      raw.groupby('b').a.agg(['sum', 'mean', 'var']).sort_index())

        r = mdf.groupby('b', as_index=False).a.sum(method=method)
        pd.testing.assert_frame_equal(
            r.execute().fetch().sort_values('b', ignore_index=True),
            raw.groupby('b', as_index=False).a.sum().sort_values('b', ignore_index=True))

        r = mdf.groupby('b', as_index=False).b.count(method=method)
        pd.testing.assert_frame_equal(
            r.execute().fetch().sort_values('b', ignore_index=True),
            raw.groupby('b', as_index=False).b.count().sort_values('b', ignore_index=True))

        r = mdf.groupby('b', as_index=False).b.agg({'cnt': 'count'}, method=method)
        pd.testing.assert_frame_equal(
            r.execute().fetch().sort_values('b', ignore_index=True),
            raw.groupby('b', as_index=False).b.agg({'cnt': 'count'}).sort_values('b', ignore_index=True))

    r = mdf.groupby('b').a.apply(lambda x: x + 1)
    pd.testing.assert_series_equal(r.execute().fetch().sort_index(),
                                   raw.groupby('b').a.apply(lambda x: x + 1).sort_index())

    r = mdf.groupby('b').a.transform(lambda x: x + 1)
    pd.testing.assert_series_equal(r.execute().fetch().sort_index(),
                                   raw.groupby('b').a.transform(lambda x: x + 1).sort_index())

    r = mdf.groupby('b').a.cumsum()
    pd.testing.assert_series_equal(r.execute().fetch().sort_index(),
                                   raw.groupby('b').a.cumsum().sort_index())

    # special test for selection key == 0
    raw = pd.DataFrame(rs.rand(data_size, 10))
    raw[0] = 0
    mdf = md.DataFrame(raw, chunk_size=13)
    r = mdf.groupby(0, as_index=False)[0].agg({'cnt': 'count'}, method='tree')
    pd.testing.assert_frame_equal(r.execute().fetch().sort_index(),
                                  raw.groupby(0, as_index=False)[0].agg({'cnt': 'count'}))
예제 #6
0
    def testGroupByGetItem(self):
        df1 = pd.DataFrame(
            {
                'a': [3, 4, 5, 3, 5, 4, 1, 2, 3],
                'b': [1, 3, 4, 5, 6, 5, 4, 4, 4],
                'c': list('aabaaddce')
            },
            index=pd.MultiIndex.from_tuples([(i % 3, 'i' + str(i))
                                             for i in range(9)]))
        mdf = md.DataFrame(df1, chunk_size=3)

        r = mdf.groupby(level=0)[['a', 'b']]
        assert_groupby_equal(self.executor.execute_dataframe(r,
                                                             concat=True)[0],
                             df1.groupby(level=0)[['a', 'b']],
                             with_selection=True)

        r = mdf.groupby(level=0)[['a', 'b']].sum(method='tree')
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(r, concat=True)[0],
            df1.groupby(level=0)[['a', 'b']].sum())

        r = mdf.groupby(level=0)[['a', 'b']].apply(lambda x: x + 1)
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(r, concat=True)[0].sort_index(),
            df1.groupby(level=0)[['a',
                                  'b']].apply(lambda x: x + 1).sort_index())

        r = mdf.groupby('b')[['a', 'b']]
        assert_groupby_equal(self.executor.execute_dataframe(r,
                                                             concat=True)[0],
                             df1.groupby('b')[['a', 'b']],
                             with_selection=True)

        r = mdf.groupby('b')[['a', 'c']]
        assert_groupby_equal(self.executor.execute_dataframe(r,
                                                             concat=True)[0],
                             df1.groupby('b')[['a', 'c']],
                             with_selection=True)

        r = mdf.groupby('b')[['a', 'b']].sum(method='tree')
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(r, concat=True)[0],
            df1.groupby('b')[['a', 'b']].sum())

        r = mdf.groupby('b')[['a', 'b']].agg(['sum', 'count'], method='tree')
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(r, concat=True)[0],
            df1.groupby('b')[['a', 'b']].agg(['sum', 'count']))

        r = mdf.groupby('b')[['a', 'c']].agg(['sum', 'count'], method='tree')
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(r, concat=True)[0],
            df1.groupby('b')[['a', 'c']].agg(['sum', 'count']))

        r = mdf.groupby('b')[['a', 'b']].apply(lambda x: x + 1)
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(r, concat=True)[0].sort_index(),
            df1.groupby('b')[['a', 'b']].apply(lambda x: x + 1).sort_index())

        r = mdf.groupby('b')[['a', 'b']].transform(lambda x: x + 1)
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(r, concat=True)[0].sort_index(),
            df1.groupby('b')[['a',
                              'b']].transform(lambda x: x + 1).sort_index())

        r = mdf.groupby('b')[['a', 'b']].cumsum()
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(r, concat=True)[0].sort_index(),
            df1.groupby('b')[['a', 'b']].cumsum().sort_index())

        r = mdf.groupby('b').a
        assert_groupby_equal(self.executor.execute_dataframe(r,
                                                             concat=True)[0],
                             df1.groupby('b').a,
                             with_selection=True)

        r = mdf.groupby('b').a.sum(method='tree')
        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(r, concat=True)[0],
            df1.groupby('b').a.sum())

        r = mdf.groupby('b').a.agg(['sum', 'mean', 'var'], method='tree')
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(r, concat=True)[0],
            df1.groupby('b').a.agg(['sum', 'mean', 'var']))

        r = mdf.groupby('b').a.apply(lambda x: x + 1)
        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(r, concat=True)[0].sort_index(),
            df1.groupby('b').a.apply(lambda x: x + 1).sort_index())

        r = mdf.groupby('b').a.transform(lambda x: x + 1)
        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(r, concat=True)[0].sort_index(),
            df1.groupby('b').a.transform(lambda x: x + 1).sort_index())

        r = mdf.groupby('b').a.cumsum()
        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(r, concat=True)[0].sort_index(),
            df1.groupby('b').a.cumsum().sort_index())
예제 #7
0
    def testDataSerialize(self):
        for type_, compress in itertools.product(
                (None,) + tuple(dataserializer.SerialType.__members__.values()),
                (None,) + tuple(dataserializer.CompressType.__members__.values())):
            array = np.random.rand(1000, 100)
            assert_array_equal(array, dataserializer.loads(
                dataserializer.dumps(array, serial_type=type_, compress=compress)))

            array = np.random.rand(1000, 100)
            assert_array_equal(array, dataserializer.load(
                BytesIO(dataserializer.dumps(array, serial_type=type_, compress=compress))))

            array = np.random.rand(1000, 100).T  # test non c-contiguous
            assert_array_equal(array, dataserializer.loads(
                dataserializer.dumps(array, serial_type=type_, compress=compress)))

            array = np.float64(0.2345)
            assert_array_equal(array, dataserializer.loads(
                dataserializer.dumps(array, serial_type=type_, compress=compress)))

        # test non-serializable object
        if pyarrow:
            non_serial = type('non_serial', (object,), dict(nbytes=10))
            with self.assertRaises(SerializationFailed):
                dataserializer.dumps(non_serial())

        # test structured arrays.
        rec_dtype = np.dtype([('a', 'int64'), ('b', 'double'), ('c', '<U8')])
        array = np.ones((100,), dtype=rec_dtype)
        array_loaded = dataserializer.loads(dataserializer.dumps(array))
        self.assertEqual(array.dtype, array_loaded.dtype)
        assert_array_equal(array, array_loaded)

        fn = os.path.join(tempfile.gettempdir(), f'test_dump_file_{id(self)}.bin')
        try:
            array = np.random.rand(1000, 100).T  # test non c-contiguous
            with open(fn, 'wb') as dump_file:
                dataserializer.dump(array, dump_file)
            with open(fn, 'rb') as dump_file:
                assert_array_equal(array, dataserializer.load(dump_file))

            with open(fn, 'wb') as dump_file:
                dataserializer.dump(array, dump_file,
                                    compress=dataserializer.CompressType.LZ4)
            with open(fn, 'rb') as dump_file:
                assert_array_equal(array, dataserializer.load(dump_file))

            with open(fn, 'wb') as dump_file:
                dataserializer.dump(array, dump_file,
                                    compress=dataserializer.CompressType.GZIP)
            with open(fn, 'rb') as dump_file:
                assert_array_equal(array, dataserializer.load(dump_file))
        finally:
            if os.path.exists(fn):
                os.unlink(fn)

        # test sparse
        if sps:
            mat = sparse.SparseMatrix(sps.random(100, 100, 0.1, format='csr'))
            des_mat = dataserializer.loads(dataserializer.dumps(mat))
            self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0)

            des_mat = dataserializer.loads(dataserializer.dumps(
                mat, compress=dataserializer.CompressType.LZ4))
            self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0)

            des_mat = dataserializer.loads(dataserializer.dumps(
                mat, compress=dataserializer.CompressType.GZIP))
            self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0)

            vector = sparse.SparseVector(sps.csr_matrix(np.random.rand(2)), shape=(2,))
            des_vector = dataserializer.loads(dataserializer.dumps(vector))
            self.assertTrue((vector.spmatrix != des_vector.spmatrix).nnz == 0)

            des_vector = dataserializer.loads(dataserializer.dumps(
                vector, compress=dataserializer.CompressType.LZ4))
            self.assertTrue((vector.spmatrix != des_vector.spmatrix).nnz == 0)

            des_vector = dataserializer.loads(dataserializer.dumps(
                vector, compress=dataserializer.CompressType.GZIP))
            self.assertTrue((vector.spmatrix != des_vector.spmatrix).nnz == 0)

        # test groupby
        df1 = pd.DataFrame({'a': [3, 4, 5, 3, 5, 4, 1, 2, 3],
                            'b': [1, 3, 4, 5, 6, 5, 4, 4, 4],
                            'c': list('aabaaddce')})
        grouped = wrapped_groupby(df1, 'b')
        restored = dataserializer.loads(dataserializer.dumps(grouped))
        assert_groupby_equal(grouped, restored.groupby_obj)

        grouped = wrapped_groupby(df1, 'b').c
        restored = dataserializer.loads(dataserializer.dumps(grouped))
        assert_groupby_equal(grouped, restored.groupby_obj)

        grouped = wrapped_groupby(df1, 'b')
        getattr(grouped, 'indices')
        restored = dataserializer.loads(dataserializer.dumps(grouped))
        assert_groupby_equal(grouped, restored.groupby_obj)

        grouped = wrapped_groupby(df1.b, lambda x: x % 2)
        restored = dataserializer.loads(dataserializer.dumps(grouped))
        assert_groupby_equal(grouped, restored.groupby_obj)

        grouped = wrapped_groupby(df1.b, lambda x: x % 2)
        getattr(grouped, 'indices')
        restored = dataserializer.loads(dataserializer.dumps(grouped))
        assert_groupby_equal(grouped, restored.groupby_obj)

        # test categorical
        s = np.random.RandomState(0).random(10)
        cat = pd.cut(s, [0.3, 0.5, 0.8])
        self.assertIsInstance(cat, pd.Categorical)
        des_cat = dataserializer.loads(dataserializer.dumps(cat))
        self.assertEqual(len(cat), len(des_cat))
        for c, dc in zip(cat, des_cat):
            np.testing.assert_equal(c, dc)

        # test IntervalIndex
        s = pd.interval_range(10, 100, 3)
        dest_s = dataserializer.loads((dataserializer.dumps(s)))
        pd.testing.assert_index_equal(s, dest_s)

        # test complex
        s = complex(10 + 5j)
        dest_s = dataserializer.loads((dataserializer.dumps(s)))
        self.assertIs(type(s), type(dest_s))
        self.assertEqual(s, dest_s)

        s = np.complex64(10 + 5j)
        dest_s = dataserializer.loads((dataserializer.dumps(s)))
        self.assertIs(type(s), type(dest_s))
        self.assertEqual(s, dest_s)

        # test pickle
        d = ClassToPickle(dict(a=1, b='uvw'))
        dest_d = dataserializer.loads((dataserializer.dumps(d)))
        self.assertIs(type(d), type(dest_d))
        self.assertEqual(d.a, dest_d.a)

        # test ndarray with negative strides
        arr = np.zeros((5, 6, 3))
        arr2 = arr[:, :, ::-1]
        dest_arr2 = dataserializer.loads(dataserializer.dumps(arr2))
        np.testing.assert_array_equal(arr2, dest_arr2)

        # test ArrowArray
        df = pd.DataFrame({'a': ['s1', 's2', 's3'],
                           'b': [['s1', 's2'], ['s3'], ['s4', 's5']]})
        df['a'] = df['a'].astype(ArrowStringDtype())
        df['b'] = df['b'].astype(ArrowListDtype(str))
        dest_df = dataserializer.loads(dataserializer.dumps(df))
        self.assertIs(type(df), type(dest_df))
        pd.testing.assert_frame_equal(df, dest_df)

        # test DataFrame with SparseDtype
        s = pd.Series([1, 2, np.nan, np.nan, 3]).astype(
            pd.SparseDtype(np.dtype(np.float64), np.nan))
        dest_s = dataserializer.loads((dataserializer.dumps(s)))
        pd.testing.assert_series_equal(s, dest_s)
        df = pd.DataFrame({'s': s})
        dest_df = dataserializer.loads((dataserializer.dumps(df)))
        pd.testing.assert_frame_equal(df, dest_df)
예제 #8
0
    def testGroupBy(self):
        rs = np.random.RandomState(0)
        data_size = 100
        data_dict = {
            'a': rs.randint(0, 10, size=(data_size, )),
            'b': rs.randint(0, 10, size=(data_size, )),
            'c': rs.choice(list('abcd'), size=(data_size, ))
        }
        df1 = pd.DataFrame(data_dict)
        mdf = md.DataFrame(df1, chunk_size=13)
        grouped = mdf.groupby('b')
        assert_groupby_equal(
            self.executor.execute_dataframe(grouped, concat=True)[0],
            df1.groupby('b'))

        df2 = pd.DataFrame(data_dict,
                           index=['i' + str(i) for i in range(data_size)])
        mdf = md.DataFrame(df2, chunk_size=13)
        grouped = mdf.groupby('b')
        assert_groupby_equal(
            self.executor.execute_dataframe(grouped, concat=True)[0],
            df2.groupby('b'))

        # test groupby series
        grouped = mdf.groupby(mdf['b'])
        assert_groupby_equal(
            self.executor.execute_dataframe(grouped, concat=True)[0],
            df2.groupby(df2['b']))

        # test groupby multiple series
        grouped = mdf.groupby(by=[mdf['b'], mdf['c']])
        assert_groupby_equal(
            self.executor.execute_dataframe(grouped, concat=True)[0],
            df2.groupby(by=[df2['b'], df2['c']]))

        df3 = pd.DataFrame(data_dict,
                           index=pd.MultiIndex.from_tuples([
                               (i % 3, 'i' + str(i)) for i in range(data_size)
                           ]))
        mdf = md.DataFrame(df3, chunk_size=13)
        grouped = mdf.groupby(level=0)
        assert_groupby_equal(
            self.executor.execute_dataframe(grouped, concat=True)[0],
            df3.groupby(level=0))

        # test groupby with integer columns
        df4 = pd.DataFrame(list(data_dict.values())).T
        mdf = md.DataFrame(df4, chunk_size=13)
        grouped = mdf.groupby(0)
        assert_groupby_equal(
            self.executor.execute_dataframe(grouped, concat=True)[0],
            df4.groupby(0))

        series1 = pd.Series(data_dict['a'])
        ms1 = md.Series(series1, chunk_size=13)
        grouped = ms1.groupby(lambda x: x % 3)
        assert_groupby_equal(
            self.executor.execute_dataframe(grouped, concat=True)[0],
            series1.groupby(lambda x: x % 3))

        # test groupby series
        grouped = ms1.groupby(ms1)
        assert_groupby_equal(
            self.executor.execute_dataframe(grouped, concat=True)[0],
            series1.groupby(series1))

        series2 = pd.Series(data_dict['a'],
                            index=['i' + str(i) for i in range(data_size)])
        ms2 = md.Series(series2, chunk_size=13)
        grouped = ms2.groupby(lambda x: int(x[1:]) % 3)
        assert_groupby_equal(
            self.executor.execute_dataframe(grouped, concat=True)[0],
            series2.groupby(lambda x: int(x[1:]) % 3))
예제 #9
0
파일: test_lib.py 프로젝트: qinxuye/mars
def test_groupby_wrapper():
    df = pd.DataFrame(
        {
            'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
            'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
            'C': np.random.randn(8),
            'D': np.random.randn(8)
        },
        index=pd.MultiIndex.from_tuples([(i // 4, i) for i in range(8)]))

    conv_func = lambda x: pickle.loads(pickle.dumps(x))

    grouped = conv_func(wrapped_groupby(df, level=0))
    assert_groupby_equal(grouped, df.groupby(level=0))
    assert grouped.shape == (8, 4)
    assert grouped.is_frame is True
    assert sys.getsizeof(grouped) > sys.getsizeof(grouped.groupby_obj)
    assert calc_data_size(grouped) > sys.getsizeof(grouped.groupby_obj)

    grouped = conv_func(wrapped_groupby(df, level=0).C)
    assert_groupby_equal(grouped, df.groupby(level=0).C)
    assert grouped.shape == (8, )
    assert grouped.is_frame is False

    grouped = conv_func(wrapped_groupby(df, 'B'))
    assert_groupby_equal(grouped, df.groupby('B'))
    assert grouped.shape == (8, 4)
    assert grouped.is_frame is True

    grouped = conv_func(wrapped_groupby(df, 'B').C)
    assert_groupby_equal(grouped, df.groupby('B').C, with_selection=True)
    assert grouped.shape == (8, )
    assert grouped.is_frame is False

    grouped = conv_func(wrapped_groupby(df, 'B')[['C', 'D']])
    assert_groupby_equal(grouped,
                         df.groupby('B')[['C', 'D']],
                         with_selection=True)
    assert grouped.shape == (8, 2)
    assert grouped.is_frame is True

    grouped = conv_func(wrapped_groupby(df, ['B', 'C']))
    assert_groupby_equal(grouped, df.groupby(['B', 'C']))
    assert grouped.shape == (8, 4)
    assert grouped.is_frame is True

    grouped = conv_func(wrapped_groupby(df, ['B', 'C']).C)
    assert_groupby_equal(grouped,
                         df.groupby(['B', 'C']).C,
                         with_selection=True)
    assert grouped.shape == (8, )
    assert grouped.is_frame is False

    grouped = conv_func(wrapped_groupby(df, ['B', 'C'])[['A', 'D']])
    assert_groupby_equal(grouped,
                         df.groupby(['B', 'C'])[['A', 'D']],
                         with_selection=True)
    assert grouped.shape == (8, 2)
    assert grouped.is_frame is True

    grouped = conv_func(wrapped_groupby(df, ['B', 'C'])[['C', 'D']])
    assert_groupby_equal(grouped,
                         df.groupby(['B', 'C'])[['C', 'D']],
                         with_selection=True)
    assert grouped.shape == (8, 2)
    assert grouped.is_frame is True

    grouped = conv_func(wrapped_groupby(df, lambda x: x[-1] % 2))
    assert_groupby_equal(grouped,
                         df.groupby(lambda x: x[-1] % 2),
                         with_selection=True)
    assert grouped.shape == (8, 4)
    assert grouped.is_frame is True

    grouped = conv_func(wrapped_groupby(df, lambda x: x[-1] % 2).C)
    assert_groupby_equal(grouped,
                         df.groupby(lambda x: x[-1] % 2).C,
                         with_selection=True)
    assert grouped.shape == (8, )
    assert grouped.is_frame is False

    grouped = conv_func(wrapped_groupby(df, lambda x: x[-1] % 2)[['C', 'D']])
    assert_groupby_equal(grouped,
                         df.groupby(lambda x: x[-1] % 2)[['C', 'D']],
                         with_selection=True)
    assert grouped.shape == (8, 2)
    assert grouped.is_frame is True

    grouped = conv_func(wrapped_groupby(df.B, lambda x: x[-1] % 2))
    assert_groupby_equal(grouped,
                         df.B.groupby(lambda x: x[-1] % 2),
                         with_selection=True)
    assert grouped.shape == (8, )
    assert grouped.is_frame is False