Exemplo n.º 1
0
        def testWithGivenSession(session):
            with option_context(
                {'vineyard.socket': '/tmp/vineyard/vineyard.sock'}):
                df1 = DataFrame(pd.DataFrame(np.arange(12).reshape(3, 4),
                                             columns=['a', 'b', 'c', 'd']),
                                chunk_size=2)
                object_id = df1.to_vineyard().execute(session=session).fetch()
                df2 = md.from_vineyard(object_id)

                df1_value = df1.execute(session=session).fetch()
                df2_value = df2.execute(session=session).fetch()
                pd.testing.assert_frame_equal(df1_value.reset_index(drop=True),
                                              df2_value.reset_index(drop=True))
Exemplo n.º 2
0
    def testArrowStringSortValues(self):
        rs = np.random.RandomState(0)
        raw = pd.DataFrame({
            'a': rs.rand(10),
            'b': [f's{rs.randint(1000)}' for _ in range(10)]
        })
        raw['b'] = raw['b'].astype(ArrowStringDtype())
        mdf = DataFrame(raw, chunk_size=3)

        df = mdf.sort_values(by='b')
        result = self.executor.execute_dataframe(df, concat=True)[0]
        expected = raw.sort_values(by='b')
        pd.testing.assert_frame_equal(result, expected)
Exemplo n.º 3
0
    def testToParquetFastParquetExecution(self):
        raw = pd.DataFrame({
            'col1': np.random.rand(100),
            'col2': np.arange(100),
            'col3': np.random.choice(['a', 'b', 'c'], (100,)),
        })
        df = DataFrame(raw, chunk_size=33)

        with tempfile.TemporaryDirectory() as base_path:
            # test fastparquet
            path = os.path.join(base_path, 'out-fastparquet-*.parquet')
            r = df.to_parquet(path, engine='fastparquet', compression='gzip')
            self.executor.execute_dataframe(r)
Exemplo n.º 4
0
        def run_with_given_session(session, **kw):
            ipc_socket = os.environ.get('VINEYARD_IPC_SOCKET',
                                        '/tmp/vineyard/vineyard.sock')
            with option_context({'vineyard.socket': ipc_socket}):
                df1 = DataFrame(pd.DataFrame(np.arange(12).reshape(3, 4),
                                             columns=['a', 'b', 'c', 'd']),
                                chunk_size=2)
                object_id = df1.to_vineyard().execute(
                    session=session, **kw).fetch(session=session)
                df2 = md.from_vineyard(object_id)

                df1_value = df1.execute(session=session,
                                        **kw).fetch(session=session)
                df2_value = df2.execute(session=session,
                                        **kw).fetch(session=session)
                pd.testing.assert_frame_equal(df1_value.reset_index(drop=True),
                                              df2_value.reset_index(drop=True))
Exemplo n.º 5
0
def test_to_csv():
    raw = pd.DataFrame(np.random.rand(10, 5))
    df = DataFrame(raw, chunk_size=4)

    r = df.to_csv('*.csv')
    r = tile(r)

    assert r.chunk_shape[1] == 1
    for i, c in enumerate(r.chunks):
        assert type(c.op).__name__ == 'DataFrameToCSV'
        assert c.inputs[0] is r.inputs[0].chunks[i].data

    # test one file
    r = df.to_csv('out.csv')
    r = tile(r)

    assert r.chunk_shape[1] == 1
    for i, c in enumerate(r.chunks):
        assert len(c.inputs) == 2
        assert c.inputs[0].inputs[0] is r.inputs[0].chunks[i].data
        assert type(c.inputs[1].op).__name__ == 'DataFrameToCSVStat'
Exemplo n.º 6
0
    def testToParquetArrowExecution(self):
        raw = pd.DataFrame({
            'col1': np.random.rand(100),
            'col2': np.arange(100),
            'col3': np.random.choice(['a', 'b', 'c'], (100, )),
        })
        df = DataFrame(raw, chunk_size=33)

        with tempfile.TemporaryDirectory() as base_path:
            # DATAFRAME TESTS
            path = os.path.join(base_path, 'out-*.parquet')
            r = df.to_parquet(path)
            self.executor.execute_dataframe(r)

            read_df = md.read_parquet(path)
            result = self.executor.execute_dataframe(read_df, concat=True)[0]
            result = result.sort_index()
            pd.testing.assert_frame_equal(result, raw)

            read_df = md.read_parquet(path)
            result = self.executor.execute_dataframe(read_df, concat=True)[0]
            result = result.sort_index()
            pd.testing.assert_frame_equal(result, raw)

            # test read_parquet then to_parquet
            read_df = md.read_parquet(path)
            r = read_df.to_parquet(path)
            self.executor.execute_dataframes([r])

            # test partition_cols
            path = os.path.join(base_path, 'out-partitioned')
            r = df.to_parquet(path, partition_cols=['col3'])
            self.executor.execute_dataframe(r)

            read_df = md.read_parquet(path)
            result = self.executor.execute_dataframe(read_df, concat=True)[0]
            result['col3'] = result['col3'].astype('object')
            pd.testing.assert_frame_equal(
                result.sort_values('col1').reset_index(drop=True),
                raw.sort_values('col1').reset_index(drop=True))
Exemplo n.º 7
0
    def testToSQL(self):
        index = pd.RangeIndex(100, 0, -1, name='index')
        raw = pd.DataFrame(
            {
                'col1': np.random.rand(100),
                'col2': np.random.choice(['a', 'b', 'c'], (100, )),
                'col3': np.arange(100).astype('int64'),
            },
            index=index)

        with tempfile.TemporaryDirectory() as d:
            table_name1 = 'test_table'
            table_name2 = 'test_table2'
            uri = 'sqlite:///' + os.path.join(d, 'test.db')

            engine = sqlalchemy.create_engine(uri)

            # test write dataframe
            df = DataFrame(raw, chunk_size=33)
            r = df.to_sql(table_name1, con=engine)
            self.executor.execute_dataframe(r)

            written = pd.read_sql(table_name1, con=engine, index_col='index') \
                .sort_index(ascending=False)
            pd.testing.assert_frame_equal(raw, written)

            # test write with existing table
            with self.assertRaises(ValueError):
                df.to_sql(table_name1, con=uri).execute()

            # test write series
            series = md.Series(raw.col1, chunk_size=33)
            with engine.connect() as conn:
                r = series.to_sql(table_name2, con=conn)
                self.executor.execute_dataframe(r)

            written = pd.read_sql(table_name2, con=engine, index_col='index') \
                .sort_index(ascending=False)
            pd.testing.assert_frame_equal(raw.col1.to_frame(), written)
Exemplo n.º 8
0
    def testToCSVExecution(self):
        index = pd.RangeIndex(100, 0, -1, name='index')
        raw = pd.DataFrame(
            {
                'col1': np.random.rand(100),
                'col2': np.random.choice(['a', 'b', 'c'], (100, )),
                'col3': np.arange(100)
            },
            index=index)
        df = DataFrame(raw, chunk_size=33)

        with tempfile.TemporaryDirectory() as base_path:
            # test one file
            path = os.path.join(base_path, 'out.csv')

            r = df.to_csv(path)
            self.executor.execute_dataframe(r)

            result = pd.read_csv(path, dtype=raw.dtypes.to_dict())
            result.set_index('index', inplace=True)
            pd.testing.assert_frame_equal(result, raw)

            # test multi files
            path = os.path.join(base_path, 'out-*.csv')
            r = df.to_csv(path)
            self.executor.execute_dataframe(r)

            dfs = [
                pd.read_csv(os.path.join(base_path, 'out-{}.csv'.format(i)),
                            dtype=raw.dtypes.to_dict()) for i in range(4)
            ]
            result = pd.concat(dfs, axis=0)
            result.set_index('index', inplace=True)
            pd.testing.assert_frame_equal(result, raw)
            pd.testing.assert_frame_equal(dfs[1].set_index('index'),
                                          raw.iloc[33:66])
Exemplo n.º 9
0
    def testGPUExecution(self):
        # test sort_values
        distinct_opts = ['0'] if sys.platform.lower().startswith('win') else [
            '0', '1'
        ]
        for add_distinct in distinct_opts:
            os.environ['PSRS_DISTINCT_COL'] = add_distinct

            # test dataframe
            raw = pd.DataFrame(np.random.rand(100, 10),
                               columns=['a' + str(i) for i in range(10)])
            mdf = DataFrame(raw, chunk_size=30).to_gpu()

            result = self.executor.execute_dataframe(mdf.sort_values(by='a0'),
                                                     concat=True)[0]
            expected = raw.sort_values(by='a0')
            pd.testing.assert_frame_equal(result.to_pandas(), expected)

            # test series
            raw = pd.Series(np.random.rand(10))
            series = Series(raw).to_gpu()

            result = self.executor.execute_dataframe(series.sort_values(),
                                                     concat=True)[0]
            expected = raw.sort_values()
            pd.testing.assert_series_equal(result.to_pandas(), expected)

        # test DataFrame.sort_index
        raw = pd.DataFrame(np.random.rand(10, 10), columns=np.random.rand(10))
        mdf = DataFrame(raw).to_gpu()

        result = self.executor.execute_dataframe(mdf.sort_index(),
                                                 concat=True)[0]
        expected = raw.sort_index()
        pd.testing.assert_frame_equal(result.to_pandas(), expected)

        # test Series.sort_index
        raw = pd.Series(np.random.rand(10, ), index=np.random.rand(10))
        series = Series(raw).to_gpu()

        result = self.executor.execute_dataframe(series.sort_index(),
                                                 concat=True)[0]
        expected = raw.sort_index()
        pd.testing.assert_series_equal(result.to_pandas(), expected)
Exemplo n.º 10
0
    def testToDatetimeExecution(self):
        # scalar
        r = to_datetime(1490195805, unit='s')

        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = pd.to_datetime(1490195805, unit='s')
        self.assertEqual(pd.to_datetime(result.item()), expected)

        # test list like
        raw = ['3/11/2000', '3/12/2000', '3/13/2000']
        t = tensor(raw, chunk_size=2)
        r = to_datetime(t, infer_datetime_format=True)

        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = pd.to_datetime(raw, infer_datetime_format=True)
        pd.testing.assert_index_equal(result, expected)

        # test series
        raw_series = pd.Series(raw)
        s = Series(raw_series, chunk_size=2)
        r = to_datetime(s)

        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = pd.to_datetime(raw_series)
        pd.testing.assert_series_equal(result, expected)

        # test DataFrame
        raw_df = pd.DataFrame({
            'year': [2015, 2016],
            'month': [2, 3],
            'day': [4, 5]
        })
        df = DataFrame(raw_df, chunk_size=(1, 2))
        r = to_datetime(df)

        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = pd.to_datetime(raw_df)
        pd.testing.assert_series_equal(result, expected)

        # test Index
        raw_index = pd.Index([1, 2, 3])
        s = Index(raw_index, chunk_size=2)
        r = to_datetime(s)

        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = pd.to_datetime(raw_index)
        pd.testing.assert_index_equal(result, expected)

        # test raises == 'ignore'
        raw = ['13000101']
        r = to_datetime(raw, format='%Y%m%d', errors='ignore')
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = pd.to_datetime(raw, format='%Y%m%d', errors='ignore')
        pd.testing.assert_index_equal(result, expected)

        # test unit
        r = to_datetime([1490195805], unit='s')
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = pd.to_datetime([1490195805], unit='s')
        pd.testing.assert_index_equal(result, expected)

        # test origin
        r = to_datetime([1, 2, 3], unit='D', origin=pd.Timestamp('1960-01-01'))
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = pd.to_datetime([1, 2, 3],
                                  unit='D',
                                  origin=pd.Timestamp('1960-01-01'))
        pd.testing.assert_index_equal(result, expected)
Exemplo n.º 11
0
    def testDotExecution(self):
        df1_raw = pd.DataFrame(np.random.rand(4, 7))
        df2_raw = pd.DataFrame(np.random.rand(7, 5), columns=list('efghi'))
        s1_raw = pd.Series(np.random.rand(7))
        s2_raw = pd.Series(np.random.rand(7))

        df1 = DataFrame(df1_raw, chunk_size=(3, 2))
        df2 = DataFrame(df2_raw, chunk_size=(3, 4))

        # df.dot(df)
        r = df1.dot(df2)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = df1_raw.dot(df2_raw)
        pd.testing.assert_frame_equal(result, expected)

        # test @
        r = df1 @ df2
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = df1_raw @ df2_raw
        pd.testing.assert_frame_equal(result, expected)

        series1 = Series(s1_raw, chunk_size=5)

        # df.dot(series)
        r = df1.dot(series1)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = df1_raw.dot(s1_raw)
        pd.testing.assert_series_equal(result, expected)

        # df.dot(2d_array)
        r = df1.dot(df2_raw.to_numpy())
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = df1_raw.dot(df2_raw.to_numpy())
        pd.testing.assert_frame_equal(result, expected)

        # df.dot(1d_array)
        r = df1.dot(s1_raw.to_numpy())
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = df1_raw.dot(s1_raw.to_numpy())
        pd.testing.assert_series_equal(result, expected)

        series2 = Series(s2_raw, chunk_size=4)

        # series.dot(series)
        r = series1.dot(series2)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = s1_raw.dot(s2_raw)
        self.assertAlmostEqual(result, expected)

        # series.dot(df)
        r = series1.dot(df2)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = s1_raw.dot(df2_raw)
        pd.testing.assert_series_equal(result, expected)

        # series.dot(2d_array)
        r = series1.dot(df2_raw.to_numpy())
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = s1_raw.dot(df2_raw.to_numpy())
        np.testing.assert_almost_equal(result, expected)

        # series.dot(1d_array)
        r = series1.dot(s2_raw.to_numpy())
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = s1_raw.dot(s2_raw.to_numpy())
        self.assertAlmostEqual(result, expected)
Exemplo n.º 12
0
def test_sort_values_execution(setup):
    distinct_opts = ['0'] if sys.platform.lower().startswith('win') else [
        '0', '1'
    ]
    for add_distinct in distinct_opts:
        os.environ['PSRS_DISTINCT_COL'] = add_distinct
        df = pd.DataFrame(np.random.rand(100, 10),
                          columns=['a' + str(i) for i in range(10)])

        # test one chunk
        mdf = DataFrame(df)
        result = mdf.sort_values('a0').execute().fetch()
        expected = df.sort_values('a0')

        pd.testing.assert_frame_equal(result, expected)

        result = mdf.sort_values(['a6', 'a7'],
                                 ascending=False).execute().fetch()
        expected = df.sort_values(['a6', 'a7'], ascending=False)

        pd.testing.assert_frame_equal(result, expected)

        # test psrs
        mdf = DataFrame(df, chunk_size=10)
        result = mdf.sort_values('a0').execute().fetch()
        expected = df.sort_values('a0')

        pd.testing.assert_frame_equal(result, expected)

        result = mdf.sort_values(['a3', 'a4']).execute().fetch()
        expected = df.sort_values(['a3', 'a4'])

        pd.testing.assert_frame_equal(result, expected)

        # test ascending=False
        result = mdf.sort_values(['a0', 'a1'],
                                 ascending=False).execute().fetch()
        expected = df.sort_values(['a0', 'a1'], ascending=False)

        pd.testing.assert_frame_equal(result, expected)

        result = mdf.sort_values(['a7'], ascending=False).execute().fetch()
        expected = df.sort_values(['a7'], ascending=False)

        pd.testing.assert_frame_equal(result, expected)

        # test multiindex
        df2 = df.copy(deep=True)
        df2.columns = pd.MultiIndex.from_product([list('AB'), list('CDEFG')])
        mdf = DataFrame(df2, chunk_size=10)

        result = mdf.sort_values([('A', 'C')]).execute().fetch()
        expected = df2.sort_values([('A', 'C')])

        pd.testing.assert_frame_equal(result, expected)

        # test rechunk
        mdf = DataFrame(df, chunk_size=3)
        result = mdf.sort_values('a0').execute().fetch()
        expected = df.sort_values('a0')

        pd.testing.assert_frame_equal(result, expected)

        result = mdf.sort_values(['a3', 'a4']).execute().fetch()
        expected = df.sort_values(['a3', 'a4'])

        pd.testing.assert_frame_equal(result, expected)

        # test other types
        raw = pd.DataFrame(
            {
                'a': np.random.rand(10),
                'b': np.random.randint(1000, size=10),
                'c': np.random.rand(10),
                'd': [np.random.bytes(10) for _ in range(10)],
                'e': [pd.Timestamp(f'201{i}') for i in range(10)],
                'f': [pd.Timedelta(f'{i} days') for i in range(10)]
            }, )
        mdf = DataFrame(raw, chunk_size=3)

        for label in raw.columns:
            result = mdf.sort_values(label).execute().fetch()
            expected = raw.sort_values(label)
            pd.testing.assert_frame_equal(result, expected)

        result = mdf.sort_values(['a', 'b', 'e'],
                                 ascending=False).execute().fetch()
        expected = raw.sort_values(['a', 'b', 'e'], ascending=False)

        pd.testing.assert_frame_equal(result, expected)

        # test nan
        df = pd.DataFrame({
            'col1': ['A', 'A', 'B', 'B', 'D', 'C'],
            'col2': [2, 1, 9, np.nan, 7, 4],
            'col3': [0, 1, 9, 4, 2, 3],
        })
        mdf = DataFrame(df)
        result = mdf.sort_values(['col2']).execute().fetch()
        expected = df.sort_values(['col2'])

        pd.testing.assert_frame_equal(result, expected)

        mdf = DataFrame(df, chunk_size=3)
        result = mdf.sort_values(['col2']).execute().fetch()
        expected = df.sort_values(['col2'])

        pd.testing.assert_frame_equal(result, expected)

        # test None (issue #1885)
        df = pd.DataFrame(np.random.rand(1000, 10))

        df[0][df[0] < 0.5] = 'A'
        df[0][df[0] != 'A'] = None

        mdf = DataFrame(df)
        result = mdf.sort_values([0, 1]).execute().fetch()
        expected = df.sort_values([0, 1])

        pd.testing.assert_frame_equal(result, expected)

        mdf = DataFrame(df, chunk_size=100)
        result = mdf.sort_values([0, 1]).execute().fetch()
        expected = df.sort_values([0, 1])

        pd.testing.assert_frame_equal(result, expected)

        # test ignore_index
        df = pd.DataFrame(np.random.rand(10, 3),
                          columns=['a' + str(i) for i in range(3)])

        mdf = DataFrame(df, chunk_size=3)
        result = mdf.sort_values(['a0', 'a1'],
                                 ignore_index=True).execute().fetch()
        try:  # for python3.5
            expected = df.sort_values(['a0', 'a1'], ignore_index=True)
        except TypeError:
            expected = df.sort_values(['a0', 'a1'])
            expected.index = pd.RangeIndex(len(expected))

        pd.testing.assert_frame_equal(result, expected)

        # test inplace
        mdf = DataFrame(df)
        mdf.sort_values('a0', inplace=True)
        result = mdf.execute().fetch()
        df.sort_values('a0', inplace=True)

        pd.testing.assert_frame_equal(result, df)

        # test unknown shape
        df = pd.DataFrame({'a': list(range(10)), 'b': np.random.random(10)})
        mdf = DataFrame(df, chunk_size=4)
        filtered = mdf[mdf['a'] > 2]
        result = filtered.sort_values(by='b').execute().fetch()

        pd.testing.assert_frame_equal(result,
                                      df[df['a'] > 2].sort_values(by='b'))

        # test empty dataframe
        df = pd.DataFrame({'a': list(range(10)), 'b': np.random.random(10)})
        mdf = DataFrame(df, chunk_size=4)
        filtered = mdf[mdf['b'] > 100]
        result = filtered.sort_values(by='b').execute().fetch()

        pd.testing.assert_frame_equal(result,
                                      df[df['b'] > 100].sort_values(by='b'))

        # test chunks with zero length
        df = pd.DataFrame({'a': list(range(10)), 'b': np.random.random(10)})
        df.iloc[4:8, 1] = 0

        mdf = DataFrame(df, chunk_size=4)
        filtered = mdf[mdf['b'] != 0]
        result = filtered.sort_values(by='b').execute().fetch()

        pd.testing.assert_frame_equal(result,
                                      df[df['b'] != 0].sort_values(by='b'))

        # test Series.sort_values
        raw = pd.Series(np.random.rand(10))
        series = Series(raw)
        result = series.sort_values().execute().fetch()
        expected = raw.sort_values()

        pd.testing.assert_series_equal(result, expected)

        series = Series(raw, chunk_size=3)
        result = series.sort_values().execute().fetch()
        expected = raw.sort_values()

        pd.testing.assert_series_equal(result, expected)

        series = Series(raw, chunk_size=2)
        result = series.sort_values(ascending=False).execute().fetch()
        expected = raw.sort_values(ascending=False)

        pd.testing.assert_series_equal(result, expected)

        # test empty series
        series = pd.Series(list(range(10)), name='a')
        mseries = Series(series, chunk_size=4)
        filtered = mseries[mseries > 100]
        result = filtered.sort_values().execute().fetch()

        pd.testing.assert_series_equal(result,
                                       series[series > 100].sort_values())

        # test series with None
        series = pd.Series(np.arange(1000, ))

        series[series < 500] = 'A'
        series[series != 'A'] = None

        mseries = Series(series, chunk_size=100)
        result = mseries.sort_values().execute().fetch()
        expected = series.sort_values()
        pd.testing.assert_series_equal(result.reset_index(drop=True),
                                       expected.reset_index(drop=True))
Exemplo n.º 13
0
def test_sort_index_execution(setup):
    raw = pd.DataFrame(np.random.rand(100, 20), index=np.random.rand(100))

    mdf = DataFrame(raw)
    result = mdf.sort_index().execute().fetch()
    expected = raw.sort_index()
    pd.testing.assert_frame_equal(result, expected)

    mdf = DataFrame(raw)
    mdf.sort_index(inplace=True)
    result = mdf.execute().fetch()
    expected = raw.sort_index()
    pd.testing.assert_frame_equal(result, expected)

    mdf = DataFrame(raw, chunk_size=30)
    result = mdf.sort_index().execute().fetch()
    expected = raw.sort_index()
    pd.testing.assert_frame_equal(result, expected)

    mdf = DataFrame(raw, chunk_size=20)
    result = mdf.sort_index(ascending=False).execute().fetch()
    expected = raw.sort_index(ascending=False)
    pd.testing.assert_frame_equal(result, expected)

    mdf = DataFrame(raw, chunk_size=10)
    result = mdf.sort_index(ignore_index=True).execute().fetch()
    try:  # for python3.5
        expected = raw.sort_index(ignore_index=True)
    except TypeError:
        expected = raw.sort_index()
        expected.index = pd.RangeIndex(len(expected))
    pd.testing.assert_frame_equal(result, expected)

    # test axis=1
    raw = pd.DataFrame(np.random.rand(10, 10), columns=np.random.rand(10))

    mdf = DataFrame(raw)
    result = mdf.sort_index(axis=1).execute().fetch()
    expected = raw.sort_index(axis=1)
    pd.testing.assert_frame_equal(result, expected)

    mdf = DataFrame(raw, chunk_size=3)
    result = mdf.sort_index(axis=1).execute().fetch()
    expected = raw.sort_index(axis=1)
    pd.testing.assert_frame_equal(result, expected)

    mdf = DataFrame(raw, chunk_size=4)
    result = mdf.sort_index(axis=1, ascending=False).execute().fetch()
    expected = raw.sort_index(axis=1, ascending=False)
    pd.testing.assert_frame_equal(result, expected)

    mdf = DataFrame(raw, chunk_size=4)

    result = mdf.sort_index(axis=1, ignore_index=True).execute().fetch()
    try:  # for python3.5
        expected = raw.sort_index(axis=1, ignore_index=True)
    except TypeError:
        expected = raw.sort_index(axis=1)
        expected.index = pd.RangeIndex(len(expected))
    pd.testing.assert_frame_equal(result, expected)

    # test series
    raw = pd.Series(np.random.rand(10, ), index=np.random.rand(10))

    series = Series(raw)
    result = series.sort_index().execute().fetch()
    expected = raw.sort_index()
    pd.testing.assert_series_equal(result, expected)

    series = Series(raw, chunk_size=2)
    result = series.sort_index().execute().fetch()
    expected = raw.sort_index()
    pd.testing.assert_series_equal(result, expected)

    series = Series(raw, chunk_size=3)
    result = series.sort_index(ascending=False).execute().fetch()
    expected = raw.sort_index(ascending=False)
    pd.testing.assert_series_equal(result, expected)
Exemplo n.º 14
0
    def testDataFrameQuantileExecution(self):
        raw = pd.DataFrame(
            {
                'a': np.random.rand(10),
                'b': np.random.randint(1000, size=10),
                'c': np.random.rand(10),
                'd': [np.random.bytes(10) for _ in range(10)],
                'e': [pd.Timestamp('201{}'.format(i)) for i in range(10)],
                'f': [pd.Timedelta('{} days'.format(i)) for i in range(10)]
            },
            index=pd.RangeIndex(1, 11))
        df = DataFrame(raw, chunk_size=3)

        # q = 0.5, axis = 0, series
        r = df.quantile()
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = raw.quantile()

        pd.testing.assert_series_equal(result, expected)

        # q = 0.5, axis = 1, series
        r = df.quantile(axis=1)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = raw.quantile(axis=1)

        pd.testing.assert_series_equal(result, expected)

        # q is a list, axis = 0, dataframe
        r = df.quantile([0.3, 0.7])
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = raw.quantile([0.3, 0.7])

        pd.testing.assert_frame_equal(result, expected)

        # q is a list, axis = 1, dataframe
        r = df.quantile([0.3, 0.7], axis=1)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = raw.quantile([0.3, 0.7], axis=1)

        pd.testing.assert_frame_equal(result, expected)

        # test interpolation
        r = df.quantile([0.3, 0.7], interpolation='midpoint')
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = raw.quantile([0.3, 0.7], interpolation='midpoint')

        pd.testing.assert_frame_equal(result, expected)

        this = self

        class MockSession:
            def __init__(self):
                self.executor = this.executor

        ctx = LocalContext(MockSession())
        executor = ExecutorForTest('numpy', storage=ctx)
        with ctx:
            q = tensor([0.3, 0.7])

            # q is a tensor
            r = df.quantile(q)
            result = executor.execute_dataframes([r])[0]
            expected = raw.quantile([0.3, 0.7])

            pd.testing.assert_frame_equal(result, expected)

        # test numeric_only
        raw2 = pd.DataFrame(
            {
                'a': np.random.rand(10),
                'b': np.random.randint(1000, size=10),
                'c': np.random.rand(10),
                'd': [pd.Timestamp('201{}'.format(i)) for i in range(10)],
            },
            index=pd.RangeIndex(1, 11))
        df2 = DataFrame(raw2, chunk_size=3)

        r = df2.quantile([0.3, 0.7], numeric_only=False)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = raw2.quantile([0.3, 0.7], numeric_only=False)

        pd.testing.assert_frame_equal(result, expected)

        r = df2.quantile(numeric_only=False)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = raw2.quantile(numeric_only=False)

        pd.testing.assert_series_equal(result, expected)
Exemplo n.º 15
0
def test_dataframe_corr_with(setup):
    rs = np.random.RandomState(0)
    raw_df = rs.rand(20, 10)
    raw_df = pd.DataFrame(np.where(raw_df > 0.4, raw_df, np.nan),
                          columns=list('ABCDEFGHIJ'))
    raw_df2 = rs.rand(20, 10)
    raw_df2 = pd.DataFrame(np.where(raw_df2 > 0.4, raw_df2, np.nan),
                           columns=list('ACDEGHIJKL'))
    raw_s = rs.rand(20)
    raw_s = pd.Series(np.where(raw_s > 0.4, raw_s, np.nan))
    raw_s2 = rs.rand(10)
    raw_s2 = pd.Series(np.where(raw_s2 > 0.4, raw_s2, np.nan),
                       index=raw_df2.columns)

    df = DataFrame(raw_df)
    df2 = DataFrame(raw_df2)

    result = df.corrwith(df2)
    pd.testing.assert_series_equal(result.execute().fetch(),
                                   raw_df.corrwith(raw_df2))

    result = df.corrwith(df2, axis=1)
    pd.testing.assert_series_equal(result.execute().fetch(),
                                   raw_df.corrwith(raw_df2, axis=1))

    result = df.corrwith(df2, method='kendall')
    pd.testing.assert_series_equal(result.execute().fetch(),
                                   raw_df.corrwith(raw_df2, method='kendall'))

    df = DataFrame(raw_df, chunk_size=4)
    df2 = DataFrame(raw_df2, chunk_size=6)
    s = Series(raw_s, chunk_size=5)
    s2 = Series(raw_s2, chunk_size=5)

    with pytest.raises(Exception):
        df.corrwith(df2, method='kendall').execute()

    result = df.corrwith(df2)
    pd.testing.assert_series_equal(result.execute().fetch().sort_index(),
                                   raw_df.corrwith(raw_df2).sort_index())

    result = df.corrwith(df2, axis=1)
    pd.testing.assert_series_equal(
        result.execute().fetch().sort_index(),
        raw_df.corrwith(raw_df2, axis=1).sort_index())

    result = df.corrwith(s)
    pd.testing.assert_series_equal(result.execute().fetch().sort_index(),
                                   raw_df.corrwith(raw_s).sort_index())

    result = df.corrwith(s2, axis=1)
    pd.testing.assert_series_equal(
        result.execute().fetch().sort_index(),
        raw_df.corrwith(raw_s2, axis=1).sort_index())
Exemplo n.º 16
0
    def testDataFrameCorrWith(self):
        rs = np.random.RandomState(0)
        raw_df = rs.rand(20, 10)
        raw_df = pd.DataFrame(np.where(raw_df > 0.4, raw_df, np.nan),
                              columns=list('ABCDEFGHIJ'))
        raw_df2 = rs.rand(20, 10)
        raw_df2 = pd.DataFrame(np.where(raw_df2 > 0.4, raw_df2, np.nan),
                               columns=list('ACDEGHIJKL'))
        raw_s = rs.rand(20)
        raw_s = pd.Series(np.where(raw_s > 0.4, raw_s, np.nan))
        raw_s2 = rs.rand(10)
        raw_s2 = pd.Series(np.where(raw_s2 > 0.4, raw_s2, np.nan),
                           index=raw_df2.columns)

        df = DataFrame(raw_df)
        df2 = DataFrame(raw_df2)

        result = df.corrwith(df2)
        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(result, concat=True)[0],
            raw_df.corrwith(raw_df2))

        result = df.corrwith(df2, axis=1)
        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(result, concat=True)[0],
            raw_df.corrwith(raw_df2, axis=1))

        result = df.corrwith(df2, method='kendall')
        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(result, concat=True)[0],
            raw_df.corrwith(raw_df2, method='kendall'))

        df = DataFrame(raw_df, chunk_size=4)
        df2 = DataFrame(raw_df2, chunk_size=6)
        s = Series(raw_s, chunk_size=5)
        s2 = Series(raw_s2, chunk_size=5)

        with self.assertRaises(Exception):
            self.executor.execute_dataframe(df.corrwith(df2, method='kendall'),
                                            concat=True)

        result = df.corrwith(df2)
        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(result,
                                            concat=True)[0].sort_index(),
            raw_df.corrwith(raw_df2).sort_index())

        result = df.corrwith(df2, axis=1)
        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(result,
                                            concat=True)[0].sort_index(),
            raw_df.corrwith(raw_df2, axis=1).sort_index())

        result = df.corrwith(s)
        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(result,
                                            concat=True)[0].sort_index(),
            raw_df.corrwith(raw_s).sort_index())

        result = df.corrwith(s2, axis=1)
        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(result,
                                            concat=True)[0].sort_index(),
            raw_df.corrwith(raw_s2, axis=1).sort_index())
Exemplo n.º 17
0
    def testSortValuesExecution(self):
        df = pd.DataFrame(np.random.rand(100, 10),
                          columns=['a' + str(i) for i in range(10)])

        # test one chunk
        mdf = DataFrame(df)
        result = self.executor.execute_dataframe(mdf.sort_values('a0'),
                                                 concat=True)[0]
        expected = df.sort_values('a0')

        pd.testing.assert_frame_equal(result, expected)

        result = self.executor.execute_dataframe(mdf.sort_values(
            ['a6', 'a7'], ascending=False),
                                                 concat=True)[0]
        expected = df.sort_values(['a6', 'a7'], ascending=False)

        pd.testing.assert_frame_equal(result, expected)

        # test psrs
        mdf = DataFrame(df, chunk_size=10)
        result = self.executor.execute_dataframe(mdf.sort_values('a0'),
                                                 concat=True)[0]
        expected = df.sort_values('a0')

        pd.testing.assert_frame_equal(result, expected)

        result = self.executor.execute_dataframe(mdf.sort_values(['a3', 'a4']),
                                                 concat=True)[0]
        expected = df.sort_values(['a3', 'a4'])

        pd.testing.assert_frame_equal(result, expected)

        # test ascending=False
        result = self.executor.execute_dataframe(mdf.sort_values(
            ['a0', 'a1'], ascending=False),
                                                 concat=True)[0]
        expected = df.sort_values(['a0', 'a1'], ascending=False)

        pd.testing.assert_frame_equal(result, expected)

        result = self.executor.execute_dataframe(mdf.sort_values(
            ['a7'], ascending=False),
                                                 concat=True)[0]
        expected = df.sort_values(['a7'], ascending=False)

        pd.testing.assert_frame_equal(result, expected)

        # test rechunk
        mdf = DataFrame(df, chunk_size=3)
        result = self.executor.execute_dataframe(mdf.sort_values('a0'),
                                                 concat=True)[0]
        expected = df.sort_values('a0')

        pd.testing.assert_frame_equal(result, expected)

        result = self.executor.execute_dataframe(mdf.sort_values(['a3', 'a4']),
                                                 concat=True)[0]
        expected = df.sort_values(['a3', 'a4'])

        pd.testing.assert_frame_equal(result, expected)

        # test other types
        raw = pd.DataFrame(
            {
                'a': np.random.rand(10),
                'b': np.random.randint(1000, size=10),
                'c': np.random.rand(10),
                'd': [np.random.bytes(10) for _ in range(10)],
                'e': [pd.Timestamp('201{}'.format(i)) for i in range(10)],
                'f': [pd.Timedelta('{} days'.format(i)) for i in range(10)]
            }, )
        mdf = DataFrame(raw, chunk_size=3)

        for label in raw.columns:
            result = self.executor.execute_dataframe(mdf.sort_values(label),
                                                     concat=True)[0]
            expected = raw.sort_values(label)
            pd.testing.assert_frame_equal(result, expected)

        result = self.executor.execute_dataframe(mdf.sort_values(
            ['a', 'b', 'e'], ascending=False),
                                                 concat=True)[0]
        expected = raw.sort_values(['a', 'b', 'e'], ascending=False)

        pd.testing.assert_frame_equal(result, expected)

        # test nan
        df = pd.DataFrame({
            'col1': ['A', 'A', 'B', 'B', 'D', 'C'],
            'col2': [2, 1, 9, np.nan, 7, 4],
            'col3': [0, 1, 9, 4, 2, 3],
        })
        mdf = DataFrame(df)
        result = self.executor.execute_dataframe(mdf.sort_values(['col2']),
                                                 concat=True)[0]
        expected = df.sort_values(['col2'])

        pd.testing.assert_frame_equal(result, expected)

        mdf = DataFrame(df, chunk_size=3)
        result = self.executor.execute_dataframe(mdf.sort_values(['col2']),
                                                 concat=True)[0]
        expected = df.sort_values(['col2'])

        pd.testing.assert_frame_equal(result, expected)

        # test ignore_index
        executor = ExecutorForTest(storage=new_session().context)

        df = pd.DataFrame(np.random.rand(10, 3),
                          columns=['a' + str(i) for i in range(3)])

        mdf = DataFrame(df, chunk_size=3)
        result = executor.execute_dataframe(mdf.sort_values(['a0', 'a1'],
                                                            ignore_index=True),
                                            concat=True)[0]
        try:  # for python3.5
            expected = df.sort_values(['a0', 'a1'], ignore_index=True)
        except TypeError:
            expected = df.sort_values(['a0', 'a1'])
            expected.index = pd.RangeIndex(len(expected))

        pd.testing.assert_frame_equal(result, expected)

        # test inplace
        mdf = DataFrame(df)
        mdf.sort_values('a0', inplace=True)
        result = self.executor.execute_dataframe(mdf, concat=True)[0]
        df.sort_values('a0', inplace=True)

        pd.testing.assert_frame_equal(result, df)

        # test unknown shape
        df = pd.DataFrame({'a': list(range(10)), 'b': np.random.random(10)})
        mdf = DataFrame(df, chunk_size=4)
        filtered = mdf[mdf['a'] > 2]
        result = self.executor.execute_dataframe(filtered.sort_values(by='b'),
                                                 concat=True)[0]

        pd.testing.assert_frame_equal(result,
                                      df[df['a'] > 2].sort_values(by='b'))

        # test Sereis.sort_values
        raw = pd.Series(np.random.rand(10))
        series = Series(raw)
        result = self.executor.execute_dataframe(series.sort_values(),
                                                 concat=True)[0]
        expected = raw.sort_values()

        pd.testing.assert_series_equal(result, expected)

        series = Series(raw, chunk_size=3)
        result = self.executor.execute_dataframe(series.sort_values(),
                                                 concat=True)[0]
        expected = raw.sort_values()

        pd.testing.assert_series_equal(result, expected)

        series = Series(raw, chunk_size=2)
        result = self.executor.execute_dataframe(
            series.sort_values(ascending=False), concat=True)[0]
        expected = raw.sort_values(ascending=False)

        pd.testing.assert_series_equal(result, expected)
Exemplo n.º 18
0
def test_to_datetime_execution(setup):
    # scalar
    r = to_datetime(1490195805, unit='s')

    result = r.execute().fetch(extra_config={
        'check_dtypes': False,
        'check_shape': False
    })
    expected = pd.to_datetime(1490195805, unit='s')
    assert pd.to_datetime(result) == expected

    # test list like
    raw = ['3/11/2000', '3/12/2000', '3/13/2000']
    t = tensor(raw, chunk_size=2)
    r = to_datetime(t, infer_datetime_format=True)

    result = r.execute().fetch()
    expected = pd.to_datetime(raw, infer_datetime_format=True)
    pd.testing.assert_index_equal(result, expected)

    # test series
    raw_series = pd.Series(raw)
    s = Series(raw_series, chunk_size=2)
    r = to_datetime(s)

    result = r.execute().fetch()
    expected = pd.to_datetime(raw_series)
    pd.testing.assert_series_equal(result, expected)

    # test DataFrame
    raw_df = pd.DataFrame({
        'year': [2015, 2016],
        'month': [2, 3],
        'day': [4, 5]
    })
    df = DataFrame(raw_df, chunk_size=(1, 2))
    r = to_datetime(df)

    result = r.execute().fetch()
    expected = pd.to_datetime(raw_df)
    pd.testing.assert_series_equal(result, expected)

    # test Index
    raw_index = pd.Index([1, 2, 3])
    s = Index(raw_index, chunk_size=2)
    r = to_datetime(s)

    result = r.execute().fetch()
    expected = pd.to_datetime(raw_index)
    pd.testing.assert_index_equal(result, expected)

    # test raises == 'ignore'
    raw = ['13000101']
    r = to_datetime(raw, format='%Y%m%d', errors='ignore')
    result = r.execute().fetch()
    expected = pd.to_datetime(raw, format='%Y%m%d', errors='ignore')
    pd.testing.assert_index_equal(result, expected)

    # test unit
    r = to_datetime([1490195805], unit='s')
    result = r.execute().fetch()
    expected = pd.to_datetime([1490195805], unit='s')
    pd.testing.assert_index_equal(result, expected)

    # test origin
    r = to_datetime([1, 2, 3], unit='D', origin=pd.Timestamp('1960-01-01'))
    result = r.execute().fetch()
    expected = pd.to_datetime([1, 2, 3],
                              unit='D',
                              origin=pd.Timestamp('1960-01-01'))
    pd.testing.assert_index_equal(result, expected)
Exemplo n.º 19
0
def test_dot_execution(setup):
    df1_raw = pd.DataFrame(np.random.rand(4, 7))
    df2_raw = pd.DataFrame(np.random.rand(7, 5), columns=list('efghi'))
    s1_raw = pd.Series(np.random.rand(7))
    s2_raw = pd.Series(np.random.rand(7))

    df1 = DataFrame(df1_raw, chunk_size=(3, 2))
    df2 = DataFrame(df2_raw, chunk_size=(3, 4))

    # df.dot(df)
    r = df1.dot(df2)
    result = r.execute().fetch()
    expected = df1_raw.dot(df2_raw)
    pd.testing.assert_frame_equal(result, expected)

    # test @
    r = df1 @ df2
    result = r.execute().fetch()
    expected = df1_raw @ df2_raw
    pd.testing.assert_frame_equal(result, expected)

    series1 = Series(s1_raw, chunk_size=5)

    # df.dot(series)
    r = df1.dot(series1)
    result = r.execute().fetch()
    expected = df1_raw.dot(s1_raw)
    pd.testing.assert_series_equal(result, expected)

    # df.dot(2d_array)
    r = df1.dot(df2_raw.to_numpy())
    result = r.execute().fetch()
    expected = df1_raw.dot(df2_raw.to_numpy())
    pd.testing.assert_frame_equal(result, expected)

    # df.dot(1d_array)
    r = df1.dot(s1_raw.to_numpy())
    result = r.execute().fetch()
    expected = df1_raw.dot(s1_raw.to_numpy())
    pd.testing.assert_series_equal(result, expected)

    series2 = Series(s2_raw, chunk_size=4)

    # series.dot(series)
    r = series1.dot(series2)
    result = r.execute().fetch()
    expected = s1_raw.dot(s2_raw)
    assert pytest.approx(result) == expected

    # series.dot(df)
    r = series1.dot(df2)
    result = r.execute().fetch()
    expected = s1_raw.dot(df2_raw)
    pd.testing.assert_series_equal(result, expected)

    # series.dot(2d_array)
    r = series1.dot(df2_raw.to_numpy())
    result = r.execute().fetch()
    expected = s1_raw.dot(df2_raw.to_numpy())
    np.testing.assert_almost_equal(result, expected)

    # series.dot(1d_array)
    r = series1.dot(s2_raw.to_numpy())
    result = r.execute().fetch()
    expected = s1_raw.dot(s2_raw.to_numpy())
    assert pytest.approx(result) == expected
Exemplo n.º 20
0
def test_dataframe_quantile_execution(setup):
    raw = pd.DataFrame(
        {
            'a': np.random.rand(10),
            'b': np.random.randint(1000, size=10),
            'c': np.random.rand(10),
            'd': [np.random.bytes(10) for _ in range(10)],
            'e': [pd.Timestamp(f'201{i}') for i in range(10)],
            'f': [pd.Timedelta(f'{i} days') for i in range(10)]
        },
        index=pd.RangeIndex(1, 11))
    df = DataFrame(raw, chunk_size=3)

    # q = 0.5, axis = 0, series
    r = df.quantile()
    result = r.execute().fetch()
    expected = raw.quantile()

    pd.testing.assert_series_equal(result, expected)

    # q = 0.5, axis = 1, series
    r = df.quantile(axis=1)
    result = r.execute().fetch()
    expected = raw.quantile(axis=1)

    pd.testing.assert_series_equal(result, expected)

    # q is a list, axis = 0, dataframe
    r = df.quantile([0.3, 0.7])
    result = r.execute().fetch()
    expected = raw.quantile([0.3, 0.7])

    pd.testing.assert_frame_equal(result, expected)

    # q is a list, axis = 1, dataframe
    r = df.quantile([0.3, 0.7], axis=1)
    result = r.execute().fetch()
    expected = raw.quantile([0.3, 0.7], axis=1)

    pd.testing.assert_frame_equal(result, expected)

    # test interpolation
    r = df.quantile([0.3, 0.7], interpolation='midpoint')
    result = r.execute().fetch()
    expected = raw.quantile([0.3, 0.7], interpolation='midpoint')

    pd.testing.assert_frame_equal(result, expected)

    q = tensor([0.3, 0.7])

    # q is a tensor
    r = df.quantile(q)
    result = r.execute().fetch()
    expected = raw.quantile([0.3, 0.7])

    pd.testing.assert_frame_equal(result, expected)

    # test numeric_only
    raw2 = pd.DataFrame(
        {
            'a': np.random.rand(10),
            'b': np.random.randint(1000, size=10),
            'c': np.random.rand(10),
            'd': [pd.Timestamp(f'201{i}') for i in range(10)],
        },
        index=pd.RangeIndex(1, 11))
    df2 = DataFrame(raw2, chunk_size=3)

    r = df2.quantile([0.3, 0.7], numeric_only=False)
    result = r.execute().fetch()
    expected = raw2.quantile([0.3, 0.7], numeric_only=False)

    pd.testing.assert_frame_equal(result, expected)

    r = df2.quantile(numeric_only=False)
    result = r.execute().fetch()
    expected = raw2.quantile(numeric_only=False)

    pd.testing.assert_series_equal(result, expected)
Exemplo n.º 21
0
    def testSortIndexExecution(self):
        raw = pd.DataFrame(np.random.rand(100, 20), index=np.random.rand(100))

        mdf = DataFrame(raw)
        result = self.executor.execute_dataframe(mdf.sort_index(),
                                                 concat=True)[0]
        expected = raw.sort_index()
        pd.testing.assert_frame_equal(result, expected)

        mdf = DataFrame(raw)
        mdf.sort_index(inplace=True)
        result = self.executor.execute_dataframe(mdf, concat=True)[0]
        expected = raw.sort_index()
        pd.testing.assert_frame_equal(result, expected)

        mdf = DataFrame(raw, chunk_size=30)
        result = self.executor.execute_dataframe(mdf.sort_index(),
                                                 concat=True)[0]
        expected = raw.sort_index()
        pd.testing.assert_frame_equal(result, expected)

        mdf = DataFrame(raw, chunk_size=20)
        result = self.executor.execute_dataframe(
            mdf.sort_index(ascending=False), concat=True)[0]
        expected = raw.sort_index(ascending=False)
        pd.testing.assert_frame_equal(result, expected)

        executor = ExecutorForTest(storage=new_session().context)

        mdf = DataFrame(raw, chunk_size=10)
        result = executor.execute_dataframe(mdf.sort_index(ignore_index=True),
                                            concat=True)[0]
        try:  # for python3.5
            expected = raw.sort_index(ignore_index=True)
        except TypeError:
            expected = raw.sort_index()
            expected.index = pd.RangeIndex(len(expected))
        pd.testing.assert_frame_equal(result, expected)

        # test axis=1
        raw = pd.DataFrame(np.random.rand(10, 10), columns=np.random.rand(10))

        mdf = DataFrame(raw)
        result = self.executor.execute_dataframe(mdf.sort_index(axis=1),
                                                 concat=True)[0]
        expected = raw.sort_index(axis=1)
        pd.testing.assert_frame_equal(result, expected)

        mdf = DataFrame(raw, chunk_size=3)
        result = self.executor.execute_dataframe(mdf.sort_index(axis=1),
                                                 concat=True)[0]
        expected = raw.sort_index(axis=1)
        pd.testing.assert_frame_equal(result, expected)

        mdf = DataFrame(raw, chunk_size=4)
        result = self.executor.execute_dataframe(mdf.sort_index(
            axis=1, ascending=False),
                                                 concat=True)[0]
        expected = raw.sort_index(axis=1, ascending=False)
        pd.testing.assert_frame_equal(result, expected)

        mdf = DataFrame(raw, chunk_size=4)
        executor = ExecutorForTest(storage=new_session().context)

        result = executor.execute_dataframe(mdf.sort_index(axis=1,
                                                           ignore_index=True),
                                            concat=True)[0]
        try:  # for python3.5
            expected = raw.sort_index(axis=1, ignore_index=True)
        except TypeError:
            expected = raw.sort_index(axis=1)
            expected.index = pd.RangeIndex(len(expected))
        pd.testing.assert_frame_equal(result, expected)

        # test series
        raw = pd.Series(np.random.rand(10, ), index=np.random.rand(10))

        series = Series(raw)
        result = self.executor.execute_dataframe(series.sort_index(),
                                                 concat=True)[0]
        expected = raw.sort_index()
        pd.testing.assert_series_equal(result, expected)

        series = Series(raw, chunk_size=2)
        result = self.executor.execute_dataframe(series.sort_index(),
                                                 concat=True)[0]
        expected = raw.sort_index()
        pd.testing.assert_series_equal(result, expected)

        series = Series(raw, chunk_size=3)
        result = self.executor.execute_dataframe(
            series.sort_index(ascending=False), concat=True)[0]
        expected = raw.sort_index(ascending=False)
        pd.testing.assert_series_equal(result, expected)
Exemplo n.º 22
0
def test_to_csv_execution(setup):
    index = pd.RangeIndex(100, 0, -1, name='index')
    raw = pd.DataFrame(
        {
            'col1': np.random.rand(100),
            'col2': np.random.choice(['a', 'b', 'c'], (100, )),
            'col3': np.arange(100)
        },
        index=index)
    df = DataFrame(raw, chunk_size=33)

    with tempfile.TemporaryDirectory() as base_path:
        # DATAFRAME TESTS
        # test one file with dataframe
        path = os.path.join(base_path, 'out.csv')

        df.to_csv(path).execute()

        result = pd.read_csv(path, dtype=raw.dtypes.to_dict())
        result.set_index('index', inplace=True)
        pd.testing.assert_frame_equal(result, raw)

        # test multi files with dataframe
        path = os.path.join(base_path, 'out-*.csv')
        df.to_csv(path).execute()

        dfs = [
            pd.read_csv(os.path.join(base_path, f'out-{i}.csv'),
                        dtype=raw.dtypes.to_dict()) for i in range(4)
        ]
        result = pd.concat(dfs, axis=0)
        result.set_index('index', inplace=True)
        pd.testing.assert_frame_equal(result, raw)
        pd.testing.assert_frame_equal(dfs[1].set_index('index'),
                                      raw.iloc[33:66])

        # test df with unknown shape
        df2 = DataFrame(raw, chunk_size=(50, 2))
        df2 = df2[df2['col1'] < 1]
        path2 = os.path.join(base_path, 'out2.csv')
        df2.to_csv(path2).execute()

        result = pd.read_csv(path2, dtype=raw.dtypes.to_dict())
        result.set_index('index', inplace=True)
        pd.testing.assert_frame_equal(result, raw)

        # SERIES TESTS
        series = md.Series(raw.col1, chunk_size=33)

        # test one file with series
        path = os.path.join(base_path, 'out.csv')
        series.to_csv(path).execute()

        result = pd.read_csv(path, dtype=raw.dtypes.to_dict())
        result.set_index('index', inplace=True)
        pd.testing.assert_frame_equal(result, raw.col1.to_frame())

        # test multi files with series
        path = os.path.join(base_path, 'out-*.csv')
        series.to_csv(path).execute()

        dfs = [
            pd.read_csv(os.path.join(base_path, f'out-{i}.csv'),
                        dtype=raw.dtypes.to_dict()) for i in range(4)
        ]
        result = pd.concat(dfs, axis=0)
        result.set_index('index', inplace=True)
        pd.testing.assert_frame_equal(result, raw.col1.to_frame())
        pd.testing.assert_frame_equal(dfs[1].set_index('index'),
                                      raw.col1.to_frame().iloc[33:66])
Exemplo n.º 23
0
    def testSortValuesExecution(self):
        distinct_opts = ['0'] if sys.platform.lower().startswith('win') else [
            '0', '1'
        ]
        for add_distinct in distinct_opts:
            os.environ['PSRS_DISTINCT_COL'] = add_distinct
            df = pd.DataFrame(np.random.rand(100, 10),
                              columns=['a' + str(i) for i in range(10)])

            # test one chunk
            mdf = DataFrame(df)
            result = self.executor.execute_dataframe(mdf.sort_values('a0'),
                                                     concat=True)[0]
            expected = df.sort_values('a0')

            pd.testing.assert_frame_equal(result, expected)

            result = self.executor.execute_dataframe(mdf.sort_values(
                ['a6', 'a7'], ascending=False),
                                                     concat=True)[0]
            expected = df.sort_values(['a6', 'a7'], ascending=False)

            pd.testing.assert_frame_equal(result, expected)

            # test psrs
            mdf = DataFrame(df, chunk_size=10)
            result = self.executor.execute_dataframe(mdf.sort_values('a0'),
                                                     concat=True)[0]
            expected = df.sort_values('a0')

            pd.testing.assert_frame_equal(result, expected)

            result = self.executor.execute_dataframe(mdf.sort_values(
                ['a3', 'a4']),
                                                     concat=True)[0]
            expected = df.sort_values(['a3', 'a4'])

            pd.testing.assert_frame_equal(result, expected)

            # test ascending=False
            result = self.executor.execute_dataframe(mdf.sort_values(
                ['a0', 'a1'], ascending=False),
                                                     concat=True)[0]
            expected = df.sort_values(['a0', 'a1'], ascending=False)

            pd.testing.assert_frame_equal(result, expected)

            result = self.executor.execute_dataframe(mdf.sort_values(
                ['a7'], ascending=False),
                                                     concat=True)[0]
            expected = df.sort_values(['a7'], ascending=False)

            pd.testing.assert_frame_equal(result, expected)

            # test multiindex
            df2 = df.copy(deep=True)
            df2.columns = pd.MultiIndex.from_product(
                [list('AB'), list('CDEFG')])
            mdf = DataFrame(df2, chunk_size=10)

            result = self.executor.execute_dataframe(mdf.sort_values([('A',
                                                                       'C')]),
                                                     concat=True)[0]
            expected = df2.sort_values([('A', 'C')])

            pd.testing.assert_frame_equal(result, expected)

            # test rechunk
            mdf = DataFrame(df, chunk_size=3)
            result = self.executor.execute_dataframe(mdf.sort_values('a0'),
                                                     concat=True)[0]
            expected = df.sort_values('a0')

            pd.testing.assert_frame_equal(result, expected)

            result = self.executor.execute_dataframe(mdf.sort_values(
                ['a3', 'a4']),
                                                     concat=True)[0]
            expected = df.sort_values(['a3', 'a4'])

            pd.testing.assert_frame_equal(result, expected)

            # test other types
            raw = pd.DataFrame(
                {
                    'a': np.random.rand(10),
                    'b': np.random.randint(1000, size=10),
                    'c': np.random.rand(10),
                    'd': [np.random.bytes(10) for _ in range(10)],
                    'e': [pd.Timestamp(f'201{i}') for i in range(10)],
                    'f': [pd.Timedelta(f'{i} days') for i in range(10)]
                }, )
            mdf = DataFrame(raw, chunk_size=3)

            for label in raw.columns:
                result = self.executor.execute_dataframe(
                    mdf.sort_values(label), concat=True)[0]
                expected = raw.sort_values(label)
                pd.testing.assert_frame_equal(result, expected)

            result = self.executor.execute_dataframe(mdf.sort_values(
                ['a', 'b', 'e'], ascending=False),
                                                     concat=True)[0]
            expected = raw.sort_values(['a', 'b', 'e'], ascending=False)

            pd.testing.assert_frame_equal(result, expected)

            # test nan
            df = pd.DataFrame({
                'col1': ['A', 'A', 'B', 'B', 'D', 'C'],
                'col2': [2, 1, 9, np.nan, 7, 4],
                'col3': [0, 1, 9, 4, 2, 3],
            })
            mdf = DataFrame(df)
            result = self.executor.execute_dataframe(mdf.sort_values(['col2']),
                                                     concat=True)[0]
            expected = df.sort_values(['col2'])

            pd.testing.assert_frame_equal(result, expected)

            mdf = DataFrame(df, chunk_size=3)
            result = self.executor.execute_dataframe(mdf.sort_values(['col2']),
                                                     concat=True)[0]
            expected = df.sort_values(['col2'])

            pd.testing.assert_frame_equal(result, expected)

            # test ignore_index
            executor = ExecutorForTest(storage=new_session().context)

            df = pd.DataFrame(np.random.rand(10, 3),
                              columns=['a' + str(i) for i in range(3)])

            mdf = DataFrame(df, chunk_size=3)
            result = executor.execute_dataframe(mdf.sort_values(
                ['a0', 'a1'], ignore_index=True),
                                                concat=True)[0]
            try:  # for python3.5
                expected = df.sort_values(['a0', 'a1'], ignore_index=True)
            except TypeError:
                expected = df.sort_values(['a0', 'a1'])
                expected.index = pd.RangeIndex(len(expected))

            pd.testing.assert_frame_equal(result, expected)

            # test inplace
            mdf = DataFrame(df)
            mdf.sort_values('a0', inplace=True)
            result = self.executor.execute_dataframe(mdf, concat=True)[0]
            df.sort_values('a0', inplace=True)

            pd.testing.assert_frame_equal(result, df)

            # test unknown shape
            df = pd.DataFrame({
                'a': list(range(10)),
                'b': np.random.random(10)
            })
            mdf = DataFrame(df, chunk_size=4)
            filtered = mdf[mdf['a'] > 2]
            result = self.executor.execute_dataframe(
                filtered.sort_values(by='b'), concat=True)[0]

            pd.testing.assert_frame_equal(result,
                                          df[df['a'] > 2].sort_values(by='b'))

            # test Series.sort_values
            raw = pd.Series(np.random.rand(10))
            series = Series(raw)
            result = self.executor.execute_dataframe(series.sort_values(),
                                                     concat=True)[0]
            expected = raw.sort_values()

            pd.testing.assert_series_equal(result, expected)

            series = Series(raw, chunk_size=3)
            result = self.executor.execute_dataframe(series.sort_values(),
                                                     concat=True)[0]
            expected = raw.sort_values()

            pd.testing.assert_series_equal(result, expected)

            series = Series(raw, chunk_size=2)
            result = self.executor.execute_dataframe(
                series.sort_values(ascending=False), concat=True)[0]
            expected = raw.sort_values(ascending=False)

            pd.testing.assert_series_equal(result, expected)
Exemplo n.º 24
0
    def testDataFrameQuantileExecution(self):
        raw = pd.DataFrame({'a': np.random.rand(10),
                            'b': np.random.randint(1000, size=10),
                            'c': np.random.rand(10),
                            'd': [np.random.bytes(10) for _ in range(10)],
                            'e': [pd.Timestamp(f'201{i}') for i in range(10)],
                            'f': [pd.Timedelta(f'{i} days') for i in range(10)]
                            },
                           index=pd.RangeIndex(1, 11))
        df = DataFrame(raw, chunk_size=3)

        # q = 0.5, axis = 0, series
        r = df.quantile()
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = raw.quantile()

        pd.testing.assert_series_equal(result, expected)

        # q = 0.5, axis = 1, series
        r = df.quantile(axis=1)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = raw.quantile(axis=1)

        pd.testing.assert_series_equal(result, expected)

        # q is a list, axis = 0, dataframe
        r = df.quantile([0.3, 0.7])
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = raw.quantile([0.3, 0.7])

        pd.testing.assert_frame_equal(result, expected)

        # q is a list, axis = 1, dataframe
        r = df.quantile([0.3, 0.7], axis=1)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = raw.quantile([0.3, 0.7], axis=1)

        pd.testing.assert_frame_equal(result, expected)

        # test interpolation
        r = df.quantile([0.3, 0.7], interpolation='midpoint')
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = raw.quantile([0.3, 0.7], interpolation='midpoint')

        pd.testing.assert_frame_equal(result, expected)

        ctx, executor = self._create_test_context(self.executor)
        with ctx:
            q = tensor([0.3, 0.7])

            # q is a tensor
            r = df.quantile(q)
            result = executor.execute_dataframes([r])[0]
            expected = raw.quantile([0.3, 0.7])

            pd.testing.assert_frame_equal(result, expected)

        # test numeric_only
        raw2 = pd.DataFrame({'a': np.random.rand(10),
                             'b': np.random.randint(1000, size=10),
                             'c': np.random.rand(10),
                             'd': [pd.Timestamp(f'201{i}') for i in range(10)],
                             }, index=pd.RangeIndex(1, 11))
        df2 = DataFrame(raw2, chunk_size=3)

        r = df2.quantile([0.3, 0.7], numeric_only=False)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = raw2.quantile([0.3, 0.7], numeric_only=False)

        pd.testing.assert_frame_equal(result, expected)

        r = df2.quantile(numeric_only=False)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = raw2.quantile(numeric_only=False)

        pd.testing.assert_series_equal(result, expected)