Пример #1
0
def test_to_parquet_fast_parquet_execution():
    raw = pd.DataFrame({
        'col1': np.random.rand(100),
        'col2': np.arange(100),
        'col3': np.random.choice(['a', 'b', 'c'], (100, )),
    })
    df = DataFrame(raw, chunk_size=33)

    with tempfile.TemporaryDirectory() as base_path:
        # test fastparquet
        path = os.path.join(base_path, 'out-fastparquet-*.parquet')
        df.to_parquet(path, engine='fastparquet', compression='gzip').execute()
Пример #2
0
    def testToParquetArrowExecution(self):
        raw = pd.DataFrame({
            'col1': np.random.rand(100),
            'col2': np.arange(100),
            'col3': np.random.choice(['a', 'b', 'c'], (100, )),
        })
        df = DataFrame(raw, chunk_size=33)

        with tempfile.TemporaryDirectory() as base_path:
            # DATAFRAME TESTS
            path = os.path.join(base_path, 'out-*.parquet')
            r = df.to_parquet(path)
            self.executor.execute_dataframe(r)

            read_df = md.read_parquet(path)
            result = self.executor.execute_dataframe(read_df, concat=True)[0]
            result = result.sort_index()
            pd.testing.assert_frame_equal(result, raw)

            read_df = md.read_parquet(path)
            result = self.executor.execute_dataframe(read_df, concat=True)[0]
            result = result.sort_index()
            pd.testing.assert_frame_equal(result, raw)

            # test read_parquet then to_parquet
            read_df = md.read_parquet(path)
            r = read_df.to_parquet(path)
            self.executor.execute_dataframes([r])

            # test partition_cols
            path = os.path.join(base_path, 'out-partitioned')
            r = df.to_parquet(path, partition_cols=['col3'])
            self.executor.execute_dataframe(r)

            read_df = md.read_parquet(path)
            result = self.executor.execute_dataframe(read_df, concat=True)[0]
            result['col3'] = result['col3'].astype('object')
            pd.testing.assert_frame_equal(
                result.sort_values('col1').reset_index(drop=True),
                raw.sort_values('col1').reset_index(drop=True))