def parquet_writer_test_rowgroup_index_compression(pdf, compression, row_group_size): pd_file_name = "cpu_pdf.parquet" gd_file_name = "gpu_pdf.parquet" gdf = cudf.from_pandas(pdf) pdf.to_parquet( pd_file_name, compression=compression, row_group_size=row_group_size, ) gdf.to_parquet( gd_file_name, compression=compression, row_group_size=row_group_size, ) actual = cudf.read_parquet(gd_file_name) expected = pd.read_parquet(pd_file_name) compare_dataframe(actual, expected) actual = cudf.read_parquet(pd_file_name) expected = pd.read_parquet(gd_file_name) compare_dataframe(actual, expected, nullable=False)
def orc_reader_test(input_tuple, skiprows, columns, num_rows, use_index): # TODO: Remove skiprows=0 after # following issue is fixed: # https://github.com/rapidsai/cudf/issues/6563 skiprows = 0 pdf, file_buffer = input_tuple expected_pdf = pdf.iloc[skiprows:] if num_rows is not None: expected_pdf = expected_pdf.head(num_rows) if skiprows is not None or num_rows is not None: expected_pdf.reset_index(drop=True, inplace=True) if columns is not None: expected_pdf = expected_pdf[columns] if use_index is False: expected_pdf.reset_index(drop=True, inplace=True) gdf = cudf.read_orc( io.BytesIO(file_buffer), columns=columns, skiprows=skiprows, num_rows=num_rows, use_index=use_index, ) compare_dataframe(expected_pdf, gdf)
def avro_reader_test(input_tuple, columns, skiprows, num_rows): pdf, parquet_buffer = input_tuple expected_pdf = pdf[skiprows:] if num_rows is not None: expected_pdf = expected_pdf.head(num_rows) if skiprows is not None or num_rows is not None: expected_pdf = expected_pdf.reset_index(drop=True) gdf = cudf.read_avro(parquet_buffer, columns=columns, skiprows=skiprows, num_rows=num_rows) compare_dataframe(expected_pdf, gdf)
def parquet_reader_columns(parquet_buffer, columns, use_pandas_metadata): pdf = pd.read_parquet( parquet_buffer, columns=columns, use_pandas_metadata=use_pandas_metadata, ) gdf = cudf.read_parquet( parquet_buffer, columns=columns, use_pandas_metadata=use_pandas_metadata, ) compare_dataframe(gdf, pdf)
def orc_writer_test(pdf, compression, enable_statistics): file_to_strore = io.BytesIO() gdf = cudf.from_pandas(pdf) gdf.to_orc( file_to_strore, compression=compression, enable_statistics=enable_statistics, ) file_to_strore.seek(0) actual_df = cudf.read_orc(file_to_strore) compare_dataframe(pdf, actual_df)
def orc_reader_stripes_test(input_tuple, columns, stripes): _, file_buffer = input_tuple expected_pdf = orc_to_pandas(file_io_obj=io.BytesIO(file_buffer), stripes=stripes) if columns is not None and len(columns) > 0: # ORC reader picks columns if only # there are any elements in `columns` expected_pdf = expected_pdf[columns] gdf = cudf.read_orc(io.BytesIO(file_buffer), columns=columns, stripes=stripes) compare_dataframe(expected_pdf, gdf)
def parquet_writer_test(pdf): pd_file_name = "cpu_pdf.parquet" gd_file_name = "gpu_pdf.parquet" gdf = cudf.from_pandas(pdf) pdf.to_parquet(pd_file_name) gdf.to_parquet(gd_file_name) actual = cudf.read_parquet(gd_file_name) expected = pd.read_parquet(pd_file_name) compare_dataframe(actual, expected) actual = cudf.read_parquet(pd_file_name) expected = pd.read_parquet(gd_file_name) compare_dataframe(actual, expected)
def parquet_reader_columns(parquet_buffer, columns, use_pandas_metadata, skiprows, num_rows): pdf = pd.read_parquet( parquet_buffer, columns=columns, use_pandas_metadata=use_pandas_metadata, ) pdf = pdf.iloc[skiprows:] if num_rows is not None: pdf = pdf.head(num_rows) gdf = cudf.read_parquet( parquet_buffer, columns=columns, use_pandas_metadata=use_pandas_metadata, skiprows=skiprows, num_rows=num_rows, ) compare_dataframe(gdf, pdf)
def orc_reader_test(input_tuple, columns, skiprows, num_rows, use_index): pdf, file_buffer = input_tuple expected_pdf = pdf.iloc[skiprows:] if num_rows is not None: expected_pdf = expected_pdf.head(num_rows) if skiprows is not None or num_rows is not None: expected_pdf.reset_index(drop=True, inplace=True) if columns is not None and len(columns) > 0: # ORC reader picks columns if only # there are any elements in `columns` expected_pdf = expected_pdf[columns] if use_index is False: expected_pdf.reset_index(drop=True, inplace=True) gdf = cudf.read_orc( io.BytesIO(file_buffer), columns=columns, skiprows=skiprows, num_rows=num_rows, use_index=use_index, ) compare_dataframe(expected_pdf, gdf)
def parquet_reader_test(parquet_buffer): pdf = pd.read_parquet(parquet_buffer) gdf = cudf.read_parquet(parquet_buffer) compare_dataframe(gdf, pdf)