def test_chunked_orc_writer_statistics_frequency(datadir, tmpdir, stats_freq): reference_file = "TestOrcFile.test1.orc" pdf_fname = datadir / reference_file gdf_fname = tmpdir.join("chunked_gdf.orc") try: orcfile = pa.orc.ORCFile(pdf_fname) except Exception as excpr: if type(excpr).__name__ == "ArrowIOError": pytest.skip(".orc file is not found") else: print(type(excpr).__name__) columns = [ "boolean1", "byte1", "short1", "int1", "long1", "float1", "double1", ] pdf = orcfile.read(columns=columns).to_pandas() gdf = cudf.from_pandas(pdf) expect = pd.concat([pdf, pdf]).reset_index(drop=True) writer = ORCWriter(gdf_fname, statistics=stats_freq) writer.write_table(gdf) writer.write_table(gdf) writer.close() got = pa.orc.ORCFile(gdf_fname).read().to_pandas() assert_eq(expect, got)
def test_chunked_orc_writer(datadir, tmpdir, reference_file, columns, compression): pdf_fname = datadir / reference_file gdf_fname = tmpdir.join("chunked_gdf.orc") try: orcfile = pa.orc.ORCFile(pdf_fname) except Exception as excpr: if type(excpr).__name__ == "ArrowIOError": pytest.skip(".orc file is not found") else: print(type(excpr).__name__) pdf = orcfile.read(columns=columns).to_pandas() gdf = cudf.from_pandas(pdf) expect = pd.concat([pdf, pdf]).reset_index(drop=True) writer = ORCWriter(gdf_fname, compression=compression) writer.write_table(gdf) writer.write_table(gdf) writer.close() got = pa.orc.ORCFile(gdf_fname).read(columns=columns).to_pandas() assert_eq(expect, got)
def test_chunked_orc_writer_strings(tmpdir, dtypes): gdf_fname = tmpdir.join("chunked_gdf_strings.orc") gdf = cudf.datasets.randomdata(nrows=10, dtypes=dtypes, seed=1) pdf = gdf.to_pandas() expect = pd.concat([pdf, pdf]).reset_index(drop=True) writer = ORCWriter(gdf_fname) writer.write_table(gdf) writer.write_table(gdf) writer.close() got = pa.orc.ORCFile(gdf_fname).read().to_pandas() assert_eq(expect, got)
def test_chunked_orc_writer_lists(): num_rows = 12345 pdf_in = pd.DataFrame({ "ls": [[str(i), str(2 * i)] for i in range(num_rows)], "ld": [[dec(i / 2)] * 5 for i in range(num_rows)], }) gdf = cudf.from_pandas(pdf_in) expect = pd.concat([pdf_in, pdf_in]).reset_index(drop=True) buffer = BytesIO() writer = ORCWriter(buffer) writer.write_table(gdf) writer.write_table(gdf) writer.close() got = pa.orc.ORCFile(buffer).read().to_pandas() assert_eq(expect, got)