def test_no_data(tmpdir): filepath = tmpdir + "no_data.avro" schema = { "name": "Weather", "type": "record", "fields": [ { "name": "station", "type": "string" }, { "name": "time", "type": "long" }, { "name": "temp", "type": "int" }, ], } parsed_schema = fa.parse_schema(schema) with open(filepath, "wb") as out: fa.writer(out, parsed_schema, []) df = cudf.read_avro(filepath) # fastavro returns an empty dataframe, need to verify manually assert_eq(df.shape, (0, 3)) dtypes = df.dtypes.values.tolist() assert_eq(dtypes, [np.dtype("O"), np.dtype("int64"), np.dtype("int32")]) col_names = df.columns.tolist() assert_eq(col_names, ["station", "time", "temp"])
def cudf_from_avro_util(schema, records): schema = [] if schema is None else fastavro.parse_schema(schema) buffer = io.BytesIO() fastavro.writer(buffer, schema, records) buffer.seek(0) return cudf.read_avro(buffer)
def test_empty_dataframe(tmpdir): filepath = tmpdir + "empty.avro" # write empty dataframe with open(filepath, "wb") as out: fa.writer(out, [], []) df = cudf.read_avro(filepath) assert_eq(df, cudf.DataFrame())
def avro_reader_test(input_tuple, columns, skiprows, num_rows): pdf, parquet_buffer = input_tuple expected_pdf = pdf[skiprows:] if num_rows is not None: expected_pdf = expected_pdf.head(num_rows) if skiprows is not None or num_rows is not None: expected_pdf = expected_pdf.reset_index(drop=True) gdf = cudf.read_avro(parquet_buffer, columns=columns, skiprows=skiprows, num_rows=num_rows) compare_dataframe(expected_pdf, gdf)
def test_avro_compression(rows, codec): schema = { "name": "root", "type": "record", "fields": [ { "name": "0", "type": "int" }, { "name": "1", "type": "string" }, ], } df = rand_dataframe( [ { "dtype": "int32", "null_frequency": 0, "cardinality": 1000 }, { "dtype": "str", "null_frequency": 0, "cardinality": 100, "max_string_length": 10, }, ], rows, ) expected_df = cudf.DataFrame.from_arrow(df) records = df.to_pandas().to_dict(orient="records") buffer = io.BytesIO() fastavro.writer(buffer, schema, records, codec=codec) buffer.seek(0) got_df = cudf.read_avro(buffer) assert_eq(expected_df, got_df)
def test_read_avro(datadir, hdfs, test_url): fname = datadir / "avro" / "example.avro" # Read from local file system as buffer with open(fname, mode="rb") as f: buffer = BytesIO(f.read()) # Write to hdfs hdfs.upload(basedir + "/file.avro", buffer) if test_url: hd_fpath = f"hdfs://{host}:{port}{basedir}/file.avro" else: hd_fpath = f"hdfs://{basedir}/file.avro" got = cudf.read_avro(hd_fpath) with open(fname, mode="rb") as f: expect = pd.DataFrame.from_records(fa.reader(f)) for col in expect.columns: expect[col] = expect[col].astype(got[col].dtype) assert_eq(expect, got)
def test_avro_reader_basic(datadir, inputfile, columns, engine): path = datadir / inputfile try: reader = fa.reader(open(path, "rb")) except Exception as excpr: if type(excpr).__name__ == "FileNotFoundError": pytest.skip(".avro file is not found") else: print(type(excpr).__name__) expect = pd.DataFrame.from_records(reader) got = cudf.read_avro(path, engine=engine, columns=columns) # PANDAS uses NaN to represent invalid data, which forces float dtype # For comparison, we can replace NaN with 0 and cast to the cuDF dtype # FASTAVRO produces int64 columns from avro int32 dtype, so convert # it back to int32 here for col in expect.columns: expect[col] = expect[col].astype(got[col].dtype) assert_eq(expect, got, check_categorical=False)