Python read_avro 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: cudf

메소드/함수: read_avro

hotexamples.com에서의 예제들: 7

Python read_avro - 7개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 cudf.read_avro에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

def test_no_data(tmpdir):
    filepath = tmpdir + "no_data.avro"
    schema = {
        "name":
        "Weather",
        "type":
        "record",
        "fields": [
            {
                "name": "station",
                "type": "string"
            },
            {
                "name": "time",
                "type": "long"
            },
            {
                "name": "temp",
                "type": "int"
            },
        ],
    }
    parsed_schema = fa.parse_schema(schema)
    with open(filepath, "wb") as out:
        fa.writer(out, parsed_schema, [])

    df = cudf.read_avro(filepath)

    # fastavro returns an empty dataframe, need to verify manually
    assert_eq(df.shape, (0, 3))
    dtypes = df.dtypes.values.tolist()
    assert_eq(dtypes, [np.dtype("O"), np.dtype("int64"), np.dtype("int32")])
    col_names = df.columns.tolist()
    assert_eq(col_names, ["station", "time", "temp"])

예제 #2

파일 보기

def cudf_from_avro_util(schema, records):

    schema = [] if schema is None else fastavro.parse_schema(schema)
    buffer = io.BytesIO()
    fastavro.writer(buffer, schema, records)
    buffer.seek(0)
    return cudf.read_avro(buffer)

예제 #3

파일 보기

def test_empty_dataframe(tmpdir):
    filepath = tmpdir + "empty.avro"
    # write empty dataframe
    with open(filepath, "wb") as out:
        fa.writer(out, [], [])

    df = cudf.read_avro(filepath)
    assert_eq(df, cudf.DataFrame())

예제 #4

파일 보기

def avro_reader_test(input_tuple, columns, skiprows, num_rows):
    pdf, parquet_buffer = input_tuple
    expected_pdf = pdf[skiprows:]
    if num_rows is not None:
        expected_pdf = expected_pdf.head(num_rows)
    if skiprows is not None or num_rows is not None:
        expected_pdf = expected_pdf.reset_index(drop=True)

    gdf = cudf.read_avro(parquet_buffer,
                         columns=columns,
                         skiprows=skiprows,
                         num_rows=num_rows)
    compare_dataframe(expected_pdf, gdf)

예제 #5

파일 보기

파일: test_avro_reader_fastavro_integration.py 프로젝트: rongou/cudf

def test_avro_compression(rows, codec):
    schema = {
        "name":
        "root",
        "type":
        "record",
        "fields": [
            {
                "name": "0",
                "type": "int"
            },
            {
                "name": "1",
                "type": "string"
            },
        ],
    }

    df = rand_dataframe(
        [
            {
                "dtype": "int32",
                "null_frequency": 0,
                "cardinality": 1000
            },
            {
                "dtype": "str",
                "null_frequency": 0,
                "cardinality": 100,
                "max_string_length": 10,
            },
        ],
        rows,
    )
    expected_df = cudf.DataFrame.from_arrow(df)

    records = df.to_pandas().to_dict(orient="records")

    buffer = io.BytesIO()
    fastavro.writer(buffer, schema, records, codec=codec)
    buffer.seek(0)
    got_df = cudf.read_avro(buffer)

    assert_eq(expected_df, got_df)

예제 #6

파일 보기

파일: test_hdfs.py 프로젝트: rongou/cudf

def test_read_avro(datadir, hdfs, test_url):
    fname = datadir / "avro" / "example.avro"
    # Read from local file system as buffer
    with open(fname, mode="rb") as f:
        buffer = BytesIO(f.read())
    # Write to hdfs
    hdfs.upload(basedir + "/file.avro", buffer)

    if test_url:
        hd_fpath = f"hdfs://{host}:{port}{basedir}/file.avro"
    else:
        hd_fpath = f"hdfs://{basedir}/file.avro"

    got = cudf.read_avro(hd_fpath)
    with open(fname, mode="rb") as f:
        expect = pd.DataFrame.from_records(fa.reader(f))

    for col in expect.columns:
        expect[col] = expect[col].astype(got[col].dtype)
    assert_eq(expect, got)

예제 #7

파일 보기

def test_avro_reader_basic(datadir, inputfile, columns, engine):
    path = datadir / inputfile
    try:
        reader = fa.reader(open(path, "rb"))
    except Exception as excpr:
        if type(excpr).__name__ == "FileNotFoundError":
            pytest.skip(".avro file is not found")
        else:
            print(type(excpr).__name__)

    expect = pd.DataFrame.from_records(reader)
    got = cudf.read_avro(path, engine=engine, columns=columns)

    # PANDAS uses NaN to represent invalid data, which forces float dtype
    # For comparison, we can replace NaN with 0 and cast to the cuDF dtype
    # FASTAVRO produces int64 columns from avro int32 dtype, so convert
    # it back to int32 here
    for col in expect.columns:
        expect[col] = expect[col].astype(got[col].dtype)

    assert_eq(expect, got, check_categorical=False)