예제 #1
0
def orc_type(field):
    if pa.types.is_boolean(field):
        return pyorc.Boolean()
    elif pa.types.is_int8(field):
        return pyorc.TinyInt()
    elif pa.types.is_int16(field):
        return pyorc.SmallInt()
    elif pa.types.is_int32(field):
        return pyorc.Int()
    elif pa.types.is_int64(field):
        return pyorc.BigInt()
    elif pa.types.is_float32(field):
        return pyorc.Float()
    elif pa.types.is_float64(field):
        return pyorc.Double()
    elif pa.types.is_decimal(field):
        return pyorc.Decimal(field.precision, field.scale)
    elif pa.types.is_list(field):
        return pyorc.Array(orc_type(field.value_type))
    elif pa.types.is_timestamp(field):
        return pyorc.Timestamp()
    elif pa.types.is_date(field):
        return pyorc.Date()
    elif pa.types.is_binary(field):
        return pyorc.Binary()
    elif pa.types.is_string(field):
        return pyorc.String()
    else:
        raise ValueError('Cannot Convert %s' % field)
예제 #2
0
def test_empty_statistics():
    buff = BytesIO()
    orc_schema = po.Struct(
        a=po.BigInt(),
        b=po.Double(),
        c=po.String(),
        d=po.Decimal(11, 2),
        e=po.Date(),
        f=po.Timestamp(),
        g=po.Boolean(),
        h=po.Binary(),
        i=po.BigInt(),
        # One column with non null value, else cudf/pyorc readers crash
    )
    data = tuple([None] * (len(orc_schema.fields) - 1) + [1])
    with po.Writer(buff, orc_schema) as writer:
        writer.write(data)

    got = cudf.io.orc.read_orc_statistics([buff])

    # Check for both file and stripe stats
    for stats in got:
        # Similar expected stats for the first 6 columns in this case
        for col_name in ascii_lowercase[:6]:
            assert stats[0][col_name].get("number_of_values") == 0
            assert stats[0][col_name].get("has_null") is True
            assert stats[0][col_name].get("minimum") is None
            assert stats[0][col_name].get("maximum") is None
        for col_name in ascii_lowercase[:3]:
            assert stats[0][col_name].get("sum") == 0
        # Sum for decimal column is a string
        assert stats[0]["d"].get("sum") == "0"

        assert stats[0]["g"].get("number_of_values") == 0
        assert stats[0]["g"].get("has_null") is True
        assert stats[0]["g"].get("true_count") == 0
        assert stats[0]["g"].get("false_count") == 0

        assert stats[0]["h"].get("number_of_values") == 0
        assert stats[0]["h"].get("has_null") is True
        assert stats[0]["h"].get("sum") == 0

        assert stats[0]["i"].get("number_of_values") == 1
        assert stats[0]["i"].get("has_null") is False
        assert stats[0]["i"].get("minimum") == 1
        assert stats[0]["i"].get("maximum") == 1
        assert stats[0]["i"].get("sum") == 1
예제 #3
0
파일: utils.py 프로젝트: TravisHester/cudf
    cudf.dtype("<M8[ms]"): {"type": "long", "logicalType": "timestamp-millis"},
    cudf.dtype("<M8[us]"): {"type": "long", "logicalType": "timestamp-micros"},
}

PANDAS_TO_ORC_TYPES = {
    cudf.dtype("int8"): pyorc.TinyInt(),
    pd.Int8Dtype(): pyorc.TinyInt(),
    pd.Int16Dtype(): pyorc.SmallInt(),
    pd.Int32Dtype(): pyorc.Int(),
    pd.Int64Dtype(): pyorc.BigInt(),
    pd.BooleanDtype(): pyorc.Boolean(),
    cudf.dtype("bool_"): pyorc.Boolean(),
    cudf.dtype("int16"): pyorc.SmallInt(),
    cudf.dtype("int32"): pyorc.Int(),
    cudf.dtype("int64"): pyorc.BigInt(),
    cudf.dtype("O"): pyorc.String(),
    pd.StringDtype(): pyorc.String(),
    cudf.dtype("float32"): pyorc.Float(),
    cudf.dtype("float64"): pyorc.Double(),
    cudf.dtype("<M8[ns]"): pyorc.Timestamp(),
    cudf.dtype("<M8[ms]"): pyorc.Timestamp(),
    cudf.dtype("<M8[us]"): pyorc.Timestamp(),
}

ORC_TO_PANDAS_TYPES = {
    pyorc.TinyInt().name: pd.Int8Dtype(),
    pyorc.Int().name: pd.Int32Dtype(),
    pyorc.Boolean().name: pd.BooleanDtype(),
    pyorc.SmallInt().name: pd.Int16Dtype(),
    pyorc.BigInt().name: pd.Int64Dtype(),
    pyorc.String().name: cudf.dtype("O"),
예제 #4
0
        "logicalType": "timestamp-micros"
    },
}

PANDAS_TO_ORC_TYPES = {
    np.dtype("int8"): pyorc.TinyInt(),
    pd.Int8Dtype(): pyorc.TinyInt(),
    pd.Int16Dtype(): pyorc.SmallInt(),
    pd.Int32Dtype(): pyorc.Int(),
    pd.Int64Dtype(): pyorc.BigInt(),
    pd.BooleanDtype(): pyorc.Boolean(),
    np.dtype("bool_"): pyorc.Boolean(),
    np.dtype("int16"): pyorc.SmallInt(),
    np.dtype("int32"): pyorc.Int(),
    np.dtype("int64"): pyorc.BigInt(),
    np.dtype("O"): pyorc.String(),
    pd.StringDtype(): pyorc.String(),
    np.dtype("float32"): pyorc.Float(),
    np.dtype("float64"): pyorc.Double(),
    np.dtype("<M8[ns]"): pyorc.Timestamp(),
    np.dtype("<M8[ms]"): pyorc.Timestamp(),
    np.dtype("<M8[us]"): pyorc.Timestamp(),
}


def _generate_rand_meta(obj, dtypes_list, null_frequency_override=None):
    obj._current_params = {}
    num_rows = obj._rand(obj._max_rows)
    num_cols = obj._rand(obj._max_columns)

    dtypes_meta = []
예제 #5
0
def gen_map_buff(size=10000):
    from string import ascii_letters as al

    rd = random.Random(1)
    np.random.seed(seed=1)

    buff = BytesIO()

    schema = {
        "lvl1_map":
        po.Map(key=po.String(), value=po.BigInt()),
        "lvl2_map":
        po.Map(key=po.String(), value=po.Array(po.BigInt())),
        "lvl2_struct_map":
        po.Map(
            key=po.String(),
            value=po.Struct(**{
                "a": po.BigInt(),
                "b": po.BigInt()
            }),
        ),
    }

    schema = po.Struct(**schema)

    lvl1_map = [
        rd.choice([
            None,
            [(
                rd.choice(al),
                rd.choice([None, np.random.randint(1, 1500)]),
            ) for y in range(2)],
        ]) for x in range(size)
    ]
    lvl2_map = [
        rd.choice([
            None,
            [(
                rd.choice(al),
                rd.choice([
                    None,
                    [
                        rd.choice([None, np.random.randint(1, 1500)])
                        for z in range(5)
                    ],
                ]),
            ) for y in range(2)],
        ]) for x in range(size)
    ]
    lvl2_struct_map = [
        rd.choice([
            None,
            [(
                rd.choice(al),
                rd.choice([
                    None,
                    (
                        rd.choice([None, np.random.randint(1, 1500)]),
                        rd.choice([None, np.random.randint(1, 1500)]),
                    ),
                ]),
            ) for y in range(2)],
        ]) for x in range(size)
    ]

    pdf = pd.DataFrame({
        "lvl1_map": lvl1_map,
        "lvl2_map": lvl2_map,
        "lvl2_struct_map": lvl2_struct_map,
    })
    writer = po.Writer(buff,
                       schema,
                       stripe_size=1024,
                       compression=po.CompressionKind.NONE)
    tuples = list(
        map(
            lambda x: (None, ) if x[0] is pd.NA else x,
            list(pdf.itertuples(index=False, name=None)),
        ))

    writer.writerows(tuples)
    writer.close()

    return buff