Пример #1
0
def test_map_from_dicts():
    data = [[{
        'key': b'a',
        'value': 1
    }, {
        'key': b'b',
        'value': 2
    }], [{
        'key': b'c',
        'value': 3
    }],
            [{
                'key': b'd',
                'value': 4
            }, {
                'key': b'e',
                'value': 5
            }, {
                'key': b'f',
                'value': None
            }], [{
                'key': b'g',
                'value': 7
            }]]
    expected = [[(d['key'], d['value']) for d in entry] for entry in data]

    arr = pa.array(expected, type=pa.map_(pa.binary(), pa.int32()))

    assert arr.to_pylist() == expected

    # With omitted values
    data[1] = None
    expected[1] = None

    arr = pa.array(expected, type=pa.map_(pa.binary(), pa.int32()))

    assert arr.to_pylist() == expected

    # Invalid dictionary
    for entry in [[{'value': 5}], [{}], [{'k': 1, 'v': 2}]]:
        with pytest.raises(ValueError, match="Invalid Map"):
            pa.array([entry], type=pa.map_('i4', 'i4'))

    # Invalid dictionary types
    for entry in [[{'key': '1', 'value': 5}], [{'key': {'value': 2}}]]:
        with pytest.raises(TypeError, match="integer is required"):
            pa.array([entry], type=pa.map_('i4', 'i4'))
Пример #2
0
def pyarrow_datatype_from_dict(json_dict: Dict) -> pyarrow.DataType:
    """
    Create a DataType in PyArrow format from a Schema json format.
    :param json_dict: the DataType in json format
    :return: the DataType in PyArrow format
    """ ""
    type_class = json_dict["type"]["name"]
    if type_class == "dictionary":
        key_type = json_dict["dictionary"]["indexType"]
        value_type = json_dict["children"][0]
        key_type = pyarrow_datatype_from_dict(key_type)
        value_type = pyarrow_datatype_from_dict(value_type)
        return pyarrow.map_(key_type, value_type)
    elif type_class == "list":
        field = json_dict["children"][0]
        element_type = pyarrow_datatype_from_dict(field)
        return pyarrow.list_(element_type)
    elif type_class == "struct":
        fields = [
            pyarrow_field_from_dict(field) for field in json_dict["children"]
        ]
        return pyarrow.struct(fields)
    elif type_class == "int" or type_class == "float" or type_class == "date":
        return pyarrow.type_for_alias(
            f'{type_class}{json_dict["type"]["bitWidth"]}')
    elif type_class == "time":
        type_info = json_dict["type"]
        if type_info["unit"] == "MICROSECOND":
            unit = "us"
        elif type_info["unit"] == "NANOSECOND":
            unit = "ns"
        elif type_info["unit"] == "MILLISECOND":
            unit = "ms"
        else:
            unit = "s"
        return pyarrow.type_for_alias(
            f'{type_class}{type_info["bitWidth"]}[{unit}]')
    elif type_class == "timestamp":
        type_info = json_dict["type"]
        if "unit" in type_info:
            if type_info["unit"] == "MICROSECOND":
                unit = "us"
            elif type_info["unit"] == "NANOSECOND":
                unit = "ns"
            elif type_info["unit"] == "MILLISECOND":
                unit = "ms"
            elif type_info["unit"] == "SECOND":
                unit = "s"
        else:
            unit = "ns"
        return pyarrow.type_for_alias(f"{type_class}[{unit}]")
    elif type_class.startswith("decimal"):
        type_info = json_dict["type"]
        return pyarrow.decimal128(precision=type_info["precision"],
                                  scale=type_info["scale"])
    else:
        return pyarrow.type_for_alias(type_class)
Пример #3
0
def test_map_from_tuples():
    expected = [[(b'a', 1), (b'b', 2)], [(b'c', 3)],
                [(b'd', 4), (b'e', 5), (b'f', None)], [(b'g', 7)]]

    arr = pa.array(expected, type=pa.map_(pa.binary(), pa.int32()))

    assert arr.to_pylist() == expected

    # With omitted values
    expected[1] = None

    arr = pa.array(expected, type=pa.map_(pa.binary(), pa.int32()))

    assert arr.to_pylist() == expected

    # Invalid tuple size
    for entry in [[(5, )], [()], [('5', 'foo', True)]]:
        with pytest.raises(ValueError, match="(?i)tuple size"):
            pa.array([entry], type=pa.map_('i4', 'i4'))
Пример #4
0
def test_type_schema_pickling():
    cases = [
        pa.int8(),
        pa.string(),
        pa.binary(),
        pa.binary(10),
        pa.list_(pa.string()),
        pa.map_(pa.string(), pa.int8()),
        pa.struct([pa.field('a', 'int8'),
                   pa.field('b', 'string')]),
        pa.union([pa.field('a', pa.int8()),
                  pa.field('b', pa.int16())], pa.lib.UnionMode_SPARSE),
        pa.union([pa.field('a', pa.int8()),
                  pa.field('b', pa.int16())], pa.lib.UnionMode_DENSE),
        pa.time32('s'),
        pa.time64('us'),
        pa.date32(),
        pa.date64(),
        pa.timestamp('ms'),
        pa.timestamp('ns'),
        pa.decimal128(12, 2),
        pa.decimal256(76, 38),
        pa.field('a', 'string', metadata={b'foo': b'bar'}),
        pa.list_(pa.field("element", pa.int64())),
        pa.large_list(pa.field("element", pa.int64())),
        pa.map_(pa.field("key", pa.string(), nullable=False),
                pa.field("value", pa.int8()))
    ]

    for val in cases:
        roundtripped = pickle.loads(pickle.dumps(val))
        assert val == roundtripped

    fields = []
    for i, f in enumerate(cases):
        if isinstance(f, pa.Field):
            fields.append(f)
        else:
            fields.append(pa.field('_f{}'.format(i), f))

    schema = pa.schema(fields, metadata={b'foo': b'bar'})
    roundtripped = pickle.loads(pickle.dumps(schema))
    assert schema == roundtripped
Пример #5
0
def to_arrow_type(dt):
    """ Convert Spark data type to pyarrow type
    """
    from distutils.version import LooseVersion
    import pyarrow as pa
    if type(dt) == BooleanType:
        arrow_type = pa.bool_()
    elif type(dt) == ByteType:
        arrow_type = pa.int8()
    elif type(dt) == ShortType:
        arrow_type = pa.int16()
    elif type(dt) == IntegerType:
        arrow_type = pa.int32()
    elif type(dt) == LongType:
        arrow_type = pa.int64()
    elif type(dt) == FloatType:
        arrow_type = pa.float32()
    elif type(dt) == DoubleType:
        arrow_type = pa.float64()
    elif type(dt) == DecimalType:
        arrow_type = pa.decimal128(dt.precision, dt.scale)
    elif type(dt) == StringType:
        arrow_type = pa.string()
    elif type(dt) == BinaryType:
        arrow_type = pa.binary()
    elif type(dt) == DateType:
        arrow_type = pa.date32()
    elif type(dt) == TimestampType:
        # Timestamps should be in UTC, JVM Arrow timestamps require a timezone to be read
        arrow_type = pa.timestamp('us', tz='UTC')
    elif type(dt) == ArrayType:
        if type(dt.elementType) in [StructType, TimestampType]:
            raise TypeError("Unsupported type in conversion to Arrow: " + str(dt))
        arrow_type = pa.list_(to_arrow_type(dt.elementType))
    elif type(dt) == MapType:
        if LooseVersion(pa.__version__) < LooseVersion("2.0.0"):
            raise TypeError("MapType is only supported with pyarrow 2.0.0 and above")
        if type(dt.keyType) in [StructType, TimestampType] or \
                type(dt.valueType) in [StructType, TimestampType]:
            raise TypeError("Unsupported type in conversion to Arrow: " + str(dt))
        arrow_type = pa.map_(to_arrow_type(dt.keyType), to_arrow_type(dt.valueType))
    elif type(dt) == StructType:
        if any(type(field.dataType) == StructType for field in dt):
            raise TypeError("Nested StructType not supported in conversion to Arrow")
        fields = [pa.field(field.name, to_arrow_type(field.dataType), nullable=field.nullable)
                  for field in dt]
        arrow_type = pa.struct(fields)
    elif type(dt) == NullType:
        arrow_type = pa.null()
    else:
        raise TypeError("Unsupported type in conversion to Arrow: " + str(dt))
    return arrow_type
Пример #6
0
def test_map():
    ty = pa.map_(pa.string(), pa.int8())
    v = [('a', 1), ('b', 2)]
    s = pa.scalar(v, type=ty)

    assert len(s) == 2
    assert isinstance(s, pa.MapScalar)
    assert isinstance(s.values, pa.Array)
    assert repr(s) == "<pyarrow.MapScalar: [('a', 1), ('b', 2)]>"
    assert s.as_py() == v
    assert s[1] == (pa.scalar('b',
                              type=pa.string()), pa.scalar(2, type=pa.int8()))
    assert s[-1] == s[1]
    assert s[-2] == s[0]
    with pytest.raises(IndexError):
        s[-3]
    with pytest.raises(IndexError):
        s[2]
Пример #7
0
def athena2pyarrow(dtype: str) -> pa.DataType:  # pylint: disable=too-many-return-statements,too-many-branches
    """Athena to PyArrow data types conversion."""
    if dtype.startswith(("array", "struct", "map")):
        orig_dtype: str = dtype
    dtype = dtype.lower().replace(" ", "")
    if dtype == "tinyint":
        return pa.int8()
    if dtype == "smallint":
        return pa.int16()
    if dtype in ("int", "integer"):
        return pa.int32()
    if dtype == "bigint":
        return pa.int64()
    if dtype in ("float", "real"):
        return pa.float32()
    if dtype == "double":
        return pa.float64()
    if dtype == "boolean":
        return pa.bool_()
    if (dtype == "string"
        ) or dtype.startswith("char") or dtype.startswith("varchar"):
        return pa.string()
    if dtype == "timestamp":
        return pa.timestamp(unit="ns")
    if dtype == "date":
        return pa.date32()
    if dtype in ("binary" or "varbinary"):
        return pa.binary()
    if dtype.startswith("decimal") is True:
        precision, scale = dtype.replace("decimal(",
                                         "").replace(")", "").split(sep=",")
        return pa.decimal128(precision=int(precision), scale=int(scale))
    if dtype.startswith("array") is True:
        return pa.list_(value_type=athena2pyarrow(dtype=orig_dtype[6:-1]),
                        list_size=-1)
    if dtype.startswith("struct") is True:
        return pa.struct([(f.split(":",
                                   1)[0], athena2pyarrow(f.split(":", 1)[1]))
                          for f in _split_struct(orig_dtype[7:-1])])
    if dtype.startswith("map") is True:
        parts: List[str] = _split_map(s=orig_dtype[4:-1])
        return pa.map_(athena2pyarrow(parts[0]), athena2pyarrow(parts[1]))
    raise exceptions.UnsupportedType(f"Unsupported Athena type: {dtype}")
Пример #8
0
def test_read_pandas_map_fields(tempdir):
    # ARROW-10140 - table created from Pandas with mapping fields
    df = pd.DataFrame({
        'col1': pd.Series([
            [('id', 'something'), ('value2', 'else')],
            [('id', 'something2'), ('value', 'else2')],
        ]),
        'col2': pd.Series(['foo', 'bar'])
    })

    filename = tempdir / 'data.parquet'

    udt = pa.map_(pa.string(), pa.string())
    schema = pa.schema([pa.field('col1', udt), pa.field('col2', pa.string())])
    arrow_table = pa.Table.from_pandas(df, schema)

    _write_table(arrow_table, filename)

    result = pq.read_pandas(filename).to_pandas()
    tm.assert_frame_equal(result, df)
Пример #9
0
def test_to_column_info():
    schema = pa.schema([
        pa.field("col_boolean", pa.bool_()),
        pa.field("col_tinyint", pa.int32()),
        pa.field("col_smallint", pa.int32()),
        pa.field("col_int", pa.int32()),
        pa.field("col_bigint", pa.int64()),
        pa.field("col_float", pa.float32()),
        pa.field("col_double", pa.float64()),
        pa.field("col_string", pa.string()),
        pa.field("col_varchar", pa.string()),
        pa.field("col_timestamp", pa.timestamp("ns")),
        pa.field("col_date", pa.date32()),
        pa.field("col_binary", pa.binary()),
        pa.field("col_array", pa.list_(pa.field("array_element", pa.int32()))),
        pa.field("col_map", pa.map_(pa.int32(),
                                    pa.field("entries", pa.int32()))),
        pa.field(
            "col_struct",
            pa.struct([pa.field("a", pa.int32()),
                       pa.field("b", pa.int32())]),
        ),
        pa.field("col_decimal", pa.decimal128(10, 1)),
    ])
    assert to_column_info(schema) == (
        {
            "Name": "col_boolean",
            "Nullable": "NULLABLE",
            "Precision": 0,
            "Scale": 0,
            "Type": "boolean",
        },
        {
            "Name": "col_tinyint",
            "Nullable": "NULLABLE",
            "Precision": 10,
            "Scale": 0,
            "Type": "integer",
        },
        {
            "Name": "col_smallint",
            "Nullable": "NULLABLE",
            "Precision": 10,
            "Scale": 0,
            "Type": "integer",
        },
        {
            "Name": "col_int",
            "Nullable": "NULLABLE",
            "Precision": 10,
            "Scale": 0,
            "Type": "integer",
        },
        {
            "Name": "col_bigint",
            "Nullable": "NULLABLE",
            "Precision": 19,
            "Scale": 0,
            "Type": "bigint",
        },
        {
            "Name": "col_float",
            "Nullable": "NULLABLE",
            "Precision": 17,
            "Scale": 0,
            "Type": "float",
        },
        {
            "Name": "col_double",
            "Nullable": "NULLABLE",
            "Precision": 17,
            "Scale": 0,
            "Type": "double",
        },
        {
            "Name": "col_string",
            "Nullable": "NULLABLE",
            "Precision": 2147483647,
            "Scale": 0,
            "Type": "varchar",
        },
        {
            "Name": "col_varchar",
            "Nullable": "NULLABLE",
            "Precision": 2147483647,
            "Scale": 0,
            "Type": "varchar",
        },
        {
            "Name": "col_timestamp",
            "Nullable": "NULLABLE",
            "Precision": 3,
            "Scale": 0,
            "Type": "timestamp",
        },
        {
            "Name": "col_date",
            "Nullable": "NULLABLE",
            "Precision": 0,
            "Scale": 0,
            "Type": "date",
        },
        {
            "Name": "col_binary",
            "Nullable": "NULLABLE",
            "Precision": 1073741824,
            "Scale": 0,
            "Type": "varbinary",
        },
        {
            "Name": "col_array",
            "Nullable": "NULLABLE",
            "Precision": 0,
            "Scale": 0,
            "Type": "array",
        },
        {
            "Name": "col_map",
            "Nullable": "NULLABLE",
            "Precision": 0,
            "Scale": 0,
            "Type": "map",
        },
        {
            "Name": "col_struct",
            "Nullable": "NULLABLE",
            "Precision": 0,
            "Scale": 0,
            "Type": "row",
        },
        {
            "Name": "col_decimal",
            "Nullable": "NULLABLE",
            "Precision": 10,
            "Scale": 1,
            "Type": "decimal",
        },
    )
Пример #10
0
def map_types(draw, key_strategy=primitive_types,
              item_strategy=primitive_types):
    key_type = draw(key_strategy)
    h.assume(not pa.types.is_null(key_type))
    value_type = draw(item_strategy)
    return pa.map_(key_type, value_type)
Пример #11
0
def generate_test_parquet():
    import pyarrow as pa
    import datetime
    import decimal
    import json
    import pandas as pd
    import pathlib
    import pyarrow.parquet as pq
    import struct

    boolean = pa.array([True, False, None, False, True], type=pa.bool_())
    uint8 = pa.array([None if i == 2 else 1 + i for i in range(5)],
                     type=pa.uint8())
    int8 = pa.array([None if i == 2 else -2 + i for i in range(5)],
                    type=pa.int8())
    uint16 = pa.array([None if i == 2 else 1 + i * 10000 for i in range(5)],
                      type=pa.uint16())
    int16 = pa.array(
        [None if i == 2 else -20000 + i * 10000 for i in range(5)],
        type=pa.int16())
    uint32 = pa.array(
        [None if i == 2 else 1 + i * 1000000000 for i in range(5)],
        type=pa.uint32())
    int32 = pa.array(
        [None if i == 2 else -2000000000 + i * 1000000000 for i in range(5)],
        type=pa.int32())
    uint64 = pa.array(
        [None if i == 2 else 1 + i * 100000000000 for i in range(5)],
        type=pa.uint64())
    int64 = pa.array([
        None if i == 2 else -200000000000 + i * 100000000000 for i in range(5)
    ],
                     type=pa.int64())
    float32 = pa.array([None if i == 2 else 1.5 + i for i in range(5)],
                       type=pa.float32())
    float64 = pa.array([None if i == 2 else 1.5 + i for i in range(5)],
                       type=pa.float64())
    string = pa.array(["abcd", "", None, "c", "d"], type=pa.string())
    large_string = pa.array(["abcd", "", None, "c", "d"],
                            type=pa.large_string())
    gmt_plus_2 = datetime.timezone(datetime.timedelta(hours=2))
    timestamp_ms_gmt_plus_2 = pa.array([
        pd.Timestamp(year=2019,
                     month=1,
                     day=1,
                     hour=14,
                     nanosecond=500 * 1e6,
                     tz=gmt_plus_2)
    ] * 5,
                                       type=pa.timestamp('ms', tz=gmt_plus_2))
    gmt = datetime.timezone(datetime.timedelta(hours=0))
    timestamp_ms_gmt = pa.array([
        pd.Timestamp(
            year=2019, month=1, day=1, hour=14, nanosecond=500 * 1e6, tz=gmt)
    ] * 5,
                                type=pa.timestamp('ms', tz=gmt))
    gmt_minus_0215 = datetime.timezone(datetime.timedelta(hours=-2.25))
    timestamp_ms_gmt_minus_0215 = pa.array([
        pd.Timestamp(year=2019,
                     month=1,
                     day=1,
                     hour=14,
                     nanosecond=500 * 1e6,
                     tz=gmt_minus_0215)
    ] * 5,
                                           type=pa.timestamp(
                                               'ms', tz=gmt_minus_0215))
    timestamp_s_no_tz = pa.array([
        pd.Timestamp(year=2019, month=1, day=1, hour=14, nanosecond=500 * 1e6)
    ] * 5,
                                 type=pa.timestamp('s'))
    time32_s = pa.array([3600 + 120 + 3, None, 3, 4, 5], type=pa.time32('s'))
    time32_ms = pa.array([(3600 + 120 + 3) * 1000 + 456, 2, 3, 4, 5],
                         type=pa.time32('ms'))
    time64_us = pa.array([(3600 + 120 + 3) * 1e6, None, 3, 4, 5],
                         type=pa.time64('us'))
    time64_ns = pa.array([(3600 + 120 + 3) * 1e9 + 456, 2, 3, 4, 5],
                         type=pa.time64('ns'))
    date32 = pa.array([1, 2, 3, 4, 5], type=pa.date32())
    date64 = pa.array([86400 * 1000, 2, 3, 4, 5], type=pa.date64())
    duration_s = pa.array([1, 2, 3, 4, 5], type=pa.duration('s'))
    duration_ms = pa.array([1, 2, 3, 4, 5], type=pa.duration('ms'))
    binary = pa.array([b'\x00\x01'] * 5, type=pa.binary())
    large_binary = pa.array([b'\x00\x01'] * 5, type=pa.large_binary())
    fixed_size_binary = pa.array([b'\x00\x01'] * 5, type=pa.binary(2))
    decimal128 = pa.array([
        decimal.Decimal('1234.567'),
        decimal.Decimal('-1234.567'), None,
        decimal.Decimal('1234.567'),
        decimal.Decimal('-1234.567')
    ],
                          type=pa.decimal128(7, 3))
    decimal256 = pa.array([
        decimal.Decimal('1234.567'),
        decimal.Decimal('-1234.567'), None,
        decimal.Decimal('1234.567'),
        decimal.Decimal('-1234.567')
    ],
                          type=pa.decimal256(7, 3))
    list_boolean = pa.array([
        None if i == 2 else [
            None if j == 0 else True if (j % 2) == 0 else False
            for j in range(i)
        ] for i in range(5)
    ],
                            type=pa.list_(pa.bool_()))
    list_uint8 = pa.array([
        None if i == 2 else
        [None if j == 0 else j + i * (i - 1) // 2 for j in range(i)]
        for i in range(5)
    ],
                          type=pa.list_(pa.uint8()))
    list_int8 = pa.array([
        None if i == 2 else
        [None if j == 0 else j + i * (i - 1) // 2 for j in range(i)]
        for i in range(5)
    ],
                         type=pa.list_(pa.int8()))
    list_uint16 = pa.array([
        None if i == 2 else
        [None if j == 0 else j + i * (i - 1) // 2 for j in range(i)]
        for i in range(5)
    ],
                           type=pa.list_(pa.uint16()))
    list_int16 = pa.array([
        None if i == 2 else
        [None if j == 0 else j + i * (i - 1) // 2 for j in range(i)]
        for i in range(5)
    ],
                          type=pa.list_(pa.int16()))
    list_uint32 = pa.array([
        None if i == 2 else
        [None if j == 0 else j + i * (i - 1) // 2 for j in range(i)]
        for i in range(5)
    ],
                           type=pa.list_(pa.uint32()))
    list_int32 = pa.array([
        None if i == 2 else
        [None if j == 0 else j + i * (i - 1) // 2 for j in range(i)]
        for i in range(5)
    ],
                          type=pa.list_(pa.int32()))
    list_uint64 = pa.array([
        None if i == 2 else
        [None if j == 0 else j + i * (i - 1) // 2 for j in range(i)]
        for i in range(5)
    ],
                           type=pa.list_(pa.uint64()))
    list_int64 = pa.array([
        None if i == 2 else
        [None if j == 0 else j + i * (i - 1) // 2 for j in range(i)]
        for i in range(5)
    ],
                          type=pa.list_(pa.int64()))
    list_float32 = pa.array([
        None if i == 2 else
        [None if j == 0 else 0.5 + j + i * (i - 1) // 2 for j in range(i)]
        for i in range(5)
    ],
                            type=pa.list_(pa.float32()))
    list_float64 = pa.array([
        None if i == 2 else
        [None if j == 0 else 0.5 + j + i * (i - 1) // 2 for j in range(i)]
        for i in range(5)
    ],
                            type=pa.list_(pa.float64()))
    list_string = pa.array([
        None if i == 2 else [
            "".join(["%c" % (65 + j + k) for k in range(1 + j)])
            for j in range(i)
        ] for i in range(5)
    ])
    fixed_size_list_boolean = pa.array(
        [[True, False], [False, True], [True, False], [False, True],
         [True, False]],
        type=pa.list_(pa.bool_(), 2))
    fixed_size_list_uint8 = pa.array([[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]],
                                     type=pa.list_(pa.uint8(), 2))
    fixed_size_list_int8 = pa.array([[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]],
                                    type=pa.list_(pa.int8(), 2))
    fixed_size_list_uint16 = pa.array([[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]],
                                      type=pa.list_(pa.uint16(), 2))
    fixed_size_list_int16 = pa.array([[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]],
                                     type=pa.list_(pa.int16(), 2))
    fixed_size_list_uint32 = pa.array([[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]],
                                      type=pa.list_(pa.uint32(), 2))
    fixed_size_list_int32 = pa.array([[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]],
                                     type=pa.list_(pa.int32(), 2))
    fixed_size_list_uint64 = pa.array([[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]],
                                      type=pa.list_(pa.uint64(), 2))
    fixed_size_list_int64 = pa.array([[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]],
                                     type=pa.list_(pa.int64(), 2))
    fixed_size_list_float32 = pa.array(
        [[0, None], [2, 3], [4, 5], [6, 7], [8, 9]],
        type=pa.list_(pa.float32(), 2))
    fixed_size_list_float64 = pa.array(
        [[0, None], [2, 3], [4, 5], [6, 7], [8, 9]],
        type=pa.list_(pa.float64(), 2))
    fixed_size_list_string = pa.array(
        [["a", "b"], ["c", "d"], ["e", "f"], ["g", "h"], ["i", "j"]],
        type=pa.list_(pa.string(), 2))
    struct_field = pa.array([{
        "a": 1,
        "b": 2.5,
        "c": {
            "d": "e",
            "f": "g"
        },
        "h": [5, 6],
        "i": 3
    }] * 5)

    #struct_val = { "a": 5 }
    #for i in range(123):
    #    struct_val = { "a": struct_val }
    #struct_field = pa.array([struct_val] * 5)

    map_boolean = pa.array([[('x', None),
                             ('y', True)], [('z', True)], None, [], []],
                           type=pa.map_(pa.string(), pa.bool_()))
    map_uint8 = pa.array([[('x', 1), ('y', None)], [('z', 3)], None, [], []],
                         type=pa.map_(pa.string(), pa.uint8()))
    map_int8 = pa.array([[('x', 1), ('y', None)], [('z', 3)], None, [], []],
                        type=pa.map_(pa.string(), pa.int8()))
    map_uint16 = pa.array([[('x', 1), ('y', None)], [('z', 3)], None, [], []],
                          type=pa.map_(pa.string(), pa.uint16()))
    map_int16 = pa.array([[('x', 1), ('y', None)], [('z', 3)], None, [], []],
                         type=pa.map_(pa.string(), pa.int16()))
    map_uint32 = pa.array([[('x', 4 * 1000 * 1000 * 1000),
                            ('y', None)], [('z', 3)], None, [], []],
                          type=pa.map_(pa.string(), pa.uint32()))
    map_int32 = pa.array([[('x', 2 * 1000 * 1000 * 1000),
                           ('y', None)], [('z', 3)], None, [], []],
                         type=pa.map_(pa.string(), pa.int32()))
    map_uint64 = pa.array([[('x', 4 * 1000 * 1000 * 1000 * 1000),
                            ('y', None)], [('z', 3)], None, [], []],
                          type=pa.map_(pa.string(), pa.uint64()))
    map_int64 = pa.array([[('x', -2 * 1000 * 1000 * 1000 * 1000),
                           ('y', None)], [('z', 3)], None, [], []],
                         type=pa.map_(pa.string(), pa.int64()))
    map_float32 = pa.array([[('x', 1.5),
                             ('y', None)], [('z', 3)], None, [], []],
                           type=pa.map_(pa.string(), pa.float32()))
    map_float64 = pa.array([[('x', 1.5),
                             ('y', None)], [('z', 3)], None, [], []],
                           type=pa.map_(pa.string(), pa.float64()))
    map_string = pa.array([[('x', 'x_val'),
                            ('y', None)], [('z', 'z_val')], None, [], []],
                          type=pa.map_(pa.string(), pa.string()))

    indices = pa.array([0, 1, 2, None, 2])
    dictionary = pa.array(['foo', 'bar', 'baz'])
    dict = pa.DictionaryArray.from_arrays(indices, dictionary)

    map_list = pa.array([[('x', []), ('y', [])], [('z', [])], None, [], []],
                        type=pa.map_(pa.string(), pa.list_(pa.uint32())))

    geometry = pa.array([
        None if i == 1 else
        (b'\x01\x01\x00\x00\x00' + struct.pack('<dd', i, 2)) for i in range(5)
    ],
                        type=pa.binary())

    names = [
        "boolean",
        "uint8",
        "int8",
        "uint16",
        "int16",
        "uint32",
        "int32",
        "uint64",
        "int64",
        "float32",
        "float64",
        "string",
        "large_string",
        "timestamp_ms_gmt",
        "timestamp_ms_gmt_plus_2",
        "timestamp_ms_gmt_minus_0215",
        "timestamp_s_no_tz",
        "time32_s",
        "time32_ms",
        "time64_us",
        "time64_ns",
        "date32",
        "date64",
        # "duration_s",
        # "duration_ms",
        "binary",
        "large_binary",
        "fixed_size_binary",
        "decimal128",
        "decimal256",
        "list_boolean",
        "list_uint8",
        "list_int8",
        "list_uint16",
        "list_int16",
        "list_uint32",
        "list_int32",
        "list_uint64",
        "list_int64",
        "list_float32",
        "list_float64",
        "list_string",
        "fixed_size_list_boolean",
        "fixed_size_list_uint8",
        "fixed_size_list_int8",
        "fixed_size_list_uint16",
        "fixed_size_list_int16",
        "fixed_size_list_uint32",
        "fixed_size_list_int32",
        "fixed_size_list_uint64",
        "fixed_size_list_int64",
        "fixed_size_list_float32",
        "fixed_size_list_float64",
        "fixed_size_list_string",
        "struct_field",
        "map_boolean",
        "map_uint8",
        "map_int8",
        "map_uint16",
        "map_int16",
        "map_uint32",
        "map_int32",
        "map_uint64",
        "map_int64",
        "map_float32",
        "map_float64",
        "map_string",
        # "map_list",
        "dict",
        "geometry",
    ]

    locals_ = locals()
    table = pa.table([locals_[x] for x in names], names=names)

    my_schema = table.schema.with_metadata({
        "geo":
        json.dumps({
            "version": "0.1.0",
            "primary_column": "geometry",
            "columns": {
                "geometry": {
                    'crs': wkt_epsg_4326,
                    'bbox': [0, 2, 4, 2],
                    'encoding': 'WKB'
                }
            }
        })
    })

    table = table.cast(my_schema)
    HERE = pathlib.Path(__file__).parent
    pq.write_table(table,
                   HERE / "ogr/data/parquet/test.parquet",
                   compression='NONE',
                   row_group_size=3)
Пример #12
0
        ]
    ),
    pa.struct(
        [
            pa.field("a", pa.int32(), nullable=False),
            pa.field("b", pa.int8(), nullable=False),
            pa.field("c", pa.string()),
        ]
    ),
    pa.dictionary(pa.int8(), pa.string()),
]

_unsupported_pyarrow_types = [
    pa.decimal256(76, 38),
    pa.duration("s"),
    pa.map_(pa.string(), pa.int32()),
    pa.union(
        [pa.field("a", pa.binary(10)), pa.field("b", pa.string())],
        mode=pa.lib.UnionMode_DENSE,
    ),
    pa.union(
        [pa.field("a", pa.binary(10)), pa.field("b", pa.string())],
        mode=pa.lib.UnionMode_DENSE,
        type_codes=[4, 8],
    ),
    pa.union(
        [pa.field("a", pa.binary(10)), pa.field("b", pa.string())],
        mode=pa.lib.UnionMode_SPARSE,
    ),
    pa.union(
        [
Пример #13
0
class TestAbstractFileParserStatics:
    @pytest.mark.parametrize(  # testing all datatypes as laid out here: https://json-schema.org/understanding-json-schema/reference/type.html
        "input_json_type, output_pyarrow_type",
        [
            ("string", pa.large_string()),
            ("number", pa.float64()),
            ("integer", pa.int64()),
            ("object", pa.large_string()),
            ("array", pa.large_string()),
            ("boolean", pa.bool_()),
            ("null", pa.large_string()),
        ],
    )
    def test_json_type_to_pyarrow_type(self, input_json_type: str, output_pyarrow_type: Any) -> None:
        # Json -> PyArrow direction
        LOGGER.info(f"asserting that JSON type '{input_json_type}' converts to PyArrow type '{output_pyarrow_type}'...")
        assert AbstractFileParser.json_type_to_pyarrow_type(input_json_type) == output_pyarrow_type

    @pytest.mark.parametrize(  # testing all datatypes as laid out here: https://arrow.apache.org/docs/python/api/datatypes.html
        "input_pyarrow_types, output_json_type",
        [
            ((pa.null(),), "string"),  # null type
            ((pa.bool_(),), "boolean"),  # boolean type
            (
                (pa.int8(), pa.int16(), pa.int32(), pa.int64(), pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64()),
                "integer",
            ),  # integer types
            ((pa.float16(), pa.float32(), pa.float64(), pa.decimal128(5, 10), pa.decimal256(3, 8)), "number"),  # number types
            ((pa.time32("s"), pa.time64("ns"), pa.timestamp("ms"), pa.date32(), pa.date64()), "string"),  # temporal types
            ((pa.binary(), pa.large_binary()), "string"),  # binary types
            ((pa.string(), pa.utf8(), pa.large_string(), pa.large_utf8()), "string"),  # string types
            ((pa.list_(pa.string()), pa.large_list(pa.timestamp("us"))), "string"),  # array types
            ((pa.map_(pa.string(), pa.float32()), pa.dictionary(pa.int16(), pa.list_(pa.string()))), "string"),  # object types
        ],
    )
    def test_json_type_to_pyarrow_type_reverse(self, input_pyarrow_types: Tuple[Any], output_json_type: str) -> None:
        # PyArrow -> Json direction (reverse=True)
        for typ in input_pyarrow_types:
            LOGGER.info(f"asserting that PyArrow type '{typ}' converts to JSON type '{output_json_type}'...")
            assert AbstractFileParser.json_type_to_pyarrow_type(typ, reverse=True) == output_json_type

    @pytest.mark.parametrize(  # if expecting fail, put pyarrow_schema as None
        "json_schema, pyarrow_schema",
        [
            (
                {"a": "string", "b": "number", "c": "integer", "d": "object", "e": "array", "f": "boolean", "g": "null"},
                {
                    "a": pa.large_string(),
                    "b": pa.float64(),
                    "c": pa.int64(),
                    "d": pa.large_string(),
                    "e": pa.large_string(),
                    "f": pa.bool_(),
                    "g": pa.large_string(),
                },
            ),
            ({"single_column": "object"}, {"single_column": pa.large_string()}),
            ({}, {}),
            ({"a": "NOT A REAL TYPE", "b": "another fake type"}, {"a": pa.large_string(), "b": pa.large_string()}),
            (["string", "object"], None),  # bad input type
        ],
    )
    def test_json_schema_to_pyarrow_schema(self, json_schema: Mapping[str, Any], pyarrow_schema: Mapping[str, Any]) -> None:
        # Json -> PyArrow direction
        if pyarrow_schema is not None:
            assert AbstractFileParser.json_schema_to_pyarrow_schema(json_schema) == pyarrow_schema
        else:
            with pytest.raises(Exception) as e_info:
                AbstractFileParser.json_schema_to_pyarrow_schema(json_schema)
                LOGGER.debug(str(e_info))

    @pytest.mark.parametrize(  # if expecting fail, put json_schema as None
        "pyarrow_schema, json_schema",
        [
            (
                {
                    "a": pa.utf8(),
                    "b": pa.float16(),
                    "c": pa.uint32(),
                    "d": pa.map_(pa.string(), pa.float32()),
                    "e": pa.bool_(),
                    "f": pa.date64(),
                },
                {"a": "string", "b": "number", "c": "integer", "d": "string", "e": "boolean", "f": "string"},
            ),
            ({"single_column": pa.int32()}, {"single_column": "integer"}),
            ({}, {}),
            ({"a": "NOT A REAL TYPE", "b": "another fake type"}, {"a": "string", "b": "string"}),
            (["string", "object"], None),  # bad input type
        ],
    )
    def test_json_schema_to_pyarrow_schema_reverse(self, pyarrow_schema: Mapping[str, Any], json_schema: Mapping[str, Any]) -> None:
        # PyArrow -> Json direction (reverse=True)
        if json_schema is not None:
            assert AbstractFileParser.json_schema_to_pyarrow_schema(pyarrow_schema, reverse=True) == json_schema
        else:
            with pytest.raises(Exception) as e_info:
                AbstractFileParser.json_schema_to_pyarrow_schema(pyarrow_schema, reverse=True)
                LOGGER.debug(str(e_info))
Пример #14
0
    "id":
    3,
    "val": [("a", {
        "weight": 22.5,
        "temp": 33.1
    }), ("b", {
        "weight": 33.6,
        "temp": 44.5
    }), ("c", {
        "weight": 44.6,
        "temp": 55.5
    })],
    "val2": [("vb", {
        "weight": 5,
        "temp": 10
    })]
}]
df2 = pd.DataFrame(rows)
mystruct = pa.struct(
    [pa.field("weight", pa.float32()),
     pa.field("temp", pa.float32())])
mymap = pa.map_(pa.string(), mystruct)
schema = pa.schema([
    pa.field('id', pa.int32()),
    pa.field('val', mymap),
    pa.field("val2", mymap)
])
print(schema)
table = pa.Table.from_pandas(df2, schema)
pq.write_table(table, 'test/data/nested.parquet')
Пример #15
0
def pyarrow_datatype_from_dict(json_dict: Dict[str, Any]) -> pyarrow.DataType:
    """
    Create a DataType in PyArrow format from a Schema json format.

    :param json_dict: the DataType in json format
    :return: the DataType in PyArrow format
    """
    type_class = json_dict["type"]["name"]
    if type_class == "dictionary":
        key_type = json_dict["dictionary"]["indexType"]
        value_type = json_dict["children"][0]
        key_type = pyarrow_datatype_from_dict(key_type)
        value_type = pyarrow_datatype_from_dict(value_type)
        return pyarrow.map_(key_type, value_type)
    elif "dictionary" in json_dict:
        key_type = {
            "name": "key",
            "type": json_dict["dictionary"]["indexType"],
            "nullable": json_dict["nullable"],
        }
        key = pyarrow_datatype_from_dict(key_type)
        if type_class == "list":
            value_type = {
                "name": "val",
                "type": json_dict["dictionary"]["indexType"],
                "nullable": json_dict["nullable"],
            }
            return pyarrow.map_(
                key,
                pyarrow.list_(
                    pyarrow.field(
                        "element",
                        pyarrow.struct([pyarrow_field_from_dict(value_type)
                                        ]))),
            )
        value_type = {
            "name": "value",
            "type": json_dict["type"],
            "nullable": json_dict["nullable"],
        }
        return pyarrow.map_(key, pyarrow_datatype_from_dict(value_type))
    elif type_class == "list":
        field = json_dict["children"][0]
        element_type = pyarrow_datatype_from_dict(field)
        return pyarrow.list_(pyarrow.field("element", element_type))
    elif type_class == "struct":
        fields = [
            pyarrow_field_from_dict(field) for field in json_dict["children"]
        ]
        return pyarrow.struct(fields)
    elif type_class == "int":
        return pyarrow.type_for_alias(
            f'{type_class}{json_dict["type"]["bitWidth"]}')
    elif type_class == "date":
        type_info = json_dict["type"]
        if type_info["unit"] == "DAY":
            return pyarrow.date32()
        else:
            return pyarrow.date64()
    elif type_class == "time":
        type_info = json_dict["type"]
        if type_info["unit"] == "MICROSECOND":
            unit = "us"
        elif type_info["unit"] == "NANOSECOND":
            unit = "ns"
        elif type_info["unit"] == "MILLISECOND":
            unit = "ms"
        else:
            unit = "s"
        return pyarrow.type_for_alias(
            f'{type_class}{type_info["bitWidth"]}[{unit}]')
    elif type_class == "timestamp":
        type_info = json_dict["type"]
        if "unit" in type_info:
            if type_info["unit"] == "MICROSECOND":
                unit = "us"
            elif type_info["unit"] == "NANOSECOND":
                unit = "ns"
            elif type_info["unit"] == "MILLISECOND":
                unit = "ms"
            elif type_info["unit"] == "SECOND":
                unit = "s"
        else:
            unit = "ns"
        return pyarrow.type_for_alias(f"{type_class}[{unit}]")
    elif type_class.startswith("decimal"):
        type_info = json_dict["type"]
        return pyarrow.decimal128(precision=type_info["precision"],
                                  scale=type_info["scale"])
    elif type_class.startswith("floatingpoint"):
        type_info = json_dict["type"]
        if type_info["precision"] == "HALF":
            return pyarrow.float16()
        elif type_info["precision"] == "SINGLE":
            return pyarrow.float32()
        elif type_info["precision"] == "DOUBLE":
            return pyarrow.float64()
    else:
        return pyarrow.type_for_alias(type_class)
Пример #16
0
                             datetime(2018, 1, 5),
                             datetime(2018, 1, 7),
                             datetime(2018, 1, 9)],
                    'five': [date(2018, 1, 1),
                             date(2018, 1, 3),
                             date(2018, 1, 5),
                             date(2018, 1, 7),
                             date(2018, 1, 9)],
                    'six': [True, False, True, False, True]})
table3 = pa.Table.from_pandas(df3)

with pq.ParquetWriter('simple/example2.parquet', table3.schema) as writer:
    writer.write_table(table3)

# example3.parquet file
mdt1 = pa.map_(pa.int32(), pa.string())
mdt2 = pa.map_(pa.date32(), pa.int16())
df = pd.DataFrame({
        'one': pd.Series([
            [(1, 'foo'), (2, 'bar'), (3, 'baz')],
            [(4, 'test1'), (5,'test2')],
        ]),
        'two': pd.Series([
            [(date(2018, 1, 1), 10), (date(2018, 1, 2), 15)],
            [(date(2018, 1, 3), 20), (date(2018, 1, 4), 25)],
        ]),
        'three': pd.Series([1, 2]),
    }
)

schema = pa.schema([
Пример #17
0
     pa.LargeBinaryValue),
    (b"abc", pa.binary(3), pa.FixedSizeBinaryScalar, pa.FixedSizeBinaryValue),
    ([1, 2, 3], None, pa.ListScalar, pa.ListValue),
    ([1, 2, 3, 4], pa.large_list(pa.int8()), pa.LargeListScalar,
     pa.LargeListValue),
    ([1, 2, 3, 4, 5], pa.list_(pa.int8(), 5), pa.FixedSizeListScalar,
     pa.FixedSizeListValue),
    (datetime.date.today(), None, pa.Date32Scalar, pa.Date32Value),
    (datetime.date.today(), pa.date64(), pa.Date64Scalar, pa.Date64Value),
    (datetime.datetime.now(), None, pa.TimestampScalar, pa.TimestampValue),
    (datetime.datetime.now().time().replace(microsecond=0), pa.time32('s'),
     pa.Time32Scalar, pa.Time32Value),
    (datetime.datetime.now().time(), None, pa.Time64Scalar, pa.Time64Value),
    (datetime.timedelta(days=1), None, pa.DurationScalar, pa.DurationValue),
    ({'a': 1, 'b': [1, 2]}, None, pa.StructScalar, pa.StructValue),
    ([('a', 1), ('b', 2)], pa.map_(pa.string(), pa.int8()), pa.MapScalar,
     pa.MapValue),
])
def test_basics(value, ty, klass, deprecated):
    s = pa.scalar(value, type=ty)
    assert isinstance(s, klass)
    assert s.as_py() == value
    assert s == pa.scalar(value, type=ty)
    assert s != value
    assert s != "else"
    assert hash(s) == hash(s)
    assert s.is_valid is True
    with pytest.warns(FutureWarning):
        assert isinstance(s, deprecated)

    s = pa.scalar(None, type=s.type)