def test_timestamps_notimezone_nulls(self):
        df = pd.DataFrame({
            'datetime64': np.array([
                '2007-07-13T01:23:34.123',
                None,
                '2010-08-13T05:46:57.437'],
                dtype='datetime64[ms]')
            })
        field = pa.field('datetime64', pa.timestamp('ms'))
        schema = pa.schema([field])
        self._check_pandas_roundtrip(
            df,
            timestamps_to_ms=True,
            expected_schema=schema,
        )

        df = pd.DataFrame({
            'datetime64': np.array([
                '2007-07-13T01:23:34.123456789',
                None,
                '2010-08-13T05:46:57.437699912'],
                dtype='datetime64[ns]')
            })
        field = pa.field('datetime64', pa.timestamp('ns'))
        schema = pa.schema([field])
        self._check_pandas_roundtrip(
            df,
            timestamps_to_ms=False,
            expected_schema=schema,
        )
예제 #2
0
def make_recordbatch(length):
    schema = pa.schema([pa.field('f0', pa.int16()),
                        pa.field('f1', pa.int16())])
    a0 = pa.array(np.random.randint(0, 255, size=length, dtype=np.int16))
    a1 = pa.array(np.random.randint(0, 255, size=length, dtype=np.int16))
    batch = pa.RecordBatch.from_arrays([a0, a1], schema)
    return batch
예제 #3
0
def test_is_union():
    for mode in [pa.lib.UnionMode_SPARSE, pa.lib.UnionMode_DENSE]:
        assert types.is_union(pa.union([pa.field('a', pa.int32()),
                                        pa.field('b', pa.int8()),
                                        pa.field('c', pa.string())],
                                       mode=mode))
    assert not types.is_union(pa.list_(pa.int32()))
예제 #4
0
파일: test_types.py 프로젝트: rok/arrow
def test_field_add_remove_metadata():
    import collections

    f0 = pa.field('foo', pa.int32())

    assert f0.metadata is None

    metadata = {b'foo': b'bar', b'pandas': b'badger'}
    metadata2 = collections.OrderedDict([
        (b'a', b'alpha'),
        (b'b', b'beta')
    ])

    f1 = f0.add_metadata(metadata)
    assert f1.metadata == metadata

    f2 = f0.add_metadata(metadata2)
    assert f2.metadata == metadata2

    with pytest.raises(TypeError):
        f0.add_metadata([1, 2, 3])

    f3 = f1.remove_metadata()
    assert f3.metadata is None

    # idempotent
    f4 = f3.remove_metadata()
    assert f4.metadata is None

    f5 = pa.field('foo', pa.int32(), True, metadata)
    f6 = f0.add_metadata(metadata)
    assert f5.equals(f6)
예제 #5
0
def test_table_safe_casting():
    data = [
        pa.array(range(5), type=pa.int64()),
        pa.array([-10, -5, 0, 5, 10], type=pa.int32()),
        pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float64()),
        pa.array(['ab', 'bc', 'cd', 'de', 'ef'], type=pa.string())
    ]
    table = pa.Table.from_arrays(data, names=tuple('abcd'))

    expected_data = [
        pa.array(range(5), type=pa.int32()),
        pa.array([-10, -5, 0, 5, 10], type=pa.int16()),
        pa.array([1, 2, 3, 4, 5], type=pa.int64()),
        pa.array(['ab', 'bc', 'cd', 'de', 'ef'], type=pa.string())
    ]
    expected_table = pa.Table.from_arrays(expected_data, names=tuple('abcd'))

    target_schema = pa.schema([
        pa.field('a', pa.int32()),
        pa.field('b', pa.int16()),
        pa.field('c', pa.int64()),
        pa.field('d', pa.string())
    ])
    casted_table = table.cast(target_schema)

    assert casted_table.equals(expected_table)
예제 #6
0
파일: test_schema.py 프로젝트: rok/arrow
def test_schema():
    fields = [
        pa.field('foo', pa.int32()),
        pa.field('bar', pa.string()),
        pa.field('baz', pa.list_(pa.int8()))
    ]
    sch = pa.schema(fields)

    assert sch.names == ['foo', 'bar', 'baz']
    assert sch.types == [pa.int32(), pa.string(), pa.list_(pa.int8())]

    assert len(sch) == 3
    assert sch[0].name == 'foo'
    assert sch[0].type == fields[0].type
    assert sch.field_by_name('foo').name == 'foo'
    assert sch.field_by_name('foo').type == fields[0].type

    assert repr(sch) == """\
foo: int32
bar: string
baz: list<item: int8>
  child 0, item: int8"""

    with pytest.raises(TypeError):
        pa.schema([None])
예제 #7
0
def test_struct_from_tuples():
    ty = pa.struct([pa.field('a', pa.int32()),
                    pa.field('b', pa.string()),
                    pa.field('c', pa.bool_())])

    data = [(5, 'foo', True),
            (6, 'bar', False)]
    expected = [{'a': 5, 'b': 'foo', 'c': True},
                {'a': 6, 'b': 'bar', 'c': False}]
    arr = pa.array(data, type=ty)

    data_as_ndarray = np.empty(len(data), dtype=object)
    data_as_ndarray[:] = data
    arr2 = pa.array(data_as_ndarray, type=ty)
    assert arr.to_pylist() == expected

    assert arr.equals(arr2)

    # With omitted values
    data = [(5, 'foo', None),
            None,
            (6, None, False)]
    expected = [{'a': 5, 'b': 'foo', 'c': None},
                None,
                {'a': 6, 'b': None, 'c': False}]
    arr = pa.array(data, type=ty)
    assert arr.to_pylist() == expected

    # Invalid tuple size
    for tup in [(5, 'foo'), (), ('5', 'foo', True, None)]:
        with pytest.raises(ValueError, match="(?i)tuple size"):
            pa.array([tup], type=ty)
예제 #8
0
def test_struct_array_slice():
    # ARROW-2311: slicing nested arrays needs special care
    ty = pa.struct([pa.field('a', pa.int8()),
                    pa.field('b', pa.float32())])
    arr = pa.array([(1, 2.5), (3, 4.5), (5, 6.5)], type=ty)
    assert arr[1:].to_pylist() == [{'a': 3, 'b': 4.5},
                                   {'a': 5, 'b': 6.5}]
예제 #9
0
def test_struct_array_field():
    ty = pa.struct([pa.field('x', pa.int16()),
                    pa.field('y', pa.float32())])
    a = pa.array([(1, 2.5), (3, 4.5), (5, 6.5)], type=ty)

    x0 = a.field(0)
    y0 = a.field(1)
    x1 = a.field(-2)
    y1 = a.field(-1)
    x2 = a.field('x')
    y2 = a.field('y')

    assert isinstance(x0, pa.lib.Int16Array)
    assert isinstance(y1, pa.lib.FloatArray)
    assert x0.equals(pa.array([1, 3, 5], type=pa.int16()))
    assert y0.equals(pa.array([2.5, 4.5, 6.5], type=pa.float32()))
    assert x0.equals(x1)
    assert x0.equals(x2)
    assert y0.equals(y1)
    assert y0.equals(y2)

    for invalid_index in [None, pa.int16()]:
        with pytest.raises(TypeError):
            a.field(invalid_index)

    for invalid_index in [3, -3]:
        with pytest.raises(IndexError):
            a.field(invalid_index)

    for invalid_name in ['z', '']:
        with pytest.raises(KeyError):
            a.field(invalid_name)
예제 #10
0
def test_buffers_nested():
    a = pa.array([[1, 2], None, [3, None, 4, 5]], type=pa.list_(pa.int64()))
    buffers = a.buffers()
    assert len(buffers) == 4
    # The parent buffers
    null_bitmap = buffers[0].to_pybytes()
    assert bytearray(null_bitmap)[0] == 0b00000101
    offsets = buffers[1].to_pybytes()
    assert struct.unpack('4i', offsets) == (0, 2, 2, 6)
    # The child buffers
    null_bitmap = buffers[2].to_pybytes()
    assert bytearray(null_bitmap)[0] == 0b00110111
    values = buffers[3].to_pybytes()
    assert struct.unpack('qqq8xqq', values) == (1, 2, 3, 4, 5)

    a = pa.array([(42, None), None, (None, 43)],
                 type=pa.struct([pa.field('a', pa.int8()),
                                 pa.field('b', pa.int16())]))
    buffers = a.buffers()
    assert len(buffers) == 5
    # The parent buffer
    null_bitmap = buffers[0].to_pybytes()
    assert bytearray(null_bitmap)[0] == 0b00000101
    # The child buffers: 'a'
    null_bitmap = buffers[1].to_pybytes()
    assert bytearray(null_bitmap)[0] == 0b00000001
    values = buffers[2].to_pybytes()
    assert struct.unpack('bxx', values) == (42,)
    # The child buffers: 'b'
    null_bitmap = buffers[3].to_pybytes()
    assert bytearray(null_bitmap)[0] == 0b00000100
    values = buffers[4].to_pybytes()
    assert struct.unpack('4xh', values) == (43,)
예제 #11
0
def test_type_schema_pickling():
    cases = [
        pa.int8(),
        pa.string(),
        pa.binary(),
        pa.binary(10),
        pa.list_(pa.string()),
        pa.struct([
            pa.field('a', 'int8'),
            pa.field('b', 'string')
        ]),
        pa.time32('s'),
        pa.time64('us'),
        pa.date32(),
        pa.date64(),
        pa.timestamp('ms'),
        pa.timestamp('ns'),
        pa.decimal(12, 2),
        pa.field('a', 'string', metadata={b'foo': b'bar'})
    ]

    for val in cases:
        roundtripped = pickle.loads(pickle.dumps(val))
        assert val == roundtripped

    fields = []
    for i, f in enumerate(cases):
        if isinstance(f, pa.Field):
            fields.append(f)
        else:
            fields.append(pa.field('_f{}'.format(i), f))

    schema = pa.schema(fields, metadata={b'foo': b'bar'})
    roundtripped = pickle.loads(pickle.dumps(schema))
    assert schema == roundtripped
예제 #12
0
파일: test_types.py 프로젝트: dremio/arrow
def test_struct_type():
    fields = [pa.field('a', pa.int64()),
              pa.field('a', pa.int32()),
              pa.field('b', pa.int32())]
    ty = pa.struct(fields)

    assert len(ty) == ty.num_children == 3
    assert list(ty) == fields

    for a, b in zip(ty, fields):
        a == b

    # Construct from list of tuples
    ty = pa.struct([('a', pa.int64()),
                    ('a', pa.int32()),
                    ('b', pa.int32())])
    assert list(ty) == fields
    for a, b in zip(ty, fields):
        a == b

    # Construct from mapping
    fields = [pa.field('a', pa.int64()),
              pa.field('b', pa.int32())]
    ty = pa.struct(OrderedDict([('a', pa.int64()),
                                ('b', pa.int32())]))
    assert list(ty) == fields
    for a, b in zip(ty, fields):
        a == b
예제 #13
0
def test_table_unsafe_casting():
    data = [
        pa.array(range(5), type=pa.int64()),
        pa.array([-10, -5, 0, 5, 10], type=pa.int32()),
        pa.array([1.1, 2.2, 3.3, 4.4, 5.5], type=pa.float64()),
        pa.array(['ab', 'bc', 'cd', 'de', 'ef'], type=pa.string())
    ]
    table = pa.Table.from_arrays(data, names=tuple('abcd'))

    expected_data = [
        pa.array(range(5), type=pa.int32()),
        pa.array([-10, -5, 0, 5, 10], type=pa.int16()),
        pa.array([1, 2, 3, 4, 5], type=pa.int64()),
        pa.array(['ab', 'bc', 'cd', 'de', 'ef'], type=pa.string())
    ]
    expected_table = pa.Table.from_arrays(expected_data, names=tuple('abcd'))

    target_schema = pa.schema([
        pa.field('a', pa.int32()),
        pa.field('b', pa.int16()),
        pa.field('c', pa.int64()),
        pa.field('d', pa.string())
    ])

    with pytest.raises(pa.ArrowInvalid,
                       match='Floating point value truncated'):
        table.cast(target_schema)

    casted_table = table.cast(target_schema, safe=False)
    assert casted_table.equals(expected_table)
예제 #14
0
파일: test_table.py 프로젝트: dremio/arrow
def test_recordbatch_basics():
    data = [
        pa.array(range(5)),
        pa.array([-10, -5, 0, 5, 10])
    ]

    batch = pa.RecordBatch.from_arrays(data, ['c0', 'c1'])
    assert not batch.schema.metadata

    assert len(batch) == 5
    assert batch.num_rows == 5
    assert batch.num_columns == len(data)
    assert batch.to_pydict() == OrderedDict([
        ('c0', [0, 1, 2, 3, 4]),
        ('c1', [-10, -5, 0, 5, 10])
    ])

    with pytest.raises(IndexError):
        # bounds checking
        batch[2]

    # Schema passed explicitly
    schema = pa.schema([pa.field('c0', pa.int16()),
                        pa.field('c1', pa.int32())],
                       metadata={b'foo': b'bar'})
    batch = pa.RecordBatch.from_arrays(data, schema)
    assert batch.schema == schema
예제 #15
0
파일: test_types.py 프로젝트: rok/arrow
def test_field_metadata():
    f1 = pa.field('a', pa.int8())
    f2 = pa.field('a', pa.int8(), metadata={})
    f3 = pa.field('a', pa.int8(), metadata={b'bizz': b'bazz'})

    assert f1.metadata is None
    assert f2.metadata == {}
    assert f3.metadata[b'bizz'] == b'bazz'
예제 #16
0
def test_struct_from_mixed_sequence():
    # It is forbidden to mix dicts and tuples when initializing a struct array
    ty = pa.struct([pa.field('a', pa.int32()),
                    pa.field('b', pa.string()),
                    pa.field('c', pa.bool_())])
    data = [(5, 'foo', True),
            {'a': 6, 'b': 'bar', 'c': False}]
    with pytest.raises(TypeError):
        pa.array(data, type=ty)
예제 #17
0
def test_field_equality_operators():
    f1 = pa.field('a', pa.int8(), nullable=True)
    f2 = pa.field('a', pa.int8(), nullable=True)
    f3 = pa.field('b', pa.int8(), nullable=True)
    f4 = pa.field('b', pa.int8(), nullable=False)

    assert f1 == f2
    assert f1 != f3
    assert f3 != f4
    assert f1 != 'foo'
예제 #18
0
def test_fields_hashable():
    in_dict = {}
    fields = [pa.field('a', pa.int64()),
              pa.field('a', pa.int32()),
              pa.field('b', pa.int32())]
    for i, field in enumerate(fields):
        in_dict[field] = i
    assert len(in_dict) == len(fields)
    for i, field in enumerate(fields):
        assert in_dict[field] == i
예제 #19
0
def dataframe_with_arrays(include_index=False):
    """
    Dataframe with numpy arrays columns of every possible primtive type.

    Returns
    -------
    df: pandas.DataFrame
    schema: pyarrow.Schema
        Arrow schema definition that is in line with the constructed df.
    """
    dtypes = [('i1', pa.int8()), ('i2', pa.int16()),
              ('i4', pa.int32()), ('i8', pa.int64()),
              ('u1', pa.uint8()), ('u2', pa.uint16()),
              ('u4', pa.uint32()), ('u8', pa.uint64()),
              ('f4', pa.float32()), ('f8', pa.float64())]

    arrays = OrderedDict()
    fields = []
    for dtype, arrow_dtype in dtypes:
        fields.append(pa.field(dtype, pa.list_(arrow_dtype)))
        arrays[dtype] = [
            np.arange(10, dtype=dtype),
            np.arange(5, dtype=dtype),
            None,
            np.arange(1, dtype=dtype)
        ]

    fields.append(pa.field('str', pa.list_(pa.string())))
    arrays['str'] = [
        np.array([u"1", u"ä"], dtype="object"),
        None,
        np.array([u"1"], dtype="object"),
        np.array([u"1", u"2", u"3"], dtype="object")
    ]

    fields.append(pa.field('datetime64', pa.list_(pa.timestamp('ms'))))
    arrays['datetime64'] = [
        np.array(['2007-07-13T01:23:34.123456789',
                  None,
                  '2010-08-13T05:46:57.437699912'],
                 dtype='datetime64[ms]'),
        None,
        None,
        np.array(['2007-07-13T02',
                  None,
                  '2010-08-13T05:46:57.437699912'],
                 dtype='datetime64[ms]'),
    ]

    if include_index:
        fields.append(pa.field('__index_level_0__', pa.int64()))
    df = pd.DataFrame(arrays)
    schema = pa.schema(fields)

    return df, schema
예제 #20
0
def test_field():
    t = pa.string()
    f = pa.field('foo', t)

    assert f.name == 'foo'
    assert f.nullable
    assert f.type is t
    assert repr(f) == "pyarrow.Field<foo: string>"

    f = pa.field('foo', t, False)
    assert not f.nullable
예제 #21
0
파일: test_schema.py 프로젝트: apache/arrow
    def test_field(self):
        t = arrow.string()
        f = arrow.field('foo', t)

        assert f.name == 'foo'
        assert f.nullable
        assert f.type is t
        assert repr(f) == "Field('foo', type=string)"

        f = arrow.field('foo', t, False)
        assert not f.nullable
예제 #22
0
def test_struct_type():
    fields = [pa.field('a', pa.int64()),
              pa.field('a', pa.int32()),
              pa.field('b', pa.int32())]
    ty = pa.struct(fields)

    assert len(ty) == ty.num_children == 3
    assert list(ty) == fields

    for a, b in zip(ty, fields):
        a == b
예제 #23
0
파일: test_types.py 프로젝트: rok/arrow
def test_struct_type():
    fields = [
        # Duplicate field name on purpose
        pa.field('a', pa.int64()),
        pa.field('a', pa.int32()),
        pa.field('b', pa.int32())
    ]
    ty = pa.struct(fields)

    assert len(ty) == ty.num_children == 3
    assert list(ty) == fields
    assert ty[0].name == 'a'
    assert ty[2].type == pa.int32()
    with pytest.raises(IndexError):
        assert ty[3]

    assert ty['b'] == ty[2]

    # Duplicate
    with pytest.warns(UserWarning):
        with pytest.raises(KeyError):
            ty['a']

    # Not found
    with pytest.raises(KeyError):
        ty['c']

    # Neither integer nor string
    with pytest.raises(TypeError):
        ty[None]

    for a, b in zip(ty, fields):
        a == b

    # Construct from list of tuples
    ty = pa.struct([('a', pa.int64()),
                    ('a', pa.int32()),
                    ('b', pa.int32())])
    assert list(ty) == fields
    for a, b in zip(ty, fields):
        a == b

    # Construct from mapping
    fields = [pa.field('a', pa.int64()),
              pa.field('b', pa.int32())]
    ty = pa.struct(OrderedDict([('a', pa.int64()),
                                ('b', pa.int32())]))
    assert list(ty) == fields
    for a, b in zip(ty, fields):
        a == b

    # Invalid args
    with pytest.raises(TypeError):
        pa.struct([('a', None)])
예제 #24
0
def test_is_nested_or_struct():
    struct_ex = pa.struct([pa.field('a', pa.int32()),
                           pa.field('b', pa.int8()),
                           pa.field('c', pa.string())])

    assert types.is_struct(struct_ex)
    assert not types.is_struct(pa.list_(pa.int32()))

    assert types.is_nested(struct_ex)
    assert types.is_nested(pa.list_(pa.int32()))
    assert not types.is_nested(pa.int32())
예제 #25
0
def dataframe_with_lists(include_index=False):
    """
    Dataframe with list columns of every possible primtive type.

    Returns
    -------
    df: pandas.DataFrame
    schema: pyarrow.Schema
        Arrow schema definition that is in line with the constructed df.
    """
    arrays = OrderedDict()
    fields = []

    fields.append(pa.field('int64', pa.list_(pa.int64())))
    arrays['int64'] = [
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
        [0, 1, 2, 3, 4],
        None,
        [],
        np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9] * 2,
                 dtype=np.int64)[::2]
    ]
    fields.append(pa.field('double', pa.list_(pa.float64())))
    arrays['double'] = [
        [0., 1., 2., 3., 4., 5., 6., 7., 8., 9.],
        [0., 1., 2., 3., 4.],
        None,
        [],
        np.array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.] * 2)[::2],
    ]
    fields.append(pa.field('bytes_list', pa.list_(pa.binary())))
    arrays['bytes_list'] = [
        [b"1", b"f"],
        None,
        [b"1"],
        [b"1", b"2", b"3"],
        [],
    ]
    fields.append(pa.field('str_list', pa.list_(pa.string())))
    arrays['str_list'] = [
        [u"1", u"ä"],
        None,
        [u"1"],
        [u"1", u"2", u"3"],
        [],
    ]

    if include_index:
        fields.append(pa.field('__index_level_0__', pa.int64()))
    df = pd.DataFrame(arrays)
    schema = pa.schema(fields)

    return df, schema
예제 #26
0
파일: test_schema.py 프로젝트: rok/arrow
def test_schema_equals_propagates_check_metadata():
    # ARROW-4088
    schema1 = pa.schema([
        pa.field('foo', pa.int32()),
        pa.field('bar', pa.string())
    ])
    schema2 = pa.schema([
        pa.field('foo', pa.int32()),
        pa.field('bar', pa.string(), metadata={'a': 'alpha'}),
    ])
    assert not schema1.equals(schema2)
    assert schema1.equals(schema2, check_metadata=False)
예제 #27
0
파일: test_schema.py 프로젝트: rok/arrow
def test_schema_repr_with_dictionaries():
    fields = [
        pa.field('one', pa.dictionary(pa.int16(), pa.string())),
        pa.field('two', pa.int32())
    ]
    sch = pa.schema(fields)

    expected = (
        """\
one: dictionary<values=string, indices=int16, ordered=0>
two: int32""")

    assert repr(sch) == expected
예제 #28
0
def test_table_from_arrays_preserves_column_metadata():
    # Added to test https://issues.apache.org/jira/browse/ARROW-3866
    arr0 = pa.array([1, 2])
    arr1 = pa.array([3, 4])
    field0 = pa.field('field1', pa.int64(), metadata=dict(a="A", b="B"))
    field1 = pa.field('field2', pa.int64(), nullable=False)
    columns = [
        pa.column(field0, arr0),
        pa.column(field1, arr1)
    ]
    table = pa.Table.from_arrays(columns)
    assert b"a" in table.column(0).field.metadata
    assert table.column(1).field.nullable is False
예제 #29
0
파일: test_table.py 프로젝트: dremio/arrow
def test_table_pickle():
    data = [
        pa.chunked_array([[1, 2], [3, 4]], type=pa.uint32()),
        pa.chunked_array([["some", "strings", None, ""]], type=pa.string()),
    ]
    schema = pa.schema([pa.field('ints', pa.uint32()),
                        pa.field('strs', pa.string())],
                       metadata={b'foo': b'bar'})
    table = pa.Table.from_arrays(data, schema=schema)

    result = pickle.loads(pickle.dumps(table))
    result._validate()
    assert result.equals(table)
예제 #30
0
파일: test_table.py 프로젝트: dremio/arrow
def test_recordbatch_pickle():
    data = [
        pa.array(range(5)),
        pa.array([-10, -5, 0, 5, 10])
    ]
    schema = pa.schema([pa.field('ints', pa.int8()),
                        pa.field('floats', pa.float32()),
                        ]).add_metadata({b'foo': b'bar'})
    batch = pa.RecordBatch.from_arrays(data, schema)

    result = pickle.loads(pickle.dumps(batch))
    assert result.equals(batch)
    assert result.schema == schema
예제 #31
0
 def test_boolean_object_nulls(self):
     arr = np.array([False, None, True] * 100, dtype=object)
     df = pd.DataFrame({'bools': arr})
     field = pa.field('bools', pa.bool_())
     schema = pa.schema([field])
     self._check_pandas_roundtrip(df, expected_schema=schema)
예제 #32
0
def test_struct_array_slice():
    # ARROW-2311: slicing nested arrays needs special care
    ty = pa.struct([pa.field('a', pa.int8()), pa.field('b', pa.float32())])
    arr = pa.array([(1, 2.5), (3, 4.5), (5, 6.5)], type=ty)
    assert arr[1:].to_pylist() == [{'a': 3, 'b': 4.5}, {'a': 5, 'b': 6.5}]
예제 #33
0
    def get_type_and_builtins(self, n, type_name):
        """
        Return a `(arrow type, list)` tuple where the arrow type
        corresponds to the given logical *type_name*, and the list
        is a list of *n* random-generated Python objects compatible
        with the arrow type.
        """
        size = None

        if type_name in ('bool', 'ascii', 'unicode', 'int64 list', 'struct'):
            kind = type_name
        elif type_name.startswith(('int', 'uint')):
            kind = 'int'
        elif type_name.startswith('float'):
            kind = 'float'
        elif type_name == 'binary':
            kind = 'varying binary'
        elif type_name.startswith('binary'):
            kind = 'fixed binary'
            size = int(type_name[6:])
            assert size > 0
        else:
            raise ValueError("unrecognized type %r" % (type_name, ))

        if kind in ('int', 'float'):
            ty = getattr(pa, type_name)()
        elif kind == 'bool':
            ty = pa.bool_()
        elif kind == 'fixed binary':
            ty = pa.binary(size)
        elif kind == 'varying binary':
            ty = pa.binary()
        elif kind in ('ascii', 'unicode'):
            ty = pa.string()
        elif kind == 'int64 list':
            ty = pa.list_(pa.int64())
        elif kind == 'struct':
            ty = pa.struct([
                pa.field('u', pa.int64()),
                pa.field('v', pa.float64()),
                pa.field('w', pa.bool_())
            ])

        factories = {
            'int':
            self.generate_int_list,
            'float':
            self.generate_float_list,
            'bool':
            self.generate_bool_list,
            'fixed binary':
            partial(self.generate_fixed_binary_list, size=size),
            'varying binary':
            partial(self.generate_varying_binary_list, min_size=3,
                    max_size=40),
            'ascii':
            partial(self.generate_ascii_string_list, min_size=3, max_size=40),
            'unicode':
            partial(self.generate_unicode_string_list, min_size=3,
                    max_size=40),
            'int64 list':
            partial(self.generate_int_list_list, min_size=0, max_size=20),
            'struct':
            self.generate_dict_list,
        }
        data = factories[kind](n)
        return ty, data
예제 #34
0
def make_meta(obj, origin, partition_keys=None):
    """
    Create metadata object for DataFrame.

    .. note::
        This function can, for convenience reasons, also be applied to schema objects in which case they are just
        returned.

    .. warning::
        Information for categoricals will be stripped!

    :meth:`normalize_type` will be applied to normalize type information and :meth:`normalize_column_order` will be
    applied to to reorder column information.

    Parameters
    ----------
    obj: Union[DataFrame, Schema]
        Object to extract metadata from.
    origin: str
        Origin of the schema data, used for debugging and error reporting.
    partition_keys: Union[None, List[str]]
        Partition keys used to split the dataset.

    Returns
    -------
    schema: SchemaWrapper
        Schema information for DataFrame.
    """
    if isinstance(obj, SchemaWrapper):
        return obj
    if isinstance(obj, pa.Schema):
        return SchemaWrapper(obj, origin)

    if not isinstance(obj, pd.DataFrame):
        raise ValueError(
            "Input must be a pyarrow schema, or a pandas dataframe")

    if ARROW_LARGER_EQ_0130:
        schema = pa.Schema.from_pandas(obj)
    else:
        table = pa.Table.from_pandas(obj)
        schema = table.schema
        del table
    pandas_metadata = _pandas_meta_from_schema(schema)

    # normalize types
    fields = dict([(field.name, field.type) for field in schema])
    for cmd in pandas_metadata["columns"]:
        name = cmd.get("name")
        if name is None:
            continue
        field_name = cmd["field_name"]
        field_idx = schema.get_field_index(field_name)
        field = schema[field_idx]
        fields[field_name], cmd["pandas_type"], cmd["numpy_type"], cmd[
            "metadata"] = normalize_type(field.type, cmd["pandas_type"],
                                         cmd["numpy_type"], cmd["metadata"])
    metadata = schema.metadata
    metadata[b"pandas"] = _dict_to_binary(pandas_metadata)
    schema = pa.schema([pa.field(n, t) for n, t in fields.items()], metadata)
    return normalize_column_order(SchemaWrapper(schema, origin),
                                  partition_keys)
예제 #35
0
def get_pyarrow_translated_schema(string_schema):
    """
    Converts string schema dict to pyarrow schema for writing to parquet.
    :param string_schema:
    :return: pyarrow schema
    """
    def _bq_to_pa_type(field):
        """
        A function to convert BigQuery types to pyarrow types.
        :param field (bigquery.schema.SchemaField)
        :return: pa.DataType
        """
        type_conversions = {
            'STRING': pa.string(),
            'NUMERIC': pa.int64(),
            'BYTE': None,
            'INTEGER': pa.int64(),
            'FLOAT': pa.float64(),
            'BOOLEAN': pa.bool_(),
            'TIMESTAMP': pa.timestamp('us'),
            'DATE': pa.date32(),
            'TIME': pa.time64('us'),
            'DATETIME': pa.timestamp('us'),
            'GEOGRAPHY': None,
        }

        try:
            if field['mode'] == 'REPEATED':
                if field['type'] == 'RECORD':
                    nested_fields = field['fields']
                    # Recursively call to convert the next nested layer.
                    return pa.list_(
                        pa.struct([(fld['name'], _bq_to_pa_type(fld))
                                   for fld in nested_fields]))
                else:
                    return pa.list_(
                        _bq_to_pa_type(type_conversions[field['type']]))
            elif field['type'] == 'RECORD':
                nested_fields = field['fields']
                # Recursively call to convert the next nested layer.
                return pa.struct([(fld['name'], _bq_to_pa_type(fld))
                                  for fld in nested_fields])
            else:
                return type_conversions.get(field.get('type'))
        except KeyError as err:
            raise KeyError(
                """Type {} is not a valid BigQuery type and not supported by this
                utility.""".format(field['type']))

    pa_schema_list = []
    for field in string_schema['fields']:
        field_type = field['type']
        field_name = field['name']
        field_mode = field['mode']
        converted_type = _bq_to_pa_type(field)
        if converted_type is None:
            error_message = 'Error: json schema included a {0:s} field. ' \
                            'BYTE, and GEOGRAPHY  types cannot ' \
                            'currently be used when outputting to ' \
                            'parquet.'.format(field_type)
            logging.error(error_message)
            raise ValueError(error_message)
        else:
            nullable = False if field_mode == 'REQUIRED' else True
            pa_field = pa.field(name=field_name,
                                type=converted_type
                                #nullable=nullable
                                )
            pa_schema_list.append(pa_field)

    return pa.schema(pa_schema_list)
예제 #36
0
def dataframe_to_arrays(df,
                        schema,
                        preserve_index,
                        nthreads=1,
                        columns=None,
                        safe=True):
    (all_names, column_names, index_column_names, index_descriptors,
     index_columns, columns_to_convert,
     convert_fields) = _get_columns_to_convert(df, schema, preserve_index,
                                               columns)

    # NOTE(wesm): If nthreads=None, then we use a heuristic to decide whether
    # using a thread pool is worth it. Currently the heuristic is whether the
    # nrows > 100 * ncols and ncols > 1.
    if nthreads is None:
        nrows, ncols = len(df), len(df.columns)
        if nrows > ncols * 100 and ncols > 1:
            nthreads = pa.cpu_count()
        else:
            nthreads = 1

    def convert_column(col, field):
        if field is None:
            field_nullable = True
            type_ = None
        else:
            field_nullable = field.nullable
            type_ = field.type

        try:
            result = pa.array(col, type=type_, from_pandas=True, safe=safe)
        except (pa.ArrowInvalid, pa.ArrowNotImplementedError,
                pa.ArrowTypeError) as e:
            e.args += (
                "Conversion failed for column {!s} with type {!s}".format(
                    col.name, col.dtype), )
            raise e
        if not field_nullable and result.null_count > 0:
            raise ValueError("Field {} was non-nullable but pandas column "
                             "had {} null values".format(
                                 str(field), result.null_count))
        return result

    def _can_definitely_zero_copy(arr):
        return (isinstance(arr, np.ndarray) and arr.flags.contiguous
                and issubclass(arr.dtype.type, np.integer))

    if nthreads == 1:
        arrays = [
            convert_column(c, f)
            for c, f in zip(columns_to_convert, convert_fields)
        ]
    else:
        arrays = []
        with futures.ThreadPoolExecutor(nthreads) as executor:
            for c, f in zip(columns_to_convert, convert_fields):
                if _can_definitely_zero_copy(c.values):
                    arrays.append(convert_column(c, f))
                else:
                    arrays.append(executor.submit(convert_column, c, f))

        for i, maybe_fut in enumerate(arrays):
            if isinstance(maybe_fut, futures.Future):
                arrays[i] = maybe_fut.result()

    types = [x.type for x in arrays]

    if schema is None:
        fields = []
        for name, type_ in zip(all_names, types):
            name = name if name is not None else 'None'
            fields.append(pa.field(name, type_))
        schema = pa.schema(fields)

    pandas_metadata = construct_metadata(columns_to_convert, df, column_names,
                                         index_columns, index_descriptors,
                                         preserve_index, types)
    metadata = deepcopy(schema.metadata) if schema.metadata else dict()
    metadata.update(pandas_metadata)
    schema = schema.with_metadata(metadata)

    # If dataframe is empty but with RangeIndex ->
    # remember the length of the indexes
    n_rows = None
    if len(arrays) == 0:
        try:
            kind = index_descriptors[0]["kind"]
            if kind == "range":
                start = index_descriptors[0]["start"]
                stop = index_descriptors[0]["stop"]
                step = index_descriptors[0]["step"]
                n_rows = len(range(start, stop, step))
        except IndexError:
            pass

    return arrays, schema, n_rows
예제 #37
0
def generate(
    path,
    parameters,
    format={
        "name": "parquet",
        "row_group_size": 64
    },
    use_threads=True,
):
    """
    Generate dataset using given parameters and write to given format

    Parameters
    ----------
    path : str or file-like object
        Path to write to
    parameters : Parameters
        Parameters specifying how to randomly generate data
    format : Dict
        Format to write
    """

    # Initialize seeds
    if parameters.seed is not None:
        np.random.seed(parameters.seed)
    column_seeds = np.arange(len(parameters.column_parameters))
    np.random.shuffle(column_seeds)

    # For each column, use a generic Mimesis producer to create an Iterable
    # for generating data
    for i, column_params in enumerate(parameters.column_parameters):
        column_params.generator = column_params.generator(
            Generic("en", seed=column_seeds[i]))

    # Get schema for each column
    schema = pa.schema([
        pa.field(
            name=str(i),
            type=pa.from_numpy_dtype(type(next(iter(
                column_params.generator)))),
            nullable=column_params.null_frequency > 0,
        ) for i, column_params in enumerate(parameters.column_parameters)
    ])

    # Initialize column data and which columns should be sorted
    column_data = [None] * len(parameters.column_parameters)
    columns_to_sort = [
        str(i) for i, column_params in enumerate(parameters.column_parameters)
        if column_params.is_sorted
    ]

    # Generate data
    if not use_threads:
        for i, column_params in enumerate(parameters.column_parameters):
            column_data[i] = _generate_column(column_params,
                                              parameters.num_rows)
    else:
        pool = Pool(pa.cpu_count())
        column_data = pool.starmap(
            _generate_column,
            [(column_params, parameters.num_rows)
             for i, column_params in enumerate(parameters.column_parameters)],
        )
        pool.close()
        pool.join()

    # Convert to Pandas DataFrame and sort columns appropriately
    tbl = pa.Table.from_arrays(
        column_data,
        schema=schema,
    )
    if columns_to_sort:
        tbl = tbl.to_pandas()
        tbl = tbl.sort_values(columns_to_sort)
        tbl = pa.Table.from_pandas(tbl, schema)

    # Write
    _write(tbl, path, format)
예제 #38
0
def test_empty_table():
    schema = pa.schema([pa.field('oneField', pa.int64())])
    table = schema.empty_table()
    assert isinstance(table, pa.Table)
    assert table.num_rows == 0
    assert table.schema == schema
예제 #39
0
                          ('string', ['a', 'b', 'c']),
                          ('binary', [b'a', b'b', b'c']),
                          (pa.binary(3), [b'abc', b'bcd', b'cde'])])
def test_cast_identities(ty, values):
    arr = pa.array(values, type=ty)
    assert arr.cast(ty).equals(arr)


pickle_test_parametrize = pytest.mark.parametrize(
    ('data', 'typ'),
    [([True, False, True, True], pa.bool_()), ([1, 2, 4, 6], pa.int64()),
     ([1.0, 2.5, None], pa.float64()), (['a', None, 'b'], pa.string()),
     ([], None), ([[1, 2], [3]], pa.list_(pa.int64())),
     ([['a'], None, ['b', 'c']], pa.list_(pa.string())),
     ([(1, 'a'), (2, 'c'), None
       ], pa.struct([pa.field('a', pa.int64()),
                     pa.field('b', pa.string())]))])


@pickle_test_parametrize
def test_array_pickle(data, typ):
    # Allocate here so that we don't have any Arrow data allocated.
    # This is needed to ensure that allocator tests can be reliable.
    array = pa.array(data, type=typ)
    for proto in range(0, pickle.HIGHEST_PROTOCOL + 1):
        result = pickle.loads(pickle.dumps(array, proto))
        assert array.equals(result)


@h.given(
    past.arrays(past.all_types, size=st.integers(min_value=0, max_value=10)))
예제 #40
0
 def test_fixed_size_bytes_does_not_accept_varying_lengths(self):
     values = [b'foo', None, b'ba', None, None, b'hey']
     df = pd.DataFrame({'strings': values})
     schema = pa.schema([pa.field('strings', pa.binary(3))])
     with self.assertRaises(pa.ArrowInvalid):
         pa.Table.from_pandas(df, schema=schema)
예제 #41
0
    def __init__(
        self,
        schema: Optional[pa.Schema] = None,
        features: Optional[Features] = None,
        path: Optional[str] = None,
        stream: Optional[pa.NativeFile] = None,
        fingerprint: Optional[str] = None,
        writer_batch_size: Optional[int] = None,
        hash_salt: Optional[str] = None,
        check_duplicates: Optional[bool] = False,
        disable_nullable: bool = False,
        update_features: bool = False,
        with_metadata: bool = True,
        unit: str = "examples",
    ):
        if path is None and stream is None:
            raise ValueError(
                "At least one of path and stream must be provided.")
        if features is not None:
            self._features = features
            self._schema = pa.schema(features.type)
        elif schema is not None:
            self._schema: pa.Schema = schema
            self._features = Features.from_arrow_schema(self._schema)
        else:
            self._features = None
            self._schema = None

        if hash_salt is not None:
            # Create KeyHasher instance using split name as hash salt
            self._hasher = KeyHasher(hash_salt)
        else:
            self._hasher = KeyHasher("")

        self._check_duplicates = check_duplicates

        if disable_nullable and self._schema is not None:
            self._schema = pa.schema(
                pa.field(field.name, field.type, nullable=False)
                for field in self._schema)

        self._path = path
        if stream is None:
            self.stream = pa.OSFile(self._path, "wb")
            self._closable_stream = True
        else:
            self.stream = stream
            self._closable_stream = False

        self.fingerprint = fingerprint
        self.disable_nullable = disable_nullable
        self.writer_batch_size = writer_batch_size or config.DEFAULT_MAX_BATCH_SIZE
        self.update_features = update_features
        self.with_metadata = with_metadata
        self.unit = unit

        self._num_examples = 0
        self._num_bytes = 0
        self.current_examples: List[Tuple[Dict[str, Any], str]] = []
        self.current_rows: List[pa.Table] = []
        self.pa_writer: Optional[pa.RecordBatchStreamWriter] = None
        self.hkey_record = []
예제 #42
0
 def _to_arrow_schema(row_type):
     return pa.schema([
         pa.field(n, to_arrow_type(t), t._nullable)
         for n, t in zip(row_type.field_names(), row_type.field_types())
     ])
예제 #43
0
def test_recordbatch_from_arrays_validate_schema():
    # ARROW-6263
    arr = pa.array([])
    schema = pa.schema([pa.field('f0', pa.utf8())])
    with pytest.raises(ValueError):
        pa.record_batch([arr], schema=schema)
예제 #44
0
                 pa.array([[1], None, [3, 4]], type=pa.list_(pa.int32()))
             ], ["f1", "f2"])
         }, {
             "list<utf8>": pa.array([u"abc", None], type=pa.utf8())
         }],
         expected_output={
             "list<utf8>":
             pa.array([None, None, None, None, None, None, None, u"abc", None],
                      type=pa.utf8()),
             "struct<int32, list<int32>>":
             pa.array([
                 None, None, None, None, (1, [1]), (2, None),
                 (None, [3, 4]), None, None
             ],
                      type=pa.struct([
                          pa.field("f1", pa.int32()),
                          pa.field("f2", pa.list_(pa.int32()))
                      ])),
         }),
]

_MERGE_INVALID_INPUT_TEST_CASES = [
    dict(
        testcase_name="not_a_list_of_tables",
        inputs=[pa.Table.from_arrays([pa.array([1])], ["f1"]), 1],
        expected_error_regexp="incompatible function arguments",
    ),
    dict(
        testcase_name="not_a_list",
        inputs=1,
        expected_error_regexp="incompatible function arguments",
예제 #45
0
    def test_arrow_schema_convertion(self):

        arrow_schema = pa.schema([
            pa.field('string', pa.string()),
            pa.field('int8', pa.int8()),
            pa.field('int16', pa.int16()),
            pa.field('int32', pa.int32()),
            pa.field('int64', pa.int64()),
            pa.field('float', pa.float32()),
            pa.field('double', pa.float64()),
            pa.field('bool', pa.bool_(), False),
            pa.field('fixed_size_binary', pa.binary(10)),
            pa.field('variable_size_binary', pa.binary()),
            pa.field('decimal', pa.decimal128(3, 4)),
            pa.field('timestamp_s', pa.timestamp('s')),
            pa.field('timestamp_ns', pa.timestamp('ns')),
            pa.field('date_32', pa.date32()),
            pa.field('date_64', pa.date64()),
            pa.field('timestamp_ns', pa.timestamp('ns')),
        ])

        mock_dataset = _mock_parquet_dataset([], arrow_schema)

        unischema = Unischema.from_arrow_schema(mock_dataset)
        for name in arrow_schema.names:
            assert getattr(unischema, name).name == name
            assert isinstance(getattr(unischema, name).codec, ScalarCodec)
            if name == 'bool':
                assert not getattr(unischema, name).nullable
            else:
                assert getattr(unischema, name).nullable
예제 #46
0
def get_many_types():
    # returning them from a function is required because of pa.dictionary
    # type holds a pyarrow array and test_array.py::test_toal_bytes_allocated
    # checks that the default memory pool has zero allocated bytes
    return (pa.null(), pa.bool_(), pa.int32(), pa.time32('s'), pa.time64('us'),
            pa.date32(), pa.timestamp('us'), pa.timestamp('us', tz='UTC'),
            pa.timestamp('us', tz='Europe/Paris'), pa.float16(), pa.float32(),
            pa.float64(), pa.decimal128(19, 4), pa.string(), pa.binary(),
            pa.binary(10), pa.large_string(), pa.large_binary(),
            pa.list_(pa.int32()), pa.large_list(pa.uint16()),
            pa.struct([
                pa.field('a', pa.int32()),
                pa.field('b', pa.int8()),
                pa.field('c', pa.string())
            ]),
            pa.struct([
                pa.field('a', pa.int32(), nullable=False),
                pa.field('b', pa.int8(), nullable=False),
                pa.field('c', pa.string())
            ]),
            pa.union(
                [pa.field('a', pa.binary(10)),
                 pa.field('b', pa.string())],
                mode=pa.lib.UnionMode_DENSE),
            pa.union(
                [pa.field('a', pa.binary(10)),
                 pa.field('b', pa.string())],
                mode=pa.lib.UnionMode_SPARSE),
            pa.union([
                pa.field('a', pa.binary(10), nullable=False),
                pa.field('b', pa.string())
            ],
                     mode=pa.lib.UnionMode_SPARSE),
            pa.dictionary(pa.int32(), pa.string()))
예제 #47
0
def dataframe_to_arrays(df,
                        schema,
                        preserve_index,
                        nthreads=1,
                        columns=None,
                        safe=True):
    (all_names, column_names, index_column_names, index_descriptors,
     index_columns, columns_to_convert,
     convert_fields) = _get_columns_to_convert(df, schema, preserve_index,
                                               columns)

    # NOTE(wesm): If nthreads=None, then we use a heuristic to decide whether
    # using a thread pool is worth it. Currently the heuristic is whether the
    # nrows > 100 * ncols.
    if nthreads is None:
        nrows, ncols = len(df), len(df.columns)
        if nrows > ncols * 100:
            nthreads = pa.cpu_count()
        else:
            nthreads = 1

    def convert_column(col, field):
        if field is None:
            field_nullable = True
            type_ = None
        else:
            field_nullable = field.nullable
            type_ = field.type

        try:
            result = pa.array(col, type=type_, from_pandas=True, safe=safe)
        except (pa.ArrowInvalid, pa.ArrowNotImplementedError,
                pa.ArrowTypeError) as e:
            e.args += (
                "Conversion failed for column {0!s} with type {1!s}".format(
                    col.name, col.dtype), )
            raise e
        if not field_nullable and result.null_count > 0:
            raise ValueError("Field {} was non-nullable but pandas column "
                             "had {} null values".format(
                                 str(field), result.null_count))
        return result

    if nthreads == 1:
        arrays = [
            convert_column(c, f)
            for c, f in zip(columns_to_convert, convert_fields)
        ]
    else:
        from concurrent import futures
        with futures.ThreadPoolExecutor(nthreads) as executor:
            arrays = list(
                executor.map(convert_column, columns_to_convert,
                             convert_fields))

    types = [x.type for x in arrays]

    if schema is not None:
        # add index columns
        index_types = types[len(column_names):]
        for name, type_ in zip(index_column_names, index_types):
            name = name if name is not None else 'None'
            schema = schema.append(pa.field(name, type_))
    else:
        fields = []
        for name, type_ in zip(all_names, types):
            name = name if name is not None else 'None'
            fields.append(pa.field(name, type_))
        schema = pa.schema(fields)

    metadata = construct_metadata(df, column_names, index_columns,
                                  index_descriptors, preserve_index, types)
    schema = schema.with_metadata(metadata)

    return arrays, schema
예제 #48
0
파일: coders.py 프로젝트: zoushihua/flink
 def _to_arrow_type(field):
     if field.type.type_name == flink_fn_execution_pb2.Schema.TypeName.TINYINT:
         return pa.field(field.name, pa.int8(), field.type.nullable)
     elif field.type.type_name == flink_fn_execution_pb2.Schema.TypeName.SMALLINT:
         return pa.field(field.name, pa.int16(), field.type.nullable)
     elif field.type.type_name == flink_fn_execution_pb2.Schema.TypeName.INT:
         return pa.field(field.name, pa.int32(), field.type.nullable)
     elif field.type.type_name == flink_fn_execution_pb2.Schema.TypeName.BIGINT:
         return pa.field(field.name, pa.int64(), field.type.nullable)
     elif field.type.type_name == flink_fn_execution_pb2.Schema.TypeName.BOOLEAN:
         return pa.field(field.name, pa.bool_(), field.type.nullable)
     elif field.type.type_name == flink_fn_execution_pb2.Schema.TypeName.FLOAT:
         return pa.field(field.name, pa.float32(), field.type.nullable)
     elif field.type.type_name == flink_fn_execution_pb2.Schema.TypeName.DOUBLE:
         return pa.field(field.name, pa.float64(), field.type.nullable)
     elif field.type.type_name == flink_fn_execution_pb2.Schema.TypeName.VARCHAR:
         return pa.field(field.name, pa.utf8(), field.type.nullable)
     elif field.type.type_name == flink_fn_execution_pb2.Schema.TypeName.VARBINARY:
         return pa.field(field.name, pa.binary(), field.type.nullable)
     elif field.type.type_name == flink_fn_execution_pb2.Schema.TypeName.DECIMAL:
         return pa.field(
             field.name,
             pa.decimal128(field.type.decimal_info.precision,
                           field.type.decimal_info.scale),
             field.type.nullable)
     elif field.type.type_name == flink_fn_execution_pb2.Schema.TypeName.DATE:
         return pa.field(field.name, pa.date32(), field.type.nullable)
     elif field.type.type_name == flink_fn_execution_pb2.Schema.TypeName.TIME:
         if field.type.time_info.precision == 0:
             return pa.field(field.name, pa.time32('s'),
                             field.type.nullable)
         elif 1 <= field.type.time_type.precision <= 3:
             return pa.field(field.name, pa.time32('ms'),
                             field.type.nullable)
         elif 4 <= field.type.time_type.precision <= 6:
             return pa.field(field.name, pa.time64('us'),
                             field.type.nullable)
         else:
             return pa.field(field.name, pa.time64('ns'),
                             field.type.nullable)
     else:
         raise ValueError("field_type %s is not supported." %
                          field.type)
예제 #49
0
    assert not pa.types.is_float_value(1)
    assert pa.types.is_float_value(1.)
    assert pa.types.is_float_value(np.float64(1))
    assert not pa.types.is_float_value('1.0')


def test_is_boolean_value():
    assert not pa.types.is_boolean_value(1)
    assert pa.types.is_boolean_value(True)
    assert pa.types.is_boolean_value(False)
    assert pa.types.is_boolean_value(np.bool_(True))
    assert pa.types.is_boolean_value(np.bool_(False))


@h.given(past.all_types | past.all_fields | past.all_schemas)
@h.example(pa.field(name='', type=pa.null(), metadata={'0': '', '': ''}))
def test_pickling(field):
    data = pickle.dumps(field)
    assert pickle.loads(data) == field


@h.given(
    st.lists(past.all_types) | st.lists(past.all_fields)
    | st.lists(past.all_schemas))
def test_hashing(items):
    h.assume(
        # well, this is still O(n^2), but makes the input unique
        all(not a.equals(b) for i, a in enumerate(items) for b in items[:i]))

    container = {}
    for i, item in enumerate(items):
예제 #50
0
 def clone_field(table, name, datatype):
     f = table.schema.field_by_name(name)
     return pa.field(f.name, datatype, f.nullable, f.metadata)
예제 #51
0
def test_bq_to_arrow_data_type_w_array_struct(module_under_test, bq_type):
    fields = (
        schema.SchemaField("field01", "STRING"),
        schema.SchemaField("field02", "BYTES"),
        schema.SchemaField("field03", "INTEGER"),
        schema.SchemaField("field04", "INT64"),
        schema.SchemaField("field05", "FLOAT"),
        schema.SchemaField("field06", "FLOAT64"),
        schema.SchemaField("field07", "NUMERIC"),
        schema.SchemaField("field08", "BOOLEAN"),
        schema.SchemaField("field09", "BOOL"),
        schema.SchemaField("field10", "TIMESTAMP"),
        schema.SchemaField("field11", "DATE"),
        schema.SchemaField("field12", "TIME"),
        schema.SchemaField("field13", "DATETIME"),
        schema.SchemaField("field14", "GEOGRAPHY"),
    )
    field = schema.SchemaField("ignored_name",
                               bq_type,
                               mode="REPEATED",
                               fields=fields)
    actual = module_under_test.bq_to_arrow_data_type(field)
    expected_value_type = pyarrow.struct((
        pyarrow.field("field01", pyarrow.string()),
        pyarrow.field("field02", pyarrow.binary()),
        pyarrow.field("field03", pyarrow.int64()),
        pyarrow.field("field04", pyarrow.int64()),
        pyarrow.field("field05", pyarrow.float64()),
        pyarrow.field("field06", pyarrow.float64()),
        pyarrow.field("field07", module_under_test.pyarrow_numeric()),
        pyarrow.field("field08", pyarrow.bool_()),
        pyarrow.field("field09", pyarrow.bool_()),
        pyarrow.field("field10", module_under_test.pyarrow_timestamp()),
        pyarrow.field("field11", pyarrow.date32()),
        pyarrow.field("field12", module_under_test.pyarrow_time()),
        pyarrow.field("field13", module_under_test.pyarrow_datetime()),
        pyarrow.field("field14", pyarrow.string()),
    ))
    assert pyarrow.types.is_list(actual)
    assert pyarrow.types.is_struct(actual.value_type)
    assert actual.value_type.num_children == len(fields)
    assert actual.value_type.equals(expected_value_type)