示例#1
0
def test_sequence_double():
    data = [1.5, 1., None, 2.5, None, None]
    arr = pa.array(data)
    assert len(arr) == 6
    assert arr.null_count == 3
    assert arr.type == pa.float64()
    assert arr.to_pylist() == data
示例#2
0
def test_type_to_pandas_dtype():
    M8_ns = np.dtype('datetime64[ns]')
    cases = [
        (pa.null(), np.float64),
        (pa.bool_(), np.bool_),
        (pa.int8(), np.int8),
        (pa.int16(), np.int16),
        (pa.int32(), np.int32),
        (pa.int64(), np.int64),
        (pa.uint8(), np.uint8),
        (pa.uint16(), np.uint16),
        (pa.uint32(), np.uint32),
        (pa.uint64(), np.uint64),
        (pa.float16(), np.float16),
        (pa.float32(), np.float32),
        (pa.float64(), np.float64),
        (pa.date32(), M8_ns),
        (pa.date64(), M8_ns),
        (pa.timestamp('ms'), M8_ns),
        (pa.binary(), np.object_),
        (pa.binary(12), np.object_),
        (pa.string(), np.object_),
        (pa.list_(pa.int8()), np.object_),
    ]
    for arrow_type, numpy_type in cases:
        assert arrow_type.to_pandas_dtype() == numpy_type
 def test_double(self):
     data = [1.5, 1, None, 2.5, None, None]
     arr = pa.from_pylist(data)
     assert len(arr) == 6
     assert arr.null_count == 3
     assert arr.type == pa.float64()
     assert arr.to_pylist() == data
示例#4
0
文件: test_orc.py 项目: dremio/arrow
def test_orcfile_empty():
    from pyarrow import orc
    f = orc.ORCFile(path_for_orc_example('TestOrcFile.emptyFile'))
    table = f.read()
    assert table.num_rows == 0
    schema = table.schema
    expected_schema = pa.schema([
        ('boolean1', pa.bool_()),
        ('byte1', pa.int8()),
        ('short1', pa.int16()),
        ('int1', pa.int32()),
        ('long1', pa.int64()),
        ('float1', pa.float32()),
        ('double1', pa.float64()),
        ('bytes1', pa.binary()),
        ('string1', pa.string()),
        ('middle', pa.struct([
            ('list', pa.list_(pa.struct([
                ('int1', pa.int32()),
                ('string1', pa.string()),
                ]))),
            ])),
        ('list', pa.list_(pa.struct([
            ('int1', pa.int32()),
            ('string1', pa.string()),
            ]))),
        ('map', pa.list_(pa.struct([
            ('key', pa.string()),
            ('value', pa.struct([
                ('int1', pa.int32()),
                ('string1', pa.string()),
                ])),
            ]))),
        ])
    assert schema == expected_schema
示例#5
0
def test_sequence_numpy_double(seq, np_scalar):
    data = [np_scalar(1.5), np_scalar(1), None, np_scalar(2.5), None, None]
    arr = pa.array(seq(data))
    assert len(arr) == 6
    assert arr.null_count == 3
    assert arr.type == pa.float64()
    assert arr.to_pylist() == data
示例#6
0
def test_empty_cast():
    types = [
        pa.null(),
        pa.bool_(),
        pa.int8(),
        pa.int16(),
        pa.int32(),
        pa.int64(),
        pa.uint8(),
        pa.uint16(),
        pa.uint32(),
        pa.uint64(),
        pa.float16(),
        pa.float32(),
        pa.float64(),
        pa.date32(),
        pa.date64(),
        pa.binary(),
        pa.binary(length=4),
        pa.string(),
    ]

    for (t1, t2) in itertools.product(types, types):
        try:
            # ARROW-4766: Ensure that supported types conversion don't segfault
            # on empty arrays of common types
            pa.array([], type=t1).cast(t2)
        except pa.lib.ArrowNotImplementedError:
            continue
示例#7
0
文件: test_types.py 项目: rok/arrow
def test_dictionary_type():
    ty0 = pa.dictionary(pa.int32(), pa.string())
    assert ty0.index_type == pa.int32()
    assert ty0.value_type == pa.string()
    assert ty0.ordered is False

    ty1 = pa.dictionary(pa.int8(), pa.float64(), ordered=True)
    assert ty1.index_type == pa.int8()
    assert ty1.value_type == pa.float64()
    assert ty1.ordered is True

    # construct from non-arrow objects
    ty2 = pa.dictionary('int8', 'string')
    assert ty2.index_type == pa.int8()
    assert ty2.value_type == pa.string()
    assert ty2.ordered is False
示例#8
0
def test_table_unsafe_casting():
    data = [
        pa.array(range(5), type=pa.int64()),
        pa.array([-10, -5, 0, 5, 10], type=pa.int32()),
        pa.array([1.1, 2.2, 3.3, 4.4, 5.5], type=pa.float64()),
        pa.array(['ab', 'bc', 'cd', 'de', 'ef'], type=pa.string())
    ]
    table = pa.Table.from_arrays(data, names=tuple('abcd'))

    expected_data = [
        pa.array(range(5), type=pa.int32()),
        pa.array([-10, -5, 0, 5, 10], type=pa.int16()),
        pa.array([1, 2, 3, 4, 5], type=pa.int64()),
        pa.array(['ab', 'bc', 'cd', 'de', 'ef'], type=pa.string())
    ]
    expected_table = pa.Table.from_arrays(expected_data, names=tuple('abcd'))

    target_schema = pa.schema([
        pa.field('a', pa.int32()),
        pa.field('b', pa.int16()),
        pa.field('c', pa.int64()),
        pa.field('d', pa.string())
    ])

    with pytest.raises(pa.ArrowInvalid,
                       match='Floating point value truncated'):
        table.cast(target_schema)

    casted_table = table.cast(target_schema, safe=False)
    assert casted_table.equals(expected_table)
示例#9
0
def test_table_safe_casting():
    data = [
        pa.array(range(5), type=pa.int64()),
        pa.array([-10, -5, 0, 5, 10], type=pa.int32()),
        pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float64()),
        pa.array(['ab', 'bc', 'cd', 'de', 'ef'], type=pa.string())
    ]
    table = pa.Table.from_arrays(data, names=tuple('abcd'))

    expected_data = [
        pa.array(range(5), type=pa.int32()),
        pa.array([-10, -5, 0, 5, 10], type=pa.int16()),
        pa.array([1, 2, 3, 4, 5], type=pa.int64()),
        pa.array(['ab', 'bc', 'cd', 'de', 'ef'], type=pa.string())
    ]
    expected_table = pa.Table.from_arrays(expected_data, names=tuple('abcd'))

    target_schema = pa.schema([
        pa.field('a', pa.int32()),
        pa.field('b', pa.int16()),
        pa.field('c', pa.int64()),
        pa.field('d', pa.string())
    ])
    casted_table = table.cast(target_schema)

    assert casted_table.equals(expected_table)
示例#10
0
def test_cast_integers_safe():
    safe_cases = [
        (np.array([0, 1, 2, 3], dtype='i1'), 'int8',
         np.array([0, 1, 2, 3], dtype='i4'), pa.int32()),
        (np.array([0, 1, 2, 3], dtype='i1'), 'int8',
         np.array([0, 1, 2, 3], dtype='u4'), pa.uint16()),
        (np.array([0, 1, 2, 3], dtype='i1'), 'int8',
         np.array([0, 1, 2, 3], dtype='u1'), pa.uint8()),
        (np.array([0, 1, 2, 3], dtype='i1'), 'int8',
         np.array([0, 1, 2, 3], dtype='f8'), pa.float64())
    ]

    for case in safe_cases:
        _check_cast_case(case)

    unsafe_cases = [
        (np.array([50000], dtype='i4'), 'int32', 'int16'),
        (np.array([70000], dtype='i4'), 'int32', 'uint16'),
        (np.array([-1], dtype='i4'), 'int32', 'uint16'),
        (np.array([50000], dtype='u2'), 'uint16', 'int16')
    ]
    for in_data, in_type, out_type in unsafe_cases:
        in_arr = pa.array(in_data, type=in_type)

        with pytest.raises(pa.ArrowInvalid):
            in_arr.cast(out_type)
    def test_float_nulls(self):
        num_values = 100

        null_mask = np.random.randint(0, 10, size=num_values) < 3
        dtypes = [('f4', pa.float32()), ('f8', pa.float64())]
        names = ['f4', 'f8']
        expected_cols = []

        arrays = []
        fields = []
        for name, arrow_dtype in dtypes:
            values = np.random.randn(num_values).astype(name)

            arr = pa.array(values, from_pandas=True, mask=null_mask)
            arrays.append(arr)
            fields.append(pa.field(name, arrow_dtype))
            values[null_mask] = np.nan

            expected_cols.append(values)

        ex_frame = pd.DataFrame(dict(zip(names, expected_cols)),
                                columns=names)

        table = pa.Table.from_arrays(arrays, names)
        assert table.schema.equals(pa.schema(fields))
        result = table.to_pandas()
        tm.assert_frame_equal(result, ex_frame)
 def test_float_object_nulls(self):
     arr = np.array([None, 1.5, np.float64(3.5)] * 5, dtype=object)
     df = pd.DataFrame({'floats': arr})
     expected = pd.DataFrame({'floats': pd.to_numeric(arr)})
     field = pa.field('floats', pa.float64())
     schema = pa.schema([field])
     self._check_pandas_roundtrip(df, expected=expected,
                                  expected_schema=schema)
示例#13
0
    def do_get(self, ticket):
        data1 = [pa.array([-10, -5, 0, 5, 10], type=pa.int32())]
        data2 = [pa.array([-10.0, -5.0, 0.0, 5.0, 10.0], type=pa.float64())]
        assert data1.type != data2.type
        table1 = pa.Table.from_arrays(data1, names=['a'])
        table2 = pa.Table.from_arrays(data2, names=['a'])
        assert table1.schema == self.schema

        return flight.GeneratorStream(self.schema, [table1, table2])
示例#14
0
文件: test_schema.py 项目: rok/arrow
def test_field_flatten():
    f0 = pa.field('foo', pa.int32()).add_metadata({b'foo': b'bar'})
    assert f0.flatten() == [f0]

    f1 = pa.field('bar', pa.float64(), nullable=False)
    ff = pa.field('ff', pa.struct([f0, f1]), nullable=False)
    assert ff.flatten() == [
        pa.field('ff.foo', pa.int32()).add_metadata({b'foo': b'bar'}),
        pa.field('ff.bar', pa.float64(), nullable=False)]  # XXX

    # Nullable parent makes flattened child nullable
    ff = pa.field('ff', pa.struct([f0, f1]))
    assert ff.flatten() == [
        pa.field('ff.foo', pa.int32()).add_metadata({b'foo': b'bar'}),
        pa.field('ff.bar', pa.float64())]

    fff = pa.field('fff', pa.struct([ff]))
    assert fff.flatten() == [pa.field('fff.ff', pa.struct([f0, f1]))]
示例#15
0
def test_cast_column():
    arrays = [pa.array([1, 2, 3]), pa.array([4, 5, 6])]

    col = pa.column('foo', arrays)

    target = pa.float64()
    casted = col.cast(target)

    expected = pa.column('foo', [x.cast(target) for x in arrays])
    assert casted.equals(expected)
    def test_all_nulls_cast_numeric(self):
        arr = np.array([None], dtype=object)

        def _check_type(t):
            a2 = pa.array(arr, type=t)
            assert a2.type == t
            assert a2[0].as_py() is None

        _check_type(pa.int32())
        _check_type(pa.float64())
def dataframe_with_arrays(include_index=False):
    """
    Dataframe with numpy arrays columns of every possible primtive type.

    Returns
    -------
    df: pandas.DataFrame
    schema: pyarrow.Schema
        Arrow schema definition that is in line with the constructed df.
    """
    dtypes = [('i1', pa.int8()), ('i2', pa.int16()),
              ('i4', pa.int32()), ('i8', pa.int64()),
              ('u1', pa.uint8()), ('u2', pa.uint16()),
              ('u4', pa.uint32()), ('u8', pa.uint64()),
              ('f4', pa.float32()), ('f8', pa.float64())]

    arrays = OrderedDict()
    fields = []
    for dtype, arrow_dtype in dtypes:
        fields.append(pa.field(dtype, pa.list_(arrow_dtype)))
        arrays[dtype] = [
            np.arange(10, dtype=dtype),
            np.arange(5, dtype=dtype),
            None,
            np.arange(1, dtype=dtype)
        ]

    fields.append(pa.field('str', pa.list_(pa.string())))
    arrays['str'] = [
        np.array([u"1", u"ä"], dtype="object"),
        None,
        np.array([u"1"], dtype="object"),
        np.array([u"1", u"2", u"3"], dtype="object")
    ]

    fields.append(pa.field('datetime64', pa.list_(pa.timestamp('ms'))))
    arrays['datetime64'] = [
        np.array(['2007-07-13T01:23:34.123456789',
                  None,
                  '2010-08-13T05:46:57.437699912'],
                 dtype='datetime64[ms]'),
        None,
        None,
        np.array(['2007-07-13T02',
                  None,
                  '2010-08-13T05:46:57.437699912'],
                 dtype='datetime64[ms]'),
    ]

    if include_index:
        fields.append(pa.field('__index_level_0__', pa.int64()))
    df = pd.DataFrame(arrays)
    schema = pa.schema(fields)

    return df, schema
示例#18
0
def json_to_parquet(data, output, schema):
    column_data = {}
    array_data = []

    for row in data:
        for column in schema.names:
            _col = column_data.get(column, [])
            _col.append(row.get(column))
            column_data[column] = _col

    for column in schema:
        _col = column_data.get(column.name)
        if isinstance(column.type, pa.lib.TimestampType):
            _converted_col = []
            for t in _col:
                try:
                    _converted_col.append(pd.to_datetime(t))
                except pd._libs.tslib.OutOfBoundsDatetime:
                    _converted_col.append(pd.Timestamp.max)
            array_data.append(pa.Array.from_pandas(pd.to_datetime(_converted_col), type=pa.timestamp('ms')))
        # Float types are ambiguous for conversions, need to specify the exact type
        elif column.type.id == pa.float64().id:
            array_data.append(pa.array(_col, type=pa.float64()))
        elif column.type.id == pa.float32().id:
            # Python doesn't have a native float32 type
            # and PyArrow cannot cast float64 -> float32
            _col = pd.to_numeric(_col, downcast='float')
            array_data.append(pa.Array.from_pandas(_col, type=pa.float32()))
        elif column.type.id == pa.int64().id:
            array_data.append(pa.array([int(ele) for ele in _col], type=pa.int64()))
        else:
            array_data.append(pa.array(_col, type=column.type))

    data = pa.RecordBatch.from_arrays(array_data, schema.names)

    try:
        table = pa.Table.from_batches(data)
    except TypeError:
        table = pa.Table.from_batches([data])

    pq.write_table(table, output, compression='SNAPPY', coerce_timestamps='ms')
示例#19
0
def dataframe_with_lists(include_index=False):
    """
    Dataframe with list columns of every possible primtive type.

    Returns
    -------
    df: pandas.DataFrame
    schema: pyarrow.Schema
        Arrow schema definition that is in line with the constructed df.
    """
    arrays = OrderedDict()
    fields = []

    fields.append(pa.field('int64', pa.list_(pa.int64())))
    arrays['int64'] = [
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
        [0, 1, 2, 3, 4],
        None,
        [],
        np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9] * 2,
                 dtype=np.int64)[::2]
    ]
    fields.append(pa.field('double', pa.list_(pa.float64())))
    arrays['double'] = [
        [0., 1., 2., 3., 4., 5., 6., 7., 8., 9.],
        [0., 1., 2., 3., 4.],
        None,
        [],
        np.array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.] * 2)[::2],
    ]
    fields.append(pa.field('bytes_list', pa.list_(pa.binary())))
    arrays['bytes_list'] = [
        [b"1", b"f"],
        None,
        [b"1"],
        [b"1", b"2", b"3"],
        [],
    ]
    fields.append(pa.field('str_list', pa.list_(pa.string())))
    arrays['str_list'] = [
        [u"1", u"ä"],
        None,
        [u"1"],
        [u"1", u"2", u"3"],
        [],
    ]

    if include_index:
        fields.append(pa.field('__index_level_0__', pa.int64()))
    df = pd.DataFrame(arrays)
    schema = pa.schema(fields)

    return df, schema
def test_bq_to_arrow_data_type_w_struct(module_under_test, bq_type):
    fields = (
        schema.SchemaField("field01", "STRING"),
        schema.SchemaField("field02", "BYTES"),
        schema.SchemaField("field03", "INTEGER"),
        schema.SchemaField("field04", "INT64"),
        schema.SchemaField("field05", "FLOAT"),
        schema.SchemaField("field06", "FLOAT64"),
        schema.SchemaField("field07", "NUMERIC"),
        schema.SchemaField("field08", "BOOLEAN"),
        schema.SchemaField("field09", "BOOL"),
        schema.SchemaField("field10", "TIMESTAMP"),
        schema.SchemaField("field11", "DATE"),
        schema.SchemaField("field12", "TIME"),
        schema.SchemaField("field13", "DATETIME"),
        schema.SchemaField("field14", "GEOGRAPHY"),
    )
    field = schema.SchemaField("ignored_name", bq_type, mode="NULLABLE", fields=fields)
    actual = module_under_test.bq_to_arrow_data_type(field)
    expected = pyarrow.struct(
        (
            pyarrow.field("field01", pyarrow.string()),
            pyarrow.field("field02", pyarrow.binary()),
            pyarrow.field("field03", pyarrow.int64()),
            pyarrow.field("field04", pyarrow.int64()),
            pyarrow.field("field05", pyarrow.float64()),
            pyarrow.field("field06", pyarrow.float64()),
            pyarrow.field("field07", module_under_test.pyarrow_numeric()),
            pyarrow.field("field08", pyarrow.bool_()),
            pyarrow.field("field09", pyarrow.bool_()),
            pyarrow.field("field10", module_under_test.pyarrow_timestamp()),
            pyarrow.field("field11", pyarrow.date32()),
            pyarrow.field("field12", module_under_test.pyarrow_time()),
            pyarrow.field("field13", module_under_test.pyarrow_datetime()),
            pyarrow.field("field14", pyarrow.string()),
        )
    )
    assert pyarrow.types.is_struct(actual)
    assert actual.num_children == len(fields)
    assert actual.equals(expected)
示例#21
0
def test_type_for_alias():
    cases = [
        ('i1', pa.int8()),
        ('int8', pa.int8()),
        ('i2', pa.int16()),
        ('int16', pa.int16()),
        ('i4', pa.int32()),
        ('int32', pa.int32()),
        ('i8', pa.int64()),
        ('int64', pa.int64()),
        ('u1', pa.uint8()),
        ('uint8', pa.uint8()),
        ('u2', pa.uint16()),
        ('uint16', pa.uint16()),
        ('u4', pa.uint32()),
        ('uint32', pa.uint32()),
        ('u8', pa.uint64()),
        ('uint64', pa.uint64()),
        ('f4', pa.float32()),
        ('float32', pa.float32()),
        ('f8', pa.float64()),
        ('float64', pa.float64()),
        ('date32', pa.date32()),
        ('date64', pa.date64()),
        ('string', pa.string()),
        ('str', pa.string()),
        ('binary', pa.binary()),
        ('time32[s]', pa.time32('s')),
        ('time32[ms]', pa.time32('ms')),
        ('time64[us]', pa.time64('us')),
        ('time64[ns]', pa.time64('ns')),
        ('timestamp[s]', pa.timestamp('s')),
        ('timestamp[ms]', pa.timestamp('ms')),
        ('timestamp[us]', pa.timestamp('us')),
        ('timestamp[ns]', pa.timestamp('ns')),
    ]

    for val, expected in cases:
        assert pa.type_for_alias(val) == expected
示例#22
0
 def test_simple_varied(self):
     # Infer various kinds of data
     rows = b"a,b,c\n1,2,3\n4.0,-5,foo\n"
     table = self.read_bytes(rows)
     schema = pa.schema([('a', pa.float64()),
                         ('b', pa.int64()),
                         ('c', pa.string())])
     assert table.schema == schema
     assert table.to_pydict() == {
         'a': [1.0, 4.0],
         'b': [2, -5],
         'c': [u"3", u"foo"],
         }
示例#23
0
文件: test_table.py 项目: rok/arrow
def test_table_from_pydict():
    table = pa.Table.from_pydict({})
    assert table.num_columns == 0
    assert table.num_rows == 0
    assert table.schema == pa.schema([])
    assert table.to_pydict() == {}

    # With arrays as values
    data = OrderedDict([('strs', pa.array([u'', u'foo', u'bar'])),
                        ('floats', pa.array([4.5, 5, None]))])
    schema = pa.schema([('strs', pa.utf8()), ('floats', pa.float64())])
    table = pa.Table.from_pydict(data)
    assert table.num_columns == 2
    assert table.num_rows == 3
    assert table.schema == schema

    # With chunked arrays as values
    data = OrderedDict([('strs', pa.chunked_array([[u''], [u'foo', u'bar']])),
                        ('floats', pa.chunked_array([[4.5], [5, None]]))])
    table = pa.Table.from_pydict(data)
    assert table.num_columns == 2
    assert table.num_rows == 3
    assert table.schema == schema

    # With lists as values
    data = OrderedDict([('strs', [u'', u'foo', u'bar']),
                        ('floats', [4.5, 5, None])])
    table = pa.Table.from_pydict(data)
    assert table.num_columns == 2
    assert table.num_rows == 3
    assert table.schema == schema
    assert table.to_pydict() == data

    # With metadata and inferred schema
    metadata = {b'foo': b'bar'}
    schema = schema.add_metadata(metadata)
    table = pa.Table.from_pydict(data, metadata=metadata)
    assert table.schema == schema
    assert table.schema.metadata == metadata
    assert table.to_pydict() == data

    # With explicit schema
    table = pa.Table.from_pydict(data, schema=schema)
    assert table.schema == schema
    assert table.schema.metadata == metadata
    assert table.to_pydict() == data

    # Cannot pass both schema and metadata
    with pytest.raises(ValueError):
        pa.Table.from_pydict(data, schema=schema, metadata=metadata)
    def test_float_no_nulls(self):
        data = {}
        fields = []
        dtypes = [('f4', pa.float32()), ('f8', pa.float64())]
        num_values = 100

        for numpy_dtype, arrow_dtype in dtypes:
            values = np.random.randn(num_values)
            data[numpy_dtype] = values.astype(numpy_dtype)
            fields.append(pa.field(numpy_dtype, arrow_dtype))

        df = pd.DataFrame(data)
        schema = pa.schema(fields)
        self._check_pandas_roundtrip(df, expected_schema=schema)
示例#25
0
文件: test_compute.py 项目: rok/arrow
def test_take_indices_types():
    arr = pa.array(range(5))

    for indices_type in ['uint8', 'int8', 'uint16', 'int16',
                         'uint32', 'int32', 'uint64', 'int64']:
        indices = pa.array([0, 4, 2, None], type=indices_type)
        result = arr.take(indices)
        expected = pa.array([0, 4, 2, None])
        assert result.equals(expected)

    for indices_type in [pa.float32(), pa.float64()]:
        indices = pa.array([0, 4, 2], type=indices_type)
        with pytest.raises(TypeError):
            arr.take(indices)
示例#26
0
def test_sequence_numpy_double(seq, np_scalar, from_pandas):
    data = [np_scalar(1.5), np_scalar(1), None, np_scalar(2.5), None, np.nan]
    arr = pa.array(seq(data), from_pandas=from_pandas)
    assert len(arr) == 6
    if from_pandas:
        assert arr.null_count == 3
    else:
        assert arr.null_count == 2
    assert arr.type == pa.float64()

    assert arr.to_pylist()[:4] == data[:4]
    if from_pandas:
        assert arr.to_pylist()[5] is None
    else:
        assert np.isnan(arr.to_pylist()[5])
示例#27
0
文件: test_json.py 项目: rok/arrow
 def test_simple_varied(self):
     # Infer various kinds of data
     rows = (b'{"a": 1,"b": 2, "c": "3", "d": false}\n'
             b'{"a": 4.0, "b": -5, "c": "foo", "d": true}\n')
     table = self.read_bytes(rows)
     schema = pa.schema([('a', pa.float64()),
                         ('b', pa.int64()),
                         ('c', pa.string()),
                         ('d', pa.bool_())])
     assert table.schema == schema
     assert table.to_pydict() == {
         'a': [1.0, 4.0],
         'b': [2, -5],
         'c': [u"3", u"foo"],
         'd': [False, True],
         }
示例#28
0
def test_ndarray_nested_numpy_double(from_pandas, inner_seq):
    # ARROW-2806
    data = np.array([
        inner_seq([1., 2.]),
        inner_seq([1., 2., 3.]),
        inner_seq([np.nan]),
        None
    ])
    arr = pa.array(data, from_pandas=from_pandas)
    assert len(arr) == 4
    assert arr.null_count == 1
    assert arr.type == pa.list_(pa.float64())
    if from_pandas:
        assert arr.to_pylist() == [[1.0, 2.0], [1.0, 2.0, 3.0], [None], None]
    else:
        np.testing.assert_equal(arr.to_pylist(),
                                [[1., 2.], [1., 2., 3.], [np.nan], None])
示例#29
0
    def test_buffer_bounds_error(self):
        # ARROW-1676
        path = random_path()
        self.test_files.append(path)

        for i in range(16, 256):
            values = pa.array([None] + list(range(i)), type=pa.float64())

            writer = FeatherWriter()
            writer.open(path)

            writer.write_array('arr', values)
            writer.close()

            result = read_feather(path)
            expected = pd.DataFrame({'arr': values.to_pandas()})
            assert_frame_equal(result, expected)

            self._check_pandas_roundtrip(expected, null_counts=[1])
示例#30
0
文件: jvm.py 项目: rok/arrow
def _from_jvm_float_type(jvm_type):
    """
    Convert a JVM float type to its Python equivalent.

    Parameters
    ----------
    jvm_type: org.apache.arrow.vector.types.pojo.ArrowType$FloatingPoint

    Returns
    -------
    typ: pyarrow.DataType
    """
    precision = jvm_type.getPrecision().toString()
    if precision == 'HALF':
        return pa.float16()
    elif precision == 'SINGLE':
        return pa.float32()
    elif precision == 'DOUBLE':
        return pa.float64()
示例#31
0
    eq([a], [a])
    ne([a], [b])
    eq([a, c], [a, c])
    eq([a, c], [d])
    ne([c, a], [a, c])

    assert not pa.chunked_array([], type=pa.int32()).equals(None)


@pytest.mark.parametrize(
    ('data', 'typ'),
    [
        ([True, False, True, True], pa.bool_()),
        ([1, 2, 4, 6], pa.int64()),
        ([1.0, 2.5, None], pa.float64()),
        (['a', None, 'b'], pa.string()),
        ([], pa.list_(pa.uint8())),
        ([[1, 2], [3]], pa.list_(pa.int64())),
        ([['a'], None, ['b', 'c']], pa.list_(pa.string())),
        ([(1, 'a'), (2, 'c'), None],
            pa.struct([pa.field('a', pa.int64()), pa.field('b', pa.string())]))
    ]
)
def test_chunked_array_pickle(data, typ):
    arrays = []
    while data:
        arrays.append(pa.array(data[:2], type=typ))
        data = data[2:]
    array = pa.chunked_array(arrays, type=typ)
    array.validate()
示例#32
0
def to_arrow_type(dt):
    """ Convert Spark data type to pyarrow type
    """
    from distutils.version import LooseVersion
    import pyarrow as pa
    if type(dt) == BooleanType:
        arrow_type = pa.bool_()
    elif type(dt) == ByteType:
        arrow_type = pa.int8()
    elif type(dt) == ShortType:
        arrow_type = pa.int16()
    elif type(dt) == IntegerType:
        arrow_type = pa.int32()
    elif type(dt) == LongType:
        arrow_type = pa.int64()
    elif type(dt) == FloatType:
        arrow_type = pa.float32()
    elif type(dt) == DoubleType:
        arrow_type = pa.float64()
    elif type(dt) == DecimalType:
        arrow_type = pa.decimal128(dt.precision, dt.scale)
    elif type(dt) == StringType:
        arrow_type = pa.string()
    elif type(dt) == BinaryType:
        arrow_type = pa.binary()
    elif type(dt) == DateType:
        arrow_type = pa.date32()
    elif type(dt) == TimestampType:
        # Timestamps should be in UTC, JVM Arrow timestamps require a timezone to be read
        arrow_type = pa.timestamp('us', tz='UTC')
    elif type(dt) == TimestampNTZType:
        arrow_type = pa.timestamp('us', tz=None)
    elif type(dt) == ArrayType:
        if type(dt.elementType) in [StructType, TimestampType]:
            raise TypeError("Unsupported type in conversion to Arrow: " +
                            str(dt))
        arrow_type = pa.list_(to_arrow_type(dt.elementType))
    elif type(dt) == MapType:
        if LooseVersion(pa.__version__) < LooseVersion("2.0.0"):
            raise TypeError(
                "MapType is only supported with pyarrow 2.0.0 and above")
        if type(dt.keyType) in [StructType, TimestampType] or \
                type(dt.valueType) in [StructType, TimestampType]:
            raise TypeError("Unsupported type in conversion to Arrow: " +
                            str(dt))
        arrow_type = pa.map_(to_arrow_type(dt.keyType),
                             to_arrow_type(dt.valueType))
    elif type(dt) == StructType:
        if any(type(field.dataType) == StructType for field in dt):
            raise TypeError(
                "Nested StructType not supported in conversion to Arrow")
        fields = [
            pa.field(field.name,
                     to_arrow_type(field.dataType),
                     nullable=field.nullable) for field in dt
        ]
        arrow_type = pa.struct(fields)
    elif type(dt) == NullType:
        arrow_type = pa.null()
    else:
        raise TypeError("Unsupported type in conversion to Arrow: " + str(dt))
    return arrow_type
示例#33
0
def test_mixed_sequence_errors():
    with pytest.raises(ValueError, match="tried to convert to boolean"):
        pa.array([True, 'foo'], type=pa.bool_())

    with pytest.raises(ValueError, match="tried to convert to float32"):
        pa.array([1.5, 'foo'], type=pa.float32())

    with pytest.raises(ValueError, match="tried to convert to double"):
        pa.array([1.5, 'foo'])


@parametrize_with_iterable_types
@pytest.mark.parametrize("np_scalar,pa_type", [(np.float16, pa.float16()),
                                               (np.float32, pa.float32()),
                                               (np.float64, pa.float64())])
@pytest.mark.parametrize("from_pandas", [True, False])
def test_sequence_numpy_double(seq, np_scalar, pa_type, from_pandas):
    data = [np_scalar(1.5), np_scalar(1), None, np_scalar(2.5), None, np.nan]
    arr = pa.array(seq(data), from_pandas=from_pandas)
    assert len(arr) == 6
    if from_pandas:
        assert arr.null_count == 3
    else:
        assert arr.null_count == 2
    if from_pandas:
        # The NaN is skipped in type inference, otherwise it forces a
        # float64 promotion
        assert arr.type == pa_type
    else:
        assert arr.type == pa.float64()
示例#34
0
import pyarrow as pa
import pyarrow.types as types

MANY_TYPES = [
    pa.null(),
    pa.bool_(),
    pa.int32(),
    pa.time32('s'),
    pa.time64('us'),
    pa.date32(),
    pa.timestamp('us'),
    pa.timestamp('us', tz='UTC'),
    pa.timestamp('us', tz='Europe/Paris'),
    pa.float16(),
    pa.float32(),
    pa.float64(),
    pa.decimal128(19, 4),
    pa.string(),
    pa.binary(),
    pa.binary(10),
    pa.list_(pa.int32()),
    pa.struct([
        pa.field('a', pa.int32()),
        pa.field('b', pa.int8()),
        pa.field('c', pa.string())
    ]),
    pa.union([pa.field('a', pa.binary(10)),
              pa.field('b', pa.string())],
             mode=pa.lib.UnionMode_DENSE),
    pa.union([pa.field('a', pa.binary(10)),
              pa.field('b', pa.string())],
示例#35
0
from pandas.core.dtypes.common import infer_dtype_from_object

import cudf
from cudf.api.types import (  # noqa: F401
    _is_non_decimal_numeric_dtype, _is_scalar_or_zero_d_array, infer_dtype,
    is_categorical_dtype, is_datetime_dtype as is_datetime_dtype,
    is_decimal32_dtype, is_decimal64_dtype, is_decimal_dtype, is_integer,
    is_integer_dtype, is_interval_dtype, is_list_dtype, is_list_like,
    is_numeric_dtype as is_numerical_dtype, is_scalar, is_string_dtype,
    is_struct_dtype, is_timedelta_dtype, pandas_dtype,
)
from cudf.core._compat import PANDAS_GE_120

_NA_REP = "<NA>"
_np_pa_dtypes = {
    np.float64: pa.float64(),
    np.float32: pa.float32(),
    np.int64: pa.int64(),
    np.longlong: pa.int64(),
    np.int32: pa.int32(),
    np.int16: pa.int16(),
    np.int8: pa.int8(),
    np.bool_: pa.int8(),
    np.uint64: pa.uint64(),
    np.uint32: pa.uint32(),
    np.uint16: pa.uint16(),
    np.uint8: pa.uint8(),
    np.datetime64: pa.date64(),
    np.object_: pa.string(),
    np.str_: pa.string(),
}
except ImportError:
    sparse = None


tensor_type_pairs = [
    ('i1', pa.int8()),
    ('i2', pa.int16()),
    ('i4', pa.int32()),
    ('i8', pa.int64()),
    ('u1', pa.uint8()),
    ('u2', pa.uint16()),
    ('u4', pa.uint32()),
    ('u8', pa.uint64()),
    ('f2', pa.float16()),
    ('f4', pa.float32()),
    ('f8', pa.float64())
]


@pytest.mark.parametrize('sparse_tensor_type', [
    pa.SparseCSRMatrix,
    pa.SparseCSCMatrix,
    pa.SparseCOOTensor,
    pa.SparseCSFTensor,
])
def test_sparse_tensor_attrs(sparse_tensor_type):
    data = np.array([
        [8, 0, 2, 0, 0, 0],
        [0, 0, 0, 0, 0, 5],
        [3, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 4, 6],
示例#37
0
        "TIMESTAMP": pyarrow_timestamp,
    }
    ARROW_SCALAR_IDS_TO_BQ = {
        # https://arrow.apache.org/docs/python/api/datatypes.html#type-classes
        pyarrow.bool_().id: "BOOL",
        pyarrow.int8().id: "INT64",
        pyarrow.int16().id: "INT64",
        pyarrow.int32().id: "INT64",
        pyarrow.int64().id: "INT64",
        pyarrow.uint8().id: "INT64",
        pyarrow.uint16().id: "INT64",
        pyarrow.uint32().id: "INT64",
        pyarrow.uint64().id: "INT64",
        pyarrow.float16().id: "FLOAT64",
        pyarrow.float32().id: "FLOAT64",
        pyarrow.float64().id: "FLOAT64",
        pyarrow.time32("ms").id: "TIME",
        pyarrow.time64("ns").id: "TIME",
        pyarrow.timestamp("ns").id: "TIMESTAMP",
        pyarrow.date32().id: "DATE",
        pyarrow.date64().id: "DATETIME",  # because millisecond resolution
        pyarrow.binary().id: "BYTES",
        pyarrow.string().id: "STRING",  # also alias for pyarrow.utf8()
        pyarrow.decimal128(38, scale=9).id: "NUMERIC",
        # The exact decimal's scale and precision are not important, as only
        # the type ID matters, and it's the same for all decimal128 instances.
    }

else:  # pragma: NO COVER
    BQ_TO_ARROW_SCALARS = {}  # pragma: NO COVER
    ARROW_SCALAR_IDS_TO_BQ = {}  # pragma: NO_COVER
示例#38
0
@pytest.mark.parametrize(
    "dtype,value,expected",
    [
        (pa.bool_(), True, True),
        (pa.bool_(), False, False),
        (pa.bool_(), 1, True),
        (pa.bool_(), 0, False),
        (pa.bool_(), "True", True),
        (pa.bool_(), "False", False),
        (pa.bool_(), "true", True),
        (pa.bool_(), "false", False),
        (pa.int64(), 1, 1),
        (pa.int64(), "1", 1),
        (pa.int64(), 1.0, 1),
        (pa.float64(), 1.1, 1.1),
        (pa.float64(), "1.1", 1.1),
        (pa.float64(), 1, 1.0),
        (pa.binary(), "x", b"x"),
        (pa.string(), "x", "x"),
        (pa.string(), "ö", "ö"),
        (pa.string(), 1, "1"),
        (pa.string(), "ö".encode("utf8"), "ö"),
        (
            pa.timestamp("ns"),
            pd.Timestamp("2018-01-01"),
            pd.Timestamp("2018-01-01").to_datetime64(),
        ),
        (
            pa.timestamp("ns"),
            pd.Timestamp("2018-01-01").to_datetime64(),
示例#39
0
文件: service.py 项目: skg-net/suzieq
    def clean_data_common(self, processed_data, raw_data):
        """Fix the type and default value of of each extracted field

        This routine is common to all services. It ensures that all the missing
        fields, as defined by the schema, are added to the records extracted.
        Furthermore, each field is set to the specified type.
        """

        # Build default data structure
        schema_rec = {}
        def_vals = self._get_default_vals()

        ptype_map = {
            pa.string(): str,
            pa.int32(): int,
            pa.int64(): int,
            pa.float32(): float,
            pa.float64(): float,
            pa.date64(): float,
            pa.list_(pa.string()): list,
            pa.list_(pa.int64()): list,
            pa.bool_(): bool,
        }

        for fld in self.schema:
            default = def_vals[fld.type]
            schema_rec.update({fld.name: default})

        if isinstance(raw_data, list):
            read_from = raw_data[0]
        else:
            read_from = raw_data

        # pylint: disable=too-many-nested-blocks
        for entry in processed_data or []:
            entry.update({"hostname": read_from["hostname"]})
            entry.update({"namespace": read_from["namespace"]})
            entry.update({"timestamp": read_from["timestamp"]})
            entry.update({"sqvers": self.version})
            for fld, val in schema_rec.items():
                if fld not in entry:
                    if fld == "active":
                        entry.update({fld: True})
                    else:
                        entry.update({fld: val})
                else:
                    fld_type = self.schema.field(fld).type
                    if not isinstance(entry[fld], ptype_map[fld_type]):
                        try:
                            entry[fld] = ptype_map[fld_type](entry[fld])
                        except (ValueError, TypeError):
                            entry[fld] = val
                    elif isinstance(entry[fld], list):
                        for i, ele in enumerate(entry[fld]):
                            if not isinstance(ele,
                                              ptype_map[fld_type.value_type]):
                                try:
                                    if ptype_map[fld_type.value_type] == int:
                                        entry[fld][i] = int(entry[fld][i])
                                    elif ptype_map[fld_type.value_type] == str:
                                        entry[fld][i] = str(entry[fld][i])
                                    else:
                                        raise ValueError
                                except (ValueError, TypeError):
                                    entry[fld][i] = val
        return processed_data
示例#40
0
def test_cast_date64_to_int():
    arr = pa.array(np.array([0, 1, 2], dtype='int64'),
                   type=pa.date64())
    expected = pa.array([0, 1, 2], type='i8')

    result = arr.cast('i8')

    assert result.equals(expected)


@pytest.mark.parametrize(
    ('data', 'typ'),
    [
        ([True, False, True, True], pa.bool_()),
        ([1, 2, 4, 6], pa.int64()),
        ([1.0, 2.5, None], pa.float64()),
        (['a', None, 'b'], pa.string()),
        ([], None),
        ([[1, 2], [3]], pa.list_(pa.int64())),
        ([['a'], None, ['b', 'c']], pa.list_(pa.string())),
        ([(1, 'a'), (2, 'c'), None],
            pa.struct([pa.field('a', pa.int64()), pa.field('b', pa.string())]))
    ]
)
def test_array_pickle(data, typ):
    # Allocate here so that we don't have any Arrow data allocated.
    # This is needed to ensure that allocator tests can be reliable.
    array = pa.array(data, type=typ)
    result = pickle.loads(pickle.dumps(array))
    assert array.equals(result)
示例#41
0
def create_null_float64():
    import pyarrow as pa
    return convert(pa.array([np.float64(1), None], type=pa.float64()))
示例#42
0
try:
    from scipy.sparse import csr_matrix, coo_matrix
except ImportError:
    coo_matrix = None
    csr_matrix = None

try:
    import sparse
except ImportError:
    sparse = None

tensor_type_pairs = [('i1', pa.int8()), ('i2', pa.int16()), ('i4', pa.int32()),
                     ('i8', pa.int64()), ('u1', pa.uint8()),
                     ('u2', pa.uint16()), ('u4', pa.uint32()),
                     ('u8', pa.uint64()), ('f2', pa.float16()),
                     ('f4', pa.float32()), ('f8', pa.float64())]


@pytest.mark.parametrize('sparse_tensor_type', [
    pa.SparseCSRMatrix,
    pa.SparseCSCMatrix,
    pa.SparseCOOTensor,
    pa.SparseCSFTensor,
])
def test_sparse_tensor_attrs(sparse_tensor_type):
    data = np.array([
        [8, 0, 2, 0, 0, 0],
        [0, 0, 0, 0, 0, 5],
        [3, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 4, 6],
    ])
示例#43
0
                          ('uint32', range(0, 10)), ('int32', range(0, 10)),
                          ('uint64', range(0, 10)), ('int64', range(0, 10)),
                          ('float', [0.0, 0.1, 0.2]),
                          ('double', [0.0, 0.1, 0.2]),
                          ('string', ['a', 'b', 'c']),
                          ('binary', [b'a', b'b', b'c']),
                          (pa.binary(3), [b'abc', b'bcd', b'cde'])])
def test_cast_identities(ty, values):
    arr = pa.array(values, type=ty)
    assert arr.cast(ty).equals(arr)


pickle_test_parametrize = pytest.mark.parametrize(
    ('data', 'typ'),
    [([True, False, True, True], pa.bool_()), ([1, 2, 4, 6], pa.int64()),
     ([1.0, 2.5, None], pa.float64()), (['a', None, 'b'], pa.string()),
     ([], None), ([[1, 2], [3]], pa.list_(pa.int64())),
     ([[4, 5], [6]], pa.large_list(pa.int16())),
     ([['a'], None, ['b', 'c']], pa.list_(pa.string())),
     ([(1, 'a'), (2, 'c'), None
       ], pa.struct([pa.field('a', pa.int64()),
                     pa.field('b', pa.string())]))])


@pickle_test_parametrize
def test_array_pickle(data, typ):
    # Allocate here so that we don't have any Arrow data allocated.
    # This is needed to ensure that allocator tests can be reliable.
    array = pa.array(data, type=typ)
    for proto in range(0, pickle.HIGHEST_PROTOCOL + 1):
        result = pickle.loads(pickle.dumps(array, proto))
示例#44
0
    assert result2.equals(arr)


def test_cast_date64_to_int():
    arr = pa.array(np.array([0, 1, 2], dtype='int64'), type=pa.date64())
    expected = pa.array([0, 1, 2], type='i8')

    result = arr.cast('i8')

    assert result.equals(expected)


@pytest.mark.parametrize(
    ('data', 'typ'),
    [([True, False, True, True], pa.bool_()), ([1, 2, 4, 6], pa.int64()),
     ([1.0, 2.5, None], pa.float64()), (['a', None, 'b'], pa.string()),
     ([], None), ([[1, 2], [3]], pa.list_(pa.int64())),
     ([['a'], None, ['b', 'c']], pa.list_(pa.string())),
     ([(1, 'a'), (2, 'c'), None
       ], pa.struct([pa.field('a', pa.int64()),
                     pa.field('b', pa.string())]))])
def test_array_pickle(data, typ):
    # Allocate here so that we don't have any Arrow data allocated.
    # This is needed to ensure that allocator tests can be reliable.
    array = pa.array(data, type=typ)
    result = pickle.loads(pickle.dumps(array))
    assert array.equals(result)


@pytest.mark.parametrize('narr', [
    np.arange(10, dtype=np.int64),
示例#45
0
from google.protobuf import text_format
from absl.testing import absltest
from absl.testing import parameterized
from tensorflow_metadata.proto.v0 import schema_pb2

_TF_TYPE_TO_ARROW_TYPE = {
    tf.int8: pa.int8(),
    tf.int16: pa.int16(),
    tf.int32: pa.int32(),
    tf.int64: pa.int64(),
    tf.uint8: pa.uint8(),
    tf.uint16: pa.uint16(),
    tf.uint32: pa.uint32(),
    tf.uint64: pa.uint64(),
    tf.float32: pa.float32(),
    tf.float64: pa.float64(),
    tf.string: pa.large_binary(),
}

_ROW_PARTITION_DTYPES = {"INT64": np.int64, "INT32": np.int32}


def _make_2d_dense_tensor_test_cases():
    result = []
    for tf_type, arrow_type in _TF_TYPE_TO_ARROW_TYPE.items():
        if tf_type == tf.string:
            tensor = tf.constant([[b"1", b"2"], [b"3", b"4"]], dtype=tf.string)
            expected_array = pa.array([[b"1", b"2"], [b"3", b"4"]],
                                      type=pa.large_list(arrow_type))
        else:
            tensor = tf.constant([[1, 2], [3, 4]], dtype=tf_type)
示例#46
0
class DataMapping:
    """
    Map primary data between different supported data frameworks, preserving equivalent data types.

    DataMapping is for primary data, to map metadata types and values use
    :py:class:`TypeMapping <tracdap.rt.impl.type_system.TypeMapping>` and
    :py:class:`TypeMapping <tracdap.rt.impl.type_system.MetadataCodec>`.
    """

    __log = _util.logger_for_namespace(_DataInternal.__module__ +
                                       ".DataMapping")

    # Matches TRAC_ARROW_TYPE_MAPPING in ArrowSchema, tracdap-lib-data

    __TRAC_DECIMAL_PRECISION = 38
    __TRAC_DECIMAL_SCALE = 12
    __TRAC_TIMESTAMP_UNIT = "ms"
    __TRAC_TIMESTAMP_ZONE = None

    __TRAC_TO_ARROW_BASIC_TYPE_MAPPING = {
        _meta.BasicType.BOOLEAN:
        pa.bool_(),
        _meta.BasicType.INTEGER:
        pa.int64(),
        _meta.BasicType.FLOAT:
        pa.float64(),
        _meta.BasicType.DECIMAL:
        pa.decimal128(__TRAC_DECIMAL_PRECISION, __TRAC_DECIMAL_SCALE),
        _meta.BasicType.STRING:
        pa.utf8(),
        _meta.BasicType.DATE:
        pa.date32(),
        _meta.BasicType.DATETIME:
        pa.timestamp(__TRAC_TIMESTAMP_UNIT, __TRAC_TIMESTAMP_ZONE)
    }

    # Check the Pandas dtypes for handling floats are available before setting up the type mapping
    __PANDAS_FLOAT_DTYPE_CHECK = _DataInternal.float_dtype_check()
    __PANDAS_DATETIME_TYPE = pd.to_datetime([]).dtype

    # Only partial mapping is possible, decimal and temporal dtypes cannot be mapped this way
    __ARROW_TO_PANDAS_TYPE_MAPPING = {
        pa.bool_(): pd.BooleanDtype(),
        pa.int8(): pd.Int8Dtype(),
        pa.int16(): pd.Int16Dtype(),
        pa.int32(): pd.Int32Dtype(),
        pa.int64(): pd.Int64Dtype(),
        pa.uint8(): pd.UInt8Dtype(),
        pa.uint16(): pd.UInt16Dtype(),
        pa.uint32(): pd.UInt32Dtype(),
        pa.uint64(): pd.UInt64Dtype(),
        pa.float16(): pd.Float32Dtype(),
        pa.float32(): pd.Float32Dtype(),
        pa.float64(): pd.Float64Dtype(),
        pa.utf8(): pd.StringDtype()
    }

    @staticmethod
    def arrow_to_python_type(arrow_type: pa.DataType) -> type:

        if pa.types.is_boolean(arrow_type):
            return bool

        if pa.types.is_integer(arrow_type):
            return int

        if pa.types.is_floating(arrow_type):
            return float

        if pa.types.is_decimal(arrow_type):
            return decimal.Decimal

        if pa.types.is_string(arrow_type):
            return str

        if pa.types.is_date(arrow_type):
            return dt.date

        if pa.types.is_timestamp(arrow_type):
            return dt.datetime

        raise _ex.ETracInternal(
            f"No Python type mapping available for Arrow type [{arrow_type}]")

    @classmethod
    def python_to_arrow_type(cls, python_type: type) -> pa.DataType:

        if python_type == bool:
            return pa.bool_()

        if python_type == int:
            return pa.int64()

        if python_type == float:
            return pa.float64()

        if python_type == decimal.Decimal:
            return pa.decimal128(cls.__TRAC_DECIMAL_PRECISION,
                                 cls.__TRAC_DECIMAL_SCALE)

        if python_type == str:
            return pa.utf8()

        if python_type == dt.date:
            return pa.date32()

        if python_type == dt.datetime:
            return pa.timestamp(cls.__TRAC_TIMESTAMP_UNIT,
                                cls.__TRAC_TIMESTAMP_ZONE)

        raise _ex.ETracInternal(
            f"No Arrow type mapping available for Python type [{python_type}]")

    @classmethod
    def trac_to_arrow_type(cls,
                           trac_type: _meta.TypeDescriptor) -> pa.DataType:

        return cls.trac_to_arrow_basic_type(trac_type.basicType)

    @classmethod
    def trac_to_arrow_basic_type(
            cls, trac_basic_type: _meta.BasicType) -> pa.DataType:

        arrow_type = cls.__TRAC_TO_ARROW_BASIC_TYPE_MAPPING.get(
            trac_basic_type)

        if arrow_type is None:
            raise _ex.ETracInternal(
                f"No Arrow type mapping available for TRAC type [{trac_basic_type}]"
            )

        return arrow_type

    @classmethod
    def trac_to_arrow_schema(cls,
                             trac_schema: _meta.SchemaDefinition) -> pa.Schema:

        if trac_schema.schemaType != _meta.SchemaType.TABLE:
            raise _ex.ETracInternal(
                f"Schema type [{trac_schema.schemaType}] cannot be converted for Apache Arrow"
            )

        arrow_fields = [(f.fieldName,
                         cls.trac_to_arrow_basic_type(f.fieldType))
                        for f in trac_schema.table.fields]

        return pa.schema(arrow_fields, metadata={})

    @classmethod
    def trac_arrow_decimal_type(cls) -> pa.Decimal128Type:

        return pa.decimal128(cls.__TRAC_DECIMAL_PRECISION,
                             cls.__TRAC_DECIMAL_SCALE)

    @classmethod
    def pandas_datetime_type(cls):
        return cls.__PANDAS_DATETIME_TYPE

    @classmethod
    def view_to_pandas(cls, view: DataView, part: DataPartKey) -> pd.DataFrame:

        deltas = view.parts.get(part)

        # Sanity checks

        if not view.arrow_schema:
            raise _ex.ETracInternal(f"Data view schema not set")

        if not deltas:
            raise _ex.ETracInternal(
                f"Data view for part [{part.opaque_key}] does not contain any items"
            )

        if len(deltas) == 1:
            return cls.item_to_pandas(deltas[0])

        batches = {
            batch
            for delta in deltas for batch in (
                delta.batches if delta.batches else delta.table.to_batches())
        }

        table = pa.Table.from_batches(batches)  # noqa
        return table.to_pandas()

    @classmethod
    def item_to_pandas(cls, item: DataItem) -> pd.DataFrame:

        if item.pandas is not None:
            return item.pandas.copy()

        if item.table is not None:
            return cls.arrow_to_pandas(item.table)

        if item.batches is not None:
            table = pa.Table.from_batches(item.batches, item.schema)  # noqa
            return cls.arrow_to_pandas(table)

        raise _ex.ETracInternal(f"Data item does not contain any usable data")

    @classmethod
    def arrow_to_pandas(cls, table: pa.Table) -> pd.DataFrame:

        return table.to_pandas(
            ignore_metadata=True,  # noqa
            date_as_object=False,  # noqa
            timestamp_as_object=False,  # noqa
            types_mapper=cls.__ARROW_TO_PANDAS_TYPE_MAPPING.get)

    @classmethod
    def pandas_to_view(cls, df: pd.DataFrame, prior_view: DataView,
                       part: DataPartKey):

        item = cls.pandas_to_item(df, prior_view.arrow_schema)
        return cls.add_item_to_view(prior_view, part, item)

    @classmethod
    def pandas_to_item(cls, df: pd.DataFrame,
                       schema: tp.Optional[pa.Schema]) -> DataItem:

        table = cls.pandas_to_arrow(df, schema)
        return DataItem(table.schema, table)

    @classmethod
    def pandas_to_arrow(cls,
                        df: pd.DataFrame,
                        schema: tp.Optional[pa.Schema] = None) -> pa.Table:

        # Here we convert the whole Pandas df and then pass it to conformance
        # An optimization would be to filter columns before applying conformance
        # To do this, we'd need the case-insensitive field matching logic, including output of warnings

        # Also, note that schema is not applied in from_pandas
        # This is because the conformance logic allows for a wider range of conversions
        # Applying the schema directly would fail for some types where casting is possible

        if len(df) == 0:
            df_schema = pa.Schema.from_pandas(df, preserve_index=False)  # noqa
            table = pa.Table.from_batches(list(), df_schema)  # noqa
        else:
            table = pa.Table.from_pandas(df, preserve_index=False)  # noqa

        # If there is no explict schema, give back the table exactly as it was received from Pandas
        # There could be an option here to coerce types to the appropriate TRAC standard types
        # E.g. unsigned int 32 -> signed int 64, TRAC standard integer type

        if schema is None:
            return table
        else:
            return DataConformance.conform_to_schema(table, schema, df.dtypes)

    @classmethod
    def add_item_to_view(cls, view: DataView, part: DataPartKey,
                         item: DataItem) -> DataView:

        prior_deltas = view.parts.get(part) or list()
        deltas = [*prior_deltas, item]
        parts = {**view.parts, part: deltas}

        return DataView(view.trac_schema, view.arrow_schema, parts)
示例#47
0
def main():
    # https://arrow.apache.org/docs/python/api/datatypes.html
    my_schema = pa.schema([
        # skip null

        ('c_bool', pa.bool_()),

        ('c_int8', pa.int8()),
        ('c_int16', pa.int16()),
        ('c_int32', pa.int32()),
        ('c_int64', pa.int64()),

        ('c_uint8', pa.uint8()),
        ('c_uint16', pa.uint16()),
        ('c_uint32', pa.uint32()),
        ('c_uint64', pa.uint64()),

        # skip ('c_float16', pa.float16()),
        ('c_float32', pa.float32()),
        ('c_float64', pa.float64()),

        ('c_time32', pa.time32('ms')),
        ('c_time64', pa.time64('ns')),
        ('c_timestamp', pa.timestamp('ms')),
        ('c_date32', pa.date32()),
        ('c_date64', pa.date64()),

        # skip binary

        ('c_string', pa.string()),

        # skip utf8
        # skip large_binary
        # skip large_string
        # skip large_utf8

        ('c_decimal128_8_3', pa.decimal128(8, 3))

        # skip list_
        # skip  large_list
        # skip struct
        # skip dictionary
        # skip field
        # skip schema
        # skip from_numpy_dtype
    ])

    c_bool = pa.array([False, True, False], type=pa.bool_())

    c_int8 = pa.array([1, 2, 3], type=pa.int8())
    c_int16 = pa.array([1, 2, 3], type=pa.int16())
    c_int32 = pa.array([1, 2, 3], type=pa.int32())
    c_int64 = pa.array([1, 2, 3], type=pa.int64())

    c_uint8 = pa.array([1, 2, 3], type=pa.uint8())
    c_uint16 = pa.array([1, 2, 3], type=pa.uint16())
    c_uint32 = pa.array([1, 2, 3], type=pa.uint32())
    c_uint64 = pa.array([1, 2, 3], type=pa.uint64())

    # c_float16 = pa.array([np.float16(1.0), np.float16(2.0), np.float16(3.0)], type=pa.float16())
    c_float32 = pa.array([1.0, 2.0, 3.0], type=pa.float32())
    c_float64 = pa.array([1.0, 2.0, 3.0], type=pa.float64())

    c_time32 = pa.array([1, 2, 3], type=pa.time32('ms'))
    c_time64 = pa.array([1, 2, 3], type=pa.time64('ns'))
    c_timestamp = pa.array([
        datetime(2019, 9, 3, 9, 0, 0),
        datetime(2019, 9, 3, 10, 0, 0),
        datetime(2019, 9, 3, 11, 0, 0)
    ], type=pa.timestamp('ms'))
    c_date32 = pa.array([
        datetime(2019, 9, 3, 9, 0, 0),
        datetime(2019, 9, 3, 10, 0, 0),
        datetime(2019, 9, 3, 11, 0, 0)
    ], type=pa.date32())
    c_date64 = pa.array([
        datetime(2019, 9, 3, 9, 0, 0),
        datetime(2019, 9, 3, 10, 0, 0),
        datetime(2019, 9, 3, 11, 0, 0)
    ], type=pa.date64())

    c_string = pa.array(
        ['*****@*****.**', '*****@*****.**', '*****@*****.**'],
        type=pa.string()
    )

    c_decimal128_8_3 = pa.array([1, 2, 3], type=pa.decimal128(8, 3))

    batch = pa.RecordBatch.from_arrays(
        [c_bool,
         c_int8, c_int16, c_int32, c_int64,
         c_uint8, c_uint16, c_uint32, c_uint64,
         # c_float16,
         c_float32, c_float64,
         c_time32, c_time64, c_timestamp, c_date32, c_date64,
         c_string,
         c_decimal128_8_3
         ],
        schema=my_schema
    )

    table = pa.Table.from_batches([batch])
    pq.write_table(table, 'example.parquet')
示例#48
0
    from tfx_bsl.cc.tfx_bsl_extension.arrow.table_util import TotalByteSize as _TotalByteSize
except ImportError as err:
    import sys
    sys.stderr.write(
        "Error importing tfx_bsl_extension.arrow.table_util. "
        "Some tfx_bsl functionalities are not available: {}".format(err))
# pylint: enable=g-import-not-at-top
# pytype: enable=import-error
# pylint: enable=unused-import

_EMPTY_RECORD_BATCH = pa.RecordBatch.from_arrays([], [])

_NUMPY_KIND_TO_ARROW_TYPE = {
    "i": pa.int64(),
    "u": pa.uint64(),
    "f": pa.float64(),
    "b": pa.int8(),
    "S": pa.binary(),
    "O": pa.binary(),
    "U": pa.binary(),
}


def TotalByteSize(table_or_batch: Union[pa.Table, pa.RecordBatch],
                  ignore_unsupported=False):
    """Returns the in-memory size of a record batch or a table."""
    if isinstance(table_or_batch, pa.Table):
        return sum([
            _TotalByteSize(b, ignore_unsupported)
            for b in table_or_batch.to_batches(max_chunksize=None)
        ])
示例#49
0
文件: __init__.py 项目: tnir/pandas
]

if not pa_version_under1p01:
    import pyarrow as pa

    UNSIGNED_INT_PYARROW_DTYPES = [
        pa.uint8(), pa.uint16(),
        pa.uint32(), pa.uint64()
    ]
    SIGNED_INT_PYARROW_DTYPES = [
        pa.uint8(), pa.int16(),
        pa.int32(), pa.uint64()
    ]
    ALL_INT_PYARROW_DTYPES = UNSIGNED_INT_PYARROW_DTYPES + SIGNED_INT_PYARROW_DTYPES

    FLOAT_PYARROW_DTYPES = [pa.float32(), pa.float64()]
    STRING_PYARROW_DTYPES = [pa.string(), pa.utf8()]

    TIME_PYARROW_DTYPES = [
        pa.time32("s"),
        pa.time32("ms"),
        pa.time64("us"),
        pa.time64("ns"),
    ]
    DATE_PYARROW_DTYPES = [pa.date32(), pa.date64()]
    DATETIME_PYARROW_DTYPES = [
        pa.timestamp(unit=unit, tz=tz) for unit in ["s", "ms", "us", "ns"]
        for tz in [None, "UTC", "US/Pacific", "US/Eastern"]
    ]
    TIMEDELTA_PYARROW_DTYPES = [
        pa.duration(unit) for unit in ["s", "ms", "us", "ns"]
示例#50
0
         ("d", np.array([4, 5, 6])),
         ("e", b""),
     ]),
 ],
 "type_schema":
 OrderedDict([
     ("a", int),
     ("b", float),
     ("c", unicode),
     ("d", np.ndarray),
     ("e", bytes),
 ]),
 "pyarrow_schema":
 pa.schema([
     ("a", pa.int64()),
     ("b", pa.float64()),
     ("c", pa.string()),
     ("d", pa.list_(pa.int64())),
     ("e", pa.binary()),
 ]),
 "avro_schema": {
     "namespace":
     "example.avro",
     "name":
     "User",
     "type":
     "record",
     "fields": [
         {
             "name": "a",
             "type": "int"
示例#51
0
import argparse
import csv
import re
import sys
from datetime import datetime
from base64 import standard_b64decode
import pyarrow as pa
import pyarrow.parquet as pq

PA_BOOL = pa.bool_()
PA_FLOAT32 = pa.float32()
PA_FLOAT64 = pa.float64()
PA_INT8 = pa.int8()
PA_INT16 = pa.int16()
PA_INT32 = pa.int32()
PA_INT64 = pa.int64()
PA_STRING = pa.string()
PA_TIMESTAMP = pa.timestamp('ns')
PA_BINARY = pa.binary()


def get_delimiter(csv_file):
    if csv_file[-4:] == '.tsv':
        return '\t'
    return ','


def sanitize_column_name(name):
    cleaned = re.sub('[^a-z0-9]', '_', name.lower())
    cleaned = re.sub('__*', '_', cleaned)
    cleaned = re.sub('^_*', '', cleaned)
示例#52
0
"""
Copyright (C) 2018 Anthony Potappel, The Netherlands. All Rights Reserved.
This work is licensed under the terms of the MIT license (for details, see attached LICENSE file).
"""

import pyarrow as pa

_ENDIANNESS = '<'

_DTYPES_CONV = {
    _ENDIANNESS + 'f2': pa.float16(),
    _ENDIANNESS + 'f4': pa.float32(),
    _ENDIANNESS + 'f8': pa.float64(),
    _ENDIANNESS + 'i2': pa.int16(),
    _ENDIANNESS + 'i4': pa.int32(),
    _ENDIANNESS + 'i8': pa.int64(),
    _ENDIANNESS + 'u2': pa.uint16(),
    _ENDIANNESS + 'u4': pa.uint32(),
    _ENDIANNESS + 'u8': pa.uint64(),
    '|i1': pa.int8(),
    '|u1': pa.uint8(),
}

_DTYPES_CONV_STR = {
    "float16": pa.float16(),
    "float32": pa.float32(),
    "float64": pa.float64(),
    "int16": pa.int16(),
    "int32": pa.int32(),
    "int64": pa.int64(),
    "uint16": pa.uint16(),
示例#53
0
     [2, 100, -10],
     [2, None, -10],
     lambda: choices(list(range(100)), k=10),
 ),
 FletcherTestType(
     pa.int64(),
     # Use small values here so that np.prod stays in int64
     [2, 1, 3, 2, 1] * 20,
     [None, 1],
     [2, 2, None, None, -100, -100, 2, 100],
     [2, 100, -10],
     [2, None, -10],
     lambda: choices(list(range(100)), k=10),
 ),
 FletcherTestType(
     pa.float64(),
     [2, 1.0, 1.0, 5.5, 6.6] * 20,
     [None, 1.1],
     [2.5, 2.5, None, None, -100.1, -100.1, 2.5, 100.1],
     [2.5, 100.99, -10.1],
     [2.5, None, -10.1],
     lambda: choices([2.5, 1.0, -1.0, 0, 66.6], k=10),
 ),
 # Most of the tests fail as assert_extension_array_equal casts to numpy object
 # arrays and on them equality is not defined.
 pytest.param(
     FletcherTestType(
         pa.list_(pa.string()),
         [["B", "C"], ["A"], [None], ["A", "A"], []] * 20,
         [None, ["A"]],
         [["B"], ["B"], None, None, ["A"], ["A"], ["B"], ["C"]],
示例#54
0
def create_float64():
    import pyarrow as pa
    return convert(pa.array([np.float64(1), np.float64(2)], type=pa.float64()))
示例#55
0
@pytest.mark.parametrize(
    ('type', 'expected'),
    [
        (pa.null(), 'empty'),
        (pa.bool_(), 'bool'),
        (pa.int8(), 'int8'),
        (pa.int16(), 'int16'),
        (pa.int32(), 'int32'),
        (pa.int64(), 'int64'),
        (pa.uint8(), 'uint8'),
        (pa.uint16(), 'uint16'),
        (pa.uint32(), 'uint32'),
        (pa.uint64(), 'uint64'),
        (pa.float16(), 'float16'),
        (pa.float32(), 'float32'),
        (pa.float64(), 'float64'),
        (pa.date32(), 'date'),
        (pa.date64(), 'date'),
        (pa.binary(), 'bytes'),
        (pa.binary(length=4), 'bytes'),
        (pa.string(), 'unicode'),
        (pa.list_(pa.list_(pa.int16())), 'list[list[int16]]'),
        (pa.decimal128(18, 3), 'decimal'),
        (pa.timestamp('ms'), 'datetime'),
        (pa.timestamp('us', 'UTC'), 'datetimetz'),
        (pa.time32('s'), 'time'),
        (pa.time64('us'), 'time')
    ]
)
def test_logical_type(type, expected):
    assert get_logical_type(type) == expected
示例#56
0
    def __init__(
        self,
        name: str,
        values: "Union[np.array, List[Optional[Any]]]" = None,
        nullable: bool = True,
        dtype: "Optional[DataType]" = None,
    ):
        """

        Parameters
        ----------
        name
            Name of the series
        values
            Values of the series
        nullable
            If nullable.
                None values in a list will be interpreted as missing.
                NaN values in a numpy array will be interpreted as missing. Note that missing and NaNs are not the same
                in Polars
            Series creation may be faster if set to False and there are no null values.
        """
        # assume the first input were the values
        if values is None and not isinstance(name, str):
            values = name
            name = ""
        if values.__class__ == self.__class__:
            values.rename(name)
            self._s = values._s
            return

        self._s: PySeries
        # series path
        if isinstance(values, Series):
            self._from_pyseries(values)
            return
        elif isinstance(values, dict):
            raise ValueError(
                f"Constructing a Series with a dict is not supported for {values}"
            )
        elif isinstance(values, pa.Array):
            self._s = self.from_arrow(name, values)._s
            return

        # castable to numpy
        if not isinstance(values, np.ndarray) and not nullable:
            values = np.array(values)

        if dtype is not None:
            if dtype == Int8:
                self._s = PySeries.new_i8(name, values)
            elif dtype == Int16:
                self._s = PySeries.new_i16(name, values)
            elif dtype == Int32:
                self._s = PySeries.new_i32(name, values)
            elif dtype == Int64:
                self._s = PySeries.new_i64(name, values)
            elif dtype == UInt8:
                self._s = PySeries.new_u8(name, values)
            elif dtype == UInt16:
                self._s = PySeries.new_u16(name, values)
            elif dtype == UInt32:
                self._s = PySeries.new_u32(name, values)
            elif dtype == UInt64:
                self._s = PySeries.new_u64(name, values)
            elif dtype == Float32:
                self._s = PySeries.new_f32(name, values)
            elif dtype == Float64:
                self._s = PySeries.new_f64(name, values)
            elif dtype == Boolean:
                self._s = PySeries.new_bool(name, values)
            elif dtype == Utf8:
                self._s = PySeries.new_str(name, values)
            else:
                raise ValueError(
                    f"dtype {dtype} not yet supported when creating a Series"
                )
            return

        # numpy path
        if isinstance(values, np.ndarray):
            if not values.data.contiguous:
                values = np.array(values)
            if len(values.shape) > 1:
                self._s = PySeries.new_object(name, values)
                return
            dtype = values.dtype
            if dtype == np.int64:
                self._s = PySeries.new_i64(name, values)
            elif dtype == np.int32:
                self._s = PySeries.new_i32(name, values)
            elif dtype == np.int16:
                self._s = PySeries.new_i16(name, values)
            elif dtype == np.int8:
                self._s = PySeries.new_i8(name, values)
            elif dtype == np.float32:
                self._s = PySeries.new_f32(name, values, nullable)
            elif dtype == np.float64:
                self._s = PySeries.new_f64(name, values, nullable)
            elif isinstance(values[0], str):
                self._s = PySeries.new_str(name, values)
            elif dtype == np.bool:
                self._s = PySeries.new_bool(name, values)
            elif dtype == np.uint8:
                self._s = PySeries.new_u8(name, values)
            elif dtype == np.uint16:
                self._s = PySeries.new_u16(name, values)
            elif dtype == np.uint32:
                self._s = PySeries.new_u32(name, values)
            elif dtype == np.uint64:
                self._s = PySeries.new_u64(name, values)
            else:
                self._s = PySeries.new_object(name, values)
            return
        # list path
        else:
            dtype = _find_first_non_none(values)
            # order is important as booleans are instance of int in python.
            if isinstance(dtype, bool):
                self._s = PySeries.new_opt_bool(name, values)
            elif isinstance(dtype, int):
                self._s = PySeries.new_opt_i64(name, values)
            elif isinstance(dtype, float):
                self._s = PySeries.new_opt_f64(name, values)
            elif isinstance(dtype, str):
                self._s = PySeries.new_str(name, values)
            # make list array
            elif isinstance(dtype, (list, tuple)):
                value_dtype = _find_first_non_none(dtype)

                # we can expect a failure if we pass `[[12], "foo", 9]`
                # in that case we catch the exception and create an object type
                try:
                    if isinstance(value_dtype, bool):
                        arrow_array = pa.array(values, pa.large_list(pa.bool_()))
                    elif isinstance(value_dtype, int):
                        arrow_array = pa.array(values, pa.large_list(pa.int64()))
                    elif isinstance(value_dtype, float):
                        arrow_array = pa.array(values, pa.large_list(pa.float64()))
                    elif isinstance(value_dtype, str):
                        arrow_array = pa.array(values, pa.large_list(pa.large_utf8()))
                    else:
                        self._s = PySeries.new_object(name, values)
                        return
                    self._s = Series.from_arrow(name, arrow_array)._s

                except pa.lib.ArrowInvalid:
                    self._s = PySeries.new_object(name, values)
            else:
                self._s = PySeries.new_object(name, values)
示例#57
0
 "INT64",
 pyarrow.int64().id:
 "INT64",
 pyarrow.uint8().id:
 "INT64",
 pyarrow.uint16().id:
 "INT64",
 pyarrow.uint32().id:
 "INT64",
 pyarrow.uint64().id:
 "INT64",
 pyarrow.float16().id:
 "FLOAT64",
 pyarrow.float32().id:
 "FLOAT64",
 pyarrow.float64().id:
 "FLOAT64",
 pyarrow.time32("ms").id:
 "TIME",
 pyarrow.time64("ns").id:
 "TIME",
 pyarrow.timestamp("ns").id:
 "TIMESTAMP",
 pyarrow.date32().id:
 "DATE",
 pyarrow.date64().id:
 "DATETIME",  # because millisecond resolution
 pyarrow.binary().id:
 "BYTES",
 pyarrow.string().id:
 "STRING",  # also alias for pyarrow.utf8()
示例#58
0
def test_sql(parameters, db_type):
    df = get_df()
    if db_type == "redshift":
        df.drop(["binary"], axis=1, inplace=True)
    engine = wr.catalog.get_engine(connection=f"aws-data-wrangler-{db_type}")
    wr.db.to_sql(
        df=df,
        con=engine,
        name="test_sql",
        schema=parameters[db_type]["schema"],
        if_exists="replace",
        index=False,
        index_label=None,
        chunksize=None,
        method=None,
        dtype={"iint32": sqlalchemy.types.Integer},
    )
    df = wr.db.read_sql_query(
        sql=f"SELECT * FROM {parameters[db_type]['schema']}.test_sql",
        con=engine)
    ensure_data_types(df, has_list=False)
    engine = wr.db.get_engine(
        db_type=db_type,
        host=parameters[db_type]["host"],
        port=parameters[db_type]["port"],
        database=parameters[db_type]["database"],
        user=parameters["user"],
        password=parameters["password"],
    )
    dfs = wr.db.read_sql_query(
        sql=f"SELECT * FROM {parameters[db_type]['schema']}.test_sql",
        con=engine,
        chunksize=1,
        dtype={
            "iint8": pa.int8(),
            "iint16": pa.int16(),
            "iint32": pa.int32(),
            "iint64": pa.int64(),
            "float": pa.float32(),
            "double": pa.float64(),
            "decimal": pa.decimal128(3, 2),
            "string_object": pa.string(),
            "string": pa.string(),
            "date": pa.date32(),
            "timestamp": pa.timestamp(unit="ns"),
            "binary": pa.binary(),
            "category": pa.float64(),
        },
    )
    for df in dfs:
        ensure_data_types(df, has_list=False)
    if db_type != "redshift":
        account_id = boto3.client("sts").get_caller_identity().get("Account")
        engine = wr.catalog.get_engine(
            connection=f"aws-data-wrangler-{db_type}", catalog_id=account_id)
        wr.db.to_sql(
            df=pd.DataFrame({"col0": [1, 2, 3]}, dtype="Int32"),
            con=engine,
            name="test_sql",
            schema=parameters[db_type]["schema"],
            if_exists="replace",
            index=True,
            index_label="index",
        )
        schema = None
        if db_type == "postgresql":
            schema = parameters[db_type]["schema"]
        df = wr.db.read_sql_table(con=engine,
                                  table="test_sql",
                                  schema=schema,
                                  index_col="index")
        assert len(df.index) == 3
        assert len(df.columns) == 1
示例#59
0
def test_is_floating():
    for t in [pa.float16(), pa.float32(), pa.float64()]:
        assert types.is_floating(t)

    assert not types.is_floating(pa.int32())
示例#60
0
bool_type = st.just(pa.bool_())

binary_type = st.just(pa.binary())
string_type = st.just(pa.string())
large_binary_type = st.just(pa.large_binary())
large_string_type = st.just(pa.large_string())

signed_integer_types = st.sampled_from(
    [pa.int8(), pa.int16(), pa.int32(),
     pa.int64()])
unsigned_integer_types = st.sampled_from(
    [pa.uint8(), pa.uint16(),
     pa.uint32(), pa.uint64()])
integer_types = st.one_of(signed_integer_types, unsigned_integer_types)

floating_types = st.sampled_from([pa.float16(), pa.float32(), pa.float64()])
decimal_type = st.builds(pa.decimal128,
                         precision=st.integers(min_value=1, max_value=38),
                         scale=st.integers(min_value=1, max_value=38))
numeric_types = st.one_of(integer_types, floating_types, decimal_type)

date_types = st.sampled_from([pa.date32(), pa.date64()])
time_types = st.sampled_from(
    [pa.time32('s'),
     pa.time32('ms'),
     pa.time64('us'),
     pa.time64('ns')])
timestamp_types = st.builds(pa.timestamp,
                            unit=st.sampled_from(['s', 'ms', 'us', 'ns']),
                            tz=tzst.timezones())
duration_types = st.builds(pa.duration,