def test_sequence_timestamp_from_int_with_unit(): data = [1] s = pa.timestamp('s') ms = pa.timestamp('ms') us = pa.timestamp('us') ns = pa.timestamp('ns') arr_s = pa.array(data, type=s) assert len(arr_s) == 1 assert arr_s.type == s assert str(arr_s[0]) == "Timestamp('1970-01-01 00:00:01')" arr_ms = pa.array(data, type=ms) assert len(arr_ms) == 1 assert arr_ms.type == ms assert str(arr_ms[0]) == "Timestamp('1970-01-01 00:00:00.001000')" arr_us = pa.array(data, type=us) assert len(arr_us) == 1 assert arr_us.type == us assert str(arr_us[0]) == "Timestamp('1970-01-01 00:00:00.000001')" arr_ns = pa.array(data, type=ns) assert len(arr_ns) == 1 assert arr_ns.type == ns assert str(arr_ns[0]) == "Timestamp('1970-01-01 00:00:00.000000001')" with pytest.raises(pa.ArrowException): class CustomClass(): pass pa.array([1, CustomClass()], type=ns) pa.array([1, CustomClass()], type=pa.date32()) pa.array([1, CustomClass()], type=pa.date64())
def test_timestamps_notimezone_nulls(self): df = pd.DataFrame({ 'datetime64': np.array([ '2007-07-13T01:23:34.123', None, '2010-08-13T05:46:57.437'], dtype='datetime64[ms]') }) field = pa.field('datetime64', pa.timestamp('ms')) schema = pa.schema([field]) self._check_pandas_roundtrip( df, timestamps_to_ms=True, expected_schema=schema, ) df = pd.DataFrame({ 'datetime64': np.array([ '2007-07-13T01:23:34.123456789', None, '2010-08-13T05:46:57.437699912'], dtype='datetime64[ns]') }) field = pa.field('datetime64', pa.timestamp('ns')) schema = pa.schema([field]) self._check_pandas_roundtrip( df, timestamps_to_ms=False, expected_schema=schema, )
def test_sequence_timestamp_with_unit(): data = [ datetime.datetime(2007, 7, 13, 1, 23, 34, 123456), ] s = pa.timestamp('s') ms = pa.timestamp('ms') us = pa.timestamp('us') ns = pa.timestamp('ns') arr_s = pa.array(data, type=s) assert len(arr_s) == 1 assert arr_s.type == s assert arr_s[0].as_py() == datetime.datetime(2007, 7, 13, 1, 23, 34, 0) arr_ms = pa.array(data, type=ms) assert len(arr_ms) == 1 assert arr_ms.type == ms assert arr_ms[0].as_py() == datetime.datetime(2007, 7, 13, 1, 23, 34, 123000) arr_us = pa.array(data, type=us) assert len(arr_us) == 1 assert arr_us.type == us assert arr_us[0].as_py() == datetime.datetime(2007, 7, 13, 1, 23, 34, 123456) arr_ns = pa.array(data, type=ns) assert len(arr_ns) == 1 assert arr_ns.type == ns assert arr_ns[0].as_py() == datetime.datetime(2007, 7, 13, 1, 23, 34, 123456)
def test_type_schema_pickling(): cases = [ pa.int8(), pa.string(), pa.binary(), pa.binary(10), pa.list_(pa.string()), pa.struct([ pa.field('a', 'int8'), pa.field('b', 'string') ]), pa.time32('s'), pa.time64('us'), pa.date32(), pa.date64(), pa.timestamp('ms'), pa.timestamp('ns'), pa.decimal(12, 2), pa.field('a', 'string', metadata={b'foo': b'bar'}) ] for val in cases: roundtripped = pickle.loads(pickle.dumps(val)) assert val == roundtripped fields = [] for i, f in enumerate(cases): if isinstance(f, pa.Field): fields.append(f) else: fields.append(pa.field('_f{}'.format(i), f)) schema = pa.schema(fields, metadata={b'foo': b'bar'}) roundtripped = pickle.loads(pickle.dumps(schema)) assert schema == roundtripped
def test_cast_timestamp_unit(): # ARROW-1680 val = datetime.datetime.now() s = pd.Series([val]) s_nyc = s.dt.tz_localize('tzlocal()').dt.tz_convert('America/New_York') us_with_tz = pa.timestamp('us', tz='America/New_York') arr = pa.Array.from_pandas(s_nyc, type=us_with_tz) # ARROW-1906 assert arr.type == us_with_tz arr2 = pa.Array.from_pandas(s, type=pa.timestamp('us')) assert arr[0].as_py() == s_nyc[0] assert arr2[0].as_py() == s[0] # Disallow truncation arr = pa.array([123123], type='int64').cast(pa.timestamp('ms')) expected = pa.array([123], type='int64').cast(pa.timestamp('s')) target = pa.timestamp('s') with pytest.raises(ValueError): arr.cast(target) result = arr.cast(target, safe=False) assert result.equals(expected)
def test_cast_from_null(): in_data = [None] * 3 in_type = pa.null() out_types = [ pa.null(), pa.uint8(), pa.float16(), pa.utf8(), pa.binary(), pa.binary(10), pa.list_(pa.int16()), pa.decimal128(19, 4), pa.timestamp('us'), pa.timestamp('us', tz='UTC'), pa.timestamp('us', tz='Europe/Paris'), pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.list_(pa.int8())), pa.field('c', pa.string())]), ] for out_type in out_types: _check_cast_case((in_data, in_type, in_data, out_type)) out_types = [ pa.dictionary(pa.int32(), pa.string()), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), ] in_arr = pa.array(in_data, type=pa.null()) for out_type in out_types: with pytest.raises(NotImplementedError): in_arr.cast(out_type)
def test_timestamp(): for unit in ('s', 'ms', 'us', 'ns'): for tz in (None, 'UTC', 'Europe/Paris'): ty = pa.timestamp(unit, tz=tz) assert ty.unit == unit assert ty.tz == tz for invalid_unit in ('m', 'arbit', 'rary'): with pytest.raises(ValueError, match='Invalid TimeUnit string'): pa.timestamp(invalid_unit)
def test_type_from_numpy_dtype_timestamps(): cases = [ (np.dtype('datetime64[s]'), pa.timestamp('s')), (np.dtype('datetime64[ms]'), pa.timestamp('ms')), (np.dtype('datetime64[us]'), pa.timestamp('us')), (np.dtype('datetime64[ns]'), pa.timestamp('ns')) ] for dt, pt in cases: result = pa.from_numpy_dtype(dt) assert result == pt
def test_cast_timestamp_to_int(): arr = pa.array(np.array([0, 1, 2], dtype='int64'), type=pa.timestamp('us')) expected = pa.array([0, 1, 2], type='i8') result = arr.cast('i8') assert result.equals(expected)
def test_timestamp(self): import pandas as pd arr = pd.date_range('2000-01-01 12:34:56', periods=10).values units = ['ns', 'us', 'ms', 's'] for i, unit in enumerate(units): dtype = 'datetime64[{0}]'.format(unit) arrow_arr = pa.Array.from_pandas(arr.astype(dtype)) expected = pd.Timestamp('2000-01-01 12:34:56') assert arrow_arr[0].as_py() == expected assert arrow_arr[0].value * 1000**i == expected.value tz = 'America/New_York' arrow_type = pa.timestamp(unit, tz=tz) dtype = 'datetime64[{0}]'.format(unit) arrow_arr = pa.Array.from_pandas(arr.astype(dtype), type=arrow_type) expected = (pd.Timestamp('2000-01-01 12:34:56') .tz_localize('utc') .tz_convert(tz)) assert arrow_arr[0].as_py() == expected assert arrow_arr[0].value * 1000**i == expected.value
def test_type_to_pandas_dtype(): M8_ns = np.dtype('datetime64[ns]') cases = [ (pa.null(), np.float64), (pa.bool_(), np.bool_), (pa.int8(), np.int8), (pa.int16(), np.int16), (pa.int32(), np.int32), (pa.int64(), np.int64), (pa.uint8(), np.uint8), (pa.uint16(), np.uint16), (pa.uint32(), np.uint32), (pa.uint64(), np.uint64), (pa.float16(), np.float16), (pa.float32(), np.float32), (pa.float64(), np.float64), (pa.date32(), M8_ns), (pa.date64(), M8_ns), (pa.timestamp('ms'), M8_ns), (pa.binary(), np.object_), (pa.binary(12), np.object_), (pa.string(), np.object_), (pa.list_(pa.int8()), np.object_), ] for arrow_type, numpy_type in cases: assert arrow_type.to_pandas_dtype() == numpy_type
def _add_any_metadata(table, pandas_metadata): modified_columns = {} schema = table.schema # Add time zones for i, col_meta in enumerate(pandas_metadata['columns']): if col_meta['pandas_type'] == 'datetimetz': col = table[i] converted = col.to_pandas() tz = col_meta['metadata']['timezone'] tz_aware_type = pa.timestamp('ns', tz=tz) with_metadata = pa.Array.from_pandas(converted.values, type=tz_aware_type) field = pa.field(schema[i].name, tz_aware_type) modified_columns[i] = pa.Column.from_array(field, with_metadata) if len(modified_columns) > 0: columns = [] for i in range(len(table.schema)): if i in modified_columns: columns.append(modified_columns[i]) else: columns.append(table[i]) return pa.Table.from_arrays(columns) else: return table
def test_datetime_subclassing(): class MyDate(datetime.date): pass data = [ MyDate(2007, 7, 13), ] date_type = pa.date32() arr_date = pa.array(data, type=date_type) assert len(arr_date) == 1 assert arr_date.type == date_type assert arr_date[0].as_py() == datetime.date(2007, 7, 13) class MyDatetime(datetime.datetime): pass data = [ MyDatetime(2007, 7, 13, 1, 23, 34, 123456), ] s = pa.timestamp('s') ms = pa.timestamp('ms') us = pa.timestamp('us') ns = pa.timestamp('ns') arr_s = pa.array(data, type=s) assert len(arr_s) == 1 assert arr_s.type == s assert arr_s[0].as_py() == datetime.datetime(2007, 7, 13, 1, 23, 34, 0) arr_ms = pa.array(data, type=ms) assert len(arr_ms) == 1 assert arr_ms.type == ms assert arr_ms[0].as_py() == datetime.datetime(2007, 7, 13, 1, 23, 34, 123000) arr_us = pa.array(data, type=us) assert len(arr_us) == 1 assert arr_us.type == us assert arr_us[0].as_py() == datetime.datetime(2007, 7, 13, 1, 23, 34, 123456) arr_ns = pa.array(data, type=ns) assert len(arr_ns) == 1 assert arr_ns.type == ns assert arr_ns[0].as_py() == datetime.datetime(2007, 7, 13, 1, 23, 34, 123456)
def test_timestamp_units_from_list(unit): x = np.datetime64('2017-01-01 01:01:01.111111111', unit) a1 = pa.array([x]) a2 = pa.array([x], type=pa.timestamp(unit)) assert a1.type == a2.type assert a1.type.unit == unit assert a1[0] == a2[0]
def dataframe_with_arrays(include_index=False): """ Dataframe with numpy arrays columns of every possible primtive type. Returns ------- df: pandas.DataFrame schema: pyarrow.Schema Arrow schema definition that is in line with the constructed df. """ dtypes = [('i1', pa.int8()), ('i2', pa.int16()), ('i4', pa.int32()), ('i8', pa.int64()), ('u1', pa.uint8()), ('u2', pa.uint16()), ('u4', pa.uint32()), ('u8', pa.uint64()), ('f4', pa.float32()), ('f8', pa.float64())] arrays = OrderedDict() fields = [] for dtype, arrow_dtype in dtypes: fields.append(pa.field(dtype, pa.list_(arrow_dtype))) arrays[dtype] = [ np.arange(10, dtype=dtype), np.arange(5, dtype=dtype), None, np.arange(1, dtype=dtype) ] fields.append(pa.field('str', pa.list_(pa.string()))) arrays['str'] = [ np.array([u"1", u"ä"], dtype="object"), None, np.array([u"1"], dtype="object"), np.array([u"1", u"2", u"3"], dtype="object") ] fields.append(pa.field('datetime64', pa.list_(pa.timestamp('ms')))) arrays['datetime64'] = [ np.array(['2007-07-13T01:23:34.123456789', None, '2010-08-13T05:46:57.437699912'], dtype='datetime64[ms]'), None, None, np.array(['2007-07-13T02', None, '2010-08-13T05:46:57.437699912'], dtype='datetime64[ms]'), ] if include_index: fields.append(pa.field('__index_level_0__', pa.int64())) df = pd.DataFrame(arrays) schema = pa.schema(fields) return df, schema
def test_simple_timestamps(self): # Infer a timestamp column rows = b"a,b\n1970,1970-01-01\n1989,1989-07-14\n" table = self.read_bytes(rows) schema = pa.schema([('a', pa.int64()), ('b', pa.timestamp('s'))]) assert table.schema == schema assert table.to_pydict() == { 'a': [1970, 1989], 'b': [datetime(1970, 1, 1), datetime(1989, 7, 14)], }
def test_array_from_list_of_timestamps(unit): n = np.datetime64('NaT', unit) x = np.datetime64('2017-01-01 01:01:01.111111111', unit) y = np.datetime64('2018-11-22 12:24:48.111111111', unit) a1 = pa.array([n, x, y]) a2 = pa.array([n, x, y], type=pa.timestamp(unit)) assert a1.type == a2.type assert a1.type.unit == unit assert a1[0] == a2[0]
def test_date_time_types(): t1 = pa.date32() data1 = np.array([17259, 17260, 17261], dtype='int32') a1 = pa.Array.from_pandas(data1, type=t1) t2 = pa.date64() data2 = data1.astype('int64') * 86400000 a2 = pa.Array.from_pandas(data2, type=t2) t3 = pa.timestamp('us') start = pd.Timestamp('2000-01-01').value / 1000 data3 = np.array([start, start + 1, start + 2], dtype='int64') a3 = pa.Array.from_pandas(data3, type=t3) t4 = pa.time32('ms') data4 = np.arange(3, dtype='i4') a4 = pa.Array.from_pandas(data4, type=t4) t5 = pa.time64('us') a5 = pa.Array.from_pandas(data4.astype('int64'), type=t5) t6 = pa.time32('s') a6 = pa.Array.from_pandas(data4, type=t6) ex_t6 = pa.time32('ms') ex_a6 = pa.Array.from_pandas(data4 * 1000, type=ex_t6) table = pa.Table.from_arrays([a1, a2, a3, a4, a5, a6], ['date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]', 'time32_from64[s]']) # date64 as date32 # time32[s] to time32[ms] expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6], ['date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]', 'time32_from64[s]']) _check_roundtrip(table, expected=expected, version='2.0') # Unsupported stuff def _assert_unsupported(array): table = pa.Table.from_arrays([array], ['unsupported']) buf = io.BytesIO() with pytest.raises(NotImplementedError): _write_table(table, buf, version="2.0") t7 = pa.time64('ns') a7 = pa.Array.from_pandas(data4.astype('int64'), type=t7) _assert_unsupported(a7)
def test_type_for_alias(): cases = [ ('i1', pa.int8()), ('int8', pa.int8()), ('i2', pa.int16()), ('int16', pa.int16()), ('i4', pa.int32()), ('int32', pa.int32()), ('i8', pa.int64()), ('int64', pa.int64()), ('u1', pa.uint8()), ('uint8', pa.uint8()), ('u2', pa.uint16()), ('uint16', pa.uint16()), ('u4', pa.uint32()), ('uint32', pa.uint32()), ('u8', pa.uint64()), ('uint64', pa.uint64()), ('f4', pa.float32()), ('float32', pa.float32()), ('f8', pa.float64()), ('float64', pa.float64()), ('date32', pa.date32()), ('date64', pa.date64()), ('string', pa.string()), ('str', pa.string()), ('binary', pa.binary()), ('time32[s]', pa.time32('s')), ('time32[ms]', pa.time32('ms')), ('time64[us]', pa.time64('us')), ('time64[ns]', pa.time64('ns')), ('timestamp[s]', pa.timestamp('s')), ('timestamp[ms]', pa.timestamp('ms')), ('timestamp[us]', pa.timestamp('us')), ('timestamp[ns]', pa.timestamp('ns')), ] for val, expected in cases: assert pa.type_for_alias(val) == expected
def _from_jvm_timestamp_type(jvm_type): """ Convert a JVM timestamp type to its Python equivalent. Parameters ---------- jvm_type: org.apache.arrow.vector.types.pojo.ArrowType$Timestamp Returns ------- typ: pyarrow.DataType """ time_unit = jvm_type.getUnit().toString() timezone = jvm_type.getTimezone() if time_unit == 'SECOND': return pa.timestamp('s', tz=timezone) elif time_unit == 'MILLISECOND': return pa.timestamp('ms', tz=timezone) elif time_unit == 'MICROSECOND': return pa.timestamp('us', tz=timezone) elif time_unit == 'NANOSECOND': return pa.timestamp('ns', tz=timezone)
def test_timestamps_notimezone_no_nulls(self): df = pd.DataFrame({ 'datetime64': np.array([ '2007-07-13T01:23:34.123456789', '2006-01-13T12:34:56.432539784', '2010-08-13T05:46:57.437699912'], dtype='datetime64[ns]') }) field = pa.field('datetime64', pa.timestamp('ns')) schema = pa.schema([field]) self._check_pandas_roundtrip( df, expected_schema=schema, )
def get_datetimetz_type(values, dtype, type_): if values.dtype.type != np.datetime64: return values, type_ if _pandas_api.is_datetimetz(dtype) and type_ is None: # If no user type passed, construct a tz-aware timestamp type tz = dtype.tz unit = dtype.unit type_ = pa.timestamp(unit, tz) elif type_ is None: # Trust the NumPy dtype type_ = pa.from_numpy_dtype(values.dtype) return values, type_
def get_many_types(): # returning them from a function is required because of pa.dictionary # type holds a pyarrow array and test_array.py::test_toal_bytes_allocated # checks that the default memory pool has zero allocated bytes return ( pa.null(), pa.bool_(), pa.int32(), pa.time32('s'), pa.time64('us'), pa.date32(), pa.timestamp('us'), pa.timestamp('us', tz='UTC'), pa.timestamp('us', tz='Europe/Paris'), pa.float16(), pa.float32(), pa.float64(), pa.decimal128(19, 4), pa.string(), pa.binary(), pa.binary(10), pa.list_(pa.int32()), pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.int8()), pa.field('c', pa.string())]), pa.struct([pa.field('a', pa.int32(), nullable=False), pa.field('b', pa.int8(), nullable=False), pa.field('c', pa.string())]), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), pa.union([pa.field('a', pa.binary(10), nullable=False), pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), pa.dictionary(pa.int32(), pa.string()) )
def test_timestamp(self): data = [ datetime.datetime(2007, 7, 13, 1, 23, 34, 123456), None, datetime.datetime(2006, 1, 13, 12, 34, 56, 432539), datetime.datetime(2010, 8, 13, 5, 46, 57, 437699), ] arr = pyarrow.from_pylist(data) assert len(arr) == 4 assert arr.type == pyarrow.timestamp() assert arr.null_count == 1 assert arr[0].as_py() == datetime.datetime(2007, 7, 13, 1, 23, 34, 123456) assert arr[1].as_py() is None assert arr[2].as_py() == datetime.datetime(2006, 1, 13, 12, 34, 56, 432539) assert arr[3].as_py() == datetime.datetime(2010, 8, 13, 5, 46, 57, 437699)
def test_timestamps_notimezone_no_nulls(self): df = pd.DataFrame({ 'datetime64': np.array([ '2007-07-13T01:23:34.123', '2006-01-13T12:34:56.432', '2010-08-13T05:46:57.437'], dtype='datetime64[ms]') }) field = A.Field.from_py('datetime64', A.timestamp('ms')) schema = A.Schema.from_fields([field]) self._check_pandas_roundtrip(df, timestamps_to_ms=True, expected_schema=schema) df = pd.DataFrame({ 'datetime64': np.array([ '2007-07-13T01:23:34.123456789', '2006-01-13T12:34:56.432539784', '2010-08-13T05:46:57.437699912'], dtype='datetime64[ns]') }) field = A.Field.from_py('datetime64', A.timestamp('ns')) schema = A.Schema.from_fields([field]) self._check_pandas_roundtrip(df, timestamps_to_ms=False, expected_schema=schema)
def get_datetimetz_type(values, dtype, type_): from pyarrow.compat import DatetimeTZDtype if values.dtype.type != np.datetime64: return values, type_ if isinstance(dtype, DatetimeTZDtype): tz = dtype.tz unit = dtype.unit type_ = pa.timestamp(unit, tz) elif type_ is None: # Trust the NumPy dtype type_ = pa.from_numpy_dtype(values.dtype) return values, type_
def test_from_numpy_dtype(): cases = [ (np.dtype('bool'), pa.bool_()), (np.dtype('int8'), pa.int8()), (np.dtype('int16'), pa.int16()), (np.dtype('int32'), pa.int32()), (np.dtype('int64'), pa.int64()), (np.dtype('uint8'), pa.uint8()), (np.dtype('uint16'), pa.uint16()), (np.dtype('uint32'), pa.uint32()), (np.dtype('float16'), pa.float16()), (np.dtype('float32'), pa.float32()), (np.dtype('float64'), pa.float64()), (np.dtype('U'), pa.string()), (np.dtype('S'), pa.binary()), (np.dtype('datetime64[s]'), pa.timestamp('s')), (np.dtype('datetime64[ms]'), pa.timestamp('ms')), (np.dtype('datetime64[us]'), pa.timestamp('us')), (np.dtype('datetime64[ns]'), pa.timestamp('ns')) ] for dt, pt in cases: result = pa.from_numpy_dtype(dt) assert result == pt # Things convertible to numpy dtypes work assert pa.from_numpy_dtype('U') == pa.string() assert pa.from_numpy_dtype(np.unicode) == pa.string() assert pa.from_numpy_dtype('int32') == pa.int32() assert pa.from_numpy_dtype(bool) == pa.bool_() with pytest.raises(NotImplementedError): pa.from_numpy_dtype(np.dtype('O')) with pytest.raises(TypeError): pa.from_numpy_dtype('not_convertible_to_dtype')
def _add_any_metadata(table, pandas_metadata): modified_columns = {} schema = table.schema index_columns = pandas_metadata['index_columns'] n_index_levels = len(index_columns) n_columns = len(pandas_metadata['columns']) - n_index_levels # Add time zones for i, col_meta in enumerate(pandas_metadata['columns']): raw_name = col_meta.get('field_name') if not raw_name: # deal with metadata written with arrow < 0.8 raw_name = col_meta['name'] if i >= n_columns: # index columns raw_name = index_columns[i - n_columns] if raw_name is None: raw_name = 'None' idx = schema.get_field_index(raw_name) if idx != -1: if col_meta['pandas_type'] == 'datetimetz': col = table[idx] converted = col.to_pandas() tz = col_meta['metadata']['timezone'] tz_aware_type = pa.timestamp('ns', tz=tz) with_metadata = pa.Array.from_pandas(converted.values, type=tz_aware_type) field = pa.field(schema[idx].name, tz_aware_type) modified_columns[idx] = pa.Column.from_array(field, with_metadata) if len(modified_columns) > 0: columns = [] for i in range(len(table.schema)): if i in modified_columns: columns.append(modified_columns[i]) else: columns.append(table[i]) return pa.Table.from_arrays(columns) else: return table
def test_sequence_numpy_timestamp(): data = [ np.datetime64(datetime.datetime(2007, 7, 13, 1, 23, 34, 123456)), None, np.datetime64(datetime.datetime(2006, 1, 13, 12, 34, 56, 432539)), np.datetime64(datetime.datetime(2010, 8, 13, 5, 46, 57, 437699)) ] arr = pa.array(data) assert len(arr) == 4 assert arr.type == pa.timestamp('us') assert arr.null_count == 1 assert arr[0].as_py() == datetime.datetime(2007, 7, 13, 1, 23, 34, 123456) assert arr[1].as_py() is None assert arr[2].as_py() == datetime.datetime(2006, 1, 13, 12, 34, 56, 432539) assert arr[3].as_py() == datetime.datetime(2010, 8, 13, 5, 46, 57, 437699)
def test_coerce_timestamps(tmpdir): from collections import OrderedDict # ARROW-622 arrays = OrderedDict() fields = [pa.field('datetime64', pa.list_(pa.timestamp('ms')))] arrays['datetime64'] = [ np.array(['2007-07-13T01:23:34.123456789', None, '2010-08-13T05:46:57.437699912'], dtype='datetime64[ms]'), None, None, np.array(['2007-07-13T02', None, '2010-08-13T05:46:57.437699912'], dtype='datetime64[ms]'), ] df = pd.DataFrame(arrays) schema = pa.schema(fields) filename = tmpdir.join('pandas_rountrip.parquet') arrow_table = pa.Table.from_pandas(df, schema=schema) _write_table(arrow_table, filename.strpath, version="2.0", coerce_timestamps='us') table_read = _read_table(filename.strpath) df_read = table_read.to_pandas() df_expected = df.copy() for i, x in enumerate(df_expected['datetime64']): if isinstance(x, np.ndarray): df_expected['datetime64'][i] = x.astype('M8[us]') tm.assert_frame_equal(df_expected, df_read) with pytest.raises(ValueError): _write_table(arrow_table, filename.strpath, version="2.0", coerce_timestamps='unknown')
def _parquet_schema(dataframe: pd.DataFrame, custom_redshift_columns: dict = None): """ Translates pandas dtypes to PyArrow types and creates a Schema from them Args: dataframe (pd.DataFrame): Dataframe to pull the schema of custom_redshift_columns (dict, Optional): This dictionary contains custom column data type definitions for redshift. The params should be formatted as follows: - column name (str) - data type (str) Returns: PyArrow Schema of the given dataframe Potentially modified Dataframe """ fields = [] for col, dtype in dataframe.dtypes.items(): dtype = dtype.name if dtype == 'object': if custom_redshift_columns: # Detect if the Pandas object column contains Python decimal objects. if "[Decimal(" in str(dataframe[col].values)[:9]: # If Python decimal objects are present, parse out the precision and scale # from the custom_redshift_columns dictionary to use when converting # to PyArrow's decimal128 data type. s = custom_redshift_columns[col] precision = int(s[s.find('DECIMAL(') + len('DECIMAL('):s.rfind(',')].strip()) scale = int(s[s.find(',') + len(','):s.rfind(')')].strip()) pa_type = pa.decimal128(precision=precision, scale=scale) else: pa_type = pa.string() else: pa_type = pa.string() elif dtype.startswith('int32'): pa_type = pa.int32() elif dtype.startswith('int64'): pa_type = pa.int64() elif dtype.startswith('int8'): pa_type = pa.int8() elif dtype.startswith('Int32'): dataframe = dataframe.astype({col: 'object'}) pa_type = pa.int32() elif dtype.startswith('Int64'): dataframe = dataframe.astype({col: 'object'}) pa_type = pa.int64() elif dtype.startswith('float32'): pa_type = pa.float32() elif dtype.startswith('float64'): pa_type = pa.float64() elif dtype.startswith('float16'): pa_type = pa.float16() elif dtype.startswith('datetime'): pa_type = pa.timestamp('ns') elif dtype.startswith('date'): pa_type = pa.date64() elif dtype.startswith('category'): pa_type = pa.string() elif dtype == 'bool': pa_type = pa.bool_() else: raise NotImplementedError( f"Error: {dtype} is not a datatype which can be mapped to Parquet using s3parq." ) fields.append(pa.field(col, pa_type)) return (pa.schema(fields=fields), dataframe)
def test_type_timestamp_with_tz(): tz = 'America/Los_Angeles' t = pa.timestamp('ns', tz=tz) assert t.unit == 'ns' assert t.tz == tz
integer_types = st.one_of(signed_integer_types, unsigned_integer_types) floating_types = st.sampled_from([pa.float16(), pa.float32(), pa.float64()]) decimal_type = st.builds(pa.decimal128, precision=st.integers(min_value=0, max_value=38), scale=st.integers(min_value=0, max_value=38)) numeric_types = st.one_of(integer_types, floating_types, decimal_type) date_types = st.sampled_from([pa.date32(), pa.date64()]) time_types = st.sampled_from( [pa.time32('s'), pa.time32('ms'), pa.time64('us'), pa.time64('ns')]) timestamp_types = st.sampled_from([ pa.timestamp('s'), pa.timestamp('ms'), pa.timestamp('us'), pa.timestamp('ns') ]) temporal_types = st.one_of(date_types, time_types, timestamp_types) primitive_types = st.one_of(null_type, bool_type, binary_type, string_type, numeric_types, temporal_types) metadata = st.dictionaries(st.text(), st.text()) @st.defines_strategy def fields(type_strategy=primitive_types): return st.builds(pa.field,
def generate_type_mapper( pd_boolean=None, pd_integer=None, pd_string=None, pd_date_type=None, pd_timestamp_type=None, ): """Specifies the pyarrow data types mapping to corresponding Pandas data types. Args: pd_boolean: if not noe, use the new Pandas bool type. Defaults to None. pd_integer: if not None, use the new Pandas nullable integer type rather than defaulting to floats. Defaults to None. pd_string: if not None, use the new Pandas str type. Defaults to None. pd_date_type: Defaults to None. pd_timestamp_type: Defaults to None. Returns: Type mappings between pyarrow and pandas data types. """ tm = {} if pd_boolean: bool_map = {pa.bool_(): pd.BooleanDtype()} tm = {**tm, **bool_map} if pd_string: string_map = {pa.string(): pd.StringDtype()} tm = {**tm, **string_map} if pd_integer: int_map = { pa.int8(): pd.Int64Dtype(), pa.int16(): pd.Int64Dtype(), pa.int32(): pd.Int64Dtype(), pa.int64(): pd.Int64Dtype(), pa.uint8(): pd.Int64Dtype(), pa.uint16(): pd.Int64Dtype(), pa.uint32(): pd.Int64Dtype(), pa.uint64(): pd.Int64Dtype(), } tm = {**tm, **int_map} else: # No brackets for either keys or values in this dictionary # This lets types_mapper understand the numpy data type float_map = { pa.int8: np.float64, pa.int16: np.float64, pa.int32: np.float64, pa.int64: np.float64, pa.uint8: np.float64, pa.uint16: np.float64, pa.uint32: np.float64, pa.uint64: np.float64, } tm = {**tm, **float_map} if pd_date_type == "pd_period": date_map = {pa.date64(): pd.PeriodDtype("ms")} tm = {**tm, **date_map} if pd_timestamp_type == "pd_period": datetime_map = { pa.timestamp("s"): pd.PeriodDtype("s"), pa.timestamp("ms"): pd.PeriodDtype("ms"), pa.timestamp("us"): pd.PeriodDtype("us"), pa.timestamp("ns"): pd.PeriodDtype("ns"), } tm = {**tm, **datetime_map} if tm: return tm.get else: return None
"Trying to update an index with the wrong column. Got `another_col` but expected `col`" ) @pytest.mark.parametrize( "dtype", [ pa.binary(), pa.bool_(), pa.date32(), pa.float32(), pa.float64(), pa.int64(), pa.int8(), pa.string(), pa.timestamp("ns"), ], ) def test_index_empty(store, dtype): storage_key = "dataset_uuid/some_index.parquet" index1 = ExplicitSecondaryIndex(column="col", index_dct={}, dtype=dtype, index_storage_key=storage_key) key1 = index1.store(store, "dataset_uuid") index2 = ExplicitSecondaryIndex(column="col", index_storage_key=key1).load(store) assert index1 == index2 index3 = pickle.loads(pickle.dumps(index1))
def __init__( # pylint: disable=too-many-locals,too-many-branches self, data: DbapiResult, cursor_description: DbapiDescription, db_engine_spec: Type[db_engine_specs.BaseEngineSpec], ): self.db_engine_spec = db_engine_spec data = data or [] column_names: List[str] = [] pa_data: List[pa.Array] = [] deduped_cursor_desc: List[Tuple[Any, ...]] = [] numpy_dtype: List[Tuple[str, ...]] = [] stringified_arr: np.ndarray if cursor_description: # get deduped list of column names column_names = dedup([col[0] for col in cursor_description]) # fix cursor descriptor with the deduped names deduped_cursor_desc = [ tuple([column_name, *list(description)[1:]]) for column_name, description in zip(column_names, cursor_description) ] # generate numpy structured array dtype numpy_dtype = [(column_name, "object") for column_name in column_names] # only do expensive recasting if datatype is not standard list of tuples if data and (not isinstance(data, list) or not isinstance(data[0], tuple)): data = [tuple(row) for row in data] array = np.array(data, dtype=numpy_dtype) if array.size > 0: for column in column_names: try: pa_data.append(pa.array(array[column].tolist())) except ( pa.lib.ArrowInvalid, pa.lib.ArrowTypeError, pa.lib.ArrowNotImplementedError, TypeError, # this is super hackey, # https://issues.apache.org/jira/browse/ARROW-7855 ): # attempt serialization of values as strings stringified_arr = stringify_values(array[column]) pa_data.append(pa.array(stringified_arr.tolist())) if pa_data: # pylint: disable=too-many-nested-blocks for i, column in enumerate(column_names): if pa.types.is_nested(pa_data[i].type): # TODO: revisit nested column serialization once nested types # are added as a natively supported column type in Superset # (superset.utils.core.DbColumnType). stringified_arr = stringify_values(array[column]) pa_data[i] = pa.array(stringified_arr.tolist()) elif pa.types.is_temporal(pa_data[i].type): # workaround for bug converting # `psycopg2.tz.FixedOffsetTimezone` tzinfo values. # related: https://issues.apache.org/jira/browse/ARROW-5248 sample = self.first_nonempty(array[column]) if sample and isinstance(sample, datetime.datetime): try: if sample.tzinfo: tz = sample.tzinfo series = pd.Series(array[column], dtype="datetime64[ns]") series = pd.to_datetime(series).dt.tz_localize( tz) pa_data[i] = pa.Array.from_pandas( series, type=pa.timestamp("ns", tz=tz)) except Exception as ex: # pylint: disable=broad-except logger.exception(ex) self.table = pa.Table.from_arrays(pa_data, names=column_names) self._type_dict: Dict[str, Any] = {} try: # The driver may not be passing a cursor.description self._type_dict = { col: db_engine_spec.get_datatype(deduped_cursor_desc[i][1]) for i, col in enumerate(column_names) if deduped_cursor_desc } except Exception as ex: # pylint: disable=broad-except logger.exception(ex)
np.testing.assert_array_equal(narr[:6], arr[:6].to_numpy()) np.testing.assert_array_equal(narr[2:], arr[2:].to_numpy()) np.testing.assert_array_equal(narr[2:6], arr[2:6].to_numpy()) @pytest.mark.parametrize( ('type', 'expected'), [(pa.null(), 'empty'), (pa.bool_(), 'bool'), (pa.int8(), 'int8'), (pa.int16(), 'int16'), (pa.int32(), 'int32'), (pa.int64(), 'int64'), (pa.uint8(), 'uint8'), (pa.uint16(), 'uint16'), (pa.uint32(), 'uint32'), (pa.uint64(), 'uint64'), (pa.float16(), 'float16'), (pa.float32(), 'float32'), (pa.float64(), 'float64'), (pa.date32(), 'date'), (pa.date64(), 'date'), (pa.binary(), 'bytes'), (pa.binary(length=4), 'bytes'), (pa.string(), 'unicode'), (pa.list_(pa.list_(pa.int16())), 'list[list[int16]]'), (pa.decimal128(18, 3), 'decimal'), (pa.timestamp('ms'), 'datetime'), (pa.timestamp('us', 'UTC'), 'datetimetz'), (pa.time32('s'), 'time'), (pa.time64('us'), 'time')]) def test_logical_type(type, expected): assert get_logical_type(type) == expected def test_array_uint64_from_py_over_range(): arr = pa.array([2**63], type=pa.uint64()) expected = pa.array(np.array([2**63], dtype='u8')) assert arr.equals(expected) def test_array_conversions_no_sentinel_values(): arr = np.array([1, 2, 3, 4], dtype='int8') refcount = sys.getrefcount(arr)
'total_amt': 'total_amount', 'tpep_dropoff_datetime': 'dropoff_datetime', 'tpep_pickup_datetime': 'pickup_datetime', 'trip_distance': 'trip_distance', 'trip_dropoff_datetime': 'dropoff_datetime', 'trip_pickup_datetime': 'pickup_datetime', 'vendor_id': 'vendor', 'vendor_name': 'vendor', 'vendorid': 'vendor', 'trip_type': 'trip_type', 'lpep_dropoff_datetime': 'dropoff_datetime', 'lpep_pickup_datetime': 'pickup_datetime' } arrow_schema = pa.schema([ ('pickup_datetime', pa.timestamp('ns')), ('dropoff_datetime', pa.timestamp('ns')), ('store_and_forward', pa.int8()), ('passenger_count', pa.int8()), ('trip_distance', pa.float32()), ('fare_amount', pa.float32()), ('tip_amount', pa.float32()), ('total_amount', pa.float32()), ('payment_type', pa.string()), ('trip_type', pa.string()), ('company', pa.string()), ('trip_duration_minutes', pa.float32()), ('year', pa.int16()), ('pickup_borough', pa.string()), ('pickup_zone', pa.string()), ('pickup_location_id', pa.int16()),
def test_timestamp_restore_timezone(): # ARROW-5888, restore timezone from serialized metadata ty = pa.timestamp('ms', tz='America/New_York') arr = pa.array([1, 2, 3], type=ty) t = pa.table([arr], names=['f0']) _check_roundtrip(t)
def test_date_time_types(tempdir): t1 = pa.date32() data1 = np.array([17259, 17260, 17261], dtype='int32') a1 = pa.array(data1, type=t1) t2 = pa.date64() data2 = data1.astype('int64') * 86400000 a2 = pa.array(data2, type=t2) t3 = pa.timestamp('us') start = pd.Timestamp('2001-01-01').value / 1000 data3 = np.array([start, start + 1, start + 2], dtype='int64') a3 = pa.array(data3, type=t3) t4 = pa.time32('ms') data4 = np.arange(3, dtype='i4') a4 = pa.array(data4, type=t4) t5 = pa.time64('us') a5 = pa.array(data4.astype('int64'), type=t5) t6 = pa.time32('s') a6 = pa.array(data4, type=t6) ex_t6 = pa.time32('ms') ex_a6 = pa.array(data4 * 1000, type=ex_t6) t7 = pa.timestamp('ns') start = pd.Timestamp('2001-01-01').value data7 = np.array([start, start + 1000, start + 2000], dtype='int64') a7 = pa.array(data7, type=t7) table = pa.Table.from_arrays([a1, a2, a3, a4, a5, a6, a7], [ 'date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]', 'time32_from64[s]', 'timestamp[ns]' ]) # date64 as date32 # time32[s] to time32[ms] expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6, a7], [ 'date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]', 'time32_from64[s]', 'timestamp[ns]' ]) _check_roundtrip(table, expected=expected, version='2.6') t0 = pa.timestamp('ms') data0 = np.arange(4, dtype='int64') a0 = pa.array(data0, type=t0) t1 = pa.timestamp('us') data1 = np.arange(4, dtype='int64') a1 = pa.array(data1, type=t1) t2 = pa.timestamp('ns') data2 = np.arange(4, dtype='int64') a2 = pa.array(data2, type=t2) table = pa.Table.from_arrays([a0, a1, a2], ['ts[ms]', 'ts[us]', 'ts[ns]']) expected = pa.Table.from_arrays([a0, a1, a2], ['ts[ms]', 'ts[us]', 'ts[ns]']) # int64 for all timestamps supported by default filename = tempdir / 'int64_timestamps.parquet' _write_table(table, filename, version='2.6') parquet_schema = pq.ParquetFile(filename).schema for i in range(3): assert parquet_schema.column(i).physical_type == 'INT64' read_table = _read_table(filename) assert read_table.equals(expected) t0_ns = pa.timestamp('ns') data0_ns = np.array(data0 * 1000000, dtype='int64') a0_ns = pa.array(data0_ns, type=t0_ns) t1_ns = pa.timestamp('ns') data1_ns = np.array(data1 * 1000, dtype='int64') a1_ns = pa.array(data1_ns, type=t1_ns) expected = pa.Table.from_arrays([a0_ns, a1_ns, a2], ['ts[ms]', 'ts[us]', 'ts[ns]']) # int96 nanosecond timestamps produced upon request filename = tempdir / 'explicit_int96_timestamps.parquet' _write_table(table, filename, version='2.6', use_deprecated_int96_timestamps=True) parquet_schema = pq.ParquetFile(filename).schema for i in range(3): assert parquet_schema.column(i).physical_type == 'INT96' read_table = _read_table(filename) assert read_table.equals(expected) # int96 nanosecond timestamps implied by flavor 'spark' filename = tempdir / 'spark_int96_timestamps.parquet' _write_table(table, filename, version='2.6', flavor='spark') parquet_schema = pq.ParquetFile(filename).schema for i in range(3): assert parquet_schema.column(i).physical_type == 'INT96' read_table = _read_table(filename) assert read_table.equals(expected)
import vaex.utils supported_arrow_array_types = (pa.Array, pa.ChunkedArray) supported_array_types = (np.ndarray, ) + supported_arrow_array_types string_types = [pa.string(), pa.large_string()] _type_names_int = [ "int8", "int16", "int32", "int64", "uint8", "uint16", "uint32", "uint64" ] _type_names = ["float64", "float32"] + _type_names_int map_arrow_to_numpy = { getattr(pa, name)(): np.dtype(name) for name in _type_names } map_arrow_to_numpy[pa.bool_()] = np.dtype("?") for unit in 's ms us ns'.split(): map_arrow_to_numpy[pa.timestamp(unit)] = np.dtype(f"datetime64[{unit}]") def full(n, value, dtype): from .datatype import DataType dtype = DataType(dtype) values = np.full(n, value, dtype=dtype.numpy) if dtype.is_arrow: return pa.array(values) else: return values def is_arrow_array(ar): return isinstance(ar, supported_arrow_array_types)
def test_generate_from_meta(): md = Metadata.from_dict({ "name": "test_table", "file_format": "test-format", "columns": [ { "name": "my_int", "type": "int64", "description": "This is an integer", "nullable": False, }, { "name": "my_double", "type": "float64", "nullable": True }, { "name": "my_date", "type": "date64" }, { "name": "my_decimal", "type": "decimal128(10,2)" }, { "name": "my_timestamp", "type": "timestamp(s)", "description": "Partition column", }, ], "partitions": ["my_timestamp"], }) ac = ArrowConverter() assert isinstance(ac.options, BaseConverterOptions) schema1 = ac.generate_from_meta(md) schema2 = ac.generate_from_meta(md, False) assert isinstance(schema1, pa.Schema) assert isinstance(schema2, pa.Schema) expected_names = ["my_int", "my_double", "my_date", "my_decimal"] expected_types = [ pa.int64(), pa.float64(), pa.date64(), pa.decimal128(10, 2) ] assert schema1.names == expected_names checks1 = [a.equals(e) for a, e in zip(schema1.types, expected_types)] assert all(checks1) # Do schema2 assertions expected_names.append("my_timestamp") expected_types.append(pa.timestamp("s")) assert schema2.names == expected_names checks2 = [a.equals(e) for a, e in zip(schema2.types, expected_types)] assert all(checks2) # Also check specific type properties assert schema2.field("my_decimal").type.precision == 10 assert schema2.field("my_decimal").type.scale == 2 assert schema2.field("my_timestamp").type.unit == "s"
("int32", pa.int32()), ("int64", pa.int64()), ("uint8", pa.uint8()), ("uint16", pa.uint16()), ("uint32", pa.uint32()), ("uint64", pa.uint64()), ("float16", pa.float16()), ("float32", pa.float32()), ("float64", pa.float64()), ("decimal128(38,1)", pa.decimal128(38, 1)), ("decimal128(1,2)", pa.decimal128(1, 2)), ("time32(s)", pa.time32("s")), ("time32(ms)", pa.time32("ms")), ("time64(us)", pa.time64("us")), ("time64(ns)", pa.time64("ns")), ("timestamp(s)", pa.timestamp("s")), ("timestamp(ms)", pa.timestamp("ms")), ("timestamp(us)", pa.timestamp("us")), ("timestamp(ns)", pa.timestamp("ns")), ("date32", pa.date32()), ("date64", pa.date64()), ("string", pa.string()), ("large_string", pa.large_string()), ("utf8", pa.utf8()), ("large_utf8", pa.large_utf8()), ("binary", pa.binary()), ("binary(128)", pa.binary(128)), ("large_binary", pa.large_binary()), ("struct<num:int64>", pa.struct([("num", pa.int64())])), ("list<int64>", pa.list_(pa.int64())), ("list_<list<int64>>", pa.list_(pa.list_(pa.int64()))),
def test_get_eq_func(): for t in [ pa.int8(), pa.int16(), pa.int32(), pa.int64(), pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64(), ]: assert not get_eq_func(t)(0, 1) assert not get_eq_func(t)(None, 1) assert get_eq_func(t)(1, 1) assert get_eq_func(t)(None, None) t = pa.null() assert get_eq_func(t)("0", "1") assert get_eq_func(t)(None, "1") assert get_eq_func(t)("1", "1") assert get_eq_func(t)(None, None) t = pa.string() assert not get_eq_func(t)("0", "1") assert not get_eq_func(t)(None, "1") assert get_eq_func(t)("1", "1") assert get_eq_func(t)(None, None) t = pa.bool_() assert not get_eq_func(t)(False, True) assert not get_eq_func(t)(None, False) assert not get_eq_func(t)(None, True) assert get_eq_func(t)(True, True) assert get_eq_func(t)(False, False) assert get_eq_func(t)(None, None) for t in [pa.float16(), pa.float32(), pa.float64()]: assert not get_eq_func(t)(0.0, 1.1) assert get_eq_func(t)(1.1, 1.1) assert get_eq_func(t)(None, float("nan")) for n in [None, float("nan"), float("inf"), float("-inf")]: assert not get_eq_func(t)(None, 1.1) assert get_eq_func(t)(None, None) for t in [pa.timestamp("ns")]: for n in [None, pd.NaT]: assert not get_eq_func(t)(datetime(2020, 1, 1, 0), datetime(2020, 1, 1, 1)) assert not get_eq_func(t)(n, datetime(2020, 1, 1, 1)) assert get_eq_func(t)(datetime(2020, 1, 1, 1), datetime(2020, 1, 1, 1)) assert get_eq_func(t)(n, n) assert get_eq_func(pa.timestamp("ns"))(None, pd.NaT) for t in [pa.date32()]: for n in [None, pd.NaT]: assert get_eq_func(t)(datetime(2020, 1, 1, 0), datetime(2020, 1, 1, 1)) assert not get_eq_func(t)(datetime(2020, 1, 1), datetime( 2020, 1, 2).date()) assert not get_eq_func(t)(n, datetime(2020, 1, 1, 1)) assert get_eq_func(t)(datetime(2020, 1, 1).date(), datetime(2020, 1, 1, 1)) assert get_eq_func(t)(n, n) t = pa.struct([pa.field("a", pa.int32())]) assert not get_eq_func(t)(dict(a=0), dict(a=1)) assert not get_eq_func(t)(None, dict(a=1)) assert get_eq_func(t)(dict(a=1), dict(a=1)) assert get_eq_func(t)(None, None) t = pa.list_(pa.int32()) assert not get_eq_func(t)([0], [1]) assert not get_eq_func(t)(None, [1]) assert get_eq_func(t)([1], [1]) assert get_eq_func(t)(None, None)
def test_is_datetime(): assert is_datetime(pyarrow.timestamp("us", tz=None)) assert not is_datetime(pyarrow.timestamp("ms", tz=None)) assert not is_datetime(pyarrow.timestamp("us", tz="UTC")) assert not is_datetime(pyarrow.string())
(pa.int64(), 'int64'), (pa.uint8(), 'uint8'), (pa.uint16(), 'uint16'), (pa.uint32(), 'uint32'), (pa.uint64(), 'uint64'), (pa.float16(), 'float16'), (pa.float32(), 'float32'), (pa.float64(), 'float64'), (pa.date32(), 'date'), (pa.date64(), 'date'), (pa.binary(), 'bytes'), (pa.binary(length=4), 'bytes'), (pa.string(), 'unicode'), (pa.list_(pa.list_(pa.int16())), 'list[list[int16]]'), (pa.decimal128(18, 3), 'decimal'), (pa.timestamp('ms'), 'datetime'), (pa.timestamp('us', 'UTC'), 'datetimetz'), (pa.time32('s'), 'time'), (pa.time64('us'), 'time') ] ) def test_logical_type(type, expected): assert get_logical_type(type) == expected def test_array_uint64_from_py_over_range(): arr = pa.array([2 ** 63], type=pa.uint64()) expected = pa.array(np.array([2 ** 63], dtype='u8')) assert arr.equals(expected)
def test_in_expr_todo(): import pyarrow.gandiva as gandiva # TODO: Implement reasonable support for timestamp, time & date. # Current exceptions: # pyarrow.lib.ArrowException: ExpressionValidationError: # Evaluation expression for IN clause returns XXXX values are of typeXXXX # binary arr = pa.array([b"ga", b"an", b"nd", b"di", b"iv", b"va"]) table = pa.Table.from_arrays([arr], ["a"]) builder = gandiva.TreeExprBuilder() node_a = builder.make_field(table.schema.field("a")) cond = builder.make_in_expression(node_a, [b'an', b'nd'], pa.binary()) condition = builder.make_condition(cond) filter = gandiva.make_filter(table.schema, condition) result = filter.evaluate(table.to_batches()[0], pa.default_memory_pool()) assert list(result.to_array()) == [1, 2] # timestamp datetime_1 = datetime.datetime.utcfromtimestamp(1542238951.621877) datetime_2 = datetime.datetime.utcfromtimestamp(1542238911.621877) datetime_3 = datetime.datetime.utcfromtimestamp(1542238051.621877) arr = pa.array([datetime_1, datetime_2, datetime_3]) table = pa.Table.from_arrays([arr], ["a"]) builder = gandiva.TreeExprBuilder() node_a = builder.make_field(table.schema.field("a")) cond = builder.make_in_expression(node_a, [datetime_2], pa.timestamp('ms')) condition = builder.make_condition(cond) filter = gandiva.make_filter(table.schema, condition) result = filter.evaluate(table.to_batches()[0], pa.default_memory_pool()) assert list(result.to_array()) == [1] # time time_1 = datetime_1.time() time_2 = datetime_2.time() time_3 = datetime_3.time() arr = pa.array([time_1, time_2, time_3]) table = pa.Table.from_arrays([arr], ["a"]) builder = gandiva.TreeExprBuilder() node_a = builder.make_field(table.schema.field("a")) cond = builder.make_in_expression(node_a, [time_2], pa.time64('ms')) condition = builder.make_condition(cond) filter = gandiva.make_filter(table.schema, condition) result = filter.evaluate(table.to_batches()[0], pa.default_memory_pool()) assert list(result.to_array()) == [1] # date date_1 = datetime_1.date() date_2 = datetime_2.date() date_3 = datetime_3.date() arr = pa.array([date_1, date_2, date_3]) table = pa.Table.from_arrays([arr], ["a"]) builder = gandiva.TreeExprBuilder() node_a = builder.make_field(table.schema.field("a")) cond = builder.make_in_expression(node_a, [date_2], pa.date32()) condition = builder.make_condition(cond) filter = gandiva.make_filter(table.schema, condition) result = filter.evaluate(table.to_batches()[0], pa.default_memory_pool()) assert list(result.to_array()) == [1]
def _convert_data_with_schema(data, schema, date_format=None, field_aliases=None): column_data = {} array_data = [] schema_names = [] for row in data: for column in schema.names: _col = column_data.get(column, []) _col.append(row.get(column)) column_data[column] = _col for column in schema: _col = column_data.get(column.name) if isinstance(column.type, pa.lib.TimestampType): _converted_col = [] for t in _col: try: _converted_col.append(pd.to_datetime(t, format=date_format)) except pd._libs.tslib.OutOfBoundsDatetime: _converted_col.append(pd.Timestamp.max) array_data.append(pa.Array.from_pandas(pd.to_datetime(_converted_col), type=pa.timestamp('ns'))) elif column.type.id == pa.date32().id: _converted_col = map(_date_converter, _col) array_data.append(pa.array(_converted_col, type=pa.date32())) # Float types are ambiguous for conversions, need to specify the exact type elif column.type.id == pa.float64().id: array_data.append(pa.array(_col, type=pa.float64())) elif column.type.id == pa.float32().id: # Python doesn't have a native float32 type # and PyArrow cannot cast float64 -> float32 _col = pd.to_numeric(_col, downcast='float') array_data.append(pa.Array.from_pandas(_col, type=pa.float32())) elif column.type.id == pa.int32().id: # PyArrow 0.8.0 can cast int64 -> int32 _col64 = pa.array(_col, type=pa.int64()) array_data.append(_col64.cast(pa.int32())) elif column.type.id == pa.bool_().id: _col = map(_boolean_converter, _col) array_data.append(pa.array(_col, type=column.type)) else: array_data.append(pa.array(_col, type=column.type)) if isinstance(field_aliases, dict): schema_names.append(field_aliases.get(column.name, column.name)) else: schema_names.append(column.name) return pa.RecordBatch.from_arrays(array_data, schema_names)
def pyarrow_timestamp(): return pyarrow.timestamp("us", tz="UTC")
def test_cell_is_null_timestamp(): _assert_condition_mask( {"A": pa.array([datetime.datetime.now(), None], pa.timestamp("ns"))}, CELL("is_null", "A"), "01", )
def read_type(doc): t = doc[TYPE] if PARAM in doc: tp = doc[PARAM] else: tp = None if t == 'null': return pyarrow.null() if t == 'bool': return pyarrow.bool_() if t == 'int8': return pyarrow.int8() if t == 'int16': return pyarrow.int16() if t == 'int32': return pyarrow.int32() if t == 'int64': return pyarrow.int64() if t == 'uint8': return pyarrow.uint8() if t == 'uint16': return pyarrow.uint16() if t == 'uint32': return pyarrow.uint32() if t == 'uint64': return pyarrow.uint64() if t == 'float16': return pyarrow.float16() if t == 'float32': return pyarrow.float32() if t == 'float64': return pyarrow.float64() if t == 'date[d]': return pyarrow.date32() if t == 'date[ms]': return pyarrow.date64() if t == 'timestamp[s]': return pyarrow.timestamp('s') if t == 'timestamp[ms]': return pyarrow.timestamp('ms') if t == 'timestamp[us]': return pyarrow.timestamp('us') if t == 'timestamp[ns]': return pyarrow.timestamp('ns') if t == 'time[s]': return pyarrow.time32('s') if t == 'time[ms]': return pyarrow.time32('ms') if t == 'time[us]': return pyarrow.time64('us') if t == 'time[ns]': return pyarrow.time64('ns') if t == 'utf8': return pyarrow.utf8() if t == 'bytes': return pyarrow.binary() if t == 'factor': if tp is None: index_type = pyarrow.int32() dict_type = pyarrow.utf8() else: index_type = read_type(tp[INDEX]) dict_type = read_type(tp[DICT]) return pyarrow.dictionary(index_type, dict_type, False) if t == 'ordered': if tp is None: index_type = pyarrow.int32() dict_type = pyarrow.utf8() else: index_type = read_type(tp[INDEX]) dict_type = read_type(tp[DICT]) return pyarrow.dictionary(index_type, dict_type, True) if t == 'opaque': return pyarrow.binary(tp) if t == 'list': return pyarrow.list_(read_type(tp)) if t == 'struct': return pyarrow.struct( [pyarrow.field(f[NAME], read_type(f)) for f in tp]) raise ValueError(f'{t} is not supported BSON DataFrame type')
arr3 = pa.array(np.array([1, np.nan, 2, 3, np.nan, 4], dtype='float32'), type='float32') assert arr3.type == 'float32' assert arr3.null_count == 0 def test_array_from_numpy_datetimeD(): arr = np.array([None, datetime.date(2017, 4, 4)], dtype='datetime64[D]') result = pa.array(arr) expected = pa.array([None, datetime.date(2017, 4, 4)], type=pa.date32()) assert result.equals(expected) @pytest.mark.parametrize(('dtype', 'type'), [('datetime64[s]', pa.timestamp('s')), ('datetime64[ms]', pa.timestamp('ms')), ('datetime64[us]', pa.timestamp('us')), ('datetime64[ns]', pa.timestamp('ns'))]) def test_array_from_numpy_datetime(dtype, type): data = [ None, datetime.datetime(2017, 4, 4, 12, 11, 10), datetime.datetime(2018, 1, 1, 0, 2, 0) ] # from numpy array arr = pa.array(np.array(data, dtype=dtype)) expected = pa.array(data, type=type) assert arr.equals(expected)
def test_complex_as_arrow(self, arrow_cursor): table = arrow_cursor.execute(""" SELECT col_boolean ,col_tinyint ,col_smallint ,col_int ,col_bigint ,col_float ,col_double ,col_string ,col_varchar ,col_timestamp ,CAST(col_timestamp AS time) AS col_time ,col_date ,col_binary ,col_array ,CAST(col_array AS json) AS col_array_json ,col_map ,CAST(col_map AS json) AS col_map_json ,col_struct ,col_decimal FROM one_row_complex """).as_arrow() assert table.shape[0] == 1 assert table.shape[1] == 19 assert table.schema == pa.schema([ pa.field("col_boolean", pa.bool_()), pa.field("col_tinyint", pa.int8()), pa.field("col_smallint", pa.int16()), pa.field("col_int", pa.int32()), pa.field("col_bigint", pa.int64()), pa.field("col_float", pa.float32()), pa.field("col_double", pa.float64()), pa.field("col_string", pa.string()), pa.field("col_varchar", pa.string()), pa.field("col_timestamp", pa.timestamp("ms")), pa.field("col_time", pa.string()), pa.field("col_date", pa.timestamp("ms")), pa.field("col_binary", pa.string()), pa.field("col_array", pa.string()), pa.field("col_array_json", pa.string()), pa.field("col_map", pa.string()), pa.field("col_map_json", pa.string()), pa.field("col_struct", pa.string()), pa.field("col_decimal", pa.string()), ]) assert [row for row in zip(*table.to_pydict().values())] == [( True, 127, 32767, 2147483647, 9223372036854775807, 0.5, 0.25, "a string", "varchar", datetime(2017, 1, 1, 0, 0, 0), "00:00:00.000", datetime(2017, 1, 2, 0, 0, 0), "31 32 33", "[1, 2]", "[1,2]", "{1=2, 3=4}", '{"1":2,"3":4}', "{a=1, b=2}", "0.1", )]
def test_date_time_types(): t1 = pa.date32() data1 = np.array([17259, 17260, 17261], dtype='int32') a1 = pa.Array.from_pandas(data1, type=t1) t2 = pa.date64() data2 = data1.astype('int64') * 86400000 a2 = pa.Array.from_pandas(data2, type=t2) t3 = pa.timestamp('us') start = pd.Timestamp('2000-01-01').value / 1000 data3 = np.array([start, start + 1, start + 2], dtype='int64') a3 = pa.Array.from_pandas(data3, type=t3) t4 = pa.time32('ms') data4 = np.arange(3, dtype='i4') a4 = pa.Array.from_pandas(data4, type=t4) t5 = pa.time64('us') a5 = pa.Array.from_pandas(data4.astype('int64'), type=t5) t6 = pa.time32('s') a6 = pa.Array.from_pandas(data4, type=t6) ex_t6 = pa.time32('ms') ex_a6 = pa.Array.from_pandas(data4 * 1000, type=ex_t6) t7 = pa.timestamp('ns') start = pd.Timestamp('2001-01-01').value data7 = np.array([start, start + 1000, start + 2000], dtype='int64') a7 = pa.Array.from_pandas(data7, type=t7) t7_us = pa.timestamp('us') start = pd.Timestamp('2001-01-01').value data7_us = np.array([start, start + 1000, start + 2000], dtype='int64') // 1000 a7_us = pa.Array.from_pandas(data7_us, type=t7_us) table = pa.Table.from_arrays([a1, a2, a3, a4, a5, a6, a7], [ 'date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]', 'time32_from64[s]', 'timestamp[ns]' ]) # date64 as date32 # time32[s] to time32[ms] # 'timestamp[ns]' to 'timestamp[us]' expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6, a7_us], [ 'date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]', 'time32_from64[s]', 'timestamp[ns]' ]) _check_roundtrip(table, expected=expected, version='2.0') # date64 as date32 # time32[s] to time32[ms] # 'timestamp[ns]' is saved as INT96 timestamp expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6, a7], [ 'date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]', 'time32_from64[s]', 'timestamp[ns]' ]) _check_roundtrip(table, expected=expected, version='2.0', use_deprecated_int96_timestamps=True) # Check that setting flavor to 'spark' uses int96 timestamps _check_roundtrip(table, expected=expected, version='2.0', flavor='spark') # Unsupported stuff def _assert_unsupported(array): table = pa.Table.from_arrays([array], ['unsupported']) buf = io.BytesIO() with pytest.raises(NotImplementedError): _write_table(table, buf, version="2.0") t7 = pa.time64('ns') a7 = pa.Array.from_pandas(data4.astype('int64'), type=t7) _assert_unsupported(a7)
def pyarrow_datetime(): return pyarrow.timestamp("us", tz=None)
def as_column(arbitrary, nan_as_null=True): """Create a Column from an arbitrary object Currently support inputs are: * ``Column`` * ``Buffer`` * numba device array * numpy array * pandas.Categorical Returns ------- result : subclass of TypedColumnBase - CategoricalColumn for pandas.Categorical input. - NumericalColumn for all other inputs. """ from . import numerical, categorical, datetime if isinstance(arbitrary, Column): if not isinstance(arbitrary, TypedColumnBase): # interpret as numeric data = arbitrary.view(numerical.NumericalColumn, dtype=arbitrary.dtype) else: data = arbitrary elif isinstance(arbitrary, Buffer): data = numerical.NumericalColumn(data=arbitrary, dtype=arbitrary.dtype) elif cuda.devicearray.is_cuda_ndarray(arbitrary): data = as_column(Buffer(arbitrary)) if (data.dtype in [np.float16, np.float32, np.float64] and arbitrary.size > 0): if nan_as_null: mask = cudautils.mask_from_devary(arbitrary) data = data.set_mask(mask) elif isinstance(arbitrary, np.ndarray): if arbitrary.dtype.kind == 'M': data = datetime.DatetimeColumn.from_numpy(arbitrary) else: data = as_column(rmm.to_device(arbitrary), nan_as_null=nan_as_null) elif isinstance(arbitrary, pa.Array): if isinstance(arbitrary, pa.StringArray): raise NotImplementedError("Strings are not yet supported") elif isinstance(arbitrary, pa.NullArray): pamask = Buffer(np.empty(0, dtype='int8')) padata = Buffer(np.empty(0, dtype=arbitrary.type.to_pandas_dtype())) data = numerical.NumericalColumn( data=padata, mask=pamask, null_count=0, dtype=np.dtype(arbitrary.type.to_pandas_dtype())) elif isinstance(arbitrary, pa.DictionaryArray): if arbitrary.buffers()[0]: pamask = Buffer(np.array(arbitrary.buffers()[0])) else: pamask = None padata = Buffer( np.array(arbitrary.buffers()[1]).view( arbitrary.indices.type.to_pandas_dtype())) data = categorical.CategoricalColumn( data=padata, mask=pamask, null_count=arbitrary.null_count, categories=arbitrary.dictionary.to_pylist(), ordered=arbitrary.type.ordered, ) elif isinstance(arbitrary, pa.TimestampArray): arbitrary = arbitrary.cast(pa.timestamp('ms')) if arbitrary.buffers()[0]: pamask = Buffer(np.array(arbitrary.buffers()[0])) else: pamask = None padata = Buffer( np.array(arbitrary.buffers()[1]).view(np.dtype('M8[ms]'))) data = datetime.DatetimeColumn(data=padata, mask=pamask, null_count=arbitrary.null_count, dtype=np.dtype('M8[ms]')) elif isinstance(arbitrary, pa.Date64Array): if arbitrary.buffers()[0]: pamask = Buffer(np.array(arbitrary.buffers()[0])) else: pamask = None padata = Buffer( np.array(arbitrary.buffers()[1]).view(np.dtype('M8[ms]'))) data = datetime.DatetimeColumn(data=padata, mask=pamask, null_count=arbitrary.null_count, dtype=np.dtype('M8[ms]')) elif isinstance(arbitrary, pa.Date32Array): # No equivalent np dtype and not yet supported warnings.warn( "Date32 values are not yet supported so this will " "be typecast to a Date64 value", UserWarning) arbitrary = arbitrary.cast(pa.date64()) data = as_column(arbitrary) elif isinstance(arbitrary, pa.BooleanArray): # Arrow uses 1 bit per value while we use int8 dtype = np.dtype(np.bool) arbitrary = arbitrary.cast(pa.int8()) if arbitrary.buffers()[0]: pamask = Buffer(np.array(arbitrary.buffers()[0])) else: pamask = None padata = Buffer(np.array(arbitrary.buffers()[1]).view(dtype)) data = numerical.NumericalColumn(data=padata, mask=pamask, null_count=arbitrary.null_count, dtype=dtype) else: if arbitrary.buffers()[0]: pamask = Buffer(np.array(arbitrary.buffers()[0])) else: pamask = None padata = Buffer( np.array(arbitrary.buffers()[1]).view( np.dtype(arbitrary.type.to_pandas_dtype()))) data = numerical.NumericalColumn( data=padata, mask=pamask, null_count=arbitrary.null_count, dtype=np.dtype(arbitrary.type.to_pandas_dtype())) elif isinstance(arbitrary, (pd.Series, pd.Categorical)): if pd.core.common.is_categorical_dtype(arbitrary): data = as_column(pa.array(arbitrary, from_pandas=True)) else: data = as_column(pa.array(arbitrary, from_pandas=nan_as_null)) elif np.isscalar(arbitrary) and not isinstance(arbitrary, memoryview): if hasattr(arbitrary, 'dtype'): data_type = _gdf.np_to_pa_dtype(arbitrary.dtype) if data_type in (pa.date64(), pa.date32()): # PyArrow can't construct date64 or date32 arrays from np # datetime types arbitrary = arbitrary.astype('int64') data = as_column(pa.array([arbitrary], type=data_type)) else: data = as_column(pa.array([arbitrary])) elif isinstance(arbitrary, memoryview): data = as_column(np.array(arbitrary)) else: try: data = as_column(memoryview(arbitrary)) except TypeError: data = as_column(pa.array(arbitrary)) return data
def test_complex_unload_as_arrow(self, arrow_cursor): # NOT_SUPPORTED: Unsupported Hive type: time # NOT_SUPPORTED: Unsupported Hive type: json table = arrow_cursor.execute(""" SELECT col_boolean ,col_tinyint ,col_smallint ,col_int ,col_bigint ,col_float ,col_double ,col_string ,col_varchar ,col_timestamp ,col_date ,col_binary ,col_array ,col_map ,col_struct ,col_decimal FROM one_row_complex """).as_arrow() assert table.shape[0] == 1 assert table.shape[1] == 16 assert table.schema == pa.schema([ pa.field("col_boolean", pa.bool_()), pa.field("col_tinyint", pa.int32()), pa.field("col_smallint", pa.int32()), pa.field("col_int", pa.int32()), pa.field("col_bigint", pa.int64()), pa.field("col_float", pa.float32()), pa.field("col_double", pa.float64()), pa.field("col_string", pa.string()), pa.field("col_varchar", pa.string()), pa.field("col_timestamp", pa.timestamp("ns")), pa.field("col_date", pa.date32()), pa.field("col_binary", pa.binary()), pa.field("col_array", pa.list_(pa.field("array_element", pa.int32()))), pa.field("col_map", pa.map_(pa.int32(), pa.field("entries", pa.int32()))), pa.field( "col_struct", pa.struct( [pa.field("a", pa.int32()), pa.field("b", pa.int32())]), ), pa.field("col_decimal", pa.decimal128(10, 1)), ]) assert [row for row in zip(*table.to_pydict().values())] == [( True, 127, 32767, 2147483647, 9223372036854775807, 0.5, 0.25, "a string", "varchar", pd.Timestamp(2017, 1, 1, 0, 0, 0), datetime(2017, 1, 2).date(), b"123", [1, 2], [(1, 2), (3, 4)], { "a": 1, "b": 2 }, Decimal("0.1"), )]
def test_make_column_timestamp_interpret_local_datetime_as_utc(): column = make_column("A", [datetime.datetime(2021, 4, 8, 13, 39, 1, 123456)]) assert column.array.type == pa.timestamp("ns") # no TZ info assert column.array.cast(pa.int64()) == pa.array([1617889141123456000])