def test_empty_cast(): types = [ pa.null(), pa.bool_(), pa.int8(), pa.int16(), pa.int32(), pa.int64(), pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64(), pa.float16(), pa.float32(), pa.float64(), pa.date32(), pa.date64(), pa.binary(), pa.binary(length=4), pa.string(), ] for (t1, t2) in itertools.product(types, types): try: # ARROW-4766: Ensure that supported types conversion don't segfault # on empty arrays of common types pa.array([], type=t1).cast(t2) except pa.lib.ArrowNotImplementedError: continue
def test_sequence_timestamp_from_int_with_unit(): data = [1] s = pa.timestamp('s') ms = pa.timestamp('ms') us = pa.timestamp('us') ns = pa.timestamp('ns') arr_s = pa.array(data, type=s) assert len(arr_s) == 1 assert arr_s.type == s assert str(arr_s[0]) == "Timestamp('1970-01-01 00:00:01')" arr_ms = pa.array(data, type=ms) assert len(arr_ms) == 1 assert arr_ms.type == ms assert str(arr_ms[0]) == "Timestamp('1970-01-01 00:00:00.001000')" arr_us = pa.array(data, type=us) assert len(arr_us) == 1 assert arr_us.type == us assert str(arr_us[0]) == "Timestamp('1970-01-01 00:00:00.000001')" arr_ns = pa.array(data, type=ns) assert len(arr_ns) == 1 assert arr_ns.type == ns assert str(arr_ns[0]) == "Timestamp('1970-01-01 00:00:00.000000001')" with pytest.raises(pa.ArrowException): class CustomClass(): pass pa.array([1, CustomClass()], type=ns) pa.array([1, CustomClass()], type=pa.date32()) pa.array([1, CustomClass()], type=pa.date64())
def test_type_to_pandas_dtype(): M8_ns = np.dtype('datetime64[ns]') cases = [ (pa.null(), np.float64), (pa.bool_(), np.bool_), (pa.int8(), np.int8), (pa.int16(), np.int16), (pa.int32(), np.int32), (pa.int64(), np.int64), (pa.uint8(), np.uint8), (pa.uint16(), np.uint16), (pa.uint32(), np.uint32), (pa.uint64(), np.uint64), (pa.float16(), np.float16), (pa.float32(), np.float32), (pa.float64(), np.float64), (pa.date32(), M8_ns), (pa.date64(), M8_ns), (pa.timestamp('ms'), M8_ns), (pa.binary(), np.object_), (pa.binary(12), np.object_), (pa.string(), np.object_), (pa.list_(pa.int8()), np.object_), ] for arrow_type, numpy_type in cases: assert arrow_type.to_pandas_dtype() == numpy_type
def test_type_schema_pickling(): cases = [ pa.int8(), pa.string(), pa.binary(), pa.binary(10), pa.list_(pa.string()), pa.struct([ pa.field('a', 'int8'), pa.field('b', 'string') ]), pa.time32('s'), pa.time64('us'), pa.date32(), pa.date64(), pa.timestamp('ms'), pa.timestamp('ns'), pa.decimal(12, 2), pa.field('a', 'string', metadata={b'foo': b'bar'}) ] for val in cases: roundtripped = pickle.loads(pickle.dumps(val)) assert val == roundtripped fields = [] for i, f in enumerate(cases): if isinstance(f, pa.Field): fields.append(f) else: fields.append(pa.field('_f{}'.format(i), f)) schema = pa.schema(fields, metadata={b'foo': b'bar'}) roundtripped = pickle.loads(pickle.dumps(schema)) assert schema == roundtripped
def test_cast_date64_to_int(): arr = pa.array(np.array([0, 1, 2], dtype='int64'), type=pa.date64()) expected = pa.array([0, 1, 2], type='i8') result = arr.cast('i8') assert result.equals(expected)
def test_date(self): data = [datetime.date(2000, 1, 1), None, datetime.date(1970, 1, 1), datetime.date(2040, 2, 26)] arr = pa.from_pylist(data) assert len(arr) == 4 assert arr.type == pa.date64() assert arr.null_count == 1 assert arr[0].as_py() == datetime.date(2000, 1, 1) assert arr[1].as_py() is None assert arr[2].as_py() == datetime.date(1970, 1, 1) assert arr[3].as_py() == datetime.date(2040, 2, 26)
def test_sequence_date(): data = [datetime.date(2000, 1, 1), None, datetime.date(1970, 1, 1), datetime.date(2040, 2, 26)] arr = pa.array(data) assert len(arr) == 4 assert arr.type == pa.date64() assert arr.null_count == 1 assert arr[0].as_py() == datetime.date(2000, 1, 1) assert arr[1].as_py() is None assert arr[2].as_py() == datetime.date(1970, 1, 1) assert arr[3].as_py() == datetime.date(2040, 2, 26)
def test_date_time_types(): t1 = pa.date32() data1 = np.array([17259, 17260, 17261], dtype='int32') a1 = pa.Array.from_pandas(data1, type=t1) t2 = pa.date64() data2 = data1.astype('int64') * 86400000 a2 = pa.Array.from_pandas(data2, type=t2) t3 = pa.timestamp('us') start = pd.Timestamp('2000-01-01').value / 1000 data3 = np.array([start, start + 1, start + 2], dtype='int64') a3 = pa.Array.from_pandas(data3, type=t3) t4 = pa.time32('ms') data4 = np.arange(3, dtype='i4') a4 = pa.Array.from_pandas(data4, type=t4) t5 = pa.time64('us') a5 = pa.Array.from_pandas(data4.astype('int64'), type=t5) t6 = pa.time32('s') a6 = pa.Array.from_pandas(data4, type=t6) ex_t6 = pa.time32('ms') ex_a6 = pa.Array.from_pandas(data4 * 1000, type=ex_t6) table = pa.Table.from_arrays([a1, a2, a3, a4, a5, a6], ['date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]', 'time32_from64[s]']) # date64 as date32 # time32[s] to time32[ms] expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6], ['date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]', 'time32_from64[s]']) _check_roundtrip(table, expected=expected, version='2.0') # Unsupported stuff def _assert_unsupported(array): table = pa.Table.from_arrays([array], ['unsupported']) buf = io.BytesIO() with pytest.raises(NotImplementedError): _write_table(table, buf, version="2.0") t7 = pa.time64('ns') a7 = pa.Array.from_pandas(data4.astype('int64'), type=t7) _assert_unsupported(a7)
def test_dates_from_integers(self): t1 = pa.date32() t2 = pa.date64() arr = np.array([17259, 17260, 17261], dtype='int32') arr2 = arr.astype('int64') * 86400000 a1 = pa.array(arr, type=t1) a2 = pa.array(arr2, type=t2) expected = date(2017, 4, 3) assert a1[0].as_py() == expected assert a2[0].as_py() == expected
def test_date(self): df = pd.DataFrame({ 'date': [datetime.date(2000, 1, 1), None, datetime.date(1970, 1, 1), datetime.date(2040, 2, 26)]}) table = A.Table.from_pandas(df) field = A.Field.from_py('date', A.date64()) schema = A.Schema.from_fields([field]) assert table.schema.equals(schema) result = table.to_pandas() expected = df.copy() expected['date'] = pd.to_datetime(df['date']) tm.assert_frame_equal(result, expected)
def _from_jvm_date_type(jvm_type): """ Convert a JVM date type to its Python equivalent Parameters ---------- jvm_type: org.apache.arrow.vector.types.pojo.ArrowType$Date Returns ------- typ: pyarrow.DataType """ day_unit = jvm_type.getUnit().toString() if day_unit == 'DAY': return pa.date32() elif day_unit == 'MILLISECOND': return pa.date64()
def test_type_for_alias(): cases = [ ('i1', pa.int8()), ('int8', pa.int8()), ('i2', pa.int16()), ('int16', pa.int16()), ('i4', pa.int32()), ('int32', pa.int32()), ('i8', pa.int64()), ('int64', pa.int64()), ('u1', pa.uint8()), ('uint8', pa.uint8()), ('u2', pa.uint16()), ('uint16', pa.uint16()), ('u4', pa.uint32()), ('uint32', pa.uint32()), ('u8', pa.uint64()), ('uint64', pa.uint64()), ('f4', pa.float32()), ('float32', pa.float32()), ('f8', pa.float64()), ('float64', pa.float64()), ('date32', pa.date32()), ('date64', pa.date64()), ('string', pa.string()), ('str', pa.string()), ('binary', pa.binary()), ('time32[s]', pa.time32('s')), ('time32[ms]', pa.time32('ms')), ('time64[us]', pa.time64('us')), ('time64[ns]', pa.time64('ns')), ('timestamp[s]', pa.timestamp('s')), ('timestamp[ms]', pa.timestamp('ms')), ('timestamp[us]', pa.timestamp('us')), ('timestamp[ns]', pa.timestamp('ns')), ] for val, expected in cases: assert pa.type_for_alias(val) == expected
def test_date_objects_typed(self): arr = np.array([ date(2017, 4, 3), None, date(2017, 4, 4), date(2017, 4, 5)], dtype=object) arr_i4 = np.array([17259, -1, 17260, 17261], dtype='int32') arr_i8 = arr_i4.astype('int64') * 86400000 mask = np.array([False, True, False, False]) t32 = pa.date32() t64 = pa.date64() a32 = pa.array(arr, type=t32) a64 = pa.array(arr, type=t64) a32_expected = pa.array(arr_i4, mask=mask, type=t32) a64_expected = pa.array(arr_i8, mask=mask, type=t64) assert a32.equals(a32_expected) assert a64.equals(a64_expected) # Test converting back to pandas colnames = ['date32', 'date64'] table = pa.Table.from_arrays([a32, a64], colnames) table_pandas = table.to_pandas() ex_values = (np.array(['2017-04-03', '2017-04-04', '2017-04-04', '2017-04-05'], dtype='datetime64[D]') .astype('datetime64[ns]')) ex_values[1] = pd.NaT.value expected_pandas = pd.DataFrame({'date32': ex_values, 'date64': ex_values}, columns=colnames) tm.assert_frame_equal(table_pandas, expected_pandas)
def test_is_temporal_date_time_timestamp(): date_types = [pa.date32(), pa.date64()] time_types = [pa.time32('s'), pa.time64('ns')] timestamp_types = [pa.timestamp('ms')] for case in date_types + time_types + timestamp_types: assert types.is_temporal(case) for case in date_types: assert types.is_date(case) assert not types.is_time(case) assert not types.is_timestamp(case) for case in time_types: assert types.is_time(case) assert not types.is_date(case) assert not types.is_timestamp(case) for case in timestamp_types: assert types.is_timestamp(case) assert not types.is_date(case) assert not types.is_time(case) assert not types.is_temporal(pa.int32())
[ ("i", pa.int16()), ("my_bool", pa.bool_()), ("my_nullable_bool", pa.bool_()), ("my_date", pa.date32()), ("my_datetime", pa.timestamp("ms")), ("my_int", pa.uint16()), ("my_string", pa.string()), ] ), pa.schema( [ ("i", pa.int32()), ("my_bool", pa.bool_()), ("my_nullable_bool", pa.bool_()), ("my_date", pa.date64()), ("my_datetime", pa.timestamp("us")), ("my_int", pa.uint32()), ("my_string", pa.string()), ] ), pa.schema( [ ("i", pa.int64()), ("my_bool", pa.bool_()), ("my_nullable_bool", pa.bool_()), ("my_date", pa.date64()), ("my_datetime", pa.timestamp("ns")), ("my_int", pa.uint64()), ("my_string", pa.string()), ]
(pa.time64('us'), '{"name":"time","unit":"MICROSECOND","bitWidth":64}'), (pa.time64('ns'), '{"name":"time","unit":"NANOSECOND","bitWidth":64}'), (pa.timestamp('s'), '{"name":"timestamp","unit":"SECOND",' '"timezone":null}'), (pa.timestamp('ms'), '{"name":"timestamp","unit":"MILLISECOND",' '"timezone":null}'), (pa.timestamp('us'), '{"name":"timestamp","unit":"MICROSECOND",' '"timezone":null}'), (pa.timestamp('ns'), '{"name":"timestamp","unit":"NANOSECOND",' '"timezone":null}'), (pa.timestamp('ns', tz='UTC'), '{"name":"timestamp","unit":"NANOSECOND"' ',"timezone":"UTC"}'), (pa.timestamp('ns', tz='Europe/Paris'), '{"name":"timestamp",' '"unit":"NANOSECOND","timezone":"Europe/Paris"}'), (pa.date32(), '{"name":"date","unit":"DAY"}'), (pa.date64(), '{"name":"date","unit":"MILLISECOND"}'), (pa.decimal128(19, 4), '{"name":"decimal","precision":19,"scale":4}'), (pa.string(), '{"name":"utf8"}'), (pa.binary(), '{"name":"binary"}'), (pa.binary(10), '{"name":"fixedsizebinary","byteWidth":10}'), # TODO(ARROW-2609): complex types that have children # pa.list_(pa.int32()), # pa.struct([pa.field('a', pa.int32()), # pa.field('b', pa.int8()), # pa.field('c', pa.string())]), # pa.union([pa.field('a', pa.binary(10)), # pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE), # pa.union([pa.field('a', pa.binary(10)), # pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), # TODO: DictionaryType requires a vector in the type # pa.dictionary(pa.int32(), pa.array(['a', 'b', 'c'])),
def test_date_time_types(): t1 = pa.date32() data1 = np.array([17259, 17260, 17261], dtype='int32') a1 = pa.array(data1, type=t1) t2 = pa.date64() data2 = data1.astype('int64') * 86400000 a2 = pa.array(data2, type=t2) t3 = pa.timestamp('us') start = pd.Timestamp('2000-01-01').value / 1000 data3 = np.array([start, start + 1, start + 2], dtype='int64') a3 = pa.array(data3, type=t3) t4 = pa.time32('ms') data4 = np.arange(3, dtype='i4') a4 = pa.array(data4, type=t4) t5 = pa.time64('us') a5 = pa.array(data4.astype('int64'), type=t5) t6 = pa.time32('s') a6 = pa.array(data4, type=t6) ex_t6 = pa.time32('ms') ex_a6 = pa.array(data4 * 1000, type=ex_t6) t7 = pa.timestamp('ns') start = pd.Timestamp('2001-01-01').value data7 = np.array([start, start + 1000, start + 2000], dtype='int64') a7 = pa.array(data7, type=t7) t7_us = pa.timestamp('us') start = pd.Timestamp('2001-01-01').value data7_us = np.array([start, start + 1000, start + 2000], dtype='int64') // 1000 a7_us = pa.array(data7_us, type=t7_us) table = pa.Table.from_arrays([a1, a2, a3, a4, a5, a6, a7], ['date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]', 'time32_from64[s]', 'timestamp[ns]']) # date64 as date32 # time32[s] to time32[ms] # 'timestamp[ns]' to 'timestamp[us]' expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6, a7_us], ['date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]', 'time32_from64[s]', 'timestamp[ns]']) _check_roundtrip(table, expected=expected, version='2.0') # date64 as date32 # time32[s] to time32[ms] # 'timestamp[ns]' is saved as INT96 timestamp expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6, a7], ['date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]', 'time32_from64[s]', 'timestamp[ns]']) _check_roundtrip(table, expected=expected, version='2.0', use_deprecated_int96_timestamps=True) # Check that setting flavor to 'spark' uses int96 timestamps _check_roundtrip(table, expected=expected, version='2.0', flavor='spark') # Unsupported stuff def _assert_unsupported(array): table = pa.Table.from_arrays([array], ['unsupported']) buf = io.BytesIO() with pytest.raises(NotImplementedError): _write_table(table, buf, version="2.0") t7 = pa.time64('ns') a7 = pa.array(data4.astype('int64'), type=t7) _assert_unsupported(a7)
[ (pa.null(), 'empty'), (pa.bool_(), 'bool'), (pa.int8(), 'int8'), (pa.int16(), 'int16'), (pa.int32(), 'int32'), (pa.int64(), 'int64'), (pa.uint8(), 'uint8'), (pa.uint16(), 'uint16'), (pa.uint32(), 'uint32'), (pa.uint64(), 'uint64'), (pa.float16(), 'float16'), (pa.float32(), 'float32'), (pa.float64(), 'float64'), (pa.date32(), 'date'), (pa.date64(), 'date'), (pa.binary(), 'bytes'), (pa.binary(length=4), 'bytes'), (pa.string(), 'unicode'), (pa.list_(pa.list_(pa.int16())), 'list[list[int16]]'), (pa.decimal128(18, 3), 'decimal'), (pa.timestamp('ms'), 'datetime'), (pa.timestamp('us', 'UTC'), 'datetimetz'), (pa.time32('s'), 'time'), (pa.time64('us'), 'time') ] ) def test_logical_type(type, expected): assert get_logical_type(type) == expected
int, pa.uint32().id: int, pa.int64().id: int, pa.uint64().id: int, pa.float16().id: float, pa.float32().id: float, pa.float64().id: float, pa.date32().id: datetime.date, pa.date64().id: datetime.date, pa.timestamp("ms").id: datetime.datetime, pa.binary().id: six.binary_type, pa.string().id: six.text_type, # Use any list type here, only LIST is important pa.list_(pa.string()).id: list, } _string_type_map = {"date64[ms]": pa.date64(), "string": pa.string()}
floating_types = st.sampled_from([ pa.float16(), pa.float32(), pa.float64() ]) decimal_type = st.builds( pa.decimal128, precision=st.integers(min_value=1, max_value=38), scale=st.integers(min_value=1, max_value=38) ) numeric_types = st.one_of(integer_types, floating_types, decimal_type) date_types = st.sampled_from([ pa.date32(), pa.date64() ]) time_types = st.sampled_from([ pa.time32('s'), pa.time32('ms'), pa.time64('us'), pa.time64('ns') ]) timestamp_types = st.builds( pa.timestamp, unit=st.sampled_from(['s', 'ms', 'us', 'ns']), tz=tzst.timezones() ) temporal_types = st.one_of(date_types, time_types, timestamp_types) primitive_types = st.one_of(
import pyarrow as pa schema_fields = [ pa.field("timestamp", pa.date64(), False), pa.field("timezone", pa.uint64(), False).with_metadata({ "illex_MIN": "0", "illex_MAX": "1024" }), pa.field("vin", pa.uint64(), False), pa.field("odometer", pa.uint64(), False).with_metadata({ "illex_MIN": "0", "illex_MAX": "1000" }), pa.field("hypermiling", pa.bool_(), False), pa.field("avgspeed", pa.uint64(), False).with_metadata({ "illex_MIN": "0", "illex_MAX": "200" }), pa.field( "sec_in_band", pa.list_( pa.field("item", pa.uint64(), False).with_metadata({ "illex_MIN": "0", "illex_MAX": "4192" }), 12), False), pa.field( "miles_in_time_range", pa.list_( pa.field("item", pa.uint64(), False).with_metadata({
def dataframe_with_lists(include_index=False, parquet_compatible=False): """ Dataframe with list columns of every possible primtive type. Returns ------- df: pandas.DataFrame schema: pyarrow.Schema Arrow schema definition that is in line with the constructed df. parquet_compatible: bool Exclude types not supported by parquet """ arrays = OrderedDict() fields = [] fields.append(pa.field('int64', pa.list_(pa.int64()))) arrays['int64'] = [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [0, 1, 2, 3, 4], None, [], np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9] * 2, dtype=np.int64)[::2]] fields.append(pa.field('double', pa.list_(pa.float64()))) arrays['double'] = [ [0., 1., 2., 3., 4., 5., 6., 7., 8., 9.], [0., 1., 2., 3., 4.], None, [], np.array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.] * 2)[::2], ] fields.append(pa.field('bytes_list', pa.list_(pa.binary()))) arrays['bytes_list'] = [ [b"1", b"f"], None, [b"1"], [b"1", b"2", b"3"], [], ] fields.append(pa.field('str_list', pa.list_(pa.string()))) arrays['str_list'] = [ [u"1", u"ä"], None, [u"1"], [u"1", u"2", u"3"], [], ] date_data = [[], [date(2018, 1, 1), date(2032, 12, 30)], [date(2000, 6, 7)], None, [date(1969, 6, 9), date(1972, 7, 3)]] time_data = [[time(23, 11, 11), time(1, 2, 3), time(23, 59, 59)], [], [time(22, 5, 59)], None, [time(0, 0, 0), time(18, 0, 2), time(12, 7, 3)]] temporal_pairs = [(pa.date32(), date_data), (pa.date64(), date_data), (pa.time32('s'), time_data), (pa.time32('ms'), time_data), (pa.time64('us'), time_data)] if not parquet_compatible: temporal_pairs += [ (pa.time64('ns'), time_data), ] for value_type, data in temporal_pairs: field_name = '{}_list'.format(value_type) field_type = pa.list_(value_type) field = pa.field(field_name, field_type) fields.append(field) arrays[field_name] = data if include_index: fields.append(pa.field('__index_level_0__', pa.int64())) df = pd.DataFrame(arrays) schema = pa.schema(fields) return df, schema
return self.storage_type.num_fields pyarrow.register_extension_type( AwkwardArrowType(pyarrow.null(), None, None, None, None, None, None)) # order is important; _string_like[:2] vs _string_like[::2] _string_like = ( pyarrow.string(), pyarrow.large_string(), pyarrow.binary(), pyarrow.large_binary(), ) _pyarrow_to_numpy_dtype = { pyarrow.date32(): (True, np.dtype("M8[D]")), pyarrow.date64(): (False, np.dtype("M8[ms]")), pyarrow.time32("s"): (True, np.dtype("M8[s]")), pyarrow.time32("ms"): (True, np.dtype("M8[ms]")), pyarrow.time64("us"): (False, np.dtype("M8[us]")), pyarrow.time64("ns"): (False, np.dtype("M8[ns]")), pyarrow.timestamp("s"): (False, np.dtype("M8[s]")), pyarrow.timestamp("ms"): (False, np.dtype("M8[ms]")), pyarrow.timestamp("us"): (False, np.dtype("M8[us]")), pyarrow.timestamp("ns"): (False, np.dtype("M8[ns]")), pyarrow.duration("s"): (False, np.dtype("m8[s]")), pyarrow.duration("ms"): (False, np.dtype("m8[ms]")), pyarrow.duration("us"): (False, np.dtype("m8[us]")), pyarrow.duration("ns"): (False, np.dtype("m8[ns]")), } if not ak._v2._util.numpy_at_least("1.17.0"):
def textfsm_data(self, raw_input, fsm_template, schema, data): """Convert unstructured output to structured output""" records = [] fsm_template.Reset() res = fsm_template.ParseText(raw_input) for entry in res: metent = dict(zip(fsm_template.header, entry)) records.append(metent) result = self.clean_data(records, data) fields = [fld.name for fld in schema] ptype_map = { pa.string(): str, pa.int32(): int, pa.int64(): int, pa.float32(): float, pa.float64(): float, pa.date64(): float, pa.list_(pa.string()): list, pa.list_(pa.int64()): list, pa.bool_(): bool, pa.list_(pa.struct([('nexthop', pa.string()), ('oif', pa.string()), ('weight', pa.int32())])): list, } map_defaults = { pa.string(): "", pa.int32(): 0, pa.int64(): 0, pa.float32(): 0.0, pa.float64(): 0.0, pa.date64(): 0.0, pa.bool_(): False, pa.list_(pa.string()): [], pa.list_(pa.int64()): [], pa.list_(pa.struct([('nexthop', pa.string()), ('oif', pa.string()), ('weight', pa.int32())])): [("", "", 1)] } # Ensure the type is set correctly. for entry in result: for cent in entry: if cent in fields: schent_type = schema.field(cent).type if not isinstance(entry[cent], ptype_map[schent_type]): if entry[cent]: entry[cent] = ptype_map[schent_type](entry[cent]) else: entry[cent] = map_defaults[schent_type] elif isinstance(entry[cent], list): for i, ele in enumerate(entry[cent]): if not isinstance(ele, ptype_map[schent_type.value_type]): try: if ptype_map[schent_type.value_type] == int: entry[cent][i] = int(entry[cent][i]) else: raise ValueError except ValueError: entry[cent][i] = ( map_defaults[schent_type.value_type]) return result
def test_date_time_types(tempdir): t1 = pa.date32() data1 = np.array([17259, 17260, 17261], dtype='int32') a1 = pa.array(data1, type=t1) t2 = pa.date64() data2 = data1.astype('int64') * 86400000 a2 = pa.array(data2, type=t2) t3 = pa.timestamp('us') start = pd.Timestamp('2001-01-01').value / 1000 data3 = np.array([start, start + 1, start + 2], dtype='int64') a3 = pa.array(data3, type=t3) t4 = pa.time32('ms') data4 = np.arange(3, dtype='i4') a4 = pa.array(data4, type=t4) t5 = pa.time64('us') a5 = pa.array(data4.astype('int64'), type=t5) t6 = pa.time32('s') a6 = pa.array(data4, type=t6) ex_t6 = pa.time32('ms') ex_a6 = pa.array(data4 * 1000, type=ex_t6) t7 = pa.timestamp('ns') start = pd.Timestamp('2001-01-01').value data7 = np.array([start, start + 1000, start + 2000], dtype='int64') a7 = pa.array(data7, type=t7) table = pa.Table.from_arrays([a1, a2, a3, a4, a5, a6, a7], [ 'date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]', 'time32_from64[s]', 'timestamp[ns]' ]) # date64 as date32 # time32[s] to time32[ms] expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6, a7], [ 'date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]', 'time32_from64[s]', 'timestamp[ns]' ]) _check_roundtrip(table, expected=expected, version='2.6') t0 = pa.timestamp('ms') data0 = np.arange(4, dtype='int64') a0 = pa.array(data0, type=t0) t1 = pa.timestamp('us') data1 = np.arange(4, dtype='int64') a1 = pa.array(data1, type=t1) t2 = pa.timestamp('ns') data2 = np.arange(4, dtype='int64') a2 = pa.array(data2, type=t2) table = pa.Table.from_arrays([a0, a1, a2], ['ts[ms]', 'ts[us]', 'ts[ns]']) expected = pa.Table.from_arrays([a0, a1, a2], ['ts[ms]', 'ts[us]', 'ts[ns]']) # int64 for all timestamps supported by default filename = tempdir / 'int64_timestamps.parquet' _write_table(table, filename, version='2.6') parquet_schema = pq.ParquetFile(filename).schema for i in range(3): assert parquet_schema.column(i).physical_type == 'INT64' read_table = _read_table(filename) assert read_table.equals(expected) t0_ns = pa.timestamp('ns') data0_ns = np.array(data0 * 1000000, dtype='int64') a0_ns = pa.array(data0_ns, type=t0_ns) t1_ns = pa.timestamp('ns') data1_ns = np.array(data1 * 1000, dtype='int64') a1_ns = pa.array(data1_ns, type=t1_ns) expected = pa.Table.from_arrays([a0_ns, a1_ns, a2], ['ts[ms]', 'ts[us]', 'ts[ns]']) # int96 nanosecond timestamps produced upon request filename = tempdir / 'explicit_int96_timestamps.parquet' _write_table(table, filename, version='2.6', use_deprecated_int96_timestamps=True) parquet_schema = pq.ParquetFile(filename).schema for i in range(3): assert parquet_schema.column(i).physical_type == 'INT96' read_table = _read_table(filename) assert read_table.equals(expected) # int96 nanosecond timestamps implied by flavor 'spark' filename = tempdir / 'spark_int96_timestamps.parquet' _write_table(table, filename, version='2.6', flavor='spark') parquet_schema = pq.ParquetFile(filename).schema for i in range(3): assert parquet_schema.column(i).physical_type == 'INT96' read_table = _read_table(filename) assert read_table.equals(expected)
_python_type_map = { pa.null().id: six.text_type, pa.bool_().id: bool, pa.int8().id: int, pa.uint8().id: int, pa.int16().id: int, pa.uint16().id: int, pa.int32().id: int, pa.uint32().id: int, pa.int64().id: int, pa.uint64().id: int, pa.float16().id: float, pa.float32().id: float, pa.float64().id: float, pa.date32().id: datetime.date, pa.date64().id: datetime.date, pa.timestamp("ms").id: datetime.datetime, pa.binary().id: six.binary_type, pa.string().id: six.text_type, # Use any list type here, only LIST is important pa.list_(pa.string()).id: list, } _string_type_map = {"date64[ms]": pa.date64(), "string": pa.string()} class FletcherDtype(ExtensionDtype): # na_value = pa.Null() def __init__(self, arrow_dtype): self.arrow_dtype = arrow_dtype
pa.int8(): dt.Int8, pa.int16(): dt.Int16, pa.int32(): dt.Int32, pa.int64(): dt.Int64, pa.uint8(): dt.UInt8, pa.uint16(): dt.UInt16, pa.uint32(): dt.UInt32, pa.uint64(): dt.UInt64, pa.float16(): dt.Float16, pa.float32(): dt.Float32, pa.float64(): dt.Float64, pa.string(): dt.String, pa.binary(): dt.Binary, pa.bool_(): dt.Boolean, pa.date32(): dt.Date, pa.date64(): dt.Date, } @dt.dtype.register(pa.DataType) # type: ignore[misc] def from_pyarrow_primitive( arrow_type: pa.DataType, nullable: bool = True, ) -> dt.DataType: return _to_ibis_dtypes[arrow_type](nullable=nullable) @dt.dtype.register(pa.Time32Type) # type: ignore[misc] @dt.dtype.register(pa.Time64Type) # type: ignore[misc] def from_pyarrow_time( arrow_type: pa.TimestampType,
def test_simple_type_construction(): result = pa.lib.TimestampType() with pytest.raises(TypeError): str(result) @pytest.mark.parametrize( ('type', 'expected'), [(pa.null(), 'float64'), (pa.bool_(), 'bool'), (pa.int8(), 'int8'), (pa.int16(), 'int16'), (pa.int32(), 'int32'), (pa.int64(), 'int64'), (pa.uint8(), 'uint8'), (pa.uint16(), 'uint16'), (pa.uint32(), 'uint32'), (pa.uint64(), 'uint64'), (pa.float16(), 'float16'), (pa.float32(), 'float32'), (pa.float64(), 'float64'), (pa.date32(), 'date'), (pa.date64(), 'date'), (pa.binary(), 'bytes'), (pa.binary(length=4), 'bytes'), (pa.string(), 'unicode'), (pa.list_(pa.list_(pa.int16())), 'list[list[int16]]'), (pa.decimal128(18, 3), 'decimal'), (pa.timestamp('ms'), 'datetime'), (pa.timestamp('us', 'UTC'), 'datetimetz'), (pa.time32('s'), 'time'), (pa.time64('us'), 'time')]) def test_logical_type(type, expected): assert get_logical_type(type) == expected def test_array_conversions_no_sentinel_values(): arr = np.array([1, 2, 3, 4], dtype='int8') refcount = sys.getrefcount(arr) arr2 = pa.array(arr) # noqa assert sys.getrefcount(arr) == (refcount + 1)
pyarrow.int8().id: "INT64", pyarrow.int16().id: "INT64", pyarrow.int32().id: "INT64", pyarrow.int64().id: "INT64", pyarrow.uint8().id: "INT64", pyarrow.uint16().id: "INT64", pyarrow.uint32().id: "INT64", pyarrow.uint64().id: "INT64", pyarrow.float16().id: "FLOAT64", pyarrow.float32().id: "FLOAT64", pyarrow.float64().id: "FLOAT64", pyarrow.time32("ms").id: "TIME", pyarrow.time64("ns").id: "TIME", pyarrow.timestamp("ns").id: "TIMESTAMP", pyarrow.date32().id: "DATE", pyarrow.date64().id: "DATETIME", # because millisecond resolution pyarrow.binary().id: "BYTES", pyarrow.string().id: "STRING", # also alias for pyarrow.utf8() # The exact scale and precision don't matter, see below. pyarrow.decimal128(38, scale=9).id: "NUMERIC", } if version.parse(pyarrow.__version__) >= version.parse("3.0.0"): BQ_TO_ARROW_SCALARS["BIGNUMERIC"] = pyarrow_bignumeric # The exact decimal's scale and precision are not important, as only # the type ID matters, and it's the same for all decimal256 instances. ARROW_SCALAR_IDS_TO_BQ[pyarrow.decimal256(76, scale=38).id] = "BIGNUMERIC" _BIGNUMERIC_SUPPORT = True else: _BIGNUMERIC_SUPPORT = False
datetime.date(2000, 1, 1), None, datetime.date(1970, 1, 1), datetime.date(2040, 2, 26) ] arr = pa.array(data) assert len(arr) == 4 assert arr.type == pa.date32() assert arr.null_count == 1 assert arr[0].as_py() == datetime.date(2000, 1, 1) assert arr[1].as_py() is None assert arr[2].as_py() == datetime.date(1970, 1, 1) assert arr[3].as_py() == datetime.date(2040, 2, 26) @pytest.mark.parametrize('input', [(pa.date32(), [10957, None]), (pa.date64(), [10957 * 86400000, None])]) def test_sequence_explicit_types(input): t, ex_values = input data = [datetime.date(2000, 1, 1), None] arr = pa.array(data, type=t) arr2 = pa.array(ex_values, type=t) for x in [arr, arr2]: assert len(x) == 2 assert x.type == t assert x.null_count == 1 assert x[0].as_py() == datetime.date(2000, 1, 1) assert x[1] is pa.NA def test_date32_overflow():
_NA_REP = "<NA>" _np_pa_dtypes = { np.float64: pa.float64(), np.float32: pa.float32(), np.int64: pa.int64(), np.longlong: pa.int64(), np.int32: pa.int32(), np.int16: pa.int16(), np.int8: pa.int8(), np.bool_: pa.int8(), np.uint64: pa.uint64(), np.uint32: pa.uint32(), np.uint16: pa.uint16(), np.uint8: pa.uint8(), np.datetime64: pa.date64(), np.object_: pa.string(), np.str_: pa.string(), } cudf_dtypes_to_pandas_dtypes = { np.dtype("uint8"): pd.UInt8Dtype(), np.dtype("uint16"): pd.UInt16Dtype(), np.dtype("uint32"): pd.UInt32Dtype(), np.dtype("uint64"): pd.UInt64Dtype(), np.dtype("int8"): pd.Int8Dtype(), np.dtype("int16"): pd.Int16Dtype(), np.dtype("int32"): pd.Int32Dtype(), np.dtype("int64"): pd.Int64Dtype(), np.dtype("bool_"): pd.BooleanDtype(), np.dtype("object"): pd.StringDtype(),
(pa.time64('ns'), '{"name":"time","unit":"NANOSECOND","bitWidth":64}'), (pa.timestamp('s'), '{"name":"timestamp","unit":"SECOND",' '"timezone":null}'), (pa.timestamp('ms'), '{"name":"timestamp","unit":"MILLISECOND",' '"timezone":null}'), (pa.timestamp('us'), '{"name":"timestamp","unit":"MICROSECOND",' '"timezone":null}'), (pa.timestamp('ns'), '{"name":"timestamp","unit":"NANOSECOND",' '"timezone":null}'), (pa.timestamp('ns', tz='UTC'), '{"name":"timestamp","unit":"NANOSECOND"' ',"timezone":"UTC"}'), (pa.timestamp('ns', tz='Europe/Paris'), '{"name":"timestamp",' '"unit":"NANOSECOND","timezone":"Europe/Paris"}'), (pa.date32(), '{"name":"date","unit":"DAY"}'), (pa.date64(), '{"name":"date","unit":"MILLISECOND"}'), (pa.decimal128(19, 4), '{"name":"decimal","precision":19,"scale":4}'), (pa.string(), '{"name":"utf8"}'), (pa.binary(), '{"name":"binary"}'), (pa.binary(10), '{"name":"fixedsizebinary","byteWidth":10}'), # TODO(ARROW-2609): complex types that have children # pa.list_(pa.int32()), # pa.struct([pa.field('a', pa.int32()), # pa.field('b', pa.int8()), # pa.field('c', pa.string())]), # pa.union([pa.field('a', pa.binary(10)), # pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE), # pa.union([pa.field('a', pa.binary(10)), # pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), # TODO: DictionaryType requires a vector in the type # pa.dictionary(pa.int32(), pa.array(['a', 'b', 'c'])),
def test_basics(fletcher_array): df = pd.DataFrame( { "null": fletcher_array(pa.array([None, None], type=pa.null())), "bool": fletcher_array(pa.array([None, True], type=pa.bool_())), "int8": fletcher_array(pa.array([None, -1], type=pa.int8())), "uint8": fletcher_array(pa.array([None, 1], type=pa.uint8())), "int16": fletcher_array(pa.array([None, -1], type=pa.int16())), "uint16": fletcher_array(pa.array([None, 1], type=pa.uint16())), "int32": fletcher_array(pa.array([None, -1], type=pa.int32())), "uint32": fletcher_array(pa.array([None, 1], type=pa.uint32())), "int64": fletcher_array(pa.array([None, -1], type=pa.int64())), "uint64": fletcher_array(pa.array([None, 1], type=pa.uint64())), "float16": fletcher_array( pa.array([None, np.float16(-0.1)], type=pa.float16()) ), "float32": fletcher_array(pa.array([None, -0.1], type=pa.float32())), "float64": fletcher_array(pa.array([None, -0.1], type=pa.float64())), "date32": fletcher_array( pa.array([None, datetime.date(2010, 9, 8)], type=pa.date32()) ), "date64": fletcher_array( pa.array([None, datetime.date(2010, 9, 8)], type=pa.date64()) ), # https://github.com/pandas-dev/pandas/issues/34986 # "timestamp[s]": fletcher_array( # pa.array( # [None, datetime.datetime(2013, 12, 11, 10, 9, 8)], # type=pa.timestamp("s"), # ) # ), # "timestamp[ms]": fletcher_array( # pa.array( # [None, datetime.datetime(2013, 12, 11, 10, 9, 8, 1000)], # type=pa.timestamp("ms"), # ) # ), # "timestamp[us]": fletcher_array( # pa.array( # [None, datetime.datetime(2013, 12, 11, 10, 9, 8, 7)], # type=pa.timestamp("us"), # ) # ), # FIXME: assert_extension_array_equal casts to numpy object thus cannot handle nanoseconds # 'timestamp[ns]': fletcher_array(pa.array([None, datetime.datetime(2013, 12, 11, 10, 9, 8, 7)], type=pa.timestamp("ns"))), "binary": fletcher_array(pa.array([None, b"122"], type=pa.binary())), "string": fletcher_array(pa.array([None, "🤔"], type=pa.string())), "duration[s]": fletcher_array( pa.array([None, datetime.timedelta(seconds=9)], type=pa.duration("s")) ), "duration[ms]": fletcher_array( pa.array( [None, datetime.timedelta(milliseconds=8)], type=pa.duration("ms") ) ), "duration[us]": fletcher_array( pa.array( [None, datetime.timedelta(microseconds=7)], type=pa.duration("us") ) ), # FIXME: assert_extension_array_equal casts to numpy object thus cannot handle nanoseconds # 'duration[ns]': fletcher_array(pa.array([None, datetime.timedelta(microseconds=7)], type=pa.duration("ns"))), "list[string]": fletcher_array( pa.array([None, [None, "🤔"]], type=pa.list_(pa.string())) ), } ) ddf = dd.from_pandas(df, npartitions=2) meta_nonempty = ddf._meta_nonempty pdt.assert_frame_equal(meta_nonempty, df) result = ddf.compute() pdt.assert_frame_equal(result, df)
def as_column(arbitrary, nan_as_null=True, dtype=None): """Create a Column from an arbitrary object Currently support inputs are: * ``Column`` * ``Buffer`` * ``Series`` * ``Index`` * numba device array * cuda array interface * numpy array * pyarrow array * pandas.Categorical Returns ------- result : subclass of TypedColumnBase - CategoricalColumn for pandas.Categorical input. - DatetimeColumn for datetime input - NumericalColumn for all other inputs. """ from . import numerical, categorical, datetime from cudf.dataframe.series import Series from cudf.dataframe.index import Index if isinstance(arbitrary, Column): if not isinstance(arbitrary, TypedColumnBase): # interpret as numeric data = arbitrary.view(numerical.NumericalColumn, dtype=arbitrary.dtype) else: data = arbitrary elif isinstance(arbitrary, Series): data = arbitrary._column elif isinstance(arbitrary, Index): data = arbitrary._values elif isinstance(arbitrary, Buffer): data = numerical.NumericalColumn(data=arbitrary, dtype=arbitrary.dtype) elif cuda.devicearray.is_cuda_ndarray(arbitrary): data = as_column(Buffer(arbitrary)) if (data.dtype in [np.float16, np.float32, np.float64] and arbitrary.size > 0): if nan_as_null: mask = cudautils.mask_from_devary(arbitrary) data = data.set_mask(mask) elif cuda.is_cuda_array(arbitrary): # Use cuda array interface to do create a numba device array by # reference new_dev_array = cuda.as_cuda_array(arbitrary) # Allocate new output array using rmm and copy the numba device array # to an rmm owned device array out_dev_array = rmm.device_array_like(new_dev_array) out_dev_array.copy_to_device(new_dev_array) data = as_column(out_dev_array) elif isinstance(arbitrary, np.ndarray): # CUDF assumes values are always contiguous if not arbitrary.flags['C_CONTIGUOUS']: arbitrary = np.ascontiguousarray(arbitrary) if arbitrary.dtype.kind == 'M': data = datetime.DatetimeColumn.from_numpy(arbitrary) elif arbitrary.dtype.kind in ('O', 'U'): raise NotImplementedError("Strings are not yet supported") else: data = as_column(rmm.to_device(arbitrary), nan_as_null=nan_as_null) elif isinstance(arbitrary, pa.Array): if isinstance(arbitrary, pa.StringArray): warnings.warn("Strings are not yet supported, so converting to " "categorical") data = as_column(arbitrary.dictionary_encode()) elif isinstance(arbitrary, pa.NullArray): new_dtype = dtype if (type(dtype) == str and dtype == 'empty') or dtype is None: new_dtype = np.dtype(arbitrary.type.to_pandas_dtype()) if pd.api.types.is_categorical_dtype(new_dtype): arbitrary = arbitrary.dictionary_encode() else: if nan_as_null: arbitrary = arbitrary.cast(_gdf.np_to_pa_dtype(new_dtype)) else: # casting a null array doesn't make nans valid # so we create one with valid nans from scratch: arbitrary = utils.scalar_broadcast_to(np.nan, (len(arbitrary), ), dtype=new_dtype) data = as_column(arbitrary, nan_as_null=nan_as_null) elif isinstance(arbitrary, pa.DictionaryArray): pamask, padata = buffers_from_pyarrow(arbitrary) data = categorical.CategoricalColumn( data=padata, mask=pamask, null_count=arbitrary.null_count, categories=arbitrary.dictionary.to_pylist(), ordered=arbitrary.type.ordered, ) elif isinstance(arbitrary, pa.TimestampArray): arbitrary = arbitrary.cast(pa.timestamp('ms')) pamask, padata = buffers_from_pyarrow(arbitrary, dtype='M8[ms]') data = datetime.DatetimeColumn(data=padata, mask=pamask, null_count=arbitrary.null_count, dtype=np.dtype('M8[ms]')) elif isinstance(arbitrary, pa.Date64Array): pamask, padata = buffers_from_pyarrow(arbitrary, dtype='M8[ms]') data = datetime.DatetimeColumn(data=padata, mask=pamask, null_count=arbitrary.null_count, dtype=np.dtype('M8[ms]')) elif isinstance(arbitrary, pa.Date32Array): # No equivalent np dtype and not yet supported warnings.warn( "Date32 values are not yet supported so this will " "be typecast to a Date64 value", UserWarning) arbitrary = arbitrary.cast(pa.date64()) data = as_column(arbitrary) elif isinstance(arbitrary, pa.BooleanArray): # Arrow uses 1 bit per value while we use int8 dtype = np.dtype(np.bool) # Needed because of bug in PyArrow # https://issues.apache.org/jira/browse/ARROW-4766 if len(arbitrary) > 0: arbitrary = arbitrary.cast(pa.int8()) else: arbitrary = pa.array([], type=pa.int8()) pamask, padata = buffers_from_pyarrow(arbitrary, dtype=dtype) data = numerical.NumericalColumn(data=padata, mask=pamask, null_count=arbitrary.null_count, dtype=dtype) else: pamask, padata = buffers_from_pyarrow(arbitrary) data = numerical.NumericalColumn( data=padata, mask=pamask, null_count=arbitrary.null_count, dtype=np.dtype(arbitrary.type.to_pandas_dtype())) elif isinstance(arbitrary, pa.ChunkedArray): gpu_cols = [ as_column(chunk, dtype=dtype) for chunk in arbitrary.chunks ] if dtype and dtype != 'empty': new_dtype = dtype else: pa_type = arbitrary.type if pa.types.is_dictionary(pa_type): new_dtype = 'category' else: new_dtype = np.dtype(pa_type.to_pandas_dtype()) data = Column._concat(gpu_cols, dtype=new_dtype) elif isinstance(arbitrary, (pd.Series, pd.Categorical)): if pd.api.types.is_categorical_dtype(arbitrary): data = as_column(pa.array(arbitrary, from_pandas=True)) elif arbitrary.dtype == np.bool: # Bug in PyArrow or HDF that requires us to do this data = as_column(pa.array(np.array(arbitrary), from_pandas=True)) else: data = as_column(pa.array(arbitrary, from_pandas=nan_as_null)) elif isinstance(arbitrary, pd.Timestamp): # This will always treat NaTs as nulls since it's not technically a # discrete value like NaN data = as_column(pa.array(pd.Series([arbitrary]), from_pandas=True)) elif np.isscalar(arbitrary) and not isinstance(arbitrary, memoryview): if hasattr(arbitrary, 'dtype'): data_type = _gdf.np_to_pa_dtype(arbitrary.dtype) if data_type in (pa.date64(), pa.date32()): # PyArrow can't construct date64 or date32 arrays from np # datetime types arbitrary = arbitrary.astype('int64') data = as_column(pa.array([arbitrary], type=data_type)) else: data = as_column(pa.array([arbitrary]), nan_as_null=nan_as_null) elif isinstance(arbitrary, memoryview): data = as_column(np.array(arbitrary), dtype=dtype, nan_as_null=nan_as_null) else: try: data = as_column(memoryview(arbitrary)) except TypeError: try: pa_type = None if dtype is not None: if pd.api.types.is_categorical_dtype(dtype): raise TypeError else: np_type = np.dtype(dtype).type if np_type == np.bool_: pa_type = pa.bool_() else: pa_type = _gdf.np_to_pa_dtype(np.dtype(dtype).type) data = as_column(pa.array(arbitrary, type=pa_type, from_pandas=nan_as_null), nan_as_null=nan_as_null) except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError): np_type = None if dtype is not None: if pd.api.types.is_categorical_dtype(dtype): data = as_column(pd.Series(arbitrary, dtype='category'), nan_as_null=nan_as_null) else: np_type = np.dtype(dtype) data = as_column(np.array(arbitrary, dtype=np_type), nan_as_null=nan_as_null) return data
metadata={"type": "TradeTick"}, ) TYPE_TO_SCHEMA[BettingInstrument] = pa.schema( { "venue": pa.string(), "currency": pa.string(), "instrument_id": pa.string(), "event_type_id": pa.string(), "event_type_name": pa.string(), "competition_id": pa.string(), "competition_name": pa.string(), "event_id": pa.string(), "event_name": pa.string(), "event_country_code": pa.string(), "event_open_date": pa.date64(), "betting_type": pa.string(), "market_id": pa.string(), "market_name": pa.string(), "market_start_time": pa.date64(), "market_type": pa.string(), "selection_id": pa.string(), "selection_name": pa.string(), "selection_handicap": pa.string(), "ts_recv_ns": pa.int64(), "ts_event_ns": pa.int64(), }, metadata={"type": "BettingInstrument"}, ) TYPE_TO_SCHEMA[OrderBookData] = pa.schema(
class TestAbstractFileParserStatics: @pytest.mark.parametrize( # testing all datatypes as laid out here: https://json-schema.org/understanding-json-schema/reference/type.html "input_json_type, output_pyarrow_type", [ ("string", pa.large_string()), ("number", pa.float64()), ("integer", pa.int64()), ("object", pa.large_string()), ("array", pa.large_string()), ("boolean", pa.bool_()), ("null", pa.large_string()), ], ) def test_json_type_to_pyarrow_type(self, input_json_type, output_pyarrow_type): # Json -> PyArrow direction LOGGER.info( f"asserting that JSON type '{input_json_type}' converts to PyArrow type '{output_pyarrow_type}'..." ) assert AbstractFileParser.json_type_to_pyarrow_type( input_json_type) == output_pyarrow_type @pytest.mark.parametrize( # testing all datatypes as laid out here: https://arrow.apache.org/docs/python/api/datatypes.html "input_pyarrow_types, output_json_type", [ ((pa.null(), ), "string"), # null type ((pa.bool_(), ), "boolean"), # boolean type ( (pa.int8(), pa.int16(), pa.int32(), pa.int64(), pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64()), "integer", ), # integer types ((pa.float16(), pa.float32(), pa.float64(), pa.decimal128( 5, 10), pa.decimal256(3, 8)), "number"), # number types ((pa.time32("s"), pa.time64("ns"), pa.timestamp("ms"), pa.date32(), pa.date64()), "string"), # temporal types ((pa.binary(), pa.large_binary()), "string"), # binary types ((pa.string(), pa.utf8(), pa.large_string(), pa.large_utf8()), "string"), # string types ((pa.list_(pa.string()), pa.large_list( pa.timestamp("us"))), "string"), # array types ((pa.map_(pa.string(), pa.float32()), pa.dictionary(pa.int16(), pa.list_( pa.string()))), "string"), # object types ], ) def test_json_type_to_pyarrow_type_reverse(self, input_pyarrow_types, output_json_type): # PyArrow -> Json direction (reverse=True) for typ in input_pyarrow_types: LOGGER.info( f"asserting that PyArrow type '{typ}' converts to JSON type '{output_json_type}'..." ) assert AbstractFileParser.json_type_to_pyarrow_type( typ, reverse=True) == output_json_type @pytest.mark.parametrize( # if expecting fail, put pyarrow_schema as None "json_schema, pyarrow_schema", [ ( { "a": "string", "b": "number", "c": "integer", "d": "object", "e": "array", "f": "boolean", "g": "null" }, { "a": pa.large_string(), "b": pa.float64(), "c": pa.int64(), "d": pa.large_string(), "e": pa.large_string(), "f": pa.bool_(), "g": pa.large_string(), }, ), ({ "single_column": "object" }, { "single_column": pa.large_string() }), ({}, {}), ({ "a": "NOT A REAL TYPE", "b": "another fake type" }, { "a": pa.large_string(), "b": pa.large_string() }), (["string", "object"], None), # bad input type ], ) def test_json_schema_to_pyarrow_schema(self, json_schema, pyarrow_schema): # Json -> PyArrow direction if pyarrow_schema is not None: assert AbstractFileParser.json_schema_to_pyarrow_schema( json_schema) == pyarrow_schema else: with pytest.raises(Exception) as e_info: AbstractFileParser.json_schema_to_pyarrow_schema(json_schema) LOGGER.debug(str(e_info)) @pytest.mark.parametrize( # if expecting fail, put json_schema as None "pyarrow_schema, json_schema", [ ( { "a": pa.utf8(), "b": pa.float16(), "c": pa.uint32(), "d": pa.map_(pa.string(), pa.float32()), "e": pa.bool_(), "f": pa.date64(), }, { "a": "string", "b": "number", "c": "integer", "d": "string", "e": "boolean", "f": "string" }, ), ({ "single_column": pa.int32() }, { "single_column": "integer" }), ({}, {}), ({ "a": "NOT A REAL TYPE", "b": "another fake type" }, { "a": "string", "b": "string" }), (["string", "object"], None), # bad input type ], ) def test_json_schema_to_pyarrow_schema_reverse(self, pyarrow_schema, json_schema): # PyArrow -> Json direction (reverse=True) if json_schema is not None: assert AbstractFileParser.json_schema_to_pyarrow_schema( pyarrow_schema, reverse=True) == json_schema else: with pytest.raises(Exception) as e_info: AbstractFileParser.json_schema_to_pyarrow_schema( pyarrow_schema, reverse=True) LOGGER.debug(str(e_info))
def dataframe_with_lists(include_index=False, parquet_compatible=False): """ Dataframe with list columns of every possible primtive type. Returns ------- df: pandas.DataFrame schema: pyarrow.Schema Arrow schema definition that is in line with the constructed df. parquet_compatible: bool Exclude types not supported by parquet """ arrays = OrderedDict() fields = [] fields.append(pa.field('int64', pa.list_(pa.int64()))) arrays['int64'] = [ [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [0, 1, 2, 3, 4], None, [], np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9] * 2, dtype=np.int64)[::2] ] fields.append(pa.field('double', pa.list_(pa.float64()))) arrays['double'] = [ [0., 1., 2., 3., 4., 5., 6., 7., 8., 9.], [0., 1., 2., 3., 4.], None, [], np.array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.] * 2)[::2], ] fields.append(pa.field('bytes_list', pa.list_(pa.binary()))) arrays['bytes_list'] = [ [b"1", b"f"], None, [b"1"], [b"1", b"2", b"3"], [], ] fields.append(pa.field('str_list', pa.list_(pa.string()))) arrays['str_list'] = [ [u"1", u"ä"], None, [u"1"], [u"1", u"2", u"3"], [], ] date_data = [ [], [date(2018, 1, 1), date(2032, 12, 30)], [date(2000, 6, 7)], None, [date(1969, 6, 9), date(1972, 7, 3)] ] time_data = [ [time(23, 11, 11), time(1, 2, 3), time(23, 59, 59)], [], [time(22, 5, 59)], None, [time(0, 0, 0), time(18, 0, 2), time(12, 7, 3)] ] temporal_pairs = [ (pa.date32(), date_data), (pa.date64(), date_data), (pa.time32('s'), time_data), (pa.time32('ms'), time_data), (pa.time64('us'), time_data) ] if not parquet_compatible: temporal_pairs += [ (pa.time64('ns'), time_data), ] for value_type, data in temporal_pairs: field_name = '{}_list'.format(value_type) field_type = pa.list_(value_type) field = pa.field(field_name, field_type) fields.append(field) arrays[field_name] = data if include_index: fields.append(pa.field('__index_level_0__', pa.int64())) df = pd.DataFrame(arrays) schema = pa.schema(fields) return df, schema
def _parquet_schema(dataframe: pd.DataFrame, custom_redshift_columns: dict = None) -> pa.Schema: """ Translates pandas dtypes to PyArrow types and creates a Schema from them Args: dataframe (pd.DataFrame): Dataframe to pull the schema of custom_redshift_columns (dict, Optional): This dictionary contains custom column data type definitions for redshift. The params should be formatted as follows: - column name (str) - data type (str) Returns: PyArrow Schema of the given dataframe """ fields = [] for col, dtype in dataframe.dtypes.items(): dtype = dtype.name if dtype == 'object': if custom_redshift_columns: # Detect if the Pandas object column contains Python decimal objects. if "[Decimal(" in str(dataframe[col].values)[:9]: # If Python decimal objects are present, parse out the precision and scale # from the custom_redshift_columns dictionary to use when converting # to PyArrow's decimal128 data type. s = custom_redshift_columns[col] precision = int(s[s.find('DECIMAL(') + len('DECIMAL('):s.rfind(',')].strip()) scale = int(s[s.find(',') + len(','):s.rfind(')')].strip()) pa_type = pa.decimal128(precision=precision, scale=scale) else: pa_type = pa.string() else: pa_type = pa.string() elif dtype.startswith('int32'): pa_type = pa.int32() elif dtype.startswith('int64'): pa_type = pa.int64() elif dtype.startswith('int8'): pa_type = pa.int8() elif dtype.startswith('float32'): pa_type = pa.float32() elif dtype.startswith('float64'): pa_type = pa.float64() elif dtype.startswith('float16'): pa_type = pa.float16() elif dtype.startswith('datetime'): pa_type = pa.timestamp('ns') elif dtype.startswith('date'): pa_type = pa.date64() elif dtype.startswith('category'): pa_type = pa.string() elif dtype == 'bool': pa_type = pa.bool_() else: raise NotImplementedError( f"Error: {dtype} is not a datatype which can be mapped to Parquet using s3parq." ) fields.append(pa.field(col, pa_type)) return pa.schema(fields=fields)
"INT64", pyarrow.float16().id: "FLOAT64", pyarrow.float32().id: "FLOAT64", pyarrow.float64().id: "FLOAT64", pyarrow.time32("ms").id: "TIME", pyarrow.time64("ns").id: "TIME", pyarrow.timestamp("ns").id: "TIMESTAMP", pyarrow.date32().id: "DATE", pyarrow.date64().id: "DATETIME", # because millisecond resolution pyarrow.binary().id: "BYTES", pyarrow.string().id: "STRING", # also alias for pyarrow.utf8() pyarrow.decimal128(38, scale=9).id: "NUMERIC", # The exact decimal's scale and precision are not important, as only # the type ID matters, and it's the same for all decimal128 instances. } else: # pragma: NO COVER BQ_TO_ARROW_SCALARS = {} # pragma: NO COVER ARROW_SCALAR_IDS_TO_BQ = {} # pragma: NO_COVER
pa.list_(pa.int32()), pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.int8()), pa.field('c', pa.string())]) ] in_dict = {} for i, type_ in enumerate(types): assert hash(type_) == hash(type_) in_dict[type_] = i assert in_dict[type_] == i @pytest.mark.parametrize('t,check_func', [ (pa.date32(), types.is_date32), (pa.date64(), types.is_date64), (pa.time32('s'), types.is_time32), (pa.time64('ns'), types.is_time64), (pa.int8(), types.is_int8), (pa.int16(), types.is_int16), (pa.int32(), types.is_int32), (pa.int64(), types.is_int64), (pa.uint8(), types.is_uint8), (pa.uint16(), types.is_uint16), (pa.uint32(), types.is_uint32), (pa.uint64(), types.is_uint64), (pa.float16(), types.is_float16), (pa.float32(), types.is_float32), (pa.float64(), types.is_float64) ]) def test_exact_primitive_types(t, check_func):
(1.0, pa.float32(), pa.FloatScalar, pa.FloatValue), (decimal.Decimal("1.123"), None, pa.Decimal128Scalar, pa.Decimal128Value), ("string", None, pa.StringScalar, pa.StringValue), (b"bytes", None, pa.BinaryScalar, pa.BinaryValue), ("largestring", pa.large_string(), pa.LargeStringScalar, pa.LargeStringValue), (b"largebytes", pa.large_binary(), pa.LargeBinaryScalar, pa.LargeBinaryValue), (b"abc", pa.binary(3), pa.FixedSizeBinaryScalar, pa.FixedSizeBinaryValue), ([1, 2, 3], None, pa.ListScalar, pa.ListValue), ([1, 2, 3, 4], pa.large_list(pa.int8()), pa.LargeListScalar, pa.LargeListValue), ([1, 2, 3, 4, 5], pa.list_(pa.int8(), 5), pa.FixedSizeListScalar, pa.FixedSizeListValue), (datetime.date.today(), None, pa.Date32Scalar, pa.Date32Value), (datetime.date.today(), pa.date64(), pa.Date64Scalar, pa.Date64Value), (datetime.datetime.now(), None, pa.TimestampScalar, pa.TimestampValue), (datetime.datetime.now().time().replace(microsecond=0), pa.time32('s'), pa.Time32Scalar, pa.Time32Value), (datetime.datetime.now().time(), None, pa.Time64Scalar, pa.Time64Value), (datetime.timedelta(days=1), None, pa.DurationScalar, pa.DurationValue), ({'a': 1, 'b': [1, 2]}, None, pa.StructScalar, pa.StructValue), ([('a', 1), ('b', 2)], pa.map_(pa.string(), pa.int8()), pa.MapScalar, pa.MapValue), ]) def test_basics(value, ty, klass, deprecated): s = pa.scalar(value, type=ty) assert isinstance(s, klass) assert s.as_py() == value assert s == pa.scalar(value, type=ty) assert s != value
def as_column(arbitrary): """Create a Column from an arbitrary object Currently support inputs are: * ``Column`` * ``Buffer`` * numba device array * numpy array * pandas.Categorical Returns ------- result : subclass of TypedColumnBase - CategoricalColumn for pandas.Categorical input. - NumericalColumn for all other inputs. """ from . import numerical, categorical, datetime if isinstance(arbitrary, Column): if not isinstance(arbitrary, TypedColumnBase): # interpret as numeric data = arbitrary.view(numerical.NumericalColumn, dtype=arbitrary.dtype) else: data = arbitrary elif isinstance(arbitrary, Buffer): data = numerical.NumericalColumn(data=arbitrary, dtype=arbitrary.dtype) elif cuda.devicearray.is_cuda_ndarray(arbitrary): data = as_column(Buffer(arbitrary)) if (data.dtype in [np.float16, np.float32, np.float64] and arbitrary.size > 0): mask = cudautils.mask_from_devary(arbitrary) data = data.set_mask(mask) elif isinstance(arbitrary, np.ndarray): if arbitrary.dtype.kind == 'M': data = datetime.DatetimeColumn.from_numpy(arbitrary) else: data = as_column(rmm.to_device(arbitrary)) elif isinstance(arbitrary, pa.Array): if isinstance(arbitrary, pa.StringArray): raise NotImplementedError("Strings are not yet supported") elif isinstance(arbitrary, pa.NullArray): pamask = Buffer(np.empty(0, dtype='int8')) padata = Buffer(np.empty(0, dtype=arbitrary.type.to_pandas_dtype())) data = numerical.NumericalColumn( data=padata, mask=pamask, null_count=0, dtype=np.dtype(arbitrary.type.to_pandas_dtype())) elif isinstance(arbitrary, pa.DictionaryArray): if arbitrary.buffers()[0]: pamask = Buffer(np.array(arbitrary.buffers()[0])) else: pamask = None padata = Buffer( np.array(arbitrary.buffers()[1]).view( arbitrary.indices.type.to_pandas_dtype())) data = categorical.CategoricalColumn( data=padata, mask=pamask, null_count=arbitrary.null_count, categories=arbitrary.dictionary.to_pylist(), ordered=arbitrary.type.ordered, dtype="category" # What's the correct way to specify this? ) elif isinstance(arbitrary, pa.TimestampArray): arbitrary = arbitrary.cast(pa.timestamp('ms')) if arbitrary.buffers()[0]: pamask = Buffer(np.array(arbitrary.buffers()[0])) else: pamask = None padata = Buffer( np.array(arbitrary.buffers()[1]).view(np.dtype('M8[ms]'))) data = datetime.DatetimeColumn(data=padata, mask=pamask, null_count=arbitrary.null_count, dtype=np.dtype('M8[ms]')) elif isinstance(arbitrary, pa.Date64Array): if arbitrary.buffers()[0]: pamask = Buffer(np.array(arbitrary.buffers()[0])) else: pamask = None padata = Buffer( np.array(arbitrary.buffers()[1]).view(np.dtype('M8[ms]'))) data = datetime.DatetimeColumn(data=padata, mask=pamask, null_count=arbitrary.null_count, dtype=np.dtype('M8[ms]')) elif isinstance(arbitrary, pa.Date32Array): # No equivalent np dtype and not yet supported warnings.warn( "Date32 values are not yet supported so this will " "be typecast to a Date64 value", UserWarning) arbitrary = arbitrary.cast(pa.date64()) data = as_column(arbitrary) else: if arbitrary.buffers()[0]: pamask = Buffer(np.array(arbitrary.buffers()[0])) else: pamask = None padata = Buffer( np.array(arbitrary.buffers()[1]).view( np.dtype(arbitrary.type.to_pandas_dtype()))) data = numerical.NumericalColumn( data=padata, mask=pamask, null_count=arbitrary.null_count, dtype=np.dtype(arbitrary.type.to_pandas_dtype())) elif isinstance(arbitrary, (pd.Series, pd.Categorical)): data = as_column(pa.array(arbitrary, from_pandas=True)) elif np.isscalar(arbitrary): if hasattr(arbitrary, 'dtype'): data_type = _gdf.np_to_pa_dtype(arbitrary.dtype) if data_type in (pa.date64(), pa.date32()): # PyArrow can't construct date64 or date32 arrays from np # datetime types arbitrary = arbitrary.astype('int64') data = as_column(pa.array([arbitrary], type=data_type)) else: data = as_column(pa.array([arbitrary])) else: data = as_column(pa.array(arbitrary)) return data
signed_integer_types = st.sampled_from( [pa.int8(), pa.int16(), pa.int32(), pa.int64()]) unsigned_integer_types = st.sampled_from( [pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64()]) integer_types = st.one_of(signed_integer_types, unsigned_integer_types) floating_types = st.sampled_from([pa.float16(), pa.float32(), pa.float64()]) decimal_type = st.builds(pa.decimal128, precision=st.integers(min_value=1, max_value=38), scale=st.integers(min_value=1, max_value=38)) numeric_types = st.one_of(integer_types, floating_types, decimal_type) date_types = st.sampled_from([pa.date32(), pa.date64()]) time_types = st.sampled_from( [pa.time32('s'), pa.time32('ms'), pa.time64('us'), pa.time64('ns')]) timestamp_types = st.builds(pa.timestamp, unit=st.sampled_from(['s', 'ms', 'us', 'ns']), tz=tzst.timezones()) temporal_types = st.one_of(date_types, time_types, timestamp_types) primitive_types = st.one_of(null_type, bool_type, binary_type, string_type, large_binary_type, large_string_type, numeric_types, temporal_types) metadata = st.dictionaries(st.text(), st.text())
def pyarrow_datatype_from_dict(json_dict: Dict[str, Any]) -> pyarrow.DataType: """ Create a DataType in PyArrow format from a Schema json format. :param json_dict: the DataType in json format :return: the DataType in PyArrow format """ type_class = json_dict["type"]["name"] if type_class == "dictionary": key_type = json_dict["dictionary"]["indexType"] value_type = json_dict["children"][0] key_type = pyarrow_datatype_from_dict(key_type) value_type = pyarrow_datatype_from_dict(value_type) return pyarrow.map_(key_type, value_type) elif "dictionary" in json_dict: key_type = { "name": "key", "type": json_dict["dictionary"]["indexType"], "nullable": json_dict["nullable"], } key = pyarrow_datatype_from_dict(key_type) if type_class == "list": value_type = { "name": "val", "type": json_dict["dictionary"]["indexType"], "nullable": json_dict["nullable"], } return pyarrow.map_( key, pyarrow.list_( pyarrow.field( "entries", pyarrow.struct([pyarrow_field_from_dict(value_type) ]))), ) value_type = { "name": "value", "type": json_dict["type"], "nullable": json_dict["nullable"], } return pyarrow.map_(key, pyarrow_datatype_from_dict(value_type)) elif type_class == "list": field = json_dict["children"][0] element_type = pyarrow_datatype_from_dict(field) return pyarrow.list_(pyarrow.field("item", element_type)) elif type_class == "struct": fields = [ pyarrow_field_from_dict(field) for field in json_dict["children"] ] return pyarrow.struct(fields) elif type_class == "int": return pyarrow.type_for_alias( f'{type_class}{json_dict["type"]["bitWidth"]}') elif type_class == "date": type_info = json_dict["type"] if type_info["unit"] == "DAY": return pyarrow.date32() else: return pyarrow.date64() elif type_class == "time": type_info = json_dict["type"] if type_info["unit"] == "MICROSECOND": unit = "us" elif type_info["unit"] == "NANOSECOND": unit = "ns" elif type_info["unit"] == "MILLISECOND": unit = "ms" else: unit = "s" return pyarrow.type_for_alias( f'{type_class}{type_info["bitWidth"]}[{unit}]') elif type_class == "timestamp": type_info = json_dict["type"] if "unit" in type_info: if type_info["unit"] == "MICROSECOND": unit = "us" elif type_info["unit"] == "NANOSECOND": unit = "ns" elif type_info["unit"] == "MILLISECOND": unit = "ms" elif type_info["unit"] == "SECOND": unit = "s" else: unit = "ns" return pyarrow.type_for_alias(f"{type_class}[{unit}]") elif type_class.startswith("decimal"): type_info = json_dict["type"] return pyarrow.decimal128(precision=type_info["precision"], scale=type_info["scale"]) elif type_class.startswith("floatingpoint"): type_info = json_dict["type"] if type_info["precision"] == "HALF": return pyarrow.float16() elif type_info["precision"] == "SINGLE": return pyarrow.float32() elif type_info["precision"] == "DOUBLE": return pyarrow.float64() else: return pyarrow.type_for_alias(type_class)
def test_date_time_types(): t1 = pa.date32() data1 = np.array([17259, 17260, 17261], dtype='int32') a1 = pa.array(data1, type=t1) t2 = pa.date64() data2 = data1.astype('int64') * 86400000 a2 = pa.array(data2, type=t2) t3 = pa.timestamp('us') start = pd.Timestamp('2000-01-01').value / 1000 data3 = np.array([start, start + 1, start + 2], dtype='int64') a3 = pa.array(data3, type=t3) t4 = pa.time32('ms') data4 = np.arange(3, dtype='i4') a4 = pa.array(data4, type=t4) t5 = pa.time64('us') a5 = pa.array(data4.astype('int64'), type=t5) t6 = pa.time32('s') a6 = pa.array(data4, type=t6) ex_t6 = pa.time32('ms') ex_a6 = pa.array(data4 * 1000, type=ex_t6) t7 = pa.timestamp('ns') start = pd.Timestamp('2001-01-01').value data7 = np.array([start, start + 1000, start + 2000], dtype='int64') a7 = pa.array(data7, type=t7) t7_us = pa.timestamp('us') start = pd.Timestamp('2001-01-01').value data7_us = np.array([start, start + 1000, start + 2000], dtype='int64') // 1000 a7_us = pa.array(data7_us, type=t7_us) table = pa.Table.from_arrays([a1, a2, a3, a4, a5, a6, a7], [ 'date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]', 'time32_from64[s]', 'timestamp[ns]' ]) # date64 as date32 # time32[s] to time32[ms] # 'timestamp[ns]' to 'timestamp[us]' expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6, a7_us], [ 'date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]', 'time32_from64[s]', 'timestamp[ns]' ]) _check_roundtrip(table, expected=expected, version='2.0') # date64 as date32 # time32[s] to time32[ms] # 'timestamp[ns]' is saved as INT96 timestamp expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6, a7], [ 'date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]', 'time32_from64[s]', 'timestamp[ns]' ]) _check_roundtrip(table, expected=expected, version='2.0', use_deprecated_int96_timestamps=True) # Check that setting flavor to 'spark' uses int96 timestamps _check_roundtrip(table, expected=expected, version='2.0', flavor='spark') # Unsupported stuff def _assert_unsupported(array): table = pa.Table.from_arrays([array], ['unsupported']) buf = io.BytesIO() with pytest.raises(NotImplementedError): _write_table(table, buf, version="2.0") t7 = pa.time64('ns') a7 = pa.array(data4.astype('int64'), type=t7) _assert_unsupported(a7)
def test_sequence_date(): data = [datetime.date(2000, 1, 1), None, datetime.date(1970, 1, 1), datetime.date(2040, 2, 26)] arr = pa.array(data) assert len(arr) == 4 assert arr.type == pa.date32() assert arr.null_count == 1 assert arr[0].as_py() == datetime.date(2000, 1, 1) assert arr[1].as_py() is None assert arr[2].as_py() == datetime.date(1970, 1, 1) assert arr[3].as_py() == datetime.date(2040, 2, 26) @pytest.mark.parametrize('input', [(pa.date32(), [10957, None]), (pa.date64(), [10957 * 86400000, None])]) def test_sequence_explicit_types(input): t, ex_values = input data = [datetime.date(2000, 1, 1), None] arr = pa.array(data, type=t) arr2 = pa.array(ex_values, type=t) for x in [arr, arr2]: assert len(x) == 2 assert x.type == t assert x.null_count == 1 assert x[0].as_py() == datetime.date(2000, 1, 1) assert x[1] is pa.NA def test_date32_overflow():
def clean_data_common(self, processed_data, raw_data): """Fix the type and default value of of each extracted field This routine is common to all services. It ensures that all the missing fields, as defined by the schema, are added to the records extracted. Furthermore, each field is set to the specified type. """ # Build default data structure schema_rec = {} def_vals = self._get_default_vals() ptype_map = { pa.string(): str, pa.int32(): int, pa.int64(): int, pa.float32(): float, pa.float64(): float, pa.date64(): float, pa.list_(pa.string()): list, pa.list_(pa.int64()): list, pa.bool_(): bool, } for field in self.schema: default = def_vals[field.type] schema_rec.update({field.name: default}) if isinstance(raw_data, list): read_from = raw_data[0] else: read_from = raw_data for entry in processed_data: entry.update({"hostname": read_from["hostname"]}) entry.update({"namespace": read_from["namespace"]}) entry.update({"timestamp": read_from["timestamp"]}) entry.update({"sqvers": self.version}) for fld in schema_rec: if fld not in entry: if fld == "active": entry.update({fld: True}) else: entry.update({fld: schema_rec[fld]}) else: fld_type = self.schema.field(fld).type if not isinstance(entry[fld], ptype_map[fld_type]): try: entry[fld] = ptype_map[fld_type](entry[fld]) except (ValueError, TypeError): entry[fld] = schema_rec[fld] elif isinstance(entry[fld], list): for i, ele in enumerate(entry[fld]): if not isinstance(ele, ptype_map[fld_type.value_type]): try: if ptype_map[fld_type.value_type] == int: entry[fld][i] = int(entry[fld][i]) elif ptype_map[fld_type.value_type] == str: entry[fld][i] = str(entry[fld][i]) else: raise ValueError except (ValueError, TypeError): entry[fld][i] = schema_rec[fld] return processed_data
def test_fields_hashable(): in_dict = {} fields = [ pa.field('a', pa.int64()), pa.field('a', pa.int32()), pa.field('b', pa.int32()) ] for i, field in enumerate(fields): in_dict[field] = i assert len(in_dict) == len(fields) for i, field in enumerate(fields): assert in_dict[field] == i @pytest.mark.parametrize('t,check_func', [(pa.date32(), types.is_date32), (pa.date64(), types.is_date64), (pa.time32('s'), types.is_time32), (pa.time64('ns'), types.is_time64), (pa.int8(), types.is_int8), (pa.int16(), types.is_int16), (pa.int32(), types.is_int32), (pa.int64(), types.is_int64), (pa.uint8(), types.is_uint8), (pa.uint16(), types.is_uint16), (pa.uint32(), types.is_uint32), (pa.uint64(), types.is_uint64), (pa.float16(), types.is_float16), (pa.float32(), types.is_float32), (pa.float64(), types.is_float64)]) def test_exact_primitive_types(t, check_func): assert check_func(t)
SIGNED_INT_PYARROW_DTYPES = [ pa.uint8(), pa.int16(), pa.int32(), pa.uint64() ] ALL_INT_PYARROW_DTYPES = UNSIGNED_INT_PYARROW_DTYPES + SIGNED_INT_PYARROW_DTYPES FLOAT_PYARROW_DTYPES = [pa.float32(), pa.float64()] STRING_PYARROW_DTYPES = [pa.string(), pa.utf8()] TIME_PYARROW_DTYPES = [ pa.time32("s"), pa.time32("ms"), pa.time64("us"), pa.time64("ns"), ] DATE_PYARROW_DTYPES = [pa.date32(), pa.date64()] DATETIME_PYARROW_DTYPES = [ pa.timestamp(unit=unit, tz=tz) for unit in ["s", "ms", "us", "ns"] for tz in [None, "UTC", "US/Pacific", "US/Eastern"] ] TIMEDELTA_PYARROW_DTYPES = [ pa.duration(unit) for unit in ["s", "ms", "us", "ns"] ] BOOL_PYARROW_DTYPES = [pa.bool_()] # TODO: Add container like pyarrow types: # https://arrow.apache.org/docs/python/api/datatypes.html#factory-functions ALL_PYARROW_DTYPES = (ALL_INT_PYARROW_DTYPES + FLOAT_PYARROW_DTYPES + TIME_PYARROW_DTYPES + DATE_PYARROW_DTYPES + DATETIME_PYARROW_DTYPES + TIMEDELTA_PYARROW_DTYPES +