def test_schema(): fields = [ pa.field('foo', pa.int32()), pa.field('bar', pa.string()), pa.field('baz', pa.list_(pa.int8())) ] sch = pa.schema(fields) assert sch.names == ['foo', 'bar', 'baz'] assert sch.types == [pa.int32(), pa.string(), pa.list_(pa.int8())] assert len(sch) == 3 assert sch[0].name == 'foo' assert sch[0].type == fields[0].type assert sch.field_by_name('foo').name == 'foo' assert sch.field_by_name('foo').type == fields[0].type assert repr(sch) == """\ foo: int32 bar: string baz: list<item: int8> child 0, item: int8""" with pytest.raises(TypeError): pa.schema([None])
def test_table_unsafe_casting(): data = [ pa.array(range(5), type=pa.int64()), pa.array([-10, -5, 0, 5, 10], type=pa.int32()), pa.array([1.1, 2.2, 3.3, 4.4, 5.5], type=pa.float64()), pa.array(['ab', 'bc', 'cd', 'de', 'ef'], type=pa.string()) ] table = pa.Table.from_arrays(data, names=tuple('abcd')) expected_data = [ pa.array(range(5), type=pa.int32()), pa.array([-10, -5, 0, 5, 10], type=pa.int16()), pa.array([1, 2, 3, 4, 5], type=pa.int64()), pa.array(['ab', 'bc', 'cd', 'de', 'ef'], type=pa.string()) ] expected_table = pa.Table.from_arrays(expected_data, names=tuple('abcd')) target_schema = pa.schema([ pa.field('a', pa.int32()), pa.field('b', pa.int16()), pa.field('c', pa.int64()), pa.field('d', pa.string()) ]) with pytest.raises(pa.ArrowInvalid, match='Floating point value truncated'): table.cast(target_schema) casted_table = table.cast(target_schema, safe=False) assert casted_table.equals(expected_table)
def test_table_safe_casting(): data = [ pa.array(range(5), type=pa.int64()), pa.array([-10, -5, 0, 5, 10], type=pa.int32()), pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float64()), pa.array(['ab', 'bc', 'cd', 'de', 'ef'], type=pa.string()) ] table = pa.Table.from_arrays(data, names=tuple('abcd')) expected_data = [ pa.array(range(5), type=pa.int32()), pa.array([-10, -5, 0, 5, 10], type=pa.int16()), pa.array([1, 2, 3, 4, 5], type=pa.int64()), pa.array(['ab', 'bc', 'cd', 'de', 'ef'], type=pa.string()) ] expected_table = pa.Table.from_arrays(expected_data, names=tuple('abcd')) target_schema = pa.schema([ pa.field('a', pa.int32()), pa.field('b', pa.int16()), pa.field('c', pa.int64()), pa.field('d', pa.string()) ]) casted_table = table.cast(target_schema) assert casted_table.equals(expected_table)
def test_cast_from_null(): in_data = [None] * 3 in_type = pa.null() out_types = [ pa.null(), pa.uint8(), pa.float16(), pa.utf8(), pa.binary(), pa.binary(10), pa.list_(pa.int16()), pa.decimal128(19, 4), pa.timestamp('us'), pa.timestamp('us', tz='UTC'), pa.timestamp('us', tz='Europe/Paris'), pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.list_(pa.int8())), pa.field('c', pa.string())]), ] for out_type in out_types: _check_cast_case((in_data, in_type, in_data, out_type)) out_types = [ pa.dictionary(pa.int32(), pa.string()), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), ] in_arr = pa.array(in_data, type=pa.null()) for out_type in out_types: with pytest.raises(NotImplementedError): in_arr.cast(out_type)
def test_orcfile_empty(): from pyarrow import orc f = orc.ORCFile(path_for_orc_example('TestOrcFile.emptyFile')) table = f.read() assert table.num_rows == 0 schema = table.schema expected_schema = pa.schema([ ('boolean1', pa.bool_()), ('byte1', pa.int8()), ('short1', pa.int16()), ('int1', pa.int32()), ('long1', pa.int64()), ('float1', pa.float32()), ('double1', pa.float64()), ('bytes1', pa.binary()), ('string1', pa.string()), ('middle', pa.struct([ ('list', pa.list_(pa.struct([ ('int1', pa.int32()), ('string1', pa.string()), ]))), ])), ('list', pa.list_(pa.struct([ ('int1', pa.int32()), ('string1', pa.string()), ]))), ('map', pa.list_(pa.struct([ ('key', pa.string()), ('value', pa.struct([ ('int1', pa.int32()), ('string1', pa.string()), ])), ]))), ]) assert schema == expected_schema
def test_type_schema_pickling(): cases = [ pa.int8(), pa.string(), pa.binary(), pa.binary(10), pa.list_(pa.string()), pa.struct([ pa.field('a', 'int8'), pa.field('b', 'string') ]), pa.time32('s'), pa.time64('us'), pa.date32(), pa.date64(), pa.timestamp('ms'), pa.timestamp('ns'), pa.decimal(12, 2), pa.field('a', 'string', metadata={b'foo': b'bar'}) ] for val in cases: roundtripped = pickle.loads(pickle.dumps(val)) assert val == roundtripped fields = [] for i, f in enumerate(cases): if isinstance(f, pa.Field): fields.append(f) else: fields.append(pa.field('_f{}'.format(i), f)) schema = pa.schema(fields, metadata={b'foo': b'bar'}) roundtripped = pickle.loads(pickle.dumps(schema)) assert schema == roundtripped
def test_custom_nulls(self): # Infer nulls with custom values opts = ConvertOptions(null_values=['Xxx', 'Zzz']) rows = b"a,b,c,d\nZzz,Xxx,1,2\nXxx,#N/A,,Zzz\n" table = self.read_bytes(rows, convert_options=opts) schema = pa.schema([('a', pa.null()), ('b', pa.string()), ('c', pa.string()), ('d', pa.int64())]) assert table.schema == schema assert table.to_pydict() == { 'a': [None, None], 'b': [u"Xxx", u"#N/A"], 'c': [u"1", u""], 'd': [2, None], } opts = ConvertOptions(null_values=[]) rows = b"a,b\n#N/A,\n" table = self.read_bytes(rows, convert_options=opts) schema = pa.schema([('a', pa.string()), ('b', pa.string())]) assert table.schema == schema assert table.to_pydict() == { 'a': [u"#N/A"], 'b': [u""], }
def test_sequence_utf8_to_unicode(): # ARROW-1225 data = [b'foo', None, b'bar'] arr = pa.array(data, type=pa.string()) assert arr[0].as_py() == u'foo' # test a non-utf8 unicode string val = (u'mañana').encode('utf-16-le') with pytest.raises(pa.ArrowInvalid): pa.array([val], type=pa.string())
def test_is_binary_string(): assert types.is_binary(pa.binary()) assert not types.is_binary(pa.string()) assert types.is_string(pa.string()) assert types.is_unicode(pa.string()) assert not types.is_string(pa.binary()) assert types.is_fixed_size_binary(pa.binary(5)) assert not types.is_fixed_size_binary(pa.binary())
def test_convert_options(): cls = ConvertOptions opts = cls() assert opts.check_utf8 is True opts.check_utf8 = False assert opts.check_utf8 is False assert opts.strings_can_be_null is False opts.strings_can_be_null = True assert opts.strings_can_be_null is True assert opts.column_types == {} # Pass column_types as mapping opts.column_types = {'b': pa.int16(), 'c': pa.float32()} assert opts.column_types == {'b': pa.int16(), 'c': pa.float32()} opts.column_types = {'v': 'int16', 'w': 'null'} assert opts.column_types == {'v': pa.int16(), 'w': pa.null()} # Pass column_types as schema schema = pa.schema([('a', pa.int32()), ('b', pa.string())]) opts.column_types = schema assert opts.column_types == {'a': pa.int32(), 'b': pa.string()} # Pass column_types as sequence opts.column_types = [('x', pa.binary())] assert opts.column_types == {'x': pa.binary()} with pytest.raises(TypeError, match='DataType expected'): opts.column_types = {'a': None} with pytest.raises(TypeError): opts.column_types = 0 assert isinstance(opts.null_values, list) assert '' in opts.null_values assert 'N/A' in opts.null_values opts.null_values = ['xxx', 'yyy'] assert opts.null_values == ['xxx', 'yyy'] assert isinstance(opts.true_values, list) opts.true_values = ['xxx', 'yyy'] assert opts.true_values == ['xxx', 'yyy'] assert isinstance(opts.false_values, list) opts.false_values = ['xxx', 'yyy'] assert opts.false_values == ['xxx', 'yyy'] opts = cls(check_utf8=False, column_types={'a': pa.null()}, null_values=['N', 'nn'], true_values=['T', 'tt'], false_values=['F', 'ff'], strings_can_be_null=True) assert opts.check_utf8 is False assert opts.column_types == {'a': pa.null()} assert opts.null_values == ['N', 'nn'] assert opts.false_values == ['F', 'ff'] assert opts.true_values == ['T', 'tt'] assert opts.strings_can_be_null is True
def test_schema_equals_propagates_check_metadata(): # ARROW-4088 schema1 = pa.schema([ pa.field('foo', pa.int32()), pa.field('bar', pa.string()) ]) schema2 = pa.schema([ pa.field('foo', pa.int32()), pa.field('bar', pa.string(), metadata={'a': 'alpha'}), ]) assert not schema1.equals(schema2) assert schema1.equals(schema2, check_metadata=False)
def test_table_pickle(): data = [ pa.chunked_array([[1, 2], [3, 4]], type=pa.uint32()), pa.chunked_array([["some", "strings", None, ""]], type=pa.string()), ] schema = pa.schema([pa.field('ints', pa.uint32()), pa.field('strs', pa.string())], metadata={b'foo': b'bar'}) table = pa.Table.from_arrays(data, schema=schema) result = pickle.loads(pickle.dumps(table)) result._validate() assert result.equals(table)
def test_array_mixed_unicode_bytes(): values = [u'qux', b'foo', bytearray(b'barz')] b_values = [b'qux', b'foo', b'barz'] u_values = [u'qux', u'foo', u'barz'] arr = pa.array(values) expected = pa.array(b_values, type=pa.binary()) assert arr.type == pa.binary() assert arr.equals(expected) arr = pa.array(values, type=pa.string()) expected = pa.array(u_values, type=pa.string()) assert arr.type == pa.string() assert arr.equals(expected)
def test_type_to_pandas_dtype(): M8_ns = np.dtype('datetime64[ns]') cases = [ (pa.null(), np.float64), (pa.bool_(), np.bool_), (pa.int8(), np.int8), (pa.int16(), np.int16), (pa.int32(), np.int32), (pa.int64(), np.int64), (pa.uint8(), np.uint8), (pa.uint16(), np.uint16), (pa.uint32(), np.uint32), (pa.uint64(), np.uint64), (pa.float16(), np.float16), (pa.float32(), np.float32), (pa.float64(), np.float64), (pa.date32(), M8_ns), (pa.date64(), M8_ns), (pa.timestamp('ms'), M8_ns), (pa.binary(), np.object_), (pa.binary(12), np.object_), (pa.string(), np.object_), (pa.list_(pa.int8()), np.object_), ] for arrow_type, numpy_type in cases: assert arrow_type.to_pandas_dtype() == numpy_type
def test_unicode(self): data = [u("foo"), u("bar"), None, u("arrow")] arr = pyarrow.from_pylist(data) assert len(arr) == 4 assert arr.null_count == 1 assert arr.type == pyarrow.string() assert arr.to_pylist() == [u("foo"), u("bar"), None, u("arrow")]
def test_field(self): t = arrow.string() f = arrow.field('foo', t) assert f.name == 'foo' assert f.type is t assert repr(f) == "Field('foo', type=string)"
def test_dictionary_type(): ty0 = pa.dictionary(pa.int32(), pa.string()) assert ty0.index_type == pa.int32() assert ty0.value_type == pa.string() assert ty0.ordered is False ty1 = pa.dictionary(pa.int8(), pa.float64(), ordered=True) assert ty1.index_type == pa.int8() assert ty1.value_type == pa.float64() assert ty1.ordered is True # construct from non-arrow objects ty2 = pa.dictionary('int8', 'string') assert ty2.index_type == pa.int8() assert ty2.value_type == pa.string() assert ty2.ordered is False
def test_struct_from_tuples(): ty = pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.string()), pa.field('c', pa.bool_())]) data = [(5, 'foo', True), (6, 'bar', False)] expected = [{'a': 5, 'b': 'foo', 'c': True}, {'a': 6, 'b': 'bar', 'c': False}] arr = pa.array(data, type=ty) data_as_ndarray = np.empty(len(data), dtype=object) data_as_ndarray[:] = data arr2 = pa.array(data_as_ndarray, type=ty) assert arr.to_pylist() == expected assert arr.equals(arr2) # With omitted values data = [(5, 'foo', None), None, (6, None, False)] expected = [{'a': 5, 'b': 'foo', 'c': None}, None, {'a': 6, 'b': None, 'c': False}] arr = pa.array(data, type=ty) assert arr.to_pylist() == expected # Invalid tuple size for tup in [(5, 'foo'), (), ('5', 'foo', True, None)]: with pytest.raises(ValueError, match="(?i)tuple size"): pa.array([tup], type=ty)
def test_is_union(): for mode in [pa.lib.UnionMode_SPARSE, pa.lib.UnionMode_DENSE]: assert types.is_union(pa.union([pa.field('a', pa.int32()), pa.field('b', pa.int8()), pa.field('c', pa.string())], mode=mode)) assert not types.is_union(pa.list_(pa.int32()))
def test_empty_cast(): types = [ pa.null(), pa.bool_(), pa.int8(), pa.int16(), pa.int32(), pa.int64(), pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64(), pa.float16(), pa.float32(), pa.float64(), pa.date32(), pa.date64(), pa.binary(), pa.binary(length=4), pa.string(), ] for (t1, t2) in itertools.product(types, types): try: # ARROW-4766: Ensure that supported types conversion don't segfault # on empty arrays of common types pa.array([], type=t1).cast(t2) except pa.lib.ArrowNotImplementedError: continue
def test_sequence_unicode(): data = [u'foo', u'bar', None, u'mañana'] arr = pa.array(data) assert len(arr) == 4 assert arr.null_count == 1 assert arr.type == pa.string() assert arr.to_pylist() == data
def test_unicode(self): data = [u'foo', u'bar', None, u'mañana'] arr = pyarrow.from_pylist(data) assert len(arr) == 4 assert arr.null_count == 1 assert arr.type == pyarrow.string() assert arr.to_pylist() == data
def test_struct_from_dicts_inference(): expected_type = pa.struct([pa.field('a', pa.int64()), pa.field('b', pa.string()), pa.field('c', pa.bool_())]) data = [{'a': 5, 'b': u'foo', 'c': True}, {'a': 6, 'b': u'bar', 'c': False}] arr = pa.array(data) check_struct_type(arr.type, expected_type) assert arr.to_pylist() == data # With omitted values data = [{'a': 5, 'c': True}, None, {}, {'a': None, 'b': u'bar'}] expected = [{'a': 5, 'b': None, 'c': True}, None, {'a': None, 'b': None, 'c': None}, {'a': None, 'b': u'bar', 'c': None}] arr = pa.array(data) data_as_ndarray = np.empty(len(data), dtype=object) data_as_ndarray[:] = data arr2 = pa.array(data) check_struct_type(arr.type, expected_type) assert arr.to_pylist() == expected assert arr.equals(arr2) # Nested expected_type = pa.struct([ pa.field('a', pa.struct([pa.field('aa', pa.list_(pa.int64())), pa.field('ab', pa.bool_())])), pa.field('b', pa.string())]) data = [{'a': {'aa': [5, 6], 'ab': True}, 'b': 'foo'}, {'a': {'aa': None, 'ab': False}, 'b': None}, {'a': None, 'b': 'bar'}] arr = pa.array(data) assert arr.to_pylist() == data # Edge cases arr = pa.array([{}]) assert arr.type == pa.struct([]) assert arr.to_pylist() == [{}] # Mixing structs and scalars is rejected with pytest.raises((pa.ArrowInvalid, pa.ArrowTypeError)): pa.array([1, {'a': 2}])
def test_type_list(): value_type = pa.int32() list_type = pa.list_(value_type) assert str(list_type) == 'list<item: int32>' field = pa.field('my_item', pa.string()) l2 = pa.list_(field) assert str(l2) == 'list<my_item: string>'
def test_infer_lists(self): data = OrderedDict([ ('nan_ints', [[None, 1], [2, 3]]), ('ints', [[0, 1], [2, 3]]), ('strs', [[None, u'b'], [u'c', u'd']]), ('nested_strs', [[[None, u'b'], [u'c', u'd']], None]) ]) df = pd.DataFrame(data) expected_schema = pa.schema([ pa.field('nan_ints', pa.list_(pa.int64())), pa.field('ints', pa.list_(pa.int64())), pa.field('strs', pa.list_(pa.string())), pa.field('nested_strs', pa.list_(pa.list_(pa.string()))) ]) self._check_pandas_roundtrip(df, expected_schema=expected_schema)
def test_unicode(self): repeats = 1000 values = [u'foo', None, u'bar', u'mañana', np.nan] df = pd.DataFrame({'strings': values * repeats}) field = pa.field('strings', pa.string()) schema = pa.schema([field]) self._check_pandas_roundtrip(df, expected_schema=schema)
def test_chunked_array_basics(): data = pa.chunked_array([], type=pa.string()) assert data.type == pa.string() assert data.to_pylist() == [] with pytest.raises(ValueError): pa.chunked_array([]) data = pa.chunked_array([ [1, 2, 3], [4, 5, 6], [7, 8, 9] ]) assert isinstance(data.chunks, list) assert all(isinstance(c, pa.lib.Int64Array) for c in data.chunks) assert all(isinstance(c, pa.lib.Int64Array) for c in data.iterchunks()) assert len(data.chunks) == 3
def test_struct_from_mixed_sequence(): # It is forbidden to mix dicts and tuples when initializing a struct array ty = pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.string()), pa.field('c', pa.bool_())]) data = [(5, 'foo', True), {'a': 6, 'b': 'bar', 'c': False}] with pytest.raises(TypeError): pa.array(data, type=ty)
def test_schema_from_tuples(): fields = [ ('foo', pa.int32()), ('bar', pa.string()), ('baz', pa.list_(pa.int8())), ] sch = pa.schema(fields) assert sch.names == ['foo', 'bar', 'baz'] assert sch.types == [pa.int32(), pa.string(), pa.list_(pa.int8())] assert len(sch) == 3 assert repr(sch) == """\ foo: int32 bar: string baz: list<item: int8> child 0, item: int8""" with pytest.raises(TypeError): pa.schema([('foo', None)])
def test_cast_binary_to_utf8(): binary_arr = pa.array([b'foo', b'bar', b'baz'], type=pa.binary()) utf8_arr = binary_arr.cast(pa.utf8()) expected = pa.array(['foo', 'bar', 'baz'], type=pa.utf8()) assert utf8_arr.equals(expected) non_utf8_values = [(u'mañana').encode('utf-16-le')] non_utf8_binary = pa.array(non_utf8_values) assert non_utf8_binary.type == pa.binary() with pytest.raises(ValueError): non_utf8_binary.cast(pa.string()) non_utf8_all_null = pa.array(non_utf8_values, mask=np.array([True]), type=pa.binary()) # No error casted = non_utf8_all_null.cast(pa.string()) assert casted.null_count == 1
"""Conversion between different types of arrays""" import numpy as np import pyarrow as pa import vaex.utils supported_arrow_array_types = (pa.Array, pa.ChunkedArray) supported_array_types = (np.ndarray, ) + supported_arrow_array_types string_types = [pa.string(), pa.large_string()] def full(n, value, dtype): from .datatype import DataType dtype = DataType(dtype) values = np.full(n, value, dtype=dtype.numpy) if dtype.is_arrow: return pa.array(values) else: return values def is_arrow_array(ar): return isinstance(ar, supported_arrow_array_types) def is_numpy_array(ar): return isinstance(ar, np.ndarray) def filter(ar, boolean_mask): if isinstance(ar, supported_arrow_array_types):
tf.io.FixedLenFeature((1, ), tf.int64, default_value=[0]), 'image/class/text': tf.io.FixedLenFeature((1, ), tf.string, default_value=[b'']), 'image/format': tf.io.FixedLenFeature((1, ), tf.string, default_value=[b'']), 'image/filename': tf.io.FixedLenFeature((1, ), tf.string, default_value=[b'']), 'image/encoded': tf.io.FixedLenFeature((1, ), tf.string, default_value=[b'']) } parquet_schema = { 'image/height': pa.int64(), 'image/width': pa.int64(), 'image/channels': pa.int64(), 'image/colorspace': pa.string(), 'image/class/label': pa.int64(), 'image/class/text': pa.string(), 'image/format': pa.string(), 'image/filename': pa.string(), 'image/encoded': pa.binary() } def reformat_row(row): import pyarrow as pa out_row = {} for key, val in row.items(): out_type = parquet_schema[key] np_val = val.numpy()
def test_try_incompatible_extension_type(self): arr = pa.array(TypedSequence(["foo", "bar"], try_type=Array2DExtensionType((1, 3), "int64"))) self.assertEqual(arr.type, pa.string())
def lambda_handler(event, _): """Lambda entry point""" source_path = 's3://' + event['bucket'] + '/' + event['source_key'] target_path = 's3://' + event['bucket'] + '/' + event['target_key'] print('Source: ' + source_path) print('Target: ' + target_path) s3_client = boto3.client('s3') found = False # Ensure file exists before we actually run the conversion while not found: try: s3_client.head_object(Bucket=event['bucket'], Key=event['source_key']) except botocore.exceptions.ClientError: print('Waiting for: "' + source_path + '" to exist') time.sleep(10) else: print('Found: "' + source_path + '"') found = True s3fs_source = s3fs.S3FileSystem() s3fs_target = s3fs.S3FileSystem() with s3fs_source.open(source_path, 'rb') as source_file, \ s3fs_target.open(target_path, 'wb') as target_file: # Open a stream reader for the csv file csv_stream = pd.read_csv(source_file, skiprows=0, compression='gzip', dtype=object, iterator=True, chunksize=100000) parquet_writer = None for i, chunk in enumerate(csv_stream): print('Reading chunk: ' + str(i)) # First chunck, get schema and setup writer if not parquet_writer: # Fetch columns from header, hardcodes type to string columns = [ pa.field(column, pa.string()) for column in chunk.columns ] # Generate schema from columns parquet_schema = pa.schema(columns) # Open a writer to S3 parquet_writer = pq.ParquetWriter(target_file, parquet_schema, compression='snappy') # Read the first chunk table = pa.Table.from_pandas(chunk, preserve_index=False) print('Writing chunk: ' + str(i)) parquet_writer.write_table(table) parquet_writer.close() print('Done processing "' + source_path + '"') return event['target_key']
def test_string(self): data = ['foo', b'bar', None, 'arrow'] arr = pyarrow.from_pylist(data) assert len(arr) == 4 assert arr.null_count == 1 assert arr.type == pyarrow.string()
def test_str_length(array, expected, offset): array = pa.array(array, pa.string())[offset:] np.testing.assert_array_equal( str_length(NumbaStringArray.make(array)), np.asarray(expected[offset:], dtype=np.int32), )
def test_is_datetime(): assert is_datetime(pyarrow.timestamp("us", tz=None)) assert not is_datetime(pyarrow.timestamp("ms", tz=None)) assert not is_datetime(pyarrow.timestamp("us", tz="UTC")) assert not is_datetime(pyarrow.timestamp("ns", tz="UTC")) assert not is_datetime(pyarrow.string())
import pyarrow as pa PQ_SCHEMAS = dict() # site_visits fields = [ pa.field('visit_id', pa.int64(), nullable=False), pa.field('crawl_id', pa.uint32(), nullable=False), pa.field('instance_id', pa.uint32(), nullable=False), pa.field('site_url', pa.string(), nullable=False), pa.field('site_rank', pa.uint32()) ] PQ_SCHEMAS['site_visits'] = pa.schema(fields) # flash_cookies fields = [ pa.field('crawl_id', pa.uint32(), nullable=False), pa.field('visit_id', pa.int64(), nullable=False), pa.field('instance_id', pa.uint32(), nullable=False), pa.field('domain', pa.string()), pa.field('filename', pa.string()), pa.field('local_path', pa.string()), pa.field('key', pa.string()), pa.field('content', pa.string()) ] PQ_SCHEMAS['flash_cookies'] = pa.schema(fields) # crawl_history fields = [ pa.field('crawl_id', pa.uint32(), nullable=False), pa.field('visit_id', pa.int64(), nullable=False),
def test_sql(redshift_table, postgresql_table, mysql_table, databases_parameters, db_type): if db_type == "postgresql": table = postgresql_table elif db_type == "mysql": table = mysql_table else: table = redshift_table df = get_df() if db_type == "redshift": df.drop(["binary"], axis=1, inplace=True) engine = wr.catalog.get_engine(connection=f"aws-data-wrangler-{db_type}", echo=False) index = True if engine.name == "redshift" else False wr.db.to_sql( df=df, con=engine, name=table, schema=databases_parameters[db_type]["schema"], if_exists="replace", index=index, index_label=None, chunksize=None, method=None, dtype={"iint32": sqlalchemy.types.Integer}, ) df = wr.db.read_sql_query( sql=f"SELECT * FROM {databases_parameters[db_type]['schema']}.{table}", con=engine) ensure_data_types(df, has_list=False) engine = wr.db.get_engine( db_type=db_type, host=databases_parameters[db_type]["host"], port=databases_parameters[db_type]["port"], database=databases_parameters[db_type]["database"], user=databases_parameters["user"], password=databases_parameters["password"], echo=False, ) dfs = wr.db.read_sql_query( sql=f"SELECT * FROM {databases_parameters[db_type]['schema']}.{table}", con=engine, chunksize=1, dtype={ "iint8": pa.int8(), "iint16": pa.int16(), "iint32": pa.int32(), "iint64": pa.int64(), "float": pa.float32(), "double": pa.float64(), "decimal": pa.decimal128(3, 2), "string_object": pa.string(), "string": pa.string(), "date": pa.date32(), "timestamp": pa.timestamp(unit="ns"), "binary": pa.binary(), "category": pa.float64(), }, ) for df in dfs: ensure_data_types(df, has_list=False) if db_type != "redshift": account_id = boto3.client("sts").get_caller_identity().get("Account") engine = wr.catalog.get_engine( connection=f"aws-data-wrangler-{db_type}", catalog_id=account_id) wr.db.to_sql( df=pd.DataFrame({"col0": [1, 2, 3]}, dtype="Int32"), con=engine, name=table, schema=databases_parameters[db_type]["schema"], if_exists="replace", index=True, index_label="index", ) schema = None if db_type == "postgresql": schema = databases_parameters[db_type]["schema"] df = wr.db.read_sql_table(con=engine, table=table, schema=schema, index_col="index") assert df.shape == (3, 1)
def _text_replace_case_sensitive(data: pa.Array, pat: str, repl: str, max_repl: int) -> pa.Array: """ Replace occurrences of ``pat`` with ``repl`` in the Series/Index with some other string. For every row, only the first ``max_repl`` replacements will be performed. If ``max_repl = -1`` we consider that we have no limit for the number of replacements. This implementation does basic byte-by-byte comparison and is independent of any locales or encodings. """ # Convert to UTF-8 bytes pat_bytes: bytes = pat.encode() repl_bytes: bytes = repl.encode() offsets_buffer, data_buffer = _extract_string_buffers(data) if data.null_count == 0: valid_buffer = np.empty(0, dtype=np.uint8) else: valid_buffer = _buffer_to_view(data.buffers()[0]) if len(pat) > 0: output_t = _text_replace_case_sensitive_numba( len(data), valid_buffer, data.offset, offsets_buffer, data_buffer, pat_bytes, repl_bytes, max_repl, ) else: output_t = _text_replace_case_sensitive_empty_pattern( len(data), valid_buffer, data.offset, offsets_buffer, data_buffer, repl_bytes, max_repl, ) output_offsets, output_buffer = output_t if data.null_count == 0: output_valid = None else: output_valid = data.buffers()[0].slice(data.offset // 8) if data.offset % 8 != 0: output_valid = shift_unaligned_bitmap(output_valid, data.offset % 8, len(data)) buffers = [ output_valid, pa.py_buffer(output_offsets), pa.py_buffer(output_buffer) ] return pa.Array.from_buffers(pa.string(), len(data), buffers, data.null_count)
def test_type_string(): t = pa.string() assert str(t) == 'string'
def __setitem__(self, key: int | slice | np.ndarray, value: Any) -> None: """Set one or more values inplace. Parameters ---------- key : int, ndarray, or slice When called from, e.g. ``Series.__setitem__``, ``key`` will be one of * scalar int * ndarray of integers. * boolean ndarray * slice object value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object value or values to be set of ``key``. Returns ------- None """ key = check_array_indexer(self, key) if is_integer(key): key = cast(int, key) if not is_scalar(value): raise ValueError("Must pass scalars with scalar indexer") elif isna(value): value = None elif not isinstance(value, str): raise ValueError("Scalar must be NA or str") # Slice data and insert in-between new_data = [ *self._data[0:key].chunks, pa.array([value], type=pa.string()), *self._data[(key + 1):].chunks, ] self._data = pa.chunked_array(new_data) else: # Convert to integer indices and iteratively assign. # TODO: Make a faster variant of this in Arrow upstream. # This is probably extremely slow. # Convert all possible input key types to an array of integers if isinstance(key, slice): key_array = np.array(range(len(self))[key]) elif is_bool_dtype(key): # TODO(ARROW-9430): Directly support setitem(booleans) key_array = np.argwhere(key).flatten() else: # TODO(ARROW-9431): Directly support setitem(integers) key_array = np.asanyarray(key) if is_scalar(value): value = np.broadcast_to(value, len(key_array)) else: value = np.asarray(value) if len(key_array) != len(value): raise ValueError("Length of indexer and values mismatch") for k, v in zip(key_array, value): self[k] = v
def __init__(self, expressions, dtype=None, shape=None, fill_value=None): self.is_masked = any([e.is_masked for e in expressions]) self.fill_value = fill_value if self.is_masked and fill_value is None: for expression in expressions: if expression.is_masked: try: # fast path self.fill_value = expression[0:1].fill_value break except: # noqa # slower path (we have to evaluate everything) self.fill_value = expression.values.fill_value break else: raise ValueError( 'Concatenating expressions with masked values, but no fill value is found' ) if dtype is None: dtypes = [e.dtype for e in expressions] any_strings = any([is_string_type(dtype) for dtype in dtypes]) if any_strings: self.dtype = pa.string( ) # TODO: how do we know it should not be large_string? else: # np.datetime64/timedelta64 and find_common_type don't mix very well if all([dtype == 'datetime64' for dtype in dtypes]): self.dtype = dtypes[0] elif all([dtype == 'timedelta64' for dtype in dtypes]): self.dtype = dtypes[0] else: if all([dtype == dtypes[0] for dtype in dtypes ]): # find common types doesn't always behave well self.dtype = dtypes[0] if any([dtype.kind in 'SU' for dtype in dtypes ]): # strings are also done manually if all([dtype.kind in 'SU' for dtype in dtypes]): index = np.argmax( [dtype.itemsize for dtype in dtypes]) self.dtype = dtypes[index] else: index = np.argmax([ df.columns[self.column_name].astype( 'O').astype('U').dtype.itemsize for df in dfs ]) self.dtype = dfs[index].columns[ self.column_name].astype('O').astype('U').dtype else: self.dtype = np.find_common_type( [k.numpy for k in dtypes], []) logger.debug("common type for %r is %r", dtypes, self.dtype) # make sure all expression are the same type self.expressions = [ e if vaex.array_types.same_type(e.dtype, self.dtype) else e.astype(self.dtype) for e in expressions ] else: # if dtype is given, we assume every expression/column is the same dtype self.dtype = dtype self.expressions = expressions[:] if shape is not None: self.shape = (len(self), ) + shape else: self.shape = (len(self), ) + self.expressions[0].evaluate( 0, 1, array_type='numpy', parallel=False).shape[1:] for i in range(1, len(self.expressions)): expression = self.expressions[i] shape_i = (len(self), ) + expressions[i].evaluate( 0, 1, array_type='numpy', parallel=False).shape[1:] if self.shape != shape_i: raise ValueError( "shape of of expression %s, array index 0, is %r and is incompatible with the shape of the same column of array index %d, %r" % (self.expressions[0], self.shape, i, shape_i))
assert result2.equals(arr) def test_cast_date64_to_int(): arr = pa.array(np.array([0, 1, 2], dtype='int64'), type=pa.date64()) expected = pa.array([0, 1, 2], type='i8') result = arr.cast('i8') assert result.equals(expected) @pytest.mark.parametrize( ('data', 'typ'), [([True, False, True, True], pa.bool_()), ([1, 2, 4, 6], pa.int64()), ([1.0, 2.5, None], pa.float64()), (['a', None, 'b'], pa.string()), ([], None), ([[1, 2], [3]], pa.list_(pa.int64())), ([['a'], None, ['b', 'c']], pa.list_(pa.string())), ([(1, 'a'), (2, 'c'), None ], pa.struct([pa.field('a', pa.int64()), pa.field('b', pa.string())]))]) def test_array_pickle(data, typ): # Allocate here so that we don't have any Arrow data allocated. # This is needed to ensure that allocator tests can be reliable. array = pa.array(data, type=typ) result = pickle.loads(pickle.dumps(array)) assert array.equals(result) @pytest.mark.parametrize('narr', [ np.arange(10, dtype=np.int64),
(pa.int8(), 'int8'), (pa.int16(), 'int16'), (pa.int32(), 'int32'), (pa.int64(), 'int64'), (pa.uint8(), 'uint8'), (pa.uint16(), 'uint16'), (pa.uint32(), 'uint32'), (pa.uint64(), 'uint64'), (pa.float16(), 'float16'), (pa.float32(), 'float32'), (pa.float64(), 'float64'), (pa.date32(), 'date'), (pa.date64(), 'date'), (pa.binary(), 'bytes'), (pa.binary(length=4), 'bytes'), (pa.string(), 'unicode'), (pa.list_(pa.list_(pa.int16())), 'list[list[int16]]'), (pa.decimal128(18, 3), 'decimal'), (pa.timestamp('ms'), 'datetime'), (pa.timestamp('us', 'UTC'), 'datetimetz'), (pa.time32('s'), 'time'), (pa.time64('us'), 'time') ] ) def test_logical_type(type, expected): assert get_logical_type(type) == expected def test_array_uint64_from_py_over_range(): arr = pa.array([2 ** 63], type=pa.uint64()) expected = pa.array(np.array([2 ** 63], dtype='u8'))
def setUp(self): # Reducing the size of thread pools. Without this test execution may fail in # environments with limited amount of resources. filebasedsource.MAX_NUM_THREADS_FOR_SIZE_ESTIMATION = 2 self.temp_dir = tempfile.mkdtemp() self.RECORDS = [{ 'name': 'Thomas', 'favorite_number': 1, 'favorite_color': 'blue' }, { 'name': 'Henry', 'favorite_number': 3, 'favorite_color': 'green' }, { 'name': 'Toby', 'favorite_number': 7, 'favorite_color': 'brown' }, { 'name': 'Gordon', 'favorite_number': 4, 'favorite_color': 'blue' }, { 'name': 'Emily', 'favorite_number': -1, 'favorite_color': 'Red' }, { 'name': 'Percy', 'favorite_number': 6, 'favorite_color': 'Green' }, { 'name': 'Peter', 'favorite_number': 3, 'favorite_color': None }] self.SCHEMA = pa.schema([('name', pa.string(), False), ('favorite_number', pa.int64(), False), ('favorite_color', pa.string())]) self.SCHEMA96 = pa.schema([('name', pa.string(), False), ('favorite_number', pa.timestamp('ns'), False), ('favorite_color', pa.string())]) self.RECORDS_NESTED = [{ 'items': [ { 'name': 'Thomas', 'favorite_number': 1, 'favorite_color': 'blue' }, { 'name': 'Henry', 'favorite_number': 3, 'favorite_color': 'green' }, ] }, { 'items': [ { 'name': 'Toby', 'favorite_number': 7, 'favorite_color': 'brown' }, ] }] self.SCHEMA_NESTED = pa.schema([ ('items', pa.list_( pa.struct([('name', pa.string(), False), ('favorite_number', pa.int64(), False), ('favorite_color', pa.string())]))) ])
"FLOAT64", pyarrow.float64().id: "FLOAT64", pyarrow.time32("ms").id: "TIME", pyarrow.time64("ns").id: "TIME", pyarrow.timestamp("ns").id: "TIMESTAMP", pyarrow.date32().id: "DATE", pyarrow.date64().id: "DATETIME", # because millisecond resolution pyarrow.binary().id: "BYTES", pyarrow.string().id: "STRING", # also alias for pyarrow.utf8() pyarrow.decimal128(38, scale=9).id: "NUMERIC", # The exact decimal's scale and precision are not important, as only # the type ID matters, and it's the same for all decimal128 instances. } else: # pragma: NO COVER BQ_TO_ARROW_SCALARS = {} # pragma: NO COVER ARROW_SCALAR_IDS_TO_BQ = {} # pragma: NO_COVER def bq_to_arrow_struct_data_type(field): arrow_fields = [] for subfield in field.fields:
def infer_schema(self, data): """ Infer a schema for a given data input. The schema can be used to test with schema validator. This function currently supports DataFrame, Numpy, Dictionary, List and basic python types.:: data = pandas.DataFrame(...) schema = infer_schema(data) This function returns None if it can not infer the schema. """ schema = None if data is None: schema = pa.null() elif isinstance(data, dict): schema = {'type': dict, 'fields': {}} for key, value in data.items(): schema['fields'][key] = self.infer_schema(value) elif isinstance(data, pd.DataFrame): schema = {'type': pd.DataFrame, 'fields': {}} # sample the table to get the schema pa_schema = pa.Table.from_pandas(data[:_SAMPLE_SIZE], preserve_index=False).schema for i, name in enumerate(pa_schema.names): schema['fields'][name] = pa_schema.types[i] elif isinstance(data, pd.Series): schema = { 'type': pd.Series, 'item': pa.Array.from_pandas(data).type, } elif isinstance(data, np.ndarray): pa_type = pa.from_numpy_dtype( data.dtype) if data.dtype.num != 17 else pa.string() if len(data.shape) == 1: # 1d array schema = { 'type': np.ndarray, 'item': pa_type, } else: shape = [ v if i != 0 else None for i, v in enumerate(data.shape) ] schema = { 'type': np.ndarray, 'item': pa_type, 'shape': tuple(shape), } elif isinstance(data, pa.Table): schema = data.schema elif isinstance(data, (list, tuple)) and len(data) > 0: # try to infer type of the data current_type = self.infer_schema(data[0]) for i in range(1, min(len(data), _SAMPLE_SIZE)): new_type = self.infer_schema(data[i]) if new_type != current_type: current_type = None break # does not support multiple type yet if current_type: if isinstance(current_type, pa.DataType): schema = pa.list_(current_type) else: schema = {'type': list, 'item': current_type} elif type(data) in _python_mapping: schema = _python_mapping[type(data)]() else: return {'type': type(data)} return schema
MANY_TYPES = [ pa.null(), pa.bool_(), pa.int32(), pa.time32('s'), pa.time64('us'), pa.date32(), pa.timestamp('us'), pa.timestamp('us', tz='UTC'), pa.timestamp('us', tz='Europe/Paris'), pa.float16(), pa.float32(), pa.float64(), pa.decimal128(19, 4), pa.string(), pa.binary(), pa.binary(10), pa.list_(pa.int32()), pa.struct([ pa.field('a', pa.int32()), pa.field('b', pa.int8()), pa.field('c', pa.string()) ]), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), # XXX Needs array pickling
def test_try_incompatible_type(self): arr = pa.array(TypedSequence(["foo", "bar"], try_type=pa.int64())) self.assertEqual(arr.type, pa.string())
import pyarrow as pa PQ_SCHEMAS = dict() # site_visits fields = [ pa.field('visit_id', pa.int64(), nullable=False), pa.field('crawl_id', pa.int32(), nullable=False), pa.field('instance_id', pa.int32(), nullable=False), pa.field('site_url', pa.string(), nullable=False) ] PQ_SCHEMAS['site_visits'] = pa.schema(fields) # flash_cookies fields = [ pa.field('crawl_id', pa.int32(), nullable=False), pa.field('visit_id', pa.int64(), nullable=False), pa.field('instance_id', pa.int32(), nullable=False), pa.field('domain', pa.string()), pa.field('filename', pa.string()), pa.field('local_path', pa.string()), pa.field('key', pa.string()), pa.field('content', pa.string()) ] PQ_SCHEMAS['flash_cookies'] = pa.schema(fields) # profile_cookies fields = [ pa.field('crawl_id', pa.int32(), nullable=False), pa.field('visit_id', pa.int64(), nullable=False), pa.field('instance_id', pa.int32(), nullable=False),
def test_is_dictionary(): assert types.is_dictionary(pa.dictionary(pa.int32(), pa.string())) assert not types.is_dictionary(pa.int32())
MANY_TYPES = [ pa.null(), pa.bool_(), pa.int32(), pa.time32('s'), pa.time64('us'), pa.date32(), pa.timestamp('us'), pa.timestamp('us', tz='UTC'), pa.timestamp('us', tz='Europe/Paris'), pa.float16(), pa.float32(), pa.float64(), pa.decimal128(19, 4), pa.string(), pa.binary(), pa.binary(10), pa.list_(pa.int32()), pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.int8()), pa.field('c', pa.string())]), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), # XXX Needs array pickling # pa.dictionary(pa.int32(), pa.array(['a', 'b', 'c'])), ]
def test_isnull(array, expected, offset): array = pa.array(array, pa.string())[offset:] np.testing.assert_array_equal( isnull(NumbaStringArray.make(array)), np.asarray(expected[offset:], dtype=np.bool), )
('uint32', range(0, 10)), ('int32', range(0, 10)), ('uint64', range(0, 10)), ('int64', range(0, 10)), ('float', [0.0, 0.1, 0.2]), ('double', [0.0, 0.1, 0.2]), ('string', ['a', 'b', 'c']), ('binary', [b'a', b'b', b'c']), (pa.binary(3), [b'abc', b'bcd', b'cde'])]) def test_cast_identities(ty, values): arr = pa.array(values, type=ty) assert arr.cast(ty).equals(arr) pickle_test_parametrize = pytest.mark.parametrize( ('data', 'typ'), [([True, False, True, True], pa.bool_()), ([1, 2, 4, 6], pa.int64()), ([1.0, 2.5, None], pa.float64()), (['a', None, 'b'], pa.string()), ([], None), ([[1, 2], [3]], pa.list_(pa.int64())), ([[4, 5], [6]], pa.large_list(pa.int16())), ([['a'], None, ['b', 'c']], pa.list_(pa.string())), ([(1, 'a'), (2, 'c'), None ], pa.struct([pa.field('a', pa.int64()), pa.field('b', pa.string())]))]) @pickle_test_parametrize def test_array_pickle(data, typ): # Allocate here so that we don't have any Arrow data allocated. # This is needed to ensure that allocator tests can be reliable. array = pa.array(data, type=typ) for proto in range(0, pickle.HIGHEST_PROTOCOL + 1): result = pickle.loads(pickle.dumps(array, proto))
def array_inhom_chunks(): chunk1 = pa.array(list("abc"), pa.string()) chunk2 = pa.array(list("12345"), pa.string()) chunk3 = pa.array(list("Z"), pa.string()) chunked_array = pa.chunked_array([chunk1, chunk2, chunk3]) return fr.FletcherChunkedArray(chunked_array)
with self.assertRaises(OverflowError): _ = pa.array(TypedSequence([["x" * 1024]] * ((2 << 20) + 1))) # ListArray with a bit more than 2GB def _check_output(output, expected_num_chunks: int): stream = pa.BufferReader(output) if isinstance(output, pa.Buffer) else pa.memory_map(output) f = pa.ipc.open_stream(stream) pa_table: pa.Table = f.read_all() assert len(pa_table.to_batches()) == expected_num_chunks assert pa_table.to_pydict() == {"col_1": ["foo", "bar"], "col_2": [1, 2]} del pa_table @pytest.mark.parametrize("writer_batch_size", [None, 1, 10]) @pytest.mark.parametrize( "fields", [None, {"col_1": pa.string(), "col_2": pa.int64()}, {"col_1": pa.string(), "col_2": pa.int32()}] ) def test_write(fields, writer_batch_size): output = pa.BufferOutputStream() schema = pa.schema(fields) if fields else None with ArrowWriter(stream=output, schema=schema, writer_batch_size=writer_batch_size) as writer: writer.write({"col_1": "foo", "col_2": 1}) writer.write({"col_1": "bar", "col_2": 2}) num_examples, num_bytes = writer.finalize() assert num_examples == 2 assert num_bytes > 0 if not fields: fields = {"col_1": pa.string(), "col_2": pa.int64()} assert writer._schema == pa.schema(fields, metadata=writer._schema.metadata) _check_output(output.getvalue(), expected_num_chunks=num_examples if writer_batch_size == 1 else 1)
def __call__(self): return pa.string()
('string', ['a', 'b', 'c']), ('binary', [b'a', b'b', b'c']), (pa.binary(3), [b'abc', b'bcd', b'cde']) ]) def test_cast_identities(ty, values): arr = pa.array(values, type=ty) assert arr.cast(ty).equals(arr) pickle_test_parametrize = pytest.mark.parametrize( ('data', 'typ'), [ ([True, False, True, True], pa.bool_()), ([1, 2, 4, 6], pa.int64()), ([1.0, 2.5, None], pa.float64()), (['a', None, 'b'], pa.string()), ([], None), ([[1, 2], [3]], pa.list_(pa.int64())), ([['a'], None, ['b', 'c']], pa.list_(pa.string())), ([(1, 'a'), (2, 'c'), None], pa.struct([pa.field('a', pa.int64()), pa.field('b', pa.string())])) ] ) @pickle_test_parametrize def test_array_pickle(data, typ): # Allocate here so that we don't have any Arrow data allocated. # This is needed to ensure that allocator tests can be reliable. array = pa.array(data, type=typ) for proto in range(0, pickle.HIGHEST_PROTOCOL + 1):
def test_chunked_array_basics(): data = pa.chunked_array([], type=pa.string()) assert data.to_pylist() == [] with pytest.raises(ValueError): pa.chunked_array([])