def time_write_binary_table_uncompressed(self): out = pa.BufferOutputStream() pq.write_table(self.table, out, compression='none')
def _serialize_pyarrow_table(table): output_stream = pa.BufferOutputStream() with pa.RecordBatchStreamWriter(output_stream, schema=table.schema) as wr: wr.write_table(table) return output_stream.getvalue() # This will also close the stream.
def _simple_table_write_read(table, use_legacy_dataset): bio = pa.BufferOutputStream() pq.write_table(table, bio) contents = bio.getvalue() return pq.read_table(pa.BufferReader(contents), use_legacy_dataset=use_legacy_dataset)
def _schema2bytes(schema: SchemaWrapper) -> bytes: buf = pa.BufferOutputStream() pq.write_metadata(schema, buf, version="2.0", coerce_timestamps="us") return buf.getvalue().to_pybytes()
def test_empty_file(): buf = b'' with pytest.raises(pa.ArrowInvalid): pa.open_file(pa.BufferReader(buf)) def test_file_simple_roundtrip(file_fixture): file_fixture._check_roundtrip(as_table=False) def test_file_write_table(file_fixture): file_fixture._check_roundtrip(as_table=True) @pytest.mark.parametrize( "sink_factory", [lambda: io.BytesIO(), lambda: pa.BufferOutputStream()]) def test_file_read_all(sink_factory): fixture = FileFormatFixture(sink_factory) _, batches = fixture.write_batches() file_contents = pa.BufferReader(fixture.get_source()) reader = pa.open_file(file_contents) result = reader.read_all() expected = pa.Table.from_batches(batches) assert result.equals(expected) def test_open_file_from_buffer(file_fixture): # ARROW-2859; APIs accept the buffer protocol
def make_serialized(schema, batches): with pa.BufferOutputStream() as sink: with pa.ipc.new_stream(sink, schema) as out: for batch in batches: out.write(batch) return sink.getvalue()
def open_output_stream(self, path, metadata): if "notfound" in path: raise FileNotFoundError(path) return pa.BufferOutputStream()
def test_validation(self): print("Simulate production") data = [ pa.array([1, 2, 3, 4]), pa.array(["foo", "bar", "baz", None]), pa.array([True, None, False, True]), ] batch = pa.RecordBatch.from_arrays(data, ["f0", "f1", "f2"]) sink = pa.BufferOutputStream() writer = pa.RecordBatchFileWriter(sink, batch.schema) for i in range(10): writer.write_batch(batch) writer.close() buf = sink.getvalue() mymenu = Menu_pb() mymenu.uuid = str(uuid.uuid4()) mymenu.name = f"{mymenu.uuid}.menu.dat" menuinfo = MenuObjectInfo() menuinfo.created.GetCurrentTime() bufmenu = pa.py_buffer(mymenu.SerializeToString()) myconfig = Configuration() myconfig.uuid = str(uuid.uuid4()) myconfig.name = f"{myconfig.uuid}.config.dat" configinfo = ConfigObjectInfo() configinfo.created.GetCurrentTime() bufconfig = pa.py_buffer(myconfig.SerializeToString()) with tempfile.TemporaryDirectory() as dirpath: _path = dirpath + "/test" store = BaseObjectStore( str(_path), "test") # wrapper to the CronusStore message # Following puts the menu and config to the datastore menu_uuid = store.register_content(mymenu, menuinfo).uuid config_uuid = store.register_content(myconfig, configinfo).uuid dataset = store.register_dataset(menu_uuid, config_uuid) # Multiple streams store.new_partition(dataset.uuid, "key1") store.new_partition(dataset.uuid, "key2") store.new_partition(dataset.uuid, "key3") fileinfo = FileObjectInfo() fileinfo.type = 5 fileinfo.aux.description = "Some dummy data" ids_ = [] parts = store.list_partitions(dataset.uuid) # reload menu and config newmenu = Menu_pb() store.get(menu_uuid, newmenu) newconfig = Configuration() store.get(config_uuid, newconfig) print(parts) for _ in range(10): job_id = store.new_job(dataset.uuid) for key in parts: ids_.append( store.register_content( buf, fileinfo, dataset_id=dataset.uuid, job_id=job_id, partition_key=key, ).uuid) store.put(ids_[-1], buf) for id_ in ids_: buf = pa.py_buffer(store.get(id_)) reader = pa.ipc.open_file(buf) self.assertEqual(reader.num_record_batches, 10) # Save the store, reload store.save_store() newstore = BaseObjectStore(str(_path), store._name, store_uuid=store.store_uuid) for id_ in ids_: print("Get object %s", id_) print(type(id_)) buf = pa.py_buffer(newstore.get(id_)) reader = pa.ipc.open_file(buf) self.assertEqual(reader.num_record_batches, 10) print(newmenu) print(newconfig) print("Simulation Test Done ===========================")
def write_branches_to_arrow(self, transformer, topic_name, file_id, request_id): from .scratch_file_writer import ScratchFileWriter tick = time.time() scratch_writer = None total_messages = 0 for pa_table in transformer.arrow_table(): if self.object_store: if not scratch_writer: scratch_writer = ScratchFileWriter( file_format=self.file_format) scratch_writer.open_scratch_file(pa_table) scratch_writer.append_table_to_scratch(pa_table) if self.messaging: batches = pa_table.to_batches( max_chunksize=transformer.chunk_size) for batch in batches: messaging_tick = time.time() # Just need to make key unique to shard messages across brokers key = str.encode(transformer.file_path + "-" + str(total_messages)) sink = pa.BufferOutputStream() writer = pa.RecordBatchStreamWriter(sink, batch.schema) writer.write_batch(batch) writer.close() self.messaging.publish_message(topic_name, key, sink.getvalue()) self.avg_cell_size.append( len(sink.getvalue().to_pybytes()) / len(transformer.attr_name_list) / batch.num_rows) total_messages += 1 self.messaging_timings.append(time.time() - messaging_tick) if self.object_store: object_store_tick = time.time() scratch_writer.close_scratch_file() print("Writing parquet to ", request_id, " as ", transformer.file_path.replace('/', ':')) self.object_store.upload_file( request_id, transformer.file_path.replace('/', ':'), scratch_writer.file_path) scratch_writer.remove_scratch_file() self.object_store_timing = time.time() - object_store_tick tock = time.time() if self.messaging: avg_avg_cell_size = sum(self.avg_cell_size) / len(self.avg_cell_size) \ if len(self.avg_cell_size) else 0 print("Wrote " + str(total_messages) + " events to " + topic_name, "Avg Cell Size = " + str(avg_avg_cell_size) + " bytes") print("Real time: " + str(round(tock - tick / 60.0, 2)) + " minutes")
def test_dir_glob(self): print("Testing directory globbing") data = [ pa.array([1, 2, 3, 4]), pa.array(["foo", "bar", "baz", None]), pa.array([True, None, False, True]), ] batch = pa.RecordBatch.from_arrays(data, ["f0", "f1", "f2"]) sink = pa.BufferOutputStream() writer = pa.RecordBatchFileWriter(sink, batch.schema) mymenu = Menu_pb() mymenu.uuid = str(uuid.uuid4()) mymenu.name = f"{mymenu.uuid}.menu.dat" menuinfo = MenuObjectInfo() menuinfo.created.GetCurrentTime() bufmenu = pa.py_buffer(mymenu.SerializeToString()) myconfig = Configuration() myconfig.uuid = str(uuid.uuid4()) myconfig.name = f"{myconfig.uuid}.config.dat" configinfo = ConfigObjectInfo() configinfo.created.GetCurrentTime() bufconfig = pa.py_buffer(myconfig.SerializeToString()) for i in range(10): writer.write_batch(batch) writer.close() buf = sink.getvalue() mymsg = DummyMessage() mymsg.name = "dummy" mymsg.description = "really dumb" store_id = str(uuid.uuid4()) mystore = CronusObjectStore() mystore.name = "test" mystore.uuid = str(store_id) mystore.parent_uuid = "" # top level store with tempfile.TemporaryDirectory() as dirpath: mystore.address = dirpath + "/test" _path = Path(mystore.address) _path.mkdir() store = BaseObjectStore( str(_path), "test") # wrapper to the CronusStore message fileinfo = FileObjectInfo() fileinfo.type = 5 fileinfo.aux.description = "Some dummy data" menu_uuid = store.register_content(mymenu, menuinfo).uuid config_uuid = store.register_content(myconfig, configinfo).uuid dataset = store.register_dataset(menu_uuid, config_uuid) store.new_partition(dataset.uuid, "key") path = dirpath + "/test/dummy.arrow" with pa.OSFile(str(path), "wb") as f: f.write(sink.getvalue()) path = dirpath + "/test/dummy2.arrow" with pa.OSFile(str(path), "wb") as f: f.write(sink.getvalue()) objs_ = store.register_content( mystore.address, fileinfo, glob="*arrow", dataset_id=dataset.uuid, partition_key="key", ) for obj_ in objs_: print(obj_.uuid, store[obj_.uuid].address) buf = pa.py_buffer(store.get(obj_.uuid)) reader = pa.ipc.open_file(buf) self.assertEqual(reader.num_record_batches, 10) ds = store.list(suffix="dataset") for d in ds: p = d.uuid + ".part_key" f = store.list(prefix=p, suffix="arrow") print(f) print("Test Done ===========================")
def test_register_dataset(self): # Create a fake dataset # from a menu_id and menu msg # from a config_id and config msg # add files # add tables mymenu = Menu_pb() mymenu.uuid = str(uuid.uuid4()) mymenu.name = f"{mymenu.uuid}.menu.dat" menuinfo = MenuObjectInfo() menuinfo.created.GetCurrentTime() bufmenu = pa.py_buffer(mymenu.SerializeToString()) myconfig = Configuration() myconfig.uuid = str(uuid.uuid4()) myconfig.name = f"{myconfig.uuid}.config.dat" configinfo = ConfigObjectInfo() configinfo.created.GetCurrentTime() bufconfig = pa.py_buffer(myconfig.SerializeToString()) store_id = str(uuid.uuid4()) mystore = CronusObjectStore() mystore.name = "test" mystore.uuid = str(store_id) mystore.parent_uuid = "" # top level store print("Testing directory globbing") data = [ pa.array([1, 2, 3, 4]), pa.array(["foo", "bar", "baz", None]), pa.array([True, None, False, True]), ] batch = pa.RecordBatch.from_arrays(data, ["f0", "f1", "f2"]) # schema = batch.schema.to_pybytes() sink = pa.BufferOutputStream() writer = pa.RecordBatchFileWriter(sink, batch.schema) for i in range(10): writer.write_batch(batch) writer.close() buf = sink.getvalue() fileinfo = FileObjectInfo() fileinfo.type = 5 fileinfo.aux.num_columns = 3 with tempfile.TemporaryDirectory() as dirpath: _path = dirpath + "/test" store = BaseObjectStore(str(_path), "test") store_id = store.store_uuid print(store.store_info.created.ToDatetime()) menu_uuid = store.register_content(mymenu, menuinfo).uuid config_uuid = store.register_content(myconfig, configinfo).uuid print(menu_uuid) print(config_uuid) dataset = store.register_dataset(menu_uuid, config_uuid) store.new_partition(dataset.uuid, "key") job_id = store.new_job(dataset.uuid) store.register_content( buf, fileinfo, dataset_id=dataset.uuid, partition_key="key", job_id=job_id, ) ds = store.list(suffix="dataset") print(ds)
def test_identical_files(self): print("Testing add file from path") data = [ pa.array([1, 2, 3, 4]), pa.array(["foo", "bar", "baz", None]), pa.array([True, None, False, True]), ] batch = pa.RecordBatch.from_arrays(data, ["f0", "f1", "f2"]) sink = pa.BufferOutputStream() writer = pa.RecordBatchFileWriter(sink, batch.schema) for i in range(10): writer.write_batch(batch) writer.close() buf = sink.getvalue() mymsg = DummyMessage() mymsg.name = "dummy" mymsg.description = "really dumb" mymenu = Menu_pb() mymenu.uuid = str(uuid.uuid4()) mymenu.name = f"{mymenu.uuid}.menu.dat" menuinfo = MenuObjectInfo() menuinfo.created.GetCurrentTime() bufmenu = pa.py_buffer(mymenu.SerializeToString()) myconfig = Configuration() myconfig.uuid = str(uuid.uuid4()) myconfig.name = f"{myconfig.uuid}.config.dat" configinfo = ConfigObjectInfo() configinfo.created.GetCurrentTime() bufconfig = pa.py_buffer(myconfig.SerializeToString()) with tempfile.TemporaryDirectory() as dirpath: _path = dirpath + "/test" store = BaseObjectStore( str(_path), "test") # wrapper to the CronusStore message fileinfo = FileObjectInfo() fileinfo.type = 5 fileinfo.aux.description = "Some dummy data" menu_uuid = store.register_content(mymenu, menuinfo).uuid config_uuid = store.register_content(myconfig, configinfo).uuid dataset = store.register_dataset(menu_uuid, config_uuid) store.new_partition(dataset.uuid, "key") path = dirpath + "/test/dummy.arrow" with pa.OSFile(str(path), "wb") as f: f.write(sink.getvalue()) id_ = store.register_content(path, fileinfo, dataset_id=dataset.uuid, partition_key="key").uuid print(id_, store[id_].address) buf = pa.py_buffer(store._get_object(id_)) reader = pa.ipc.open_file(buf) self.assertEqual(reader.num_record_batches, 10) path = dirpath + "/test/dummy2.arrow" with pa.OSFile(str(path), "wb") as f: f.write(sink.getvalue()) id_ = store.register_content(path, fileinfo, dataset_id=dataset.uuid, partition_key="key").uuid print(id_, store[id_].address) buf = pa.py_buffer(store.get(id_)) reader = pa.ipc.open_file(buf) self.assertEqual(reader.num_record_batches, 10) print("Test Done ===========================")
def time_convert_pandas_and_write_binary_table(self): out = pa.BufferOutputStream() pq.write_table(pa.table(self.table_df), out)
def time_write_binary_table_no_dictionary(self): out = pa.BufferOutputStream() pq.write_table(self.table, out, use_dictionary=False)
def test_field_id_metadata(): # ARROW-7080 field_id = b'PARQUET:field_id' inner = pa.field('inner', pa.int32(), metadata={field_id: b'100'}) middle = pa.field('middle', pa.struct([inner]), metadata={field_id: b'101'}) fields = [ pa.field('basic', pa.int32(), metadata={ b'other': b'abc', field_id: b'1' }), pa.field('list', pa.list_( pa.field('list-inner', pa.int32(), metadata={field_id: b'10'})), metadata={field_id: b'11'}), pa.field('struct', pa.struct([middle]), metadata={field_id: b'102'}), pa.field('no-metadata', pa.int32()), pa.field('non-integral-field-id', pa.int32(), metadata={field_id: b'xyz'}), pa.field('negative-field-id', pa.int32(), metadata={field_id: b'-1000'}) ] arrs = [[] for _ in fields] table = pa.table(arrs, schema=pa.schema(fields)) bio = pa.BufferOutputStream() pq.write_table(table, bio) contents = bio.getvalue() pf = pq.ParquetFile(pa.BufferReader(contents)) schema = pf.schema_arrow assert schema[0].metadata[field_id] == b'1' assert schema[0].metadata[b'other'] == b'abc' list_field = schema[1] assert list_field.metadata[field_id] == b'11' list_item_field = list_field.type.value_field assert list_item_field.metadata[field_id] == b'10' struct_field = schema[2] assert struct_field.metadata[field_id] == b'102' struct_middle_field = struct_field.type[0] assert struct_middle_field.metadata[field_id] == b'101' struct_inner_field = struct_middle_field.type[0] assert struct_inner_field.metadata[field_id] == b'100' assert schema[3].metadata is None # Invalid input is passed through (ok) but does not # have field_id in parquet (not tested) assert schema[4].metadata[field_id] == b'xyz' assert schema[5].metadata[field_id] == b'-1000'
def test_query_indices_external(store, metadata_version): expected = { "dataset_metadata_version": metadata_version, "dataset_uuid": "uuid+namespace-attribute12_underscored", "partitions": { "part_1": { "files": { "core_data": "file.parquest" } }, "part_2": { "files": { "core_data": "file2.parquest" } }, }, "indices": { "product_id": "uuid+namespace-attribute12_underscored.product_id.by-dataset-index.parquet", "location_id": { "1": ["part_1"], "2": ["part_2"], "3": ["part_1"], "4": ["part_2"], }, }, } store.put( "uuid+namespace-attribute12_underscored.by-dataset-metadata.json", simplejson.dumps(expected).encode("utf-8"), ) df = pd.DataFrame({ "product_id": [1, 2, 100, 34], "partition": [ np.array(["part_1"], dtype=object), np.array(["part_2"], dtype=object), np.array(["part_1", "part_2"], dtype=object), np.array(["part_1"], dtype=object), ], }) schema = pa.schema([ pa.field("partition", pa.list_(pa.string())), pa.field("product_id", pa.int64()), ]) table = pa.Table.from_pandas(df, schema=schema) buf = pa.BufferOutputStream() pq.write_table(table, buf) store.put( "uuid+namespace-attribute12_underscored.product_id.by-dataset-index.parquet", buf.getvalue().to_pybytes(), ) store_schema_metadata( make_meta(df, origin="core"), "uuid+namespace-attribute12_underscored", store, "core_data", ) dmd = DatasetMetadata.load_from_store( "uuid+namespace-attribute12_underscored", store) dmd = dmd.load_index("product_id", store) assert dmd.query(product_id=2) == ["part_2"] dmd = dmd.load_all_indices(store) assert dmd.query(product_id=2, location_id=2) == ["part_2"] assert dmd.query(product_id=100, location_id=3) == ["part_1"] assert dmd.query(product_id=2, location_id=2, something_else="bla") == ["part_2"] additional_index = ExplicitSecondaryIndex.from_v2( "another_column", {"1": ["part_2", "part_3"]}) assert dmd.query(indices=[additional_index], another_column="1", product_id=2, location_id=2) == ["part_2"]
buf = b'' with pytest.raises(pa.ArrowInvalid): pa.ipc.open_file(pa.BufferReader(buf)) def test_file_simple_roundtrip(file_fixture): file_fixture._check_roundtrip(as_table=False) def test_file_write_table(file_fixture): file_fixture._check_roundtrip(as_table=True) @pytest.mark.parametrize("sink_factory", [ lambda: io.BytesIO(), lambda: pa.BufferOutputStream() ]) def test_file_read_all(sink_factory): fixture = FileFormatFixture(sink_factory) batches = fixture.write_batches() file_contents = pa.BufferReader(fixture.get_source()) reader = pa.ipc.open_file(file_contents) result = reader.read_all() expected = pa.Table.from_batches(batches) assert result.equals(expected) def test_open_file_from_buffer(file_fixture):
def from_record_batches( cls, record_batches, output_types, output_shapes=None, columns=None, batch_size=None, batch_mode="keep_remainder", ): """Create an ArrowDataset directly from Arrow record batches. This constructor requires pyarrow to be installed. Args: record_batches: An Arrow record batch or sequence of record batches output_types: Tensor dtypes of the output tensors output_shapes: TensorShapes of the output tensors or None to infer partial batch_size: Batch size of output tensors, setting a batch size here will create batched tensors from Arrow memory and can be more efficient than using tf.data.Dataset.batch(). NOTE: batch_size does not need to be set if batch_mode='auto' batch_mode: Mode of batching, supported strings: "keep_remainder" (default, keeps partial batch data), "drop_remainder" (discard partial batch data), "auto" (size to number of records in Arrow record batch) columns: A list of column indices to be used in the Dataset """ import pyarrow as pa # pylint: disable=import-outside-toplevel if isinstance(record_batches, pa.RecordBatch): record_batches = [record_batches] if columns is None: columns = tuple(range(record_batches[0].num_columns)) assert record_batches if tf.executing_eagerly(): sink = pa.BufferOutputStream() writer = pa.RecordBatchFileWriter(sink, record_batches[0].schema) for batch in record_batches: writer.write_batch(batch) writer.close() serialized_batches = None arrow_buffer = sink.getvalue() else: buf = io.BytesIO() writer = pa.RecordBatchFileWriter(buf, record_batches[0].schema) for batch in record_batches: writer.write_batch(batch) writer.close() serialized_batches = tf.convert_to_tensor( buf.getvalue(), dtype=dtypes.string, name="serialized_batches" ) arrow_buffer = None return cls( serialized_batches, columns, output_types, output_shapes, batch_size=batch_size, batch_mode=batch_mode, arrow_buffer=arrow_buffer, )
def test_buffer_readwrite_with_bad_writeoptions(): from pyarrow import orc buffer_output_stream = pa.BufferOutputStream() a = pa.array([1, None, 3, None]) table = pa.table({"int64": a}) # batch_size must be a positive integer with pytest.raises(ValueError): orc.write_table( table, buffer_output_stream, batch_size=0, ) with pytest.raises(ValueError): orc.write_table( table, buffer_output_stream, batch_size=-100, ) with pytest.raises(ValueError): orc.write_table( table, buffer_output_stream, batch_size=1024.23, ) # file_version must be 0.11 or 0.12 with pytest.raises(ValueError): orc.write_table( table, buffer_output_stream, file_version=0.13, ) with pytest.raises(ValueError): orc.write_table( table, buffer_output_stream, file_version='1.1', ) # stripe_size must be a positive integer with pytest.raises(ValueError): orc.write_table( table, buffer_output_stream, stripe_size=0, ) with pytest.raises(ValueError): orc.write_table( table, buffer_output_stream, stripe_size=-400, ) with pytest.raises(ValueError): orc.write_table( table, buffer_output_stream, stripe_size=4096.73, ) # compression must be among the given options with pytest.raises(TypeError): orc.write_table( table, buffer_output_stream, compression=0, ) with pytest.raises(ValueError): orc.write_table( table, buffer_output_stream, compression='none', ) with pytest.raises(ValueError): orc.write_table( table, buffer_output_stream, compression='zlid', ) # compression_block_size must be a positive integer with pytest.raises(ValueError): orc.write_table( table, buffer_output_stream, compression_block_size=0, ) with pytest.raises(ValueError): orc.write_table( table, buffer_output_stream, compression_block_size=-200, ) with pytest.raises(ValueError): orc.write_table( table, buffer_output_stream, compression_block_size=1096.73, ) # compression_strategy must be among the given options with pytest.raises(TypeError): orc.write_table( table, buffer_output_stream, compression_strategy=0, ) with pytest.raises(ValueError): orc.write_table( table, buffer_output_stream, compression_strategy='no', ) with pytest.raises(ValueError): orc.write_table( table, buffer_output_stream, compression_strategy='large', ) # row_index_stride must be a positive integer with pytest.raises(ValueError): orc.write_table( table, buffer_output_stream, row_index_stride=0, ) with pytest.raises(ValueError): orc.write_table( table, buffer_output_stream, row_index_stride=-800, ) with pytest.raises(ValueError): orc.write_table( table, buffer_output_stream, row_index_stride=3096.29, ) # padding_tolerance must be possible to cast to float with pytest.raises(ValueError): orc.write_table( table, buffer_output_stream, padding_tolerance='cat', ) # dictionary_key_size_threshold must be possible to cast to # float between 0.0 and 1.0 with pytest.raises(ValueError): orc.write_table( table, buffer_output_stream, dictionary_key_size_threshold='arrow', ) with pytest.raises(ValueError): orc.write_table( table, buffer_output_stream, dictionary_key_size_threshold=1.2, ) with pytest.raises(ValueError): orc.write_table( table, buffer_output_stream, dictionary_key_size_threshold=-3.2, ) # bloom_filter_columns must be convertible to a list containing # nonnegative integers with pytest.raises(ValueError): orc.write_table( table, buffer_output_stream, bloom_filter_columns="string", ) with pytest.raises(ValueError): orc.write_table( table, buffer_output_stream, bloom_filter_columns=[0, 1.4], ) with pytest.raises(ValueError): orc.write_table( table, buffer_output_stream, bloom_filter_columns={0, 2, -1}, ) # bloom_filter_fpp must be convertible to a float between 0.0 and 1.0 with pytest.raises(ValueError): orc.write_table( table, buffer_output_stream, bloom_filter_fpp='arrow', ) with pytest.raises(ValueError): orc.write_table( table, buffer_output_stream, bloom_filter_fpp=1.1, ) with pytest.raises(ValueError): orc.write_table( table, buffer_output_stream, bloom_filter_fpp=-0.1, )
def write_branches_to_arrow(messaging, topic_name, file_path, file_id, attr_name_list, chunk_size, server_endpoint, event_limit=None, object_store=None): scratch_writer = None event_iterator = XAODEvents(file_path, attr_name_list) transformer = XAODTransformer(event_iterator) batch_number = 0 total_events = 0 total_bytes = 0 for pa_table in transformer.arrow_table(chunk_size, event_limit): if object_store: if not scratch_writer: scratch_writer = _open_scratch_file(args.result_format, pa_table) _append_table_to_scratch(args.result_format, scratch_writer, pa_table) total_events = total_events + pa_table.num_rows batches = pa_table.to_batches(chunksize=chunk_size) for batch in batches: if messaging: key = file_path + "-" + str(batch_number) sink = pa.BufferOutputStream() writer = pa.RecordBatchStreamWriter(sink, batch.schema) writer.write_batch(batch) writer.close() messaging.publish_message(topic_name, key, sink.getvalue()) total_bytes = total_bytes + len(sink.getvalue().to_pybytes()) avg_cell_size = len(sink.getvalue().to_pybytes()) / len( attr_name_list) / batch.num_rows print( "Batch number " + str(batch_number) + ", " + str(batch.num_rows) + " events published to " + topic_name, "Avg Cell Size = " + str(avg_cell_size) + " bytes") batch_number += 1 if object_store: _close_scratch_file(args.result_format, scratch_writer) print("Writing parquet to ", args.request_id, " as ", file_path.replace('/', ':')) object_store.upload_file(args.request_id, file_path.replace('/', ':'), "/tmp/out") os.remove("/tmp/out") print("===> Total Events ", total_events) print("===> Total Bytes ", total_bytes) if server_endpoint: post_status_update(server_endpoint, "File " + file_path + " complete") put_file_complete(server_endpoint, file_path, file_id, "success", num_messages=batch_number, total_time="??", total_events=total_events, total_bytes=total_bytes)
def enqueue(self, uri, **data): sink = pa.BufferOutputStream() field_list = [] data_list = [] for key, value in data.items(): if isinstance(value, str): # str value will be considered as image path field = pa.field(key, pa.string()) data = self.encode_image(value) # b = bytes(data, "utf-8") data = pa.array([data]) # ba = pa.array(b, type=pa.binary()) field_list.append(field) data_list.append(data) elif isinstance(value, np.ndarray): # ndarray value will be considered as tensor indices_field = pa.field("indiceData", pa.list_(pa.int32())) indices_shape_field = pa.field("indiceShape", pa.list_(pa.int32())) data_field = pa.field("data", pa.list_(pa.float32())) shape_field = pa.field("shape", pa.list_(pa.int32())) tensor_type = pa.struct([ indices_field, indices_shape_field, data_field, shape_field ]) field = pa.field(key, tensor_type) shape = np.array(value.shape) d = value.astype("float32").flatten() # data = pa.array([{'data': d}, {'shape': shape}, {}], # type=tensor_type) data = pa.array([{ 'indiceData': [] }, { 'indiceShape': [] }, { 'data': d }, { 'shape': shape }], type=tensor_type) field_list.append(field) data_list.append(data) elif isinstance(value, list): # list will be considered as sparse tensor assert len(value) == 3, "Sparse Tensor must have list of ndarray" \ "with length 3, which represent indices, values, shape respectively" indices_field = pa.field("indiceData", pa.list_(pa.int32())) indices_shape_field = pa.field("indiceShape", pa.list_(pa.int32())) value_field = pa.field("data", pa.list_(pa.float32())) shape_field = pa.field("shape", pa.list_(pa.int32())) sparse_tensor_type = pa.struct([ indices_field, indices_shape_field, value_field, shape_field ]) field = pa.field(key, sparse_tensor_type) shape = value[2] values = value[1] indices = value[0].astype("float32").flatten() indices_shape = value[0].shape data = pa.array([{ 'indiceData': indices }, { 'indiceShape': indices_shape }, { 'data': values }, { 'shape': shape }], type=sparse_tensor_type) field_list.append(field) data_list.append(data) else: raise TypeError("Your request does not match any schema, " "please check.") schema = pa.schema(field_list) batch = pa.RecordBatch.from_arrays(data_list, schema) writer = pa.RecordBatchStreamWriter(sink, batch.schema) writer.write_batch(batch) writer.close() buf = sink.getvalue() b = buf.to_pybytes() b64str = self.base64_encode_image(b) d = {"uri": uri, "data": b64str} self.__enqueue_data(d)
def _serialize(data: Any, ) -> Tuple[bytes, Serialization]: """Serializes an object to a ``pa.Buffer``. The way the object is serialized depends on the nature of the object: ``pa.RecordBatch`` and ``pa.Table`` are serialized using ``pyarrow`` functions. All other cases are serialized through the ``pickle`` library. Args: data: The object/data to be serialized. Returns: Tuple of the serialized data (in ``pa.Buffer`` format) and the :class:`Serialization` that was used. Raises: SerializationError: If the data could not be serialized. Note: ``pickle`` does not include the code of custom functions or classes, it only pickles their names. Following to the official `Python Docs <https://docs.python.org/3/library/pickle.html#what-can-be-pickled-and-unpickled>`_: "Thus the defining module must be importable in the unpickling environment, and the module must contain the named object, otherwise an exception will be raised." """ if isinstance(data, (pa.RecordBatch, pa.Table)): # Use the intended pyarrow functionalities when possible. if isinstance(data, pa.Table): serialization = Serialization.ARROW_TABLE else: serialization = Serialization.ARROW_BATCH output_buffer = pa.BufferOutputStream() try: writer = pa.RecordBatchStreamWriter(output_buffer, data.schema) writer.write(data) writer.close() except pa.ArrowSerializationError: raise error.SerializationError( f"Could not serialize data of type {type(data)}.") serialized = output_buffer.getvalue() else: # All other cases use the pickle library. serialization = Serialization.PICKLE # Use the best protocol possible, for reference see: # https://docs.python.org/3/library/pickle.html#pickle-protocols try: serialized = pickle.dumps(data, pickle.HIGHEST_PROTOCOL) except pickle.PicklingError: raise error.SerializationError( f"Could not pickle data of type {type(data)}.") # NOTE: zero-copy view on the bytes. serialized = pa.py_buffer(serialized) return serialized, serialization
def test_native_file_write_reject_unicode(): # ARROW-3227 nf = pa.BufferOutputStream() with pytest.raises(TypeError): nf.write(u'foo')
def map( self, function, with_indices: bool = False, batched: bool = False, batch_size: Optional[int] = 1000, remove_columns: Optional[List[str]] = None, keep_in_memory: bool = False, load_from_cache_file: bool = True, cache_file_name: Optional[str] = None, writer_batch_size: Optional[int] = 1000, arrow_schema: Optional[pa.Schema] = None, disable_nullable: bool = True, ): """ Apply a function to all the elements in the table (individually or in batches) and update the table (if function does updated examples). Args: `function` (`callable`): with one of the following signature: - `function(example: Dict) -> Union[Dict, Any]` if `batched=False` and `with_indices=False` - `function(example: Dict, indices: int) -> Union[Dict, Any]` if `batched=False` and `with_indices=True` - `function(batch: Dict[List]) -> Union[Dict, Any]` if `batched=True` and `with_indices=False` - `function(batch: Dict[List], indices: List[int]) -> Union[Dict, Any]` if `batched=True` and `with_indices=True` `with_indices` (`bool`, default: `False`): Provide example indices to `function` `batched` (`bool`, default: `False`): Provide batch of examples to `function` `batch_size` (`Optional[int]`, default: `1000`): Number of examples per batch provided to `function` if `batched=True` `batch_size <= 0` or `batch_size == None`: Provide the full dataset as a single batch to `function` `remove_columns` (`Optional[List[str]]`, default: `None`): Remove a selection of columns while doing the mapping. Columns will be removed before updating the examples with the output of `function`, i.e. if `function` is adding columns with names in `remove_columns`, these columns will be kept. `keep_in_memory` (`bool`, default: `False`): Keep the dataset in memory instead of writing it to a cache file. `load_from_cache_file` (`bool`, default: `True`): If a cache file storing the current computation from `function` can be identified, use it instead of recomputing. `cache_file_name` (`Optional[str]`, default: `None`): Provide the name of a cache file to use to store the results of the computation instead of the automatically generated cache file name. `writer_batch_size` (`int`, default: `1000`): Number of rows per write operation for the cache file writer. Higher value gives smaller cache files, lower value consume less temporary memory while running `.map()`. `arrow_schema` (`Optional[pa.Schema]`, default: `None`): Use a specific Apache Arrow Schema to store the cache file instead of the automatically generated one. `disable_nullable` (`bool`, default: `True`): Allow null values in the table. """ # If the array is empty we do nothing if len(self) == 0: return self # Select the columns (arrow columns) to process if remove_columns is not None and any( col not in self._data.column_names for col in remove_columns): raise ValueError( "Column to remove {} not in the dataset. Current columns in the dataset: {}" .format( list( filter(lambda col: col not in self._data.column_names, remove_columns)), self._data.column_names, )) # If we do batch computation but no batch sze is provided, default to the full dataset if batched and (batch_size is None or batch_size <= 0): batch_size = self._data.num_rows # Check if the function returns updated examples def does_function_return_dict(inputs, indices): """ Does the function returns a dict. """ processed_inputs = function( inputs, indices) if with_indices else function(inputs) does_return_dict = isinstance(processed_inputs, Mapping) if does_return_dict is False and processed_inputs is not None: raise TypeError( "Provided `function` which is applied to all elements of table returns a variable of type {}. Make sure provided `function` returns a variable of type `dict` to update the dataset or `None` if you are only interested in side effects." .format(type(processed_inputs))) elif isinstance(test_indices, list) and does_return_dict is True: all_dict_values_are_lists = all( isinstance(value, list) for value in processed_inputs.values()) if all_dict_values_are_lists is False: raise TypeError( "Provided `function` which is applied to all elements of table returns a `dict` of types {}. When using `batched=True`, make sure provided `function` returns a `dict` of types `list`." .format([type(x) for x in processed_inputs.values()])) return does_return_dict # We only update the data table (and use the cache) if the function returns a dict. # Test it on the first element or a small batch (0, 1) for batched inputs test_inputs = self[:2] if batched else self[0] test_indices = [0, 1] if batched else 0 update_data = does_function_return_dict(test_inputs, test_indices) def apply_function_on_filtered_inputs(inputs, indices): """ Utility to apply the function on a selection of columns. """ processed_inputs = function( inputs, indices) if with_indices else function(inputs) if not update_data: return None # Nothing to update, let's move on if remove_columns is not None: for column in remove_columns: inputs.pop(column) if self._format_type is not None: inputs = self._getitem( key=(indices if isinstance(indices, int) else slice( indices[0], indices[-1])), format_type=None, format_columns=None, ) inputs.update(processed_inputs) return inputs # Find the output schema if none is given test_inputs = self[:2] if batched else self[0] test_indices = [0, 1] if batched else 0 test_output = apply_function_on_filtered_inputs( test_inputs, test_indices) if arrow_schema is None and update_data: if not batched: test_output = self._nest(test_output) test_output = convert_tuples_in_lists(test_output) arrow_schema = pa.Table.from_pydict(test_output).schema if disable_nullable: arrow_schema = pa.schema( pa.field(field.name, field.type, nullable=False) for field in arrow_schema) # Check if we've already cached this computation (indexed by a hash) if self._data_files and update_data: if cache_file_name is None: # we create a unique hash from the function, current dataset file and the mapping args cache_kwargs = { "with_indices": with_indices, "batched": batched, "batch_size": batch_size, "remove_columns": remove_columns, "keep_in_memory": keep_in_memory, "load_from_cache_file": load_from_cache_file, "cache_file_name": cache_file_name, "writer_batch_size": writer_batch_size, "arrow_schema": arrow_schema, "disable_nullable": disable_nullable, } cache_file_name = self._get_cache_file_path( function, cache_kwargs) if os.path.exists(cache_file_name) and load_from_cache_file: logger.info("Loading cached processed dataset at %s", cache_file_name) return Dataset.from_file(cache_file_name) # Prepare output buffer and batched writer in memory or on file if we update the table if update_data: if keep_in_memory or not self._data_files: buf_writer = pa.BufferOutputStream() writer = ArrowWriter(schema=arrow_schema, stream=buf_writer, writer_batch_size=writer_batch_size) else: buf_writer = None logger.info("Caching processed dataset at %s", cache_file_name) writer = ArrowWriter(schema=arrow_schema, path=cache_file_name, writer_batch_size=writer_batch_size) # Loop over single examples or batches and write to buffer/file if examples are to be updated if not batched: for i, example in tqdm(enumerate(self)): example = apply_function_on_filtered_inputs(example, i) if update_data: writer.write(example) else: for i in tqdm(range(0, len(self), batch_size)): batch = self[i:i + batch_size] indices = list( range(*(slice(i, i + batch_size).indices(self._data.num_rows) ))) # Something simpler? batch = apply_function_on_filtered_inputs(batch, indices) if update_data: writer.write_batch(batch) if update_data: writer.finalize( ) # close_stream=bool(buf_writer is None)) # We only close if we are writing in a file # Create new Dataset from buffer or file if buf_writer is None: return Dataset.from_file(cache_file_name) else: return Dataset.from_buffer(buf_writer.getvalue()) else: return self
def open_append_stream(self, path): if "notfound" in path: raise FileNotFoundError(path) return pa.BufferOutputStream()
def __init__(self): self.buf = pa.BufferOutputStream()
def _get_sink(self): return pa.BufferOutputStream()
def ipc_write_batch(batch): stream = pa.BufferOutputStream() writer = pa.RecordBatchStreamWriter(stream, batch.schema) writer.write_batch(batch) writer.close() return stream.getvalue()
def _simple_table_roundtrip(table, use_legacy_dataset=False, **write_kwargs): stream = pa.BufferOutputStream() _write_table(table, stream, **write_kwargs) buf = stream.getvalue() return _read_table(buf, use_legacy_dataset=use_legacy_dataset)
def time_write_binary_table(self): out = pa.BufferOutputStream() pq.write_table(self.table, out)