def test_register_object(self): data = [ pa.array([1, 2, 3, 4]), pa.array(["foo", "bar", "baz", None]), pa.array([True, None, False, True]), ] batch = pa.RecordBatch.from_arrays(data, ["f0", "f1", "f2"]) sink = pa.BufferOutputStream() writer = pa.RecordBatchFileWriter(sink, batch.schema) for i in range(10): writer.write_batch(batch) writer.close() buf = sink.getvalue() mymsg = DummyMessage() mymsg.name = "dummy" mymsg.description = "really dumb" mymenu = CronusObject() mymenu.name = "menu" menuinfo = MenuObjectInfo() menuinfo.created.GetCurrentTime() bufmenu = pa.py_buffer(mymenu.SerializeToString()) myconfig = Configuration() myconfig.uuid = str(uuid.uuid4()) myconfig.name = f"{myconfig.uuid}.config.dat" configinfo = ConfigObjectInfo() configinfo.created.GetCurrentTime() bufconfig = pa.py_buffer(myconfig.SerializeToString()) with tempfile.TemporaryDirectory() as dirpath: _path = dirpath + "/test" store = BaseObjectStore( str(_path), "test") # wrapper to the CronusStore message fileinfo = FileObjectInfo() fileinfo.type = 5 fileinfo.aux.description = "Some dummy data" menu_uuid = store.register_content(mymenu, menuinfo).uuid config_uuid = store.register_content(myconfig, configinfo).uuid dataset = store.register_dataset(menu_uuid, config_uuid) store.new_partition(dataset.uuid, "key") path = dirpath + "/test/dummy.arrow" with pa.OSFile(str(path), "wb") as f: f.write(sink.getvalue()) id_ = store.register_content(path, fileinfo, dataset_id=dataset.uuid, partition_key="key").uuid print(store[id_].address) buf = pa.py_buffer(store.get(id_)) reader = pa.ipc.open_file(buf) self.assertEqual(reader.num_record_batches, 10)
def test_config(self): myconfig = Configuration() myconfig.uuid = str(uuid.uuid4()) myconfig.name = f"{myconfig.uuid}.config.dat" configinfo = ConfigObjectInfo() configinfo.created.GetCurrentTime() bufconfig = pa.py_buffer(myconfig.SerializeToString()) with tempfile.TemporaryDirectory() as dirpath: _path = dirpath + "/test" store = BaseObjectStore( str(_path), "test") # wrapper to the CronusStore message config_uuid = store.register_content(myconfig, configinfo).uuid store.put(config_uuid, myconfig) aconfig = Configuration() store.get(config_uuid, aconfig) self.assertEqual(myconfig.name, aconfig.name) self.assertEqual(myconfig.uuid, aconfig.uuid)
def test_validation(self): print("Simulate production") data = [ pa.array([1, 2, 3, 4]), pa.array(["foo", "bar", "baz", None]), pa.array([True, None, False, True]), ] batch = pa.RecordBatch.from_arrays(data, ["f0", "f1", "f2"]) sink = pa.BufferOutputStream() writer = pa.RecordBatchFileWriter(sink, batch.schema) for i in range(10): writer.write_batch(batch) writer.close() buf = sink.getvalue() mymenu = Menu_pb() mymenu.uuid = str(uuid.uuid4()) mymenu.name = f"{mymenu.uuid}.menu.dat" menuinfo = MenuObjectInfo() menuinfo.created.GetCurrentTime() bufmenu = pa.py_buffer(mymenu.SerializeToString()) myconfig = Configuration() myconfig.uuid = str(uuid.uuid4()) myconfig.name = f"{myconfig.uuid}.config.dat" configinfo = ConfigObjectInfo() configinfo.created.GetCurrentTime() bufconfig = pa.py_buffer(myconfig.SerializeToString()) with tempfile.TemporaryDirectory() as dirpath: _path = dirpath + "/test" store = BaseObjectStore( str(_path), "test") # wrapper to the CronusStore message # Following puts the menu and config to the datastore menu_uuid = store.register_content(mymenu, menuinfo).uuid config_uuid = store.register_content(myconfig, configinfo).uuid dataset = store.register_dataset(menu_uuid, config_uuid) # Multiple streams store.new_partition(dataset.uuid, "key1") store.new_partition(dataset.uuid, "key2") store.new_partition(dataset.uuid, "key3") fileinfo = FileObjectInfo() fileinfo.type = 5 fileinfo.aux.description = "Some dummy data" ids_ = [] parts = store.list_partitions(dataset.uuid) # reload menu and config newmenu = Menu_pb() store.get(menu_uuid, newmenu) newconfig = Configuration() store.get(config_uuid, newconfig) print(parts) for _ in range(10): job_id = store.new_job(dataset.uuid) for key in parts: ids_.append( store.register_content( buf, fileinfo, dataset_id=dataset.uuid, job_id=job_id, partition_key=key, ).uuid) store.put(ids_[-1], buf) for id_ in ids_: buf = pa.py_buffer(store.get(id_)) reader = pa.ipc.open_file(buf) self.assertEqual(reader.num_record_batches, 10) # Save the store, reload store.save_store() newstore = BaseObjectStore(str(_path), store._name, store_uuid=store.store_uuid) for id_ in ids_: print("Get object %s", id_) print(type(id_)) buf = pa.py_buffer(newstore.get(id_)) reader = pa.ipc.open_file(buf) self.assertEqual(reader.num_record_batches, 10) print(newmenu) print(newconfig) print("Simulation Test Done ===========================")
def test_register_dataset(self): # Create a fake dataset # from a menu_id and menu msg # from a config_id and config msg # add files # add tables mymenu = Menu_pb() mymenu.uuid = str(uuid.uuid4()) mymenu.name = f"{mymenu.uuid}.menu.dat" menuinfo = MenuObjectInfo() menuinfo.created.GetCurrentTime() bufmenu = pa.py_buffer(mymenu.SerializeToString()) myconfig = Configuration() myconfig.uuid = str(uuid.uuid4()) myconfig.name = f"{myconfig.uuid}.config.dat" configinfo = ConfigObjectInfo() configinfo.created.GetCurrentTime() bufconfig = pa.py_buffer(myconfig.SerializeToString()) store_id = str(uuid.uuid4()) mystore = CronusObjectStore() mystore.name = "test" mystore.uuid = str(store_id) mystore.parent_uuid = "" # top level store print("Testing directory globbing") data = [ pa.array([1, 2, 3, 4]), pa.array(["foo", "bar", "baz", None]), pa.array([True, None, False, True]), ] batch = pa.RecordBatch.from_arrays(data, ["f0", "f1", "f2"]) # schema = batch.schema.to_pybytes() sink = pa.BufferOutputStream() writer = pa.RecordBatchFileWriter(sink, batch.schema) for i in range(10): writer.write_batch(batch) writer.close() buf = sink.getvalue() fileinfo = FileObjectInfo() fileinfo.type = 5 fileinfo.aux.num_columns = 3 with tempfile.TemporaryDirectory() as dirpath: _path = dirpath + "/test" store = BaseObjectStore(str(_path), "test") store_id = store.store_uuid print(store.store_info.created.ToDatetime()) menu_uuid = store.register_content(mymenu, menuinfo).uuid config_uuid = store.register_content(myconfig, configinfo).uuid print(menu_uuid) print(config_uuid) dataset = store.register_dataset(menu_uuid, config_uuid) store.new_partition(dataset.uuid, "key") job_id = store.new_job(dataset.uuid) store.register_content( buf, fileinfo, dataset_id=dataset.uuid, partition_key="key", job_id=job_id, ) ds = store.list(suffix="dataset") print(ds)
def test_dir_glob(self): print("Testing directory globbing") data = [ pa.array([1, 2, 3, 4]), pa.array(["foo", "bar", "baz", None]), pa.array([True, None, False, True]), ] batch = pa.RecordBatch.from_arrays(data, ["f0", "f1", "f2"]) sink = pa.BufferOutputStream() writer = pa.RecordBatchFileWriter(sink, batch.schema) mymenu = Menu_pb() mymenu.uuid = str(uuid.uuid4()) mymenu.name = f"{mymenu.uuid}.menu.dat" menuinfo = MenuObjectInfo() menuinfo.created.GetCurrentTime() bufmenu = pa.py_buffer(mymenu.SerializeToString()) myconfig = Configuration() myconfig.uuid = str(uuid.uuid4()) myconfig.name = f"{myconfig.uuid}.config.dat" configinfo = ConfigObjectInfo() configinfo.created.GetCurrentTime() bufconfig = pa.py_buffer(myconfig.SerializeToString()) for i in range(10): writer.write_batch(batch) writer.close() buf = sink.getvalue() mymsg = DummyMessage() mymsg.name = "dummy" mymsg.description = "really dumb" store_id = str(uuid.uuid4()) mystore = CronusObjectStore() mystore.name = "test" mystore.uuid = str(store_id) mystore.parent_uuid = "" # top level store with tempfile.TemporaryDirectory() as dirpath: mystore.address = dirpath + "/test" _path = Path(mystore.address) _path.mkdir() store = BaseObjectStore( str(_path), "test") # wrapper to the CronusStore message fileinfo = FileObjectInfo() fileinfo.type = 5 fileinfo.aux.description = "Some dummy data" menu_uuid = store.register_content(mymenu, menuinfo).uuid config_uuid = store.register_content(myconfig, configinfo).uuid dataset = store.register_dataset(menu_uuid, config_uuid) store.new_partition(dataset.uuid, "key") path = dirpath + "/test/dummy.arrow" with pa.OSFile(str(path), "wb") as f: f.write(sink.getvalue()) path = dirpath + "/test/dummy2.arrow" with pa.OSFile(str(path), "wb") as f: f.write(sink.getvalue()) objs_ = store.register_content( mystore.address, fileinfo, glob="*arrow", dataset_id=dataset.uuid, partition_key="key", ) for obj_ in objs_: print(obj_.uuid, store[obj_.uuid].address) buf = pa.py_buffer(store.get(obj_.uuid)) reader = pa.ipc.open_file(buf) self.assertEqual(reader.num_record_batches, 10) ds = store.list(suffix="dataset") for d in ds: p = d.uuid + ".part_key" f = store.list(prefix=p, suffix="arrow") print(f) print("Test Done ===========================")
def example_configuration(table_id, seed=42): # First define a data generator using SimuTable max_malloc = 2147483648 # Maximum memory allowed in Arrow memory pool max_buffer_size = 2147483648 # Maximum size serialized ipc message write_csv = True # Output csv files for each arrow output file sample_ndatums = 1 # Preprocess job to sample files from dataset sample_nchunks = 10 # Preprocess job to sample chunks from a file linesep = "\r\n" # Line delimiter to scan for on csv input delimiter = "," # Field delimiter blocksize = 2**16 # Size of chunked data in-memory header = "" # Predefined header footer = "" # Predefined footer header_offset = 0 # N bytes to scan past header footer_size = 0 # N bytes size of footer schema = [] # Predefined list of field names on input encoding = "utf8" # encoding gen_nbatches = 5 # Number of batches to generator gen_nrows = 1000 # Number of rows per batch config = Configuration() # Cronus Configuration message config.uuid = str(uuid.uuid4()) config.name = f"{config.uuid}.config.pb" config.max_malloc_size_bytes = max_malloc generator = SimuTableGen( "generator", nbatches=gen_nbatches, num_rows=gen_nrows, file_type=1, # Output type cronus.proto filetype table_id=table_id, seed=seed, ) # Set the generator configuration config.input.generator.config.CopyFrom(generator.to_msg()) filehandler = FileHandlerTool( "filehandler", filetype="csv", # TBD use filetype metadata blocksize=blocksize, delimiter=delimiter, linesep=linesep, header=header, footer=footer, header_offset=header_offset, footer_size=footer_size, schema=schema, encoding=encoding, seed=seed, ) # Add to the tools config.tools[filehandler.name].CopyFrom(filehandler.to_msg()) csvtool = CsvTool("csvtool", block_size=(2 * blocksize)) config.tools[csvtool.name].CopyFrom(csvtool.to_msg()) # filtercoltool = FilterColTool( # "filtercoltool", columns=["record_id", "Product", "Unit"] # ) # filtercoltool = FilterColTool('filtercoltool', # columns=['record_id', 'SIN', 'DOB']) # config.tools[filtercoltool.name].CopyFrom(filtercoltool.to_msg()) writer = BufferOutputWriter("bufferwriter", BUFFER_MAX_SIZE=max_buffer_size, write_csv=write_csv) config.tools[writer.name].CopyFrom(writer.to_msg()) tdigesttool = TDigestTool("tdigesttool") config.tools[tdigesttool.name].CopyFrom(tdigesttool.to_msg()) sampler = config.sampler sampler.ndatums = sample_ndatums sampler.nchunks = sample_nchunks return config