示例#1
0
    def test_register_object(self):
        data = [
            pa.array([1, 2, 3, 4]),
            pa.array(["foo", "bar", "baz", None]),
            pa.array([True, None, False, True]),
        ]
        batch = pa.RecordBatch.from_arrays(data, ["f0", "f1", "f2"])
        sink = pa.BufferOutputStream()
        writer = pa.RecordBatchFileWriter(sink, batch.schema)

        for i in range(10):
            writer.write_batch(batch)

        writer.close()
        buf = sink.getvalue()
        mymsg = DummyMessage()
        mymsg.name = "dummy"
        mymsg.description = "really dumb"

        mymenu = CronusObject()
        mymenu.name = "menu"
        menuinfo = MenuObjectInfo()
        menuinfo.created.GetCurrentTime()
        bufmenu = pa.py_buffer(mymenu.SerializeToString())

        myconfig = Configuration()
        myconfig.uuid = str(uuid.uuid4())
        myconfig.name = f"{myconfig.uuid}.config.dat"

        configinfo = ConfigObjectInfo()
        configinfo.created.GetCurrentTime()
        bufconfig = pa.py_buffer(myconfig.SerializeToString())

        with tempfile.TemporaryDirectory() as dirpath:
            _path = dirpath + "/test"
            store = BaseObjectStore(
                str(_path), "test")  # wrapper to the CronusStore message
            fileinfo = FileObjectInfo()
            fileinfo.type = 5
            fileinfo.aux.description = "Some dummy data"

            menu_uuid = store.register_content(mymenu, menuinfo).uuid
            config_uuid = store.register_content(myconfig, configinfo).uuid
            dataset = store.register_dataset(menu_uuid, config_uuid)
            store.new_partition(dataset.uuid, "key")
            path = dirpath + "/test/dummy.arrow"
            with pa.OSFile(str(path), "wb") as f:
                f.write(sink.getvalue())
            id_ = store.register_content(path,
                                         fileinfo,
                                         dataset_id=dataset.uuid,
                                         partition_key="key").uuid
            print(store[id_].address)
            buf = pa.py_buffer(store.get(id_))
            reader = pa.ipc.open_file(buf)
            self.assertEqual(reader.num_record_batches, 10)
示例#2
0
    def test_config(self):
        myconfig = Configuration()
        myconfig.uuid = str(uuid.uuid4())
        myconfig.name = f"{myconfig.uuid}.config.dat"

        configinfo = ConfigObjectInfo()
        configinfo.created.GetCurrentTime()
        bufconfig = pa.py_buffer(myconfig.SerializeToString())

        with tempfile.TemporaryDirectory() as dirpath:
            _path = dirpath + "/test"
            store = BaseObjectStore(
                str(_path), "test")  # wrapper to the CronusStore message
            config_uuid = store.register_content(myconfig, configinfo).uuid
            store.put(config_uuid, myconfig)
            aconfig = Configuration()
            store.get(config_uuid, aconfig)
            self.assertEqual(myconfig.name, aconfig.name)
            self.assertEqual(myconfig.uuid, aconfig.uuid)
示例#3
0
    def test_validation(self):
        print("Simulate production")
        data = [
            pa.array([1, 2, 3, 4]),
            pa.array(["foo", "bar", "baz", None]),
            pa.array([True, None, False, True]),
        ]
        batch = pa.RecordBatch.from_arrays(data, ["f0", "f1", "f2"])
        sink = pa.BufferOutputStream()
        writer = pa.RecordBatchFileWriter(sink, batch.schema)

        for i in range(10):
            writer.write_batch(batch)

        writer.close()
        buf = sink.getvalue()

        mymenu = Menu_pb()
        mymenu.uuid = str(uuid.uuid4())
        mymenu.name = f"{mymenu.uuid}.menu.dat"

        menuinfo = MenuObjectInfo()
        menuinfo.created.GetCurrentTime()
        bufmenu = pa.py_buffer(mymenu.SerializeToString())

        myconfig = Configuration()
        myconfig.uuid = str(uuid.uuid4())
        myconfig.name = f"{myconfig.uuid}.config.dat"

        configinfo = ConfigObjectInfo()
        configinfo.created.GetCurrentTime()
        bufconfig = pa.py_buffer(myconfig.SerializeToString())

        with tempfile.TemporaryDirectory() as dirpath:
            _path = dirpath + "/test"
            store = BaseObjectStore(
                str(_path), "test")  # wrapper to the CronusStore message
            # Following puts the menu and config to the datastore
            menu_uuid = store.register_content(mymenu, menuinfo).uuid
            config_uuid = store.register_content(myconfig, configinfo).uuid
            dataset = store.register_dataset(menu_uuid, config_uuid)

            # Multiple streams
            store.new_partition(dataset.uuid, "key1")
            store.new_partition(dataset.uuid, "key2")
            store.new_partition(dataset.uuid, "key3")

            fileinfo = FileObjectInfo()
            fileinfo.type = 5
            fileinfo.aux.description = "Some dummy data"

            ids_ = []
            parts = store.list_partitions(dataset.uuid)
            # reload menu and config
            newmenu = Menu_pb()
            store.get(menu_uuid, newmenu)
            newconfig = Configuration()
            store.get(config_uuid, newconfig)
            print(parts)
            for _ in range(10):
                job_id = store.new_job(dataset.uuid)

                for key in parts:
                    ids_.append(
                        store.register_content(
                            buf,
                            fileinfo,
                            dataset_id=dataset.uuid,
                            job_id=job_id,
                            partition_key=key,
                        ).uuid)
                    store.put(ids_[-1], buf)

            for id_ in ids_:
                buf = pa.py_buffer(store.get(id_))
                reader = pa.ipc.open_file(buf)
                self.assertEqual(reader.num_record_batches, 10)

            # Save the store, reload
            store.save_store()
            newstore = BaseObjectStore(str(_path),
                                       store._name,
                                       store_uuid=store.store_uuid)
            for id_ in ids_:
                print("Get object %s", id_)
                print(type(id_))
                buf = pa.py_buffer(newstore.get(id_))
                reader = pa.ipc.open_file(buf)
                self.assertEqual(reader.num_record_batches, 10)
            print(newmenu)
            print(newconfig)
            print("Simulation Test Done ===========================")
示例#4
0
    def test_register_dataset(self):

        # Create a fake dataset
        # from a menu_id and menu msg
        # from a config_id and config msg
        # add files
        # add tables

        mymenu = Menu_pb()
        mymenu.uuid = str(uuid.uuid4())
        mymenu.name = f"{mymenu.uuid}.menu.dat"

        menuinfo = MenuObjectInfo()
        menuinfo.created.GetCurrentTime()
        bufmenu = pa.py_buffer(mymenu.SerializeToString())

        myconfig = Configuration()
        myconfig.uuid = str(uuid.uuid4())
        myconfig.name = f"{myconfig.uuid}.config.dat"

        configinfo = ConfigObjectInfo()
        configinfo.created.GetCurrentTime()
        bufconfig = pa.py_buffer(myconfig.SerializeToString())

        store_id = str(uuid.uuid4())
        mystore = CronusObjectStore()
        mystore.name = "test"
        mystore.uuid = str(store_id)
        mystore.parent_uuid = ""  # top level store

        print("Testing directory globbing")
        data = [
            pa.array([1, 2, 3, 4]),
            pa.array(["foo", "bar", "baz", None]),
            pa.array([True, None, False, True]),
        ]
        batch = pa.RecordBatch.from_arrays(data, ["f0", "f1", "f2"])
        # schema = batch.schema.to_pybytes()
        sink = pa.BufferOutputStream()
        writer = pa.RecordBatchFileWriter(sink, batch.schema)

        for i in range(10):
            writer.write_batch(batch)

        writer.close()
        buf = sink.getvalue()
        fileinfo = FileObjectInfo()
        fileinfo.type = 5
        fileinfo.aux.num_columns = 3

        with tempfile.TemporaryDirectory() as dirpath:
            _path = dirpath + "/test"
            store = BaseObjectStore(str(_path), "test")
            store_id = store.store_uuid
            print(store.store_info.created.ToDatetime())

            menu_uuid = store.register_content(mymenu, menuinfo).uuid
            config_uuid = store.register_content(myconfig, configinfo).uuid
            print(menu_uuid)
            print(config_uuid)
            dataset = store.register_dataset(menu_uuid, config_uuid)
            store.new_partition(dataset.uuid, "key")
            job_id = store.new_job(dataset.uuid)
            store.register_content(
                buf,
                fileinfo,
                dataset_id=dataset.uuid,
                partition_key="key",
                job_id=job_id,
            )

            ds = store.list(suffix="dataset")
            print(ds)
示例#5
0
    def test_dir_glob(self):
        print("Testing directory globbing")
        data = [
            pa.array([1, 2, 3, 4]),
            pa.array(["foo", "bar", "baz", None]),
            pa.array([True, None, False, True]),
        ]
        batch = pa.RecordBatch.from_arrays(data, ["f0", "f1", "f2"])
        sink = pa.BufferOutputStream()
        writer = pa.RecordBatchFileWriter(sink, batch.schema)

        mymenu = Menu_pb()
        mymenu.uuid = str(uuid.uuid4())
        mymenu.name = f"{mymenu.uuid}.menu.dat"

        menuinfo = MenuObjectInfo()
        menuinfo.created.GetCurrentTime()
        bufmenu = pa.py_buffer(mymenu.SerializeToString())

        myconfig = Configuration()
        myconfig.uuid = str(uuid.uuid4())
        myconfig.name = f"{myconfig.uuid}.config.dat"

        configinfo = ConfigObjectInfo()
        configinfo.created.GetCurrentTime()
        bufconfig = pa.py_buffer(myconfig.SerializeToString())

        for i in range(10):
            writer.write_batch(batch)

        writer.close()
        buf = sink.getvalue()
        mymsg = DummyMessage()
        mymsg.name = "dummy"
        mymsg.description = "really dumb"

        store_id = str(uuid.uuid4())
        mystore = CronusObjectStore()
        mystore.name = "test"
        mystore.uuid = str(store_id)
        mystore.parent_uuid = ""  # top level store

        with tempfile.TemporaryDirectory() as dirpath:
            mystore.address = dirpath + "/test"
            _path = Path(mystore.address)
            _path.mkdir()
            store = BaseObjectStore(
                str(_path), "test")  # wrapper to the CronusStore message
            fileinfo = FileObjectInfo()
            fileinfo.type = 5
            fileinfo.aux.description = "Some dummy data"

            menu_uuid = store.register_content(mymenu, menuinfo).uuid
            config_uuid = store.register_content(myconfig, configinfo).uuid
            dataset = store.register_dataset(menu_uuid, config_uuid)
            store.new_partition(dataset.uuid, "key")
            path = dirpath + "/test/dummy.arrow"
            with pa.OSFile(str(path), "wb") as f:
                f.write(sink.getvalue())
            path = dirpath + "/test/dummy2.arrow"
            with pa.OSFile(str(path), "wb") as f:
                f.write(sink.getvalue())

            objs_ = store.register_content(
                mystore.address,
                fileinfo,
                glob="*arrow",
                dataset_id=dataset.uuid,
                partition_key="key",
            )
            for obj_ in objs_:
                print(obj_.uuid, store[obj_.uuid].address)
                buf = pa.py_buffer(store.get(obj_.uuid))
                reader = pa.ipc.open_file(buf)
                self.assertEqual(reader.num_record_batches, 10)

            ds = store.list(suffix="dataset")
            for d in ds:
                p = d.uuid + ".part_key"
                f = store.list(prefix=p, suffix="arrow")
                print(f)
        print("Test Done ===========================")
示例#6
0
def example_configuration(table_id, seed=42):
    # First define a data generator using SimuTable

    max_malloc = 2147483648  # Maximum memory allowed in Arrow memory pool
    max_buffer_size = 2147483648  # Maximum size serialized ipc message
    write_csv = True  # Output csv files for each arrow output file
    sample_ndatums = 1  # Preprocess job to sample files from dataset
    sample_nchunks = 10  # Preprocess job to sample chunks from a file
    linesep = "\r\n"  # Line delimiter to scan for on csv input
    delimiter = ","  # Field delimiter
    blocksize = 2**16  # Size of chunked data in-memory
    header = ""  # Predefined header
    footer = ""  # Predefined footer
    header_offset = 0  # N bytes to scan past header
    footer_size = 0  # N bytes size of footer
    schema = []  # Predefined list of field names on input
    encoding = "utf8"  # encoding
    gen_nbatches = 5  # Number of batches to generator
    gen_nrows = 1000  # Number of rows per batch

    config = Configuration()  # Cronus Configuration message
    config.uuid = str(uuid.uuid4())
    config.name = f"{config.uuid}.config.pb"
    config.max_malloc_size_bytes = max_malloc

    generator = SimuTableGen(
        "generator",
        nbatches=gen_nbatches,
        num_rows=gen_nrows,
        file_type=1,  # Output type cronus.proto filetype
        table_id=table_id,
        seed=seed,
    )

    # Set the generator configuration
    config.input.generator.config.CopyFrom(generator.to_msg())

    filehandler = FileHandlerTool(
        "filehandler",
        filetype="csv",  # TBD use filetype metadata
        blocksize=blocksize,
        delimiter=delimiter,
        linesep=linesep,
        header=header,
        footer=footer,
        header_offset=header_offset,
        footer_size=footer_size,
        schema=schema,
        encoding=encoding,
        seed=seed,
    )
    # Add to the tools
    config.tools[filehandler.name].CopyFrom(filehandler.to_msg())

    csvtool = CsvTool("csvtool", block_size=(2 * blocksize))
    config.tools[csvtool.name].CopyFrom(csvtool.to_msg())

    #    filtercoltool = FilterColTool(
    #        "filtercoltool", columns=["record_id", "Product", "Unit"]
    #    )
    #   filtercoltool = FilterColTool('filtercoltool',
    #                                 columns=['record_id', 'SIN', 'DOB'])
    #     config.tools[filtercoltool.name].CopyFrom(filtercoltool.to_msg())

    writer = BufferOutputWriter("bufferwriter",
                                BUFFER_MAX_SIZE=max_buffer_size,
                                write_csv=write_csv)
    config.tools[writer.name].CopyFrom(writer.to_msg())

    tdigesttool = TDigestTool("tdigesttool")
    config.tools[tdigesttool.name].CopyFrom(tdigesttool.to_msg())

    sampler = config.sampler
    sampler.ndatums = sample_ndatums
    sampler.nchunks = sample_nchunks

    return config