Exemplo n.º 1
0
    def test_register_object(self):
        data = [
            pa.array([1, 2, 3, 4]),
            pa.array(["foo", "bar", "baz", None]),
            pa.array([True, None, False, True]),
        ]
        batch = pa.RecordBatch.from_arrays(data, ["f0", "f1", "f2"])
        sink = pa.BufferOutputStream()
        writer = pa.RecordBatchFileWriter(sink, batch.schema)

        for i in range(10):
            writer.write_batch(batch)

        writer.close()
        buf = sink.getvalue()
        mymsg = DummyMessage()
        mymsg.name = "dummy"
        mymsg.description = "really dumb"

        mymenu = CronusObject()
        mymenu.name = "menu"
        menuinfo = MenuObjectInfo()
        menuinfo.created.GetCurrentTime()
        bufmenu = pa.py_buffer(mymenu.SerializeToString())

        myconfig = Configuration()
        myconfig.uuid = str(uuid.uuid4())
        myconfig.name = f"{myconfig.uuid}.config.dat"

        configinfo = ConfigObjectInfo()
        configinfo.created.GetCurrentTime()
        bufconfig = pa.py_buffer(myconfig.SerializeToString())

        with tempfile.TemporaryDirectory() as dirpath:
            _path = dirpath + "/test"
            store = BaseObjectStore(
                str(_path), "test")  # wrapper to the CronusStore message
            fileinfo = FileObjectInfo()
            fileinfo.type = 5
            fileinfo.aux.description = "Some dummy data"

            menu_uuid = store.register_content(mymenu, menuinfo).uuid
            config_uuid = store.register_content(myconfig, configinfo).uuid
            dataset = store.register_dataset(menu_uuid, config_uuid)
            store.new_partition(dataset.uuid, "key")
            path = dirpath + "/test/dummy.arrow"
            with pa.OSFile(str(path), "wb") as f:
                f.write(sink.getvalue())
            id_ = store.register_content(path,
                                         fileinfo,
                                         dataset_id=dataset.uuid,
                                         partition_key="key").uuid
            print(store[id_].address)
            buf = pa.py_buffer(store.get(id_))
            reader = pa.ipc.open_file(buf)
            self.assertEqual(reader.num_record_batches, 10)
Exemplo n.º 2
0
    def __init__(self, root, store_name, store_id, menu_id, config_id,
                 dataset_id, job_id):

        self.dataset_id = dataset_id
        self.job_id = job_id

        # Connect to the metastore
        # Setup a datastore
        self.store = BaseObjectStore(str(root),
                                     store_name,
                                     store_uuid=store_id)

        self.parts = self.store.list_partitions(dataset_id)
        self.menu = Menu_pb()
        self.store.get(menu_id, self.menu)
        self.config = Configuration()
        # Get the menu and config to run the job
        self.store.get(config_id, self.config)

        self.buf = None
Exemplo n.º 3
0
    def test_config(self):
        myconfig = Configuration()
        myconfig.uuid = str(uuid.uuid4())
        myconfig.name = f"{myconfig.uuid}.config.dat"

        configinfo = ConfigObjectInfo()
        configinfo.created.GetCurrentTime()
        bufconfig = pa.py_buffer(myconfig.SerializeToString())

        with tempfile.TemporaryDirectory() as dirpath:
            _path = dirpath + "/test"
            store = BaseObjectStore(
                str(_path), "test")  # wrapper to the CronusStore message
            config_uuid = store.register_content(myconfig, configinfo).uuid
            store.put(config_uuid, myconfig)
            aconfig = Configuration()
            store.get(config_uuid, aconfig)
            self.assertEqual(myconfig.name, aconfig.name)
            self.assertEqual(myconfig.uuid, aconfig.uuid)
Exemplo n.º 4
0
    def test_validation(self):
        print("Simulate production")
        data = [
            pa.array([1, 2, 3, 4]),
            pa.array(["foo", "bar", "baz", None]),
            pa.array([True, None, False, True]),
        ]
        batch = pa.RecordBatch.from_arrays(data, ["f0", "f1", "f2"])
        sink = pa.BufferOutputStream()
        writer = pa.RecordBatchFileWriter(sink, batch.schema)

        for i in range(10):
            writer.write_batch(batch)

        writer.close()
        buf = sink.getvalue()

        mymenu = Menu_pb()
        mymenu.uuid = str(uuid.uuid4())
        mymenu.name = f"{mymenu.uuid}.menu.dat"

        menuinfo = MenuObjectInfo()
        menuinfo.created.GetCurrentTime()
        bufmenu = pa.py_buffer(mymenu.SerializeToString())

        myconfig = Configuration()
        myconfig.uuid = str(uuid.uuid4())
        myconfig.name = f"{myconfig.uuid}.config.dat"

        configinfo = ConfigObjectInfo()
        configinfo.created.GetCurrentTime()
        bufconfig = pa.py_buffer(myconfig.SerializeToString())

        with tempfile.TemporaryDirectory() as dirpath:
            _path = dirpath + "/test"
            store = BaseObjectStore(
                str(_path), "test")  # wrapper to the CronusStore message
            # Following puts the menu and config to the datastore
            menu_uuid = store.register_content(mymenu, menuinfo).uuid
            config_uuid = store.register_content(myconfig, configinfo).uuid
            dataset = store.register_dataset(menu_uuid, config_uuid)

            # Multiple streams
            store.new_partition(dataset.uuid, "key1")
            store.new_partition(dataset.uuid, "key2")
            store.new_partition(dataset.uuid, "key3")

            fileinfo = FileObjectInfo()
            fileinfo.type = 5
            fileinfo.aux.description = "Some dummy data"

            ids_ = []
            parts = store.list_partitions(dataset.uuid)
            # reload menu and config
            newmenu = Menu_pb()
            store.get(menu_uuid, newmenu)
            newconfig = Configuration()
            store.get(config_uuid, newconfig)
            print(parts)
            for _ in range(10):
                job_id = store.new_job(dataset.uuid)

                for key in parts:
                    ids_.append(
                        store.register_content(
                            buf,
                            fileinfo,
                            dataset_id=dataset.uuid,
                            job_id=job_id,
                            partition_key=key,
                        ).uuid)
                    store.put(ids_[-1], buf)

            for id_ in ids_:
                buf = pa.py_buffer(store.get(id_))
                reader = pa.ipc.open_file(buf)
                self.assertEqual(reader.num_record_batches, 10)

            # Save the store, reload
            store.save_store()
            newstore = BaseObjectStore(str(_path),
                                       store._name,
                                       store_uuid=store.store_uuid)
            for id_ in ids_:
                print("Get object %s", id_)
                print(type(id_))
                buf = pa.py_buffer(newstore.get(id_))
                reader = pa.ipc.open_file(buf)
                self.assertEqual(reader.num_record_batches, 10)
            print(newmenu)
            print(newconfig)
            print("Simulation Test Done ===========================")
Exemplo n.º 5
0
    def test_register_dataset(self):

        # Create a fake dataset
        # from a menu_id and menu msg
        # from a config_id and config msg
        # add files
        # add tables

        mymenu = Menu_pb()
        mymenu.uuid = str(uuid.uuid4())
        mymenu.name = f"{mymenu.uuid}.menu.dat"

        menuinfo = MenuObjectInfo()
        menuinfo.created.GetCurrentTime()
        bufmenu = pa.py_buffer(mymenu.SerializeToString())

        myconfig = Configuration()
        myconfig.uuid = str(uuid.uuid4())
        myconfig.name = f"{myconfig.uuid}.config.dat"

        configinfo = ConfigObjectInfo()
        configinfo.created.GetCurrentTime()
        bufconfig = pa.py_buffer(myconfig.SerializeToString())

        store_id = str(uuid.uuid4())
        mystore = CronusObjectStore()
        mystore.name = "test"
        mystore.uuid = str(store_id)
        mystore.parent_uuid = ""  # top level store

        print("Testing directory globbing")
        data = [
            pa.array([1, 2, 3, 4]),
            pa.array(["foo", "bar", "baz", None]),
            pa.array([True, None, False, True]),
        ]
        batch = pa.RecordBatch.from_arrays(data, ["f0", "f1", "f2"])
        # schema = batch.schema.to_pybytes()
        sink = pa.BufferOutputStream()
        writer = pa.RecordBatchFileWriter(sink, batch.schema)

        for i in range(10):
            writer.write_batch(batch)

        writer.close()
        buf = sink.getvalue()
        fileinfo = FileObjectInfo()
        fileinfo.type = 5
        fileinfo.aux.num_columns = 3

        with tempfile.TemporaryDirectory() as dirpath:
            _path = dirpath + "/test"
            store = BaseObjectStore(str(_path), "test")
            store_id = store.store_uuid
            print(store.store_info.created.ToDatetime())

            menu_uuid = store.register_content(mymenu, menuinfo).uuid
            config_uuid = store.register_content(myconfig, configinfo).uuid
            print(menu_uuid)
            print(config_uuid)
            dataset = store.register_dataset(menu_uuid, config_uuid)
            store.new_partition(dataset.uuid, "key")
            job_id = store.new_job(dataset.uuid)
            store.register_content(
                buf,
                fileinfo,
                dataset_id=dataset.uuid,
                partition_key="key",
                job_id=job_id,
            )

            ds = store.list(suffix="dataset")
            print(ds)
Exemplo n.º 6
0
    def test_dir_glob(self):
        print("Testing directory globbing")
        data = [
            pa.array([1, 2, 3, 4]),
            pa.array(["foo", "bar", "baz", None]),
            pa.array([True, None, False, True]),
        ]
        batch = pa.RecordBatch.from_arrays(data, ["f0", "f1", "f2"])
        sink = pa.BufferOutputStream()
        writer = pa.RecordBatchFileWriter(sink, batch.schema)

        mymenu = Menu_pb()
        mymenu.uuid = str(uuid.uuid4())
        mymenu.name = f"{mymenu.uuid}.menu.dat"

        menuinfo = MenuObjectInfo()
        menuinfo.created.GetCurrentTime()
        bufmenu = pa.py_buffer(mymenu.SerializeToString())

        myconfig = Configuration()
        myconfig.uuid = str(uuid.uuid4())
        myconfig.name = f"{myconfig.uuid}.config.dat"

        configinfo = ConfigObjectInfo()
        configinfo.created.GetCurrentTime()
        bufconfig = pa.py_buffer(myconfig.SerializeToString())

        for i in range(10):
            writer.write_batch(batch)

        writer.close()
        buf = sink.getvalue()
        mymsg = DummyMessage()
        mymsg.name = "dummy"
        mymsg.description = "really dumb"

        store_id = str(uuid.uuid4())
        mystore = CronusObjectStore()
        mystore.name = "test"
        mystore.uuid = str(store_id)
        mystore.parent_uuid = ""  # top level store

        with tempfile.TemporaryDirectory() as dirpath:
            mystore.address = dirpath + "/test"
            _path = Path(mystore.address)
            _path.mkdir()
            store = BaseObjectStore(
                str(_path), "test")  # wrapper to the CronusStore message
            fileinfo = FileObjectInfo()
            fileinfo.type = 5
            fileinfo.aux.description = "Some dummy data"

            menu_uuid = store.register_content(mymenu, menuinfo).uuid
            config_uuid = store.register_content(myconfig, configinfo).uuid
            dataset = store.register_dataset(menu_uuid, config_uuid)
            store.new_partition(dataset.uuid, "key")
            path = dirpath + "/test/dummy.arrow"
            with pa.OSFile(str(path), "wb") as f:
                f.write(sink.getvalue())
            path = dirpath + "/test/dummy2.arrow"
            with pa.OSFile(str(path), "wb") as f:
                f.write(sink.getvalue())

            objs_ = store.register_content(
                mystore.address,
                fileinfo,
                glob="*arrow",
                dataset_id=dataset.uuid,
                partition_key="key",
            )
            for obj_ in objs_:
                print(obj_.uuid, store[obj_.uuid].address)
                buf = pa.py_buffer(store.get(obj_.uuid))
                reader = pa.ipc.open_file(buf)
                self.assertEqual(reader.num_record_batches, 10)

            ds = store.list(suffix="dataset")
            for d in ds:
                p = d.uuid + ".part_key"
                f = store.list(prefix=p, suffix="arrow")
                print(f)
        print("Test Done ===========================")
Exemplo n.º 7
0
def example_configuration(table_id, seed=42):
    # First define a data generator using SimuTable

    max_malloc = 2147483648  # Maximum memory allowed in Arrow memory pool
    max_buffer_size = 2147483648  # Maximum size serialized ipc message
    write_csv = True  # Output csv files for each arrow output file
    sample_ndatums = 1  # Preprocess job to sample files from dataset
    sample_nchunks = 10  # Preprocess job to sample chunks from a file
    linesep = "\r\n"  # Line delimiter to scan for on csv input
    delimiter = ","  # Field delimiter
    blocksize = 2**16  # Size of chunked data in-memory
    header = ""  # Predefined header
    footer = ""  # Predefined footer
    header_offset = 0  # N bytes to scan past header
    footer_size = 0  # N bytes size of footer
    schema = []  # Predefined list of field names on input
    encoding = "utf8"  # encoding
    gen_nbatches = 5  # Number of batches to generator
    gen_nrows = 1000  # Number of rows per batch

    config = Configuration()  # Cronus Configuration message
    config.uuid = str(uuid.uuid4())
    config.name = f"{config.uuid}.config.pb"
    config.max_malloc_size_bytes = max_malloc

    generator = SimuTableGen(
        "generator",
        nbatches=gen_nbatches,
        num_rows=gen_nrows,
        file_type=1,  # Output type cronus.proto filetype
        table_id=table_id,
        seed=seed,
    )

    # Set the generator configuration
    config.input.generator.config.CopyFrom(generator.to_msg())

    filehandler = FileHandlerTool(
        "filehandler",
        filetype="csv",  # TBD use filetype metadata
        blocksize=blocksize,
        delimiter=delimiter,
        linesep=linesep,
        header=header,
        footer=footer,
        header_offset=header_offset,
        footer_size=footer_size,
        schema=schema,
        encoding=encoding,
        seed=seed,
    )
    # Add to the tools
    config.tools[filehandler.name].CopyFrom(filehandler.to_msg())

    csvtool = CsvTool("csvtool", block_size=(2 * blocksize))
    config.tools[csvtool.name].CopyFrom(csvtool.to_msg())

    #    filtercoltool = FilterColTool(
    #        "filtercoltool", columns=["record_id", "Product", "Unit"]
    #    )
    #   filtercoltool = FilterColTool('filtercoltool',
    #                                 columns=['record_id', 'SIN', 'DOB'])
    #     config.tools[filtercoltool.name].CopyFrom(filtercoltool.to_msg())

    writer = BufferOutputWriter("bufferwriter",
                                BUFFER_MAX_SIZE=max_buffer_size,
                                write_csv=write_csv)
    config.tools[writer.name].CopyFrom(writer.to_msg())

    tdigesttool = TDigestTool("tdigesttool")
    config.tools[tdigesttool.name].CopyFrom(tdigesttool.to_msg())

    sampler = config.sampler
    sampler.ndatums = sample_ndatums
    sampler.nchunks = sample_nchunks

    return config
Exemplo n.º 8
0
def example_job(location):
    # Artemis Job requirements
    # BaseObjectStore - name, path and id
    # Menu
    # Configuration
    # Input Dataset
    # Dataset partitions
    # Table schemas for each dataset partition

    # Build the Menu
    mb = ExampleMenu()
    msgmenu = mb.build()
    menuinfo = MenuObjectInfo()
    menuinfo.created.GetCurrentTime()

    # Read schema and generator names
    xlstool = XlsTool("xlstool", location=location)
    ds_schema = xlstool.execute(location)
    # Example job only have one table
    table = ds_schema.tables[0]

    # Build the Configuration

    # Build the partition Table schemas

    # Register all inputs in the Cronus object store

    # Build the job
    # To use the local directory:
    # dirpath = os.getcwd()
    with tempfile.TemporaryDirectory() as dirpath:
        # All jobs now require an object store
        # All outputs are pesisted in the object store path
        # See github.com/mbr/simplekv
        # Factory class for simplekv provided by
        # blueyonder/storefact
        store = BaseObjectStore(dirpath, "artemis")

        # Requires registering an parent dataset
        # Generator data is written to disk with
        # The parent dataset uuid
        # Register the 'generator' partition -- required

        g_dataset = store.register_dataset()
        store.new_partition(g_dataset.uuid, "generator")
        job_id = store.new_job(g_dataset.uuid)

        # The table schema which defines the model for the generator
        # Persisted first to the object store
        # protobuf file
        tinfo = TableObjectInfo()
        table_id = store.register_content(
            table,
            tinfo,
            dataset_id=g_dataset.uuid,
            job_id=job_id,
            partition_key="generator",
        ).uuid

        store.save_store()

        # Now configure all tools and algorithms
        # Includes IO tools
        config = example_configuration(table_id)

        # Algorithms need to added from the menu to the configuration
        for key in mb._algos:
            msg = config.algos.add()
            msg.CopyFrom(mb._algos[key].to_msg())

        configinfo = ConfigObjectInfo()
        configinfo.created.GetCurrentTime()

        # Store the menu and configuration protobufs
        menu_uuid = store.register_content(msgmenu, menuinfo).uuid
        config_uuid = store.register_content(config, configinfo).uuid

        # Register an output dataset
        dataset = store.register_dataset(menu_id=menu_uuid,
                                         config_id=config_uuid)
        # Copy metadata from xlstool
        store[dataset.uuid].dataset.aux.CopyFrom(ds_schema.dataset.aux)
        store.save_store()

        # Now define the actual Artemis job
        # Again the input is a protobuf
        # All other information read in from the
        # object store
        # inputs = store.list(prefix=g_dataset.uuid)

        ds_results = []
        for _ in range(2):
            job_id = store.new_job(dataset.uuid)
            config = Configuration()
            store.get(config_uuid, config)
            for p in config.input.generator.config.properties.property:
                if p.name == "glob":
                    p.value = dirpath.split(".")[-2] + "csv"
            store._put_message(config_uuid, config)
            store.get(config_uuid, config)

            ds_results.append(
                runjob(
                    dirpath,
                    store.store_name,
                    store.store_uuid,
                    menu_uuid,
                    config_uuid,
                    dataset.uuid,
                    g_dataset.uuid,
                    str(job_id),
                ))

        results = dask.compute(*ds_results, scheduler="single-threaded")
        store.new_partition(dataset.uuid, "seqA")
        store.new_partition(dataset.uuid, "seqB")
        store.save_store()
        for buf in results:
            ds = DatasetObjectInfo()
            ds.ParseFromString(buf)
            store.update_dataset(dataset.uuid, buf)

        store.save_store()

        dqtool = PlotlyTool(store=store, uuid=dataset.uuid)
        dqtool.visualize(output="{}/test".format(os.getcwd()),
                         show=True,
                         check=False)
    def test_distributed(self):
        with tempfile.TemporaryDirectory() as dirpath:
            mb = MenuFactory("csvgen")
            msgmenu = mb.build()
            menuinfo = MenuObjectInfo()
            menuinfo.created.GetCurrentTime()

            store = BaseObjectStore(dirpath, "artemis")

            config = JobConfigFactory(
                "csvio",
                msgmenu,
                jobname="arrowproto",
                generator_type="file",
                filehandler_type="csv",
                nbatches=1,
                num_rows=10000,
                max_file_size=1073741824,
                write_csv=True,
                input_glob=".csv",
            )

            config.configure()
            config.add_algos(mb.algos)
            configinfo = ConfigObjectInfo()
            configinfo.created.GetCurrentTime()

            menu_uuid = store.register_content(msgmenu, menuinfo).uuid
            config_obj = store.register_content(config._msg, configinfo)
            config_uuid = config_obj.uuid

            g_dataset = store.register_dataset()
            store.new_partition(g_dataset.uuid, "generator")
            job_id = store.new_job(g_dataset.uuid)

            # define the schema for the data
            g_table = Table()
            g_table.name = "generator"
            g_table.uuid = str(uuid.uuid4())
            g_table.info.schema.name = "csv"
            g_table.info.schema.uuid = str(uuid.uuid4())

            fields = list(
                itertools.islice(GenCsvLikeArrow.generate_col_names(), 20))
            for f in fields:
                field = g_table.info.schema.info.fields.add()
                field.name = f

            tinfo = TableObjectInfo()
            tinfo.fields.extend(fields)
            store.register_content(
                g_table,
                tinfo,
                dataset_id=g_dataset.uuid,
                job_id=job_id,
                partition_key="generator",
            )

            generator = GenCsvLikeArrow(
                "generator",
                nbatches=10,
                num_cols=20,
                num_rows=1000,
                suffix=".csv",
                prefix="testio",
                path=dirpath,
                table_id=g_table.uuid,
            )

            generator.gate.meta.parentset_id = g_dataset.uuid
            generator.gate.meta.job_id = str(job_id)
            generator.gate.store = store
            generator.initialize()
            generator.write()

            dataset = store.register_dataset(menu_uuid, config_uuid)
            job_id = store.new_job(dataset.uuid)
            store.save_store()

            #######################################
            inputs = store.list(prefix=g_dataset.uuid, suffix="csv")

            store_name = store._name
            store_uuid = store.store_uuid
            dataset_uuid = dataset.uuid
            ds_results = []
            for datum in inputs:
                job_id = store.new_job(dataset.uuid)
                url_data = urllib.parse.urlparse(datum.address)
                dpath = urllib.parse.unquote(url_data.path)
                print(datum)
                config = Configuration()
                store.get(config_uuid, config)
                for p in config.input.generator.config.properties.property:
                    if p.name == "glob":
                        p.value = dpath.split(".")[-2] + ".csv"
                store._put_message(config_uuid, config)
                store.get(config_uuid, config)
                print(config)
                ds_results.append(
                    runjob(
                        dirpath,
                        store_name,
                        store_uuid,
                        menu_uuid,
                        config_uuid,
                        dataset_uuid,
                        g_dataset.uuid,
                        job_id,
                    ))

            results = dask.compute(*ds_results, scheduler="single-threaded")
            # Workaround to fix error in dataset merging
            store.new_partition(dataset.uuid, "seqY")
            # Update the dataset
            for buf in results:
                ds = DatasetObjectInfo()
                ds.ParseFromString(buf)
                store.update_dataset(dataset.uuid, buf)
            # Save the store, reload
            store.save_store()

            print(store[dataset.uuid].dataset)