Exemplo n.º 1
0
    def join(self, other, func, output=None, options: dict = None):
        if options is None:
            options = {}

        inputs = self.__repartition_with(other)

        outputs = self._maybe_set_output(output)
        functor = ErFunctor(name=RollPair.JOIN,
                            serdes=SerdesTypes.CLOUD_PICKLE,
                            body=cloudpickle.dumps(func))

        final_options = {}
        final_options.update(self.__store._options)
        final_options.update(options)
        job = ErJob(id=generate_job_id(self.__session_id, RollPair.JOIN),
                    name=RollPair.JOIN,
                    inputs=inputs,
                    outputs=outputs,
                    functors=[functor],
                    options=final_options)

        task_results = self._run_job(job=job)
        er_store = self.__get_output_from_result(task_results)

        return RollPair(er_store, self.ctx)
Exemplo n.º 2
0
    def sample(self, fraction, seed=None, output=None, options: dict = None):
        if options is None:
            options = {}
        er_fraction = ErFunctor(name=RollPair.REDUCE,
                                serdes=SerdesTypes.CLOUD_PICKLE,
                                body=cloudpickle.dumps(fraction))
        er_seed = ErFunctor(name=RollPair.REDUCE,
                            serdes=SerdesTypes.CLOUD_PICKLE,
                            body=cloudpickle.dumps(seed))

        outputs = []
        if output:
            outputs.append(output)
        job = ErJob(id=generate_job_id(self.__session_id, RollPair.SAMPLE),
                    name=RollPair.SAMPLE,
                    inputs=[self.__store],
                    outputs=outputs,
                    functors=[er_fraction, er_seed])

        job_result = self.__command_client.simple_sync_send(
            input=job,
            output_type=ErJob,
            endpoint=self.ctx.get_roll()._command_endpoint,
            command_uri=CommandURI(
                f'{RollPair.ROLL_PAIR_URI_PREFIX}/{RollPair.RUN_JOB}'),
            serdes_type=self.__command_serdes)

        er_store = job_result._outputs[0]

        return RollPair(er_store, self.ctx)
Exemplo n.º 3
0
    def map_values(self, func, output=None, options: dict = None):
        if options is None:
            options = {}
        functor = ErFunctor(name=RollPair.MAP_VALUES,
                            serdes=SerdesTypes.CLOUD_PICKLE,
                            body=cloudpickle.dumps(func))
        outputs = []
        if output:
            outputs.append(output)
        # todo:1: options issues. refer to line 77
        final_options = {}
        final_options.update(self.__store._options)
        final_options.update(options)
        job = ErJob(id=generate_job_id(self.__session_id, RollPair.MAP_VALUES),
                    name=RollPair.MAP_VALUES,
                    inputs=[self.__store],
                    outputs=outputs,
                    functors=[functor],
                    options=final_options)

        job_result = self.__command_client.simple_sync_send(
            input=job,
            output_type=ErJob,
            endpoint=self.ctx.get_roll()._command_endpoint,
            command_uri=CommandURI(
                f'{RollPair.ROLL_PAIR_URI_PREFIX}/{RollPair.RUN_JOB}'),
            serdes_type=self.__command_serdes)

        er_store = job_result._outputs[0]

        return RollPair(er_store, self.ctx)
Exemplo n.º 4
0
    def count(self):
        total_partitions = self.__store._store_locator._total_partitions
        job_id = generate_job_id(self.__session_id, tag=RollPair.COUNT)
        job = ErJob(id=job_id,
                    name=RollPair.COUNT,
                    inputs=[self.ctx.populate_processor(self.__store)])
        args = list()
        for i in range(total_partitions):
            partition_input = job._inputs[0]._partitions[i]
            task = ErTask(id=generate_task_id(job_id, i),
                          name=job._name,
                          inputs=[partition_input],
                          job=job)
            args.append(([task], partition_input._processor._command_endpoint))

        futures = self.__command_client.async_call(
            args=args,
            output_types=[ErPair],
            command_uri=CommandURI(
                f'{RollPair.EGG_PAIR_URI_PREFIX}/{RollPair.RUN_TASK}'))

        done = wait(futures, return_when=FIRST_EXCEPTION).done

        result = 0
        for future in done:
            pair = future.result()[0]
            result += self.functor_serdes.deserialize(pair._value)

        return result
Exemplo n.º 5
0
    def union(self,
              other,
              func=lambda v1, v2: v1,
              output=None,
              options: dict = None):
        if options is None:
            options = {}

        functor = ErFunctor(name=RollPair.UNION,
                            serdes=SerdesTypes.CLOUD_PICKLE,
                            body=cloudpickle.dumps(func))
        outputs = []
        if output:
            outputs.append(output)
        job = ErJob(id=generate_job_id(self.__session_id, RollPair.UNION),
                    name=RollPair.UNION,
                    inputs=self.__repartition_with(other),
                    outputs=outputs,
                    functors=[functor])

        job_result = self.__command_client.simple_sync_send(
            input=job,
            output_type=ErJob,
            endpoint=self.ctx.get_roll()._command_endpoint,
            command_uri=CommandURI(
                f'{RollPair.ROLL_PAIR_URI_PREFIX}/{RollPair.RUN_JOB}'),
            serdes_type=self.__command_serdes)
        er_store = job_result._outputs[0]

        return RollPair(er_store, self.ctx)
Exemplo n.º 6
0
    def destroy(self):
        if len(self.ctx.get_session()._cluster_manager_client.get_store(
                self.get_store())._partitions) == 0:
            L.info(f"store:{self.get_store()} has been destroyed before")
            raise ValueError(
                f"store:{self.get_store()} has been destroyed before")
        total_partitions = self.__store._store_locator._total_partitions

        job = ErJob(id=generate_job_id(self.__session_id, RollPair.DESTROY),
                    name=RollPair.DESTROY,
                    inputs=[self.__store],
                    outputs=[self.__store],
                    functors=[])

        job_resp = self.__command_client.simple_sync_send(
            input=job,
            output_type=ErJob,
            endpoint=self.ctx.get_roll()._command_endpoint,
            command_uri=CommandURI(
                f'{RollPair.ROLL_PAIR_URI_PREFIX}/{RollPair.RUN_JOB}'),
            serdes_type=self.__command_serdes)

        self.ctx.get_session()._cluster_manager_client.delete_store(
            self.__store)
        L.info(f'{RollPair.DESTROY}: {self.__store}')
        self.destroyed = True
Exemplo n.º 7
0
    def filter(self, func, output=None, options: dict = None):
        if options is None:
            options = {}
        functor = ErFunctor(name=RollPair.FILTER,
                            serdes=SerdesTypes.CLOUD_PICKLE,
                            body=cloudpickle.dumps(func))

        outputs = []
        if output:
            outputs.append(output)
        job = ErJob(id=generate_job_id(self.__session_id, RollPair.FILTER),
                    name=RollPair.FILTER,
                    inputs=[self.__store],
                    outputs=outputs,
                    functors=[functor])

        job_result = self.__command_client.simple_sync_send(
            input=job,
            output_type=ErJob,
            endpoint=self.ctx.get_roll()._command_endpoint,
            command_uri=CommandURI(
                f'{RollPair.ROLL_PAIR_URI_PREFIX}/{RollPair.RUN_JOB}'),
            serdes_type=self.__command_serdes)

        er_store = job_result._outputs[0]
        L.info(er_store)

        return RollPair(er_store, self.ctx)
Exemplo n.º 8
0
    def map_partitions(self,
                       func,
                       reduce_op=None,
                       output=None,
                       options: dict = None):
        if options is None:
            options = {}

        outputs = self._maybe_set_output(output)

        shuffle = options.get('shuffle', True)
        if not shuffle and reduce_op:
            raise ValueError(f"shuffle cannot be False when reduce is needed!")
        functor = ErFunctor(name=RollPair.MAP_PARTITIONS,
                            serdes=SerdesTypes.CLOUD_PICKLE,
                            body=cloudpickle.dumps(func))
        reduce_functor = ErFunctor(name=RollPair.MAP_PARTITIONS,
                                   serdes=SerdesTypes.CLOUD_PICKLE,
                                   body=cloudpickle.dumps(reduce_op))
        need_shuffle = ErFunctor(name=RollPair.MAP_PARTITIONS,
                                 serdes=SerdesTypes.CLOUD_PICKLE,
                                 body=cloudpickle.dumps(shuffle))

        job = ErJob(id=generate_job_id(self.__session_id,
                                       RollPair.MAP_PARTITIONS),
                    name=RollPair.MAP_PARTITIONS,
                    inputs=[self.__store],
                    outputs=outputs,
                    functors=[functor, reduce_functor, need_shuffle])

        task_future = self._run_job(job=job)
        er_store = self.__get_output_from_result(task_future)
        return RollPair(er_store, self.ctx)
Exemplo n.º 9
0
    def map_partitions_with_index(self,
                                  func,
                                  output=None,
                                  options: dict = None):
        if options is None:
            options = {}

        outputs = self._maybe_set_output(output)

        shuffle = options.get('shuffle', True)

        functor = ErFunctor(name=RollPair.MAP_PARTITIONS_WITH_INDEX,
                            serdes=SerdesTypes.CLOUD_PICKLE,
                            body=cloudpickle.dumps(func))

        need_shuffle = ErFunctor(name=RollPair.MAP_PARTITIONS_WITH_INDEX,
                                 serdes=SerdesTypes.CLOUD_PICKLE,
                                 body=cloudpickle.dumps(shuffle))

        job = ErJob(id=generate_job_id(self.__session_id,
                                       RollPair.MAP_PARTITIONS_WITH_INDEX),
                    name=RollPair.MAP_PARTITIONS_WITH_INDEX,
                    inputs=[self.__store],
                    outputs=outputs,
                    functors=[functor, need_shuffle])

        task_future = self._run_job(job=job)
        er_store = self.__get_output_from_result(task_future)
        return RollPair(er_store, self.ctx)
Exemplo n.º 10
0
    def subtract_by_key(self, other, output=None, options: dict = None):
        if options is None:
            options = {}

        functor = ErFunctor(name=RollPair.SUBTRACT_BY_KEY,
                            serdes=SerdesTypes.CLOUD_PICKLE)
        outputs = []
        if output:
            outputs.append(output)
        job = ErJob(id=generate_job_id(self.__session_id,
                                       RollPair.SUBTRACT_BY_KEY),
                    name=RollPair.SUBTRACT_BY_KEY,
                    inputs=self.__repartition_with(other),
                    outputs=outputs,
                    functors=[functor])

        job_result = self.__command_client.simple_sync_send(
            input=job,
            output_type=ErJob,
            endpoint=self.ctx.get_roll()._command_endpoint,
            command_uri=CommandURI(
                f'{RollPair.ROLL_PAIR_URI_PREFIX}/{RollPair.RUN_JOB}'),
            serdes_type=self.__command_serdes)
        er_store = job_result._outputs[0]

        return RollPair(er_store, self.ctx)
Exemplo n.º 11
0
        def send_command():
            job = ErJob(id=job_id,
                        name=RollPair.PUT_ALL,
                        inputs=[self.__store],
                        outputs=[self.__store],
                        functors=[])

            task_results = self._run_job(job)

            return self.__get_output_from_result(task_results)
Exemplo n.º 12
0
        def send_command():
            job = ErJob(id=job_id,
                        name=RollPair.GET_ALL,
                        inputs=[self.__store],
                        outputs=[self.__store],
                        functors=[ErFunctor(name=RollPair.GET_ALL, body=cloudpickle.dumps(er_pair))])

            task_results = self._run_job(job=job)
            er_store = self.__get_output_from_result(task_results)

            return er_store
Exemplo n.º 13
0
    def destroy(self, options: dict = None):
        tasks = [
            ErTask(
                id=f"{self._replicate_job_id}-partition-{self._partition_id}",
                name=RollPair.DESTROY,
                inputs=[self._er_partition],
                outputs=[],
                job=ErJob(id=self._replicate_job_id, name=RollPair.DESTROY))
        ]

        return self._cm_client.sync_send(inputs=tasks,
                                         output_types=[ErTask],
                                         endpoint=self.remote_cmd_endpoint,
                                         command_uri=RollPair.RUN_TASK_URI)
Exemplo n.º 14
0
    def count(self):
        job_id = generate_job_id(self.__session_id, tag=RollPair.COUNT)

        job = ErJob(id=job_id,
                    name=RollPair.COUNT,
                    inputs=[self.__store])

        task_results = self._run_job(job=job, output_types=[ErPair], create_output_if_missing=False)

        result = 0
        for task_result in task_results:
            pair = task_result[0]
            result += self.functor_serdes.deserialize(pair._value)

        return result
Exemplo n.º 15
0
    def aggregate(self,
                  zero_value,
                  seq_op,
                  comb_op,
                  output=None,
                  options: dict = None):
        total_partitions = self.__store._store_locator._total_partitions
        job_id = generate_job_id(self.__session_id, tag=RollPair.AGGREGATE)

        serialized_zero_value = ErFunctor(name=RollPair.AGGREGATE,
                                          serdes=SerdesTypes.CLOUD_PICKLE,
                                          body=cloudpickle.dumps(zero_value))
        serialized_seq_op = ErFunctor(name=RollPair.AGGREGATE,
                                      serdes=SerdesTypes.CLOUD_PICKLE,
                                      body=cloudpickle.dumps(seq_op))
        job = ErJob(id=job_id,
                    name=RollPair.AGGREGATE,
                    inputs=[self.ctx.populate_processor(self.__store)],
                    functors=[serialized_zero_value, serialized_seq_op])
        args = list()
        for i in range(total_partitions):
            partition_input = job._inputs[0]._partitions[i]
            task = ErTask(id=generate_task_id(job_id, i),
                          name=job._name,
                          inputs=[partition_input],
                          job=job)
            args.append(([task], partition_input._processor._command_endpoint))

        futures = self.__command_client.async_call(
            args=args,
            output_types=[ErPair],
            command_uri=CommandURI(
                f'{RollPair.EGG_PAIR_URI_PREFIX}/{RollPair.RUN_TASK}'))

        done = wait(futures, return_when=FIRST_EXCEPTION).done

        result = None
        first = True
        for future in done:
            pair = future.result()[0]
            seq_op_result = self.functor_serdes.deserialize(pair._value)
            if not first:
                result = comb_op(result, seq_op_result)
            else:
                result = seq_op_result
                first = False

        return result
Exemplo n.º 16
0
        def send_command():
            job = ErJob(id=job_id,
                        name=RollPair.PUT_ALL,
                        inputs=[self.__store],
                        outputs=[self.__store],
                        functors=[])

            result = self.__command_client.simple_sync_send(
                input=job,
                output_type=ErJob,
                endpoint=self.ctx.get_roll()._command_endpoint,
                command_uri=CommandURI(
                    f'{RollPair.ROLL_PAIR_URI_PREFIX}/{RollPair.RUN_JOB}'),
                serdes_type=SerdesTypes.PROTOBUF)

            return result
Exemplo n.º 17
0
    def map(self, func, output=None, options: dict = None):
        if options is None:
            options = {}
        functor = ErFunctor(name=RollPair.MAP, serdes=SerdesTypes.CLOUD_PICKLE, body=cloudpickle.dumps(func))

        job = ErJob(id=generate_job_id(self.__session_id, RollPair.MAP),
                    name=RollPair.MAP,
                    inputs=[self.__store],
                    outputs=[output],
                    functors=[functor],
                    options=options)

        task_results = self._run_job(job=job)
        er_store = self.__get_output_from_result(task_results)

        return RollPair(er_store, self.ctx)
Exemplo n.º 18
0
    def with_stores(self, func, others=None, options: dict = None):
        if options is None:
            options = {}
        tag = "withStores"
        if others is None:
            others = []
        total_partitions = self.get_partitions()
        for other in others:
            if other.get_partitions() != total_partitions:
                raise ValueError(
                    f"diff partitions: expected:{total_partitions}, actual:{other.get_partitions()}"
                )
        job_id = generate_job_id(self.__session_id, tag=tag)
        job = ErJob(id=job_id,
                    name=tag,
                    inputs=[
                        self.ctx.populate_processor(rp.get_store())
                        for rp in [self] + others
                    ],
                    functors=[
                        ErFunctor(name=tag,
                                  serdes=SerdesTypes.CLOUD_PICKLE,
                                  body=cloudpickle.dumps(func))
                    ],
                    options=options)
        args = list()
        for i in range(total_partitions):
            partition_self = job._inputs[0]._partitions[i]
            task = ErTask(
                id=generate_task_id(job_id, i),
                name=job._name,
                inputs=[store._partitions[i] for store in job._inputs],
                job=job)
            args.append(([task], partition_self._processor._command_endpoint))

        futures = self.__command_client.async_call(
            args=args,
            output_types=[ErPair],
            command_uri=CommandURI(
                f'{RollPair.EGG_PAIR_URI_PREFIX}/{RollPair.RUN_TASK}'))

        result = list()
        for future in futures:
            ret_pair = future.result()[0]
            result.append((self.functor_serdes.deserialize(ret_pair._key),
                           self.functor_serdes.deserialize(ret_pair._value)))
        return result
Exemplo n.º 19
0
    def delete(self, k, options: dict = None):
        if options is None:
            options = {}
        key = create_serdes(self.__store._store_locator._serdes).serialize(k)
        er_pair = ErPair(key=key, value=None)
        value = None
        partition_id = self.partitioner(key)
        egg = self.ctx.route_to_egg(self.__store._partitions[partition_id])

        job_id = generate_job_id(self.__session_id, RollPair.DELETE)
        job = ErJob(id=job_id,
                    name=RollPair.DELETE,
                    inputs=[self.__store],
                    outputs=[],
                    functors=[ErFunctor(name=RollPair.DELETE, body=cloudpickle.dumps(er_pair))])

        task_results = self._run_job(job=job, create_output_if_missing=False)
Exemplo n.º 20
0
    def write(self):
        L.info("RemoteRollPairWriteBatch write calling")
        if len(self.manual_merger) == 0:
            L.info(f"self.manual_merger={self.manual_merger}")
            return
        self.has_write_op = True
        batches = TransferPair.pair_to_bin_batch(
            sorted(self.manual_merger.items(), key=lambda kv: kv[0]))
        task_id = f"{self.adapter._replicate_job_id}-partition-{self.adapter._partition_id}"
        L.info(f"task_id={task_id}")

        tasks = [
            ErTask(id=task_id,
                   name=RollPair.PUT_BATCH,
                   inputs=[self.adapter._er_partition],
                   outputs=[self.adapter._er_partition],
                   job=ErJob(id=self.adapter._replicate_job_id,
                             name=RollPair.PUT_BATCH))
        ]

        def send_command(tasks, remote_cmd_endpoint):
            cmd_client = CommandClient()
            return cmd_client.sync_send(
                inputs=tasks,
                output_types=[ErTask],
                endpoint=remote_cmd_endpoint,
                command_uri=CommandURI(f'v1/egg-pair/runTask'))

        L.info(f"start to send cmd")
        t = Thread(target=send_command,
                   name=task_id,
                   args=[tasks, self.adapter.remote_cmd_endpoint])
        t.start()

        transfer_client = TransferClient()
        f = transfer_client.send(
            batches,
            endpoint=self.adapter.remote_transfer_endpoint,
            tag=task_id)

        f.result()
        t.join()

        self.manual_merger.clear()
        L.info("RemoteRollPairWriteBatch write called")
Exemplo n.º 21
0
    def glom(self, output=None, options: dict = None):
        if options is None:
            options = {}

        outputs = self._maybe_set_output(output)
        functor = ErFunctor(name=RollPair.GLOM,
                            serdes=SerdesTypes.CLOUD_PICKLE)

        job = ErJob(id=generate_job_id(self.__session_id, RollPair.GLOM),
                    name=RollPair.GLOM,
                    inputs=[self.__store],
                    outputs=outputs,
                    functors=[functor])

        task_results = self._run_job(job=job)
        er_store = self.__get_output_from_result(task_results)

        return RollPair(er_store, self.ctx)
Exemplo n.º 22
0
    def get(self, k, options: dict = None):
        if options is None:
            options = {}
        L.debug(f"get k: {k}")
        k = create_serdes(self.__store._store_locator._serdes).serialize(k)
        er_pair = ErPair(key=k, value=None)
        outputs = []
        value = None
        partition_id = self.partitioner(k)
        egg = self.ctx.route_to_egg(self.__store._partitions[partition_id])
        L.info(
            f"partitions count: {self.__store._store_locator._total_partitions}, target partition: {partition_id}, endpoint: {egg._command_endpoint}"
        )
        inputs = [
            ErPartition(id=partition_id,
                        store_locator=self.__store._store_locator)
        ]
        output = [
            ErPartition(id=partition_id,
                        store_locator=self.__store._store_locator)
        ]

        job_id = generate_job_id(self.__session_id, RollPair.GET)
        job = ErJob(id=job_id,
                    name=RollPair.GET,
                    inputs=[self.__store],
                    outputs=outputs,
                    functors=[ErFunctor(body=cloudpickle.dumps(er_pair))])

        task = ErTask(id=generate_task_id(job_id, partition_id),
                      name=RollPair.GET,
                      inputs=inputs,
                      outputs=output,
                      job=job)
        job_resp = self.__command_client.simple_sync_send(
            input=task,
            output_type=ErPair,
            endpoint=egg._command_endpoint,
            command_uri=CommandURI(
                f'{RollPair.EGG_PAIR_URI_PREFIX}/{RollPair.RUN_TASK}'),
            serdes_type=self.__command_serdes)

        return self.value_serdes.deserialize(
            job_resp._value) if job_resp._value != b'' else None
Exemplo n.º 23
0
    def destroy(self, options: dict = None):
        if len(self.ctx.get_session()._cluster_manager_client.get_store(self.get_store())._partitions) == 0:
            L.exception(f"store:{self.get_store()} has been destroyed before")
            raise ValueError(f"store:{self.get_store()} has been destroyed before")

        if options is None:
            options = {}

        job = ErJob(id=generate_job_id(self.__session_id, RollPair.DESTROY),
                    name=RollPair.DESTROY,
                    inputs=[self.__store],
                    outputs=[self.__store],
                    functors=[],
                    options=options)

        task_results = self._run_job(job=job, create_output_if_missing=False)
        self.ctx.get_session()._cluster_manager_client.delete_store(self.__store)
        L.debug(f'{RollPair.DESTROY}={self.__store}')
        self.destroyed = True
Exemplo n.º 24
0
    def subtract_by_key(self, other, output=None, options: dict = None):
        if options is None:
            options = {}

        inputs = self.__repartition_with(other)

        outputs = self._maybe_set_output(output)
        functor = ErFunctor(name=RollPair.SUBTRACT_BY_KEY,
                            serdes=SerdesTypes.CLOUD_PICKLE)
        job = ErJob(id=generate_job_id(self.__session_id,
                                       RollPair.SUBTRACT_BY_KEY),
                    name=RollPair.SUBTRACT_BY_KEY,
                    inputs=inputs,
                    outputs=outputs,
                    functors=[functor])

        task_future = self._run_job(job=job)
        er_store = self.__get_output_from_result(task_future)
        return RollPair(er_store, self.ctx)
Exemplo n.º 25
0
    def collapse_partitions(self, func, output=None, options: dict = None):
        if options is None:
            options = {}

        outputs = []
        if output:
            RollPair.__check_partition(self.get_partitions(), output._store_locator._total_partitions)
            outputs.append(output)

        functor = ErFunctor(name=RollPair.COLLAPSE_PARTITIONS, serdes=SerdesTypes.CLOUD_PICKLE, body=cloudpickle.dumps(func))
        job = ErJob(id=generate_job_id(self.__session_id, RollPair.COLLAPSE_PARTITIONS),
                    name=RollPair.COLLAPSE_PARTITIONS,
                    inputs=[self.__store],
                    outputs=outputs,
                    functors=[functor])

        task_results = self._run_job(job=job)
        er_store = self.__get_output_from_result(task_results)

        return RollPair(er_store, self.ctx)
Exemplo n.º 26
0
    def glom(self, output=None, options: dict = None):
        if options is None:
            options = {}

        outputs = []
        if output:
            RollPair.__check_partition(self.get_partitions(), output._store_locator._total_partitions)
            outputs.append(output)

        functor = ErFunctor(name=RollPair.GLOM, serdes=SerdesTypes.CLOUD_PICKLE)

        job = ErJob(id=generate_job_id(self.__session_id, RollPair.GLOM),
                    name=RollPair.GLOM,
                    inputs=[self.__store],
                    outputs=outputs,
                    functors=[functor])

        task_results = self._run_job(job=job)
        er_store = self.__get_output_from_result(task_results)

        return RollPair(er_store, self.ctx)
Exemplo n.º 27
0
    def get(self, k, options: dict = None):
        if options is None:
            options = {}
        k = create_serdes(self.__store._store_locator._serdes).serialize(k)
        er_pair = ErPair(key=k, value=None)
        partition_id = self.partitioner(k)
        egg = self.ctx.route_to_egg(self.__store._partitions[partition_id])
        inputs = [
            ErPartition(id=partition_id,
                        store_locator=self.__store._store_locator)
        ]
        outputs = [
            ErPartition(id=partition_id,
                        store_locator=self.__store._store_locator)
        ]

        job_id = generate_job_id(self.__session_id, RollPair.GET)
        job = ErJob(id=job_id,
                    name=RollPair.GET,
                    inputs=[self.__store],
                    outputs=[self.__store],
                    functors=[
                        ErFunctor(name=RollPair.GET,
                                  body=cloudpickle.dumps(er_pair))
                    ])

        task = ErTask(id=generate_task_id(job_id, partition_id),
                      name=RollPair.GET,
                      inputs=inputs,
                      outputs=outputs,
                      job=job)
        job_resp = self.__command_client.simple_sync_send(
            input=task,
            output_type=ErPair,
            endpoint=egg._command_endpoint,
            command_uri=self.RUN_TASK_URI,
            serdes_type=self.__command_serdes)

        return self.value_serdes.deserialize(
            job_resp._value) if job_resp._value != b'' else None
Exemplo n.º 28
0
    def sample(self, fraction, seed=None, output=None, options: dict = None):
        if options is None:
            options = {}

        outputs = []
        if output:
            RollPair.__check_partition(self.get_partitions(), output._store_locator._total_partitions)
            outputs.append(output)

        er_fraction = ErFunctor(name=RollPair.REDUCE, serdes=SerdesTypes.CLOUD_PICKLE, body=cloudpickle.dumps(fraction))
        er_seed  = ErFunctor(name=RollPair.REDUCE, serdes=SerdesTypes.CLOUD_PICKLE, body=cloudpickle.dumps(seed))

        job = ErJob(id=generate_job_id(self.__session_id, RollPair.SAMPLE),
                    name=RollPair.SAMPLE,
                    inputs=[self.__store],
                    outputs=outputs,
                    functors=[er_fraction, er_seed])

        task_results = self._run_job(job=job)
        er_store = self.__get_output_from_result(task_results)

        return RollPair(er_store, self.ctx)
Exemplo n.º 29
0
    def delete(self, k, options: dict = None):
        if options is None:
            options = {}
        key = create_serdes(self.__store._store_locator._serdes).serialize(k)
        er_pair = ErPair(key=key, value=None)
        outputs = []
        value = None
        partition_id = self.partitioner(key)
        egg = self.ctx.route_to_egg(self.__store._partitions[partition_id])
        L.info(egg._command_endpoint)
        L.info(f"count: {self.__store._store_locator._total_partitions}")
        inputs = [
            ErPartition(id=partition_id,
                        store_locator=self.__store._store_locator)
        ]
        output = [
            ErPartition(id=partition_id,
                        store_locator=self.__store._store_locator)
        ]

        job_id = generate_job_id(self.__session_id, RollPair.DELETE)
        job = ErJob(id=job_id,
                    name=RollPair.DELETE,
                    inputs=[self.__store],
                    outputs=outputs,
                    functors=[ErFunctor(body=cloudpickle.dumps(er_pair))])
        task = ErTask(id=generate_task_id(job_id, partition_id),
                      name=RollPair.DELETE,
                      inputs=inputs,
                      outputs=output,
                      job=job)
        L.info("start send req")
        job_resp = self.__command_client.simple_sync_send(
            input=task,
            output_type=ErPair,
            endpoint=egg._command_endpoint,
            command_uri=CommandURI(
                f'{RollPair.EGG_PAIR_URI_PREFIX}/{RollPair.RUN_TASK}'),
            serdes_type=self.__command_serdes)
Exemplo n.º 30
0
    def union(self, other, func=lambda v1, v2: v1, output=None, options: dict = None):
        if options is None:
            options = {}

        inputs = self.__repartition_with(other)

        outputs = []
        if output:
            RollPair.__check_partition(inputs[0]._store_locator._total_partitions,
                                        output._store_locator._total_partitions)
            outputs.append(output)

        functor = ErFunctor(name=RollPair.UNION, serdes=SerdesTypes.CLOUD_PICKLE, body=cloudpickle.dumps(func))
        job = ErJob(id=generate_job_id(self.__session_id, RollPair.UNION),
                    name=RollPair.UNION,
                    inputs=inputs,
                    outputs=outputs,
                    functors=[functor])

        task_future = self._run_job(job=job)
        er_store = self.__get_output_from_result(task_future)
        return RollPair(er_store, self.ctx)