def put(self, k, v, options: dict = None): if options is None: options = {} k, v = create_serdes(self.__store._store_locator._serdes).serialize(k), \ create_serdes(self.__store._store_locator._serdes).serialize(v) er_pair = ErPair(key=k, value=v) outputs = [] partition_id = self.partitioner(k) egg = self.ctx.route_to_egg(self.__store._partitions[partition_id]) inputs = [ErPartition(id=partition_id, store_locator=self.__store._store_locator)] output = [ErPartition(id=0, store_locator=self.__store._store_locator)] job_id = generate_job_id(self.__session_id, RollPair.PUT) job = ErJob(id=job_id, name=RollPair.PUT, inputs=[self.__store], outputs=outputs, functors=[ErFunctor(name=RollPair.PUT, body=cloudpickle.dumps(er_pair))]) task = ErTask(id=generate_task_id(job_id, partition_id), name=RollPair.PUT, inputs=inputs, outputs=output, job=job) job_resp = self.__command_client.simple_sync_send( input=task, output_type=ErPair, endpoint=egg._command_endpoint, command_uri=CommandURI(f'{RollPair.EGG_PAIR_URI_PREFIX}/{RollPair.RUN_TASK}'), serdes_type=self.__command_serdes ) value = job_resp._value return value
def count(self): total_partitions = self.__store._store_locator._total_partitions job_id = generate_job_id(self.__session_id, tag=RollPair.COUNT) job = ErJob(id=job_id, name=RollPair.COUNT, inputs=[self.ctx.populate_processor(self.__store)]) args = list() for i in range(total_partitions): partition_input = job._inputs[0]._partitions[i] task = ErTask(id=generate_task_id(job_id, i), name=job._name, inputs=[partition_input], job=job) args.append(([task], partition_input._processor._command_endpoint)) futures = self.__command_client.async_call( args=args, output_types=[ErPair], command_uri=CommandURI( f'{RollPair.EGG_PAIR_URI_PREFIX}/{RollPair.RUN_TASK}')) done = wait(futures, return_when=FIRST_EXCEPTION).done result = 0 for future in done: pair = future.result()[0] result += self.functor_serdes.deserialize(pair._value) return result
def get(self, k, options: dict = None): if options is None: options = {} k = create_serdes(self.__store._store_locator._serdes).serialize(k) er_pair = ErPair(key=k, value=None) partition_id = self.partitioner(k) egg = self.ctx.route_to_egg(self.__store._partitions[partition_id]) inputs = [ErPartition(id=partition_id, store_locator=self.__store._store_locator)] outputs = [ErPartition(id=partition_id, store_locator=self.__store._store_locator)] job_id = generate_job_id(self.__session_id, RollPair.GET) job = ErJob(id=job_id, name=RollPair.GET, inputs=[self.__store], outputs=[self.__store], functors=[ErFunctor(name=RollPair.GET, body=cloudpickle.dumps(er_pair))]) task = ErTask(id=generate_task_id(job_id, partition_id), name=RollPair.GET, inputs=inputs, outputs=outputs, job=job) job_resp = self.__command_client.simple_sync_send( input=task, output_type=ErPair, endpoint=egg._command_endpoint, command_uri=self.RUN_TASK_URI, serdes_type=self.__command_serdes ) return self.value_serdes.deserialize(job_resp._value) if job_resp._value != b'' else None
def destroy(self, options: dict = None): tasks = [ ErTask( id=f"{self._replicate_job_id}-partition-{self._partition_id}", name=RollPair.DESTROY, inputs=[self._er_partition], outputs=[], job=ErJob(id=self._replicate_job_id, name=RollPair.DESTROY)) ] return self._cm_client.sync_send(inputs=tasks, output_types=[ErTask], endpoint=self.remote_cmd_endpoint, command_uri=RollPair.RUN_TASK_URI)
def aggregate(self, zero_value, seq_op, comb_op, output=None, options: dict = None): total_partitions = self.__store._store_locator._total_partitions job_id = generate_job_id(self.__session_id, tag=RollPair.AGGREGATE) serialized_zero_value = ErFunctor(name=RollPair.AGGREGATE, serdes=SerdesTypes.CLOUD_PICKLE, body=cloudpickle.dumps(zero_value)) serialized_seq_op = ErFunctor(name=RollPair.AGGREGATE, serdes=SerdesTypes.CLOUD_PICKLE, body=cloudpickle.dumps(seq_op)) job = ErJob(id=job_id, name=RollPair.AGGREGATE, inputs=[self.ctx.populate_processor(self.__store)], functors=[serialized_zero_value, serialized_seq_op]) args = list() for i in range(total_partitions): partition_input = job._inputs[0]._partitions[i] task = ErTask(id=generate_task_id(job_id, i), name=job._name, inputs=[partition_input], job=job) args.append(([task], partition_input._processor._command_endpoint)) futures = self.__command_client.async_call( args=args, output_types=[ErPair], command_uri=CommandURI( f'{RollPair.EGG_PAIR_URI_PREFIX}/{RollPair.RUN_TASK}')) done = wait(futures, return_when=FIRST_EXCEPTION).done result = None first = True for future in done: pair = future.result()[0] seq_op_result = self.functor_serdes.deserialize(pair._value) if not first: result = comb_op(result, seq_op_result) else: result = seq_op_result first = False return result
def with_stores(self, func, others=None, options: dict = None): if options is None: options = {} tag = "withStores" if others is None: others = [] total_partitions = self.get_partitions() for other in others: if other.get_partitions() != total_partitions: raise ValueError( f"diff partitions: expected:{total_partitions}, actual:{other.get_partitions()}" ) job_id = generate_job_id(self.__session_id, tag=tag) job = ErJob(id=job_id, name=tag, inputs=[ self.ctx.populate_processor(rp.get_store()) for rp in [self] + others ], functors=[ ErFunctor(name=tag, serdes=SerdesTypes.CLOUD_PICKLE, body=cloudpickle.dumps(func)) ], options=options) args = list() for i in range(total_partitions): partition_self = job._inputs[0]._partitions[i] task = ErTask( id=generate_task_id(job_id, i), name=job._name, inputs=[store._partitions[i] for store in job._inputs], job=job) args.append(([task], partition_self._processor._command_endpoint)) futures = self.__command_client.async_call( args=args, output_types=[ErPair], command_uri=CommandURI( f'{RollPair.EGG_PAIR_URI_PREFIX}/{RollPair.RUN_TASK}')) result = list() for future in futures: ret_pair = future.result()[0] result.append((self.functor_serdes.deserialize(ret_pair._key), self.functor_serdes.deserialize(ret_pair._value))) return result
def write(self): L.info("RemoteRollPairWriteBatch write calling") if len(self.manual_merger) == 0: L.info(f"self.manual_merger={self.manual_merger}") return self.has_write_op = True batches = TransferPair.pair_to_bin_batch( sorted(self.manual_merger.items(), key=lambda kv: kv[0])) task_id = f"{self.adapter._replicate_job_id}-partition-{self.adapter._partition_id}" L.info(f"task_id={task_id}") tasks = [ ErTask(id=task_id, name=RollPair.PUT_BATCH, inputs=[self.adapter._er_partition], outputs=[self.adapter._er_partition], job=ErJob(id=self.adapter._replicate_job_id, name=RollPair.PUT_BATCH)) ] def send_command(tasks, remote_cmd_endpoint): cmd_client = CommandClient() return cmd_client.sync_send( inputs=tasks, output_types=[ErTask], endpoint=remote_cmd_endpoint, command_uri=CommandURI(f'v1/egg-pair/runTask')) L.info(f"start to send cmd") t = Thread(target=send_command, name=task_id, args=[tasks, self.adapter.remote_cmd_endpoint]) t.start() transfer_client = TransferClient() f = transfer_client.send( batches, endpoint=self.adapter.remote_transfer_endpoint, tag=task_id) f.result() t.join() self.manual_merger.clear() L.info("RemoteRollPairWriteBatch write called")
def get(self, k, options: dict = None): if options is None: options = {} L.debug(f"get k: {k}") k = create_serdes(self.__store._store_locator._serdes).serialize(k) er_pair = ErPair(key=k, value=None) outputs = [] value = None partition_id = self.partitioner(k) egg = self.ctx.route_to_egg(self.__store._partitions[partition_id]) L.info( f"partitions count: {self.__store._store_locator._total_partitions}, target partition: {partition_id}, endpoint: {egg._command_endpoint}" ) inputs = [ ErPartition(id=partition_id, store_locator=self.__store._store_locator) ] output = [ ErPartition(id=partition_id, store_locator=self.__store._store_locator) ] job_id = generate_job_id(self.__session_id, RollPair.GET) job = ErJob(id=job_id, name=RollPair.GET, inputs=[self.__store], outputs=outputs, functors=[ErFunctor(body=cloudpickle.dumps(er_pair))]) task = ErTask(id=generate_task_id(job_id, partition_id), name=RollPair.GET, inputs=inputs, outputs=output, job=job) job_resp = self.__command_client.simple_sync_send( input=task, output_type=ErPair, endpoint=egg._command_endpoint, command_uri=CommandURI( f'{RollPair.EGG_PAIR_URI_PREFIX}/{RollPair.RUN_TASK}'), serdes_type=self.__command_serdes) return self.value_serdes.deserialize( job_resp._value) if job_resp._value != b'' else None
def dispatch(self, service_name: str, args, kwargs): if service_name not in self._service_route_table: raise ValueError(f'{service_name} has not been registered yet') _instance, _class, _method = self._service_route_table[service_name] if not _instance: _instance = _class() task_name = '' deserialized_args = list() for arg in args: task = meta_pb2.Task() msg_len = task.ParseFromString(arg) deserialized_task = ErTask.from_proto(task) if not task_name: task_name = deserialized_task._name deserialized_args.append(deserialized_task) L.debug( f"[CS] calling: [{service_name}], task_name={task_name}, request={deserialized_args}, len={len(args)}" ) start = time.time() try: call_result = _method(_instance, *deserialized_args) except Exception as e: L.exception( f'Failed to dispatch to [{service_name}], task_name: {task_name}, request: {deserialized_args}' ) raise e elapsed = time.time() - start if L.isEnabledFor(logging.TRACE): L.trace( f"[CS] called (elapsed={elapsed}): [{service_name}]: task_name={task_name}, request={deserialized_args}, result={call_result}" ) else: L.debug( f"[CS] called (elapsed={elapsed}): [{service_name}], task_name={task_name}, request={deserialized_args}" ) # todo:2: defaulting to pb message. need changes when other types of result is present return [call_result.to_proto().SerializeToString()]
def delete(self, k, options: dict = None): if options is None: options = {} key = create_serdes(self.__store._store_locator._serdes).serialize(k) er_pair = ErPair(key=key, value=None) outputs = [] value = None partition_id = self.partitioner(key) egg = self.ctx.route_to_egg(self.__store._partitions[partition_id]) L.info(egg._command_endpoint) L.info(f"count: {self.__store._store_locator._total_partitions}") inputs = [ ErPartition(id=partition_id, store_locator=self.__store._store_locator) ] output = [ ErPartition(id=partition_id, store_locator=self.__store._store_locator) ] job_id = generate_job_id(self.__session_id, RollPair.DELETE) job = ErJob(id=job_id, name=RollPair.DELETE, inputs=[self.__store], outputs=outputs, functors=[ErFunctor(body=cloudpickle.dumps(er_pair))]) task = ErTask(id=generate_task_id(job_id, partition_id), name=RollPair.DELETE, inputs=inputs, outputs=output, job=job) L.info("start send req") job_resp = self.__command_client.simple_sync_send( input=task, output_type=ErPair, endpoint=egg._command_endpoint, command_uri=CommandURI( f'{RollPair.EGG_PAIR_URI_PREFIX}/{RollPair.RUN_TASK}'), serdes_type=self.__command_serdes)
def cleanup(self, name, namespace, options: dict = None): if not namespace: raise ValueError('namespace cannot be blank') L.debug(f'cleaning up namespace={namespace}, name={name}') if options is None: options = {} total_partitions = options.get('total_partitions', 1) partitioner = options.get('partitioner', PartitionerTypes.BYTESTRING_HASH) store_serdes = options.get('serdes', self.default_store_serdes) if name == '*': store_type = options.get('store_type', '*') L.debug(f'cleaning up whole store_type={store_type}, namespace={namespace}, name={name}') er_store = ErStore(store_locator=ErStoreLocator(namespace=namespace, name=name, store_type=store_type)) job_id = generate_job_id(namespace, tag=RollPair.CLEANUP) job = ErJob(id=job_id, name=RollPair.DESTROY, inputs=[er_store], options=options) args = list() cleanup_partitions = [ErPartition(id=-1, store_locator=er_store._store_locator)] for server_node, eggs in self.__session._eggs.items(): egg = eggs[0] task = ErTask(id=generate_task_id(job_id, egg._command_endpoint._host), name=job._name, inputs=cleanup_partitions, job=job) args.append(([task], egg._command_endpoint)) futures = self.__command_client.async_call( args=args, output_types=[ErTask], command_uri=CommandURI(f'{RollPair.EGG_PAIR_URI_PREFIX}/{RollPair.RUN_TASK}')) for future in futures: result = future.result() self.get_session()._cluster_manager_client.delete_store(er_store) else: # todo:1: add combine options to pass it through store_options = self.__session.get_all_options() store_options.update(options) final_options = store_options.copy() store = ErStore( store_locator=ErStoreLocator( store_type=StoreTypes.ROLLPAIR_LMDB, namespace=namespace, name=name, total_partitions=total_partitions, partitioner=partitioner, serdes=store_serdes), options=final_options) task_results = self.__session._cluster_manager_client.get_store_from_namespace(store) L.trace('res={}'.format(task_results._stores)) if task_results._stores is not None: L.trace("item count={}".format(len(task_results._stores))) for item in task_results._stores: L.trace("item namespace={} name={}".format(item._store_locator._namespace, item._store_locator._name)) rp = RollPair(er_store=item, rp_ctx=self) rp.destroy()
def _decompose_job(self, job: ErJob): input_total_partitions = job._inputs[0]._store_locator._total_partitions output_total_partitions = 0 \ if not job._outputs \ else job._outputs[0]._store_locator._total_partitions larger_total_partitions = max(input_total_partitions, output_total_partitions) populated_input_partitions = self.populate_processor(job._inputs[0])._partitions if output_total_partitions > 0: populated_output_partitions = self.populate_processor(job._outputs[0])._partitions else: populated_output_partitions = list() result = list() for i in range(larger_total_partitions): input_partitions = list() output_partitions = list() if i < input_total_partitions: input_processor = populated_input_partitions[i]._processor input_server_node_id = input_processor._server_node_id for input_store in job._inputs: input_partitions.append(ErPartition( id=i, store_locator=input_store._store_locator, processor=input_processor)) else: input_processor = None input_server_node_id = None if i < output_total_partitions: output_processor = populated_output_partitions[i]._processor output_server_node_id = output_processor._server_node_id for output_store in job._outputs: output_partitions.append(ErPartition( id=i, store_locator=output_store._store_locator, processor=output_processor)) else: output_processor = None output_server_node_id = None tasks = [ErTask(id=generate_task_id(job._id, i), name=f'{job._name}', inputs=input_partitions, outputs=output_partitions, job=job)] if input_server_node_id == output_server_node_id: result.append( (tasks, input_processor._command_endpoint)) else: if input_server_node_id is not None: result.append( (tasks, input_processor._command_endpoint)) if output_server_node_id is not None: result.append( (tasks, output_processor._command_endpoint)) return result