def cancel( self, kafka_service: KafkaService, event: CommandRequestEvent, client_id: str, target_action_id: str, cascade: bool = False, ): for engine in set(self.execution_engines.values()): try: result = engine.cancel(client_id, target_action_id, cascade) except ExecutionEngineException as err: if isinstance(err, NoResourcesFoundException): logger.info(f"No resources found for {target_action_id}") else: kafka_service.send_status_message_event( event, Status.FAILED, f"An error occurred when attempting to delete resources related to {target_action_id}. " f"Please check the status of the deletion command ({event.action_id} " f"for additional information.", target_action_id, ) raise err # issue event under the target action_id kafka_service.send_status_message_event( event, Status.SUCCEEDED, "Execution successfully cancelled...", target_action_id) return result
def create_fetcher_dispatcher( common_kafka_cfg: KafkaServiceConfig, fetcher_cfg: FetcherServiceConfig) -> KafkaService: download_mgr = create_download_manager( fetcher_cfg.zookeeper_ensemble_hosts, fetcher_cfg.kubeconfig, fetcher_cfg.fetcher_job) download_mgr.start() callbacks = { common_kafka_cfg.consumer_topic: [ FetcherEventHandler(common_kafka_cfg.producer_topic, download_mgr, fetcher_cfg.s3_download_bucket) ], common_kafka_cfg.cmd_submit_topic: [ KafkaCommandCallback(DownloadCmdObject(download_mgr), common_kafka_cfg.cmd_return_topic) ], } consumer, producer = create_kafka_consumer_producer( common_kafka_cfg, SERVICE_NAME) pod_name = get_pod_name() return KafkaService( name=SERVICE_NAME, version=__version__, callbacks=callbacks, kafka_consumer=consumer, kafka_producer=producer, pod_name=pod_name, status_topic=common_kafka_cfg.status_topic, )
def simple_kafka_service(kafka_consumer: KafkaConsumer, kafka_producer: KafkaProducer): callbacks = {} kafka_service = KafkaService( name=SERVICE_NAME, version=VERSION, callbacks=callbacks, kafka_consumer=kafka_consumer, kafka_producer=kafka_producer, pod_name=POD_NAME, ) return kafka_service
def cancel( self, kafka_service: KafkaService, event: CommandRequestEvent, client_id: str, target_action_id: str, cascade: bool = False, ) -> Optional[Any]: kafka_service.send_status_message_event(event, Status.PENDING, "Canceling downloads...", target_action_id) try: k8s_delete_results, num_zk_nodes_updated = self.download_mgr.cancel( client_id, target_action_id) except Exception as err: kafka_service.send_status_message_event( event, Status.FAILED, f"An error occurred when attempting to delete resources related to {target_action_id}. " f"Please check the status of the deletion command ({event.action_id} " f"for additional information.", target_action_id, ) raise err if num_zk_nodes_updated == 0: kafka_service.send_status_message_event( event, Status.SUCCEEDED, "No downloads to cancel...", target_action_id) return { "k8s_deletion_results": k8s_delete_results, "num_zookeeper_nodes_updated": num_zk_nodes_updated }
def _create_kafka_service(callbacks: Dict[str, List[KafkaServiceCallback]], kafka_consumer, kafka_producer): class StopKafkaServiceCallback(KafkaServiceCallback): def handle_event(self, event: BenchmarkEvent, kafka_service: KafkaService): kafka_service.stop_loop() return None def cleanup(self): pass kafka_service = KafkaService( name=SERVICE_NAME, version=VERSION, callbacks=callbacks, kafka_consumer=kafka_consumer, kafka_producer=kafka_producer, pod_name=POD_NAME, status_topic=STATUS_TOPIC, ) kafka_service.add_callback(StopKafkaServiceCallback(), CONSUMER_TOPIC) return kafka_service
def kafka_service(mocker) -> KafkaService: from kafka import KafkaConsumer, KafkaProducer kafka_service = KafkaService( name="kafka-service", version="1.0", callbacks={}, kafka_consumer=mocker.create_autospec(KafkaConsumer), kafka_producer=mocker.create_autospec(KafkaProducer), pod_name=POD_NAME, status_topic=STATUS_TOPIC, ) mocker.spy(kafka_service, "send_status_message_event") mocker.spy(kafka_service, "send_event") return kafka_service
def create_service(common_kafka_cfg: KafkaServiceConfig, service_cfg: WatcherServiceConfig) -> KafkaService: callbacks = { common_kafka_cfg.consumer_topic: [WatchJobsEventHandler(service_cfg)] } consumer, producer = create_kafka_consumer_producer( common_kafka_cfg, SERVICE_NAME) return KafkaService( name=SERVICE_NAME, version=__version__, callbacks=callbacks, kafka_consumer=consumer, kafka_producer=producer, pod_name=get_pod_name(), status_topic=common_kafka_cfg.status_topic, )
def handle_event(self, event: FetcherBenchmarkEvent, kafka_service: KafkaService): # Only handle single run benchmarks if not ExecutorEventHandler.is_single_run(event): logging.debug(f"Ignoring non single-run event: {event}") return engine_id = ExecutorEventHandler.get_engine_id(event) engine = self.execution_engines.get(engine_id) if not engine: # Ok. I've failed, but may be another service can have this engine if engine_id in self.valid_execution_engines: logging.info( f"{engine_id} is whitelisted, but not present here. Nothing to do" ) else: # It's really something weird logging.warning(f"Unknown engine {engine_id}") return try: kafka_service.send_status_message_event( event, Status.PENDING, "Processing benchmark submission request...") job = engine.run(event) except ExecutionEngineException as e: logger.exception("Engine throws exception") kafka_service.send_status_message_event(event, Status.ERROR, str(e)) raise KafkaServiceCallbackException from e payload = ExecutorPayload.create_from_fetcher_payload( event.payload, job) response_event = create_from_object(ExecutorBenchmarkEvent, event, payload=payload) kafka_service.send_status_message_event( response_event, Status.SUCCEEDED, "Benchmark successfully created...") kafka_service.send_event(response_event, topic=self.producer_topic)
def create_executor_service( service_name: str, version: str, common_kafka_cfg: KafkaServiceConfig, engines: Dict[str, ExecutionEngine]) -> KafkaService: callbacks = _create_callbacks(common_kafka_cfg, engines) consumer, producer = create_kafka_consumer_producer( common_kafka_cfg, service_name) pod_name = get_pod_name() return KafkaService( name=service_name, version=version, callbacks=callbacks, kafka_consumer=consumer, kafka_producer=producer, pod_name=pod_name, status_topic=common_kafka_cfg.status_topic, )
def handle_event(self, event: FetcherBenchmarkEvent, kafka_service: KafkaService): # Only handle scheduled benchmarks if ScheduledBenchmarkExecutorEventHandler.is_single_run(event): logging.debug(f"Ignoring event non scheduled benchmark event: {event}") return try: kafka_service.send_status_message_event( event, Status.PENDING, "Processing scheduled benchmark submission request..." ) job = self.k8s_execution_engine.schedule(event) except ExecutionEngineException as e: logger.exception("Engine throws exception") kafka_service.send_status_message_event(event, Status.ERROR, str(e)) raise KafkaServiceCallbackException from e kafka_service.send_status_message_event( event, Status.SUCCEEDED, f"Scheduled benchmark successfully submitted with job id {job.id}" )
def handle_event(self, event: ExecutorBenchmarkEvent, kafka_service: KafkaService): job_id = event.payload.job.id if job_id in self.watchers: # This shouldn't happen, so it is here more as a protection mechanism logger.warning("There is already a watcher for job '%s'", job_id) return descriptor = BenchmarkDescriptor.from_dict(event.payload.toml.contents) if descriptor.hardware.strategy not in [ DistributedStrategy.SINGLE_NODE, DistributedStrategy.INFERENCE ]: logger.info(f"Unsupported strategy {descriptor.hardware.strategy}") kafka_service.send_status_message_event( event, Status.PENDING, f"'{descriptor.hardware.strategy.value}' strategy is not currently supported." ) return logger.info("Starting to watch the job '%s'", job_id) watcher_callback = self._make_status_callback( event, kafka_service, not self._is_sage_maker_job(event)) if self._is_sage_maker_job(event): watcher = SageMakerTrainingJobWatcher( job_id=job_id, callback=watcher_callback, sagemaker_client=boto3.client("sagemaker")) kafka_service.send_status_message_event( event, Status.PENDING, "Watching SageMaker benchmark") else: watcher = KubernetesJobWatcher( job_id, watcher_callback, kubernetes_client_jobs=kubernetes.client.BatchV1Api(), kubernetes_client_pods=kubernetes.client.CoreV1Api(), kubernetes_namespace=self.config. kubernetes_namespace_of_running_jobs, ) kafka_service.send_status_message_event( event, Status.PENDING, "Watching Kubernetes benchmark") self.watchers[job_id] = watcher watcher.start()
def handle_event(self, event: FetcherBenchmarkEvent, kafka_service: KafkaService): def extract_downloads(event) -> List[DownloadableContent]: return event.payload.datasets + event.payload.models def execute(task: DownloadableContent, callback) -> None: task.dst = get_content_dst(task, self.s3_download_bucket) kafka_service.send_status_message_event( event, Status.PENDING, f"Preparing {task.src} for download...") self.download_mgr.fetch(task, event, callback) def execute_all(tasks: List[DownloadableContent], callback: Callable) -> None: kafka_service.send_status_message_event(event, Status.PENDING, "Initiating downloads...") pending = list(tasks) def on_done(content: DownloadableContent): if content.status == FetcherStatus.DONE: msg, status = f"{content.src} downloaded...", Status.PENDING elif content.status == FetcherStatus.CANCELED: msg, status = f"{content.src} download canceled...", Status.CANCELED elif content.status == FetcherStatus.FAILED: msg, status = f"{content.src} download failed: '{content.message}'...", Status.FAILED elif content.status in { FetcherStatus.RUNNING, FetcherStatus.PENDING }: msg, status = f"Downloading {content.src}...", Status.PENDING else: msg, status = f"Unknown status {content.status} issued for {content.src}", Status.ERROR if msg and status: kafka_service.send_status_message_event(event, status, msg) pending.remove(content) if not pending: callback() for tsk in tasks: execute(tsk, on_done) tasks = extract_downloads(event) tasks = list(filter(lambda t: not t.dst, tasks)) if not tasks: kafka_service.send_status_message_event(event, Status.SUCCEEDED, "Nothing to fetch") kafka_service.send_event(event, self.producer_topic) return def on_all_done(): total_status = FetcherEventHandler._collect_status( event.payload.datasets) # Any failed/canceled fetching is not actionable - so we don't send it down the pipeline if total_status == Status.SUCCEEDED: kafka_service.send_event(event, self.producer_topic) kafka_service.send_status_message_event( event, total_status, "All downloads processed") elif total_status in [Status.CANCELED, Status.FAILED]: kafka_service.send_status_message_event( event, total_status, "Aborting execution") else: logging.warning( f"Fetching ended with unexpected status: {total_status}") execute_all(tasks, on_all_done)
def test_dont_remove_from_running(simple_kafka_service: KafkaService): simple_kafka_service._running = True with pytest.raises(KafkaService.LoopAlreadyRunningException, match=re.escape(KafkaService._CANNOT_UPDATE_CALLBACKS)): simple_kafka_service.remove_callback( MagicMock(spec=KafkaServiceCallback), CONSUMER_TOPIC)
def handle_event(self, event: BenchmarkEvent, kafka_service: KafkaService): kafka_service.stop_loop() return None
def handle_event(self, event: CommandRequestEvent, kafka_service: KafkaService): # Safety check payload = event.payload if not payload or not isinstance(payload, CommandRequestPayload): logger.info("Wrong payload passed") return command = event.payload.command args = event.payload.args logger.info(f"Command {command} called with {args}") if not hasattr(self.cmd_object, command): # May be another callback in the chain will be able to handle it logger.info("No command method found") return method = getattr(self.cmd_object, command) if callable(method): sig = signature(method) result = None code = KafkaCommandCallback.CODE_SUCCESS msg = self._get_message(KafkaCommandCallback.OK, command) pos_args = [] kw_args = {} if isinstance(args, list): pos_args = tuple(args) elif isinstance(args, dict): kw_args = dict(args) if "event" in sig.parameters: kw_args["event"] = event if "kafka_service" in sig.parameters: kw_args["kafka_service"] = kafka_service params = OrderedDict(sig.parameters) # Remove positionals orderparams = list(params) for i in range(0, min(len(pos_args), len(orderparams))): del params[orderparams[i]] # Remove named for k in kw_args: if k in params: del params[k] from_event = vars(event) for key in from_event.keys() & params.keys(): kw_args[key] = from_event[key] try: logger.info(f"executing {command}({pos_args}, {kw_args})") result = method(*pos_args, **kw_args) except TypeError as e: logger.exception("Method invocation failed") code = KafkaCommandCallback.CODE_CLIENT_ERROR msg = self._get_message(KafkaCommandCallback.INVALID_ARGS, command, str(e)) except Exception as e: logger.exception("Command failed") code = KafkaCommandCallback.CODE_COMMAND_ERROR msg = self._get_message(KafkaCommandCallback.UNKNOWN_ERROR, command, str(e)) response_payload = CommandResponsePayload(code, msg, event, result) response_event = create_from_object(CommandResponseEvent, event, payload=response_payload) kafka_service.send_event(response_event, self.cmd_return_topic) else: logger.info(f"Uncallable {command} member requested")
def handle_event(self, event: BenchmarkEvent, kafka_service: KafkaService): kafka_service.send_event(event, topic=self.topic)
def test_kafka_service_stop_before_run(simple_kafka_service: KafkaService): assert not simple_kafka_service.running with pytest.raises(KafkaService.LoopNotRunningException, match=re.escape(KafkaService._IS_NOT_RUNNING)): simple_kafka_service.stop_loop()
def test_kafka_service_started_twice(simple_kafka_service: KafkaService): simple_kafka_service._running = True with pytest.raises(KafkaService.LoopAlreadyRunningException, match=re.escape(KafkaService._LOOP_IS_ALREADY_RUNNING)): simple_kafka_service.run_loop()