示例#1
0
    def cancel(
        self,
        kafka_service: KafkaService,
        event: CommandRequestEvent,
        client_id: str,
        target_action_id: str,
        cascade: bool = False,
    ):
        for engine in set(self.execution_engines.values()):
            try:
                result = engine.cancel(client_id, target_action_id, cascade)
            except ExecutionEngineException as err:
                if isinstance(err, NoResourcesFoundException):
                    logger.info(f"No resources found for {target_action_id}")
                else:
                    kafka_service.send_status_message_event(
                        event,
                        Status.FAILED,
                        f"An error occurred when attempting to delete resources related to {target_action_id}. "
                        f"Please check the status of the deletion command ({event.action_id} "
                        f"for additional information.",
                        target_action_id,
                    )
                raise err

            # issue event under the target action_id
            kafka_service.send_status_message_event(
                event, Status.SUCCEEDED, "Execution successfully cancelled...",
                target_action_id)

            return result
def create_fetcher_dispatcher(
        common_kafka_cfg: KafkaServiceConfig,
        fetcher_cfg: FetcherServiceConfig) -> KafkaService:
    download_mgr = create_download_manager(
        fetcher_cfg.zookeeper_ensemble_hosts, fetcher_cfg.kubeconfig,
        fetcher_cfg.fetcher_job)
    download_mgr.start()

    callbacks = {
        common_kafka_cfg.consumer_topic: [
            FetcherEventHandler(common_kafka_cfg.producer_topic, download_mgr,
                                fetcher_cfg.s3_download_bucket)
        ],
        common_kafka_cfg.cmd_submit_topic: [
            KafkaCommandCallback(DownloadCmdObject(download_mgr),
                                 common_kafka_cfg.cmd_return_topic)
        ],
    }

    consumer, producer = create_kafka_consumer_producer(
        common_kafka_cfg, SERVICE_NAME)

    pod_name = get_pod_name()

    return KafkaService(
        name=SERVICE_NAME,
        version=__version__,
        callbacks=callbacks,
        kafka_consumer=consumer,
        kafka_producer=producer,
        pod_name=pod_name,
        status_topic=common_kafka_cfg.status_topic,
    )
def simple_kafka_service(kafka_consumer: KafkaConsumer,
                         kafka_producer: KafkaProducer):
    callbacks = {}
    kafka_service = KafkaService(
        name=SERVICE_NAME,
        version=VERSION,
        callbacks=callbacks,
        kafka_consumer=kafka_consumer,
        kafka_producer=kafka_producer,
        pod_name=POD_NAME,
    )
    return kafka_service
    def cancel(
        self,
        kafka_service: KafkaService,
        event: CommandRequestEvent,
        client_id: str,
        target_action_id: str,
        cascade: bool = False,
    ) -> Optional[Any]:

        kafka_service.send_status_message_event(event, Status.PENDING,
                                                "Canceling downloads...",
                                                target_action_id)
        try:
            k8s_delete_results, num_zk_nodes_updated = self.download_mgr.cancel(
                client_id, target_action_id)
        except Exception as err:
            kafka_service.send_status_message_event(
                event,
                Status.FAILED,
                f"An error occurred when attempting to delete resources related to {target_action_id}. "
                f"Please check the status of the deletion command ({event.action_id} "
                f"for additional information.",
                target_action_id,
            )
            raise err

        if num_zk_nodes_updated == 0:
            kafka_service.send_status_message_event(
                event, Status.SUCCEEDED, "No downloads to cancel...",
                target_action_id)

        return {
            "k8s_deletion_results": k8s_delete_results,
            "num_zookeeper_nodes_updated": num_zk_nodes_updated
        }
def _create_kafka_service(callbacks: Dict[str, List[KafkaServiceCallback]],
                          kafka_consumer, kafka_producer):
    class StopKafkaServiceCallback(KafkaServiceCallback):
        def handle_event(self, event: BenchmarkEvent,
                         kafka_service: KafkaService):
            kafka_service.stop_loop()
            return None

        def cleanup(self):
            pass

    kafka_service = KafkaService(
        name=SERVICE_NAME,
        version=VERSION,
        callbacks=callbacks,
        kafka_consumer=kafka_consumer,
        kafka_producer=kafka_producer,
        pod_name=POD_NAME,
        status_topic=STATUS_TOPIC,
    )
    kafka_service.add_callback(StopKafkaServiceCallback(), CONSUMER_TOPIC)
    return kafka_service
示例#6
0
def kafka_service(mocker) -> KafkaService:
    from kafka import KafkaConsumer, KafkaProducer

    kafka_service = KafkaService(
        name="kafka-service",
        version="1.0",
        callbacks={},
        kafka_consumer=mocker.create_autospec(KafkaConsumer),
        kafka_producer=mocker.create_autospec(KafkaProducer),
        pod_name=POD_NAME,
        status_topic=STATUS_TOPIC,
    )
    mocker.spy(kafka_service, "send_status_message_event")
    mocker.spy(kafka_service, "send_event")
    return kafka_service
def create_service(common_kafka_cfg: KafkaServiceConfig,
                   service_cfg: WatcherServiceConfig) -> KafkaService:
    callbacks = {
        common_kafka_cfg.consumer_topic: [WatchJobsEventHandler(service_cfg)]
    }
    consumer, producer = create_kafka_consumer_producer(
        common_kafka_cfg, SERVICE_NAME)
    return KafkaService(
        name=SERVICE_NAME,
        version=__version__,
        callbacks=callbacks,
        kafka_consumer=consumer,
        kafka_producer=producer,
        pod_name=get_pod_name(),
        status_topic=common_kafka_cfg.status_topic,
    )
    def handle_event(self, event: FetcherBenchmarkEvent,
                     kafka_service: KafkaService):

        # Only handle single run benchmarks
        if not ExecutorEventHandler.is_single_run(event):
            logging.debug(f"Ignoring non single-run event: {event}")
            return

        engine_id = ExecutorEventHandler.get_engine_id(event)
        engine = self.execution_engines.get(engine_id)

        if not engine:
            # Ok. I've failed, but may be another service can have this engine
            if engine_id in self.valid_execution_engines:
                logging.info(
                    f"{engine_id} is whitelisted, but not present here. Nothing to do"
                )
            else:
                # It's really something weird
                logging.warning(f"Unknown engine {engine_id}")
            return

        try:
            kafka_service.send_status_message_event(
                event, Status.PENDING,
                "Processing benchmark submission request...")
            job = engine.run(event)
        except ExecutionEngineException as e:
            logger.exception("Engine throws exception")
            kafka_service.send_status_message_event(event, Status.ERROR,
                                                    str(e))
            raise KafkaServiceCallbackException from e

        payload = ExecutorPayload.create_from_fetcher_payload(
            event.payload, job)

        response_event = create_from_object(ExecutorBenchmarkEvent,
                                            event,
                                            payload=payload)

        kafka_service.send_status_message_event(
            response_event, Status.SUCCEEDED,
            "Benchmark successfully created...")
        kafka_service.send_event(response_event, topic=self.producer_topic)
示例#9
0
def create_executor_service(
        service_name: str, version: str, common_kafka_cfg: KafkaServiceConfig,
        engines: Dict[str, ExecutionEngine]) -> KafkaService:
    callbacks = _create_callbacks(common_kafka_cfg, engines)

    consumer, producer = create_kafka_consumer_producer(
        common_kafka_cfg, service_name)

    pod_name = get_pod_name()

    return KafkaService(
        name=service_name,
        version=version,
        callbacks=callbacks,
        kafka_consumer=consumer,
        kafka_producer=producer,
        pod_name=pod_name,
        status_topic=common_kafka_cfg.status_topic,
    )
示例#10
0
    def handle_event(self, event: FetcherBenchmarkEvent, kafka_service: KafkaService):
        # Only handle scheduled benchmarks
        if ScheduledBenchmarkExecutorEventHandler.is_single_run(event):
            logging.debug(f"Ignoring event non scheduled benchmark event: {event}")
            return

        try:
            kafka_service.send_status_message_event(
                event, Status.PENDING, "Processing scheduled benchmark submission request..."
            )
            job = self.k8s_execution_engine.schedule(event)
        except ExecutionEngineException as e:
            logger.exception("Engine throws exception")
            kafka_service.send_status_message_event(event, Status.ERROR, str(e))
            raise KafkaServiceCallbackException from e

        kafka_service.send_status_message_event(
            event, Status.SUCCEEDED, f"Scheduled benchmark successfully submitted with job id {job.id}"
        )
    def handle_event(self, event: ExecutorBenchmarkEvent,
                     kafka_service: KafkaService):
        job_id = event.payload.job.id
        if job_id in self.watchers:
            # This shouldn't happen, so it is here more as a protection mechanism
            logger.warning("There is already a watcher for job '%s'", job_id)
            return

        descriptor = BenchmarkDescriptor.from_dict(event.payload.toml.contents)
        if descriptor.hardware.strategy not in [
                DistributedStrategy.SINGLE_NODE, DistributedStrategy.INFERENCE
        ]:
            logger.info(f"Unsupported strategy {descriptor.hardware.strategy}")
            kafka_service.send_status_message_event(
                event, Status.PENDING,
                f"'{descriptor.hardware.strategy.value}' strategy is not currently supported."
            )
            return

        logger.info("Starting to watch the job '%s'", job_id)

        watcher_callback = self._make_status_callback(
            event, kafka_service, not self._is_sage_maker_job(event))
        if self._is_sage_maker_job(event):
            watcher = SageMakerTrainingJobWatcher(
                job_id=job_id,
                callback=watcher_callback,
                sagemaker_client=boto3.client("sagemaker"))
            kafka_service.send_status_message_event(
                event, Status.PENDING, "Watching SageMaker benchmark")
        else:
            watcher = KubernetesJobWatcher(
                job_id,
                watcher_callback,
                kubernetes_client_jobs=kubernetes.client.BatchV1Api(),
                kubernetes_client_pods=kubernetes.client.CoreV1Api(),
                kubernetes_namespace=self.config.
                kubernetes_namespace_of_running_jobs,
            )
            kafka_service.send_status_message_event(
                event, Status.PENDING, "Watching Kubernetes benchmark")
        self.watchers[job_id] = watcher
        watcher.start()
    def handle_event(self, event: FetcherBenchmarkEvent,
                     kafka_service: KafkaService):
        def extract_downloads(event) -> List[DownloadableContent]:
            return event.payload.datasets + event.payload.models

        def execute(task: DownloadableContent, callback) -> None:

            task.dst = get_content_dst(task, self.s3_download_bucket)

            kafka_service.send_status_message_event(
                event, Status.PENDING, f"Preparing {task.src} for download...")

            self.download_mgr.fetch(task, event, callback)

        def execute_all(tasks: List[DownloadableContent],
                        callback: Callable) -> None:
            kafka_service.send_status_message_event(event, Status.PENDING,
                                                    "Initiating downloads...")

            pending = list(tasks)

            def on_done(content: DownloadableContent):
                if content.status == FetcherStatus.DONE:
                    msg, status = f"{content.src} downloaded...", Status.PENDING
                elif content.status == FetcherStatus.CANCELED:
                    msg, status = f"{content.src} download canceled...", Status.CANCELED
                elif content.status == FetcherStatus.FAILED:
                    msg, status = f"{content.src} download failed: '{content.message}'...", Status.FAILED
                elif content.status in {
                        FetcherStatus.RUNNING, FetcherStatus.PENDING
                }:
                    msg, status = f"Downloading {content.src}...", Status.PENDING
                else:
                    msg, status = f"Unknown status {content.status} issued for {content.src}", Status.ERROR

                if msg and status:
                    kafka_service.send_status_message_event(event, status, msg)

                pending.remove(content)
                if not pending:
                    callback()

            for tsk in tasks:
                execute(tsk, on_done)

        tasks = extract_downloads(event)
        tasks = list(filter(lambda t: not t.dst, tasks))

        if not tasks:
            kafka_service.send_status_message_event(event, Status.SUCCEEDED,
                                                    "Nothing to fetch")
            kafka_service.send_event(event, self.producer_topic)
            return

        def on_all_done():
            total_status = FetcherEventHandler._collect_status(
                event.payload.datasets)
            # Any failed/canceled fetching is not actionable - so we don't send it down the pipeline
            if total_status == Status.SUCCEEDED:
                kafka_service.send_event(event, self.producer_topic)
                kafka_service.send_status_message_event(
                    event, total_status, "All downloads processed")
            elif total_status in [Status.CANCELED, Status.FAILED]:
                kafka_service.send_status_message_event(
                    event, total_status, "Aborting execution")
            else:
                logging.warning(
                    f"Fetching ended with unexpected status: {total_status}")

        execute_all(tasks, on_all_done)
def test_dont_remove_from_running(simple_kafka_service: KafkaService):
    simple_kafka_service._running = True
    with pytest.raises(KafkaService.LoopAlreadyRunningException,
                       match=re.escape(KafkaService._CANNOT_UPDATE_CALLBACKS)):
        simple_kafka_service.remove_callback(
            MagicMock(spec=KafkaServiceCallback), CONSUMER_TOPIC)
 def handle_event(self, event: BenchmarkEvent,
                  kafka_service: KafkaService):
     kafka_service.stop_loop()
     return None
示例#15
0
    def handle_event(self, event: CommandRequestEvent,
                     kafka_service: KafkaService):
        # Safety check
        payload = event.payload
        if not payload or not isinstance(payload, CommandRequestPayload):
            logger.info("Wrong payload passed")
            return

        command = event.payload.command
        args = event.payload.args

        logger.info(f"Command {command} called with {args}")

        if not hasattr(self.cmd_object, command):
            # May be another callback in the chain will be able to handle it
            logger.info("No command method found")
            return
        method = getattr(self.cmd_object, command)

        if callable(method):
            sig = signature(method)

            result = None
            code = KafkaCommandCallback.CODE_SUCCESS
            msg = self._get_message(KafkaCommandCallback.OK, command)

            pos_args = []
            kw_args = {}

            if isinstance(args, list):
                pos_args = tuple(args)
            elif isinstance(args, dict):
                kw_args = dict(args)

            if "event" in sig.parameters:
                kw_args["event"] = event
            if "kafka_service" in sig.parameters:
                kw_args["kafka_service"] = kafka_service

            params = OrderedDict(sig.parameters)

            # Remove positionals
            orderparams = list(params)
            for i in range(0, min(len(pos_args), len(orderparams))):
                del params[orderparams[i]]

            # Remove named
            for k in kw_args:
                if k in params:
                    del params[k]

            from_event = vars(event)
            for key in from_event.keys() & params.keys():
                kw_args[key] = from_event[key]
            try:
                logger.info(f"executing {command}({pos_args}, {kw_args})")
                result = method(*pos_args, **kw_args)
            except TypeError as e:
                logger.exception("Method invocation failed")
                code = KafkaCommandCallback.CODE_CLIENT_ERROR
                msg = self._get_message(KafkaCommandCallback.INVALID_ARGS,
                                        command, str(e))
            except Exception as e:
                logger.exception("Command failed")
                code = KafkaCommandCallback.CODE_COMMAND_ERROR
                msg = self._get_message(KafkaCommandCallback.UNKNOWN_ERROR,
                                        command, str(e))

            response_payload = CommandResponsePayload(code, msg, event, result)
            response_event = create_from_object(CommandResponseEvent,
                                                event,
                                                payload=response_payload)
            kafka_service.send_event(response_event, self.cmd_return_topic)
        else:
            logger.info(f"Uncallable {command} member requested")
 def handle_event(self, event: BenchmarkEvent,
                  kafka_service: KafkaService):
     kafka_service.send_event(event, topic=self.topic)
def test_kafka_service_stop_before_run(simple_kafka_service: KafkaService):
    assert not simple_kafka_service.running
    with pytest.raises(KafkaService.LoopNotRunningException,
                       match=re.escape(KafkaService._IS_NOT_RUNNING)):
        simple_kafka_service.stop_loop()
def test_kafka_service_started_twice(simple_kafka_service: KafkaService):
    simple_kafka_service._running = True
    with pytest.raises(KafkaService.LoopAlreadyRunningException,
                       match=re.escape(KafkaService._LOOP_IS_ALREADY_RUNNING)):
        simple_kafka_service.run_loop()