def test_framework_invalid(descriptor_as_adict, descriptor_config): descriptor_config.valid_frameworks = ["foo"] descriptor_as_adict.ml.framework = "bar" with pytest.raises(DescriptorError): BenchmarkDescriptor.from_dict(descriptor_as_adict.to_dict(), descriptor_config)
def test_framework_version(descriptor_as_adict): descriptor_as_adict.ml.framework = "mxnet" descriptor_as_adict.ml.framework_version = "1.0" descriptor = BenchmarkDescriptor.from_dict(descriptor_as_adict.to_dict()) assert descriptor.ml.framework_version == "1.0"
def create_job_yaml_spec( descriptor_contents: Dict[str, str], executor_config: ExecutorConfig, fetched_data_sources: List[DownloadableContent], fetched_models: List[DownloadableContent], scripts: List[FileSystemObject], job_id: str, *, event: BenchmarkEvent, extra_bai_config_args=None, ) -> str: """ Creates the YAML spec file corresponding to a descriptor passed as parameter :param event: event that triggered this execution :param descriptor_contents: dict containing the parsed descriptor :param executor_config: configuration for the transpiler :param fetched_data_sources: list of fetched data sources, as generated by the fetcher :param fetched_models: list of fetched models, as generated by the fetcher :param scripts: list of supplied scripts :param job_id: str :param extra_bai_config_args: An optional Dict which will be forwarded to the `BaiConfig` object created :return: Tuple with (yaml string for the given descriptor, job_id) """ descriptor = BenchmarkDescriptor.from_dict( descriptor_contents, executor_config.descriptor_config) bai_k8s_benchmark_job_builder = create_single_run_benchmark_bai_k8s_builder( descriptor, executor_config.bai_config, fetched_data_sources, scripts, job_id, event=event, environment_info=executor_config.environment_info, extra_bai_config_args=extra_bai_config_args, ) if descriptor.hardware.strategy != DistributedStrategy.INFERENCE: return bai_k8s_benchmark_job_builder.dump_yaml_string() bai_k8s_inference_server_job_builder = create_inference_server_bai_k8s_builder( descriptor, executor_config.bai_config, job_id, fetched_models, event=event, environment_info=executor_config.environment_info, extra_bai_config_args=extra_bai_config_args, ) return (f"{bai_k8s_benchmark_job_builder.dump_yaml_string()}" f"---\n" f"{bai_k8s_inference_server_job_builder.dump_yaml_string()}")
def create_scheduled_job_yaml_spec(descriptor_contents: Dict, executor_config: ExecutorConfig, job_id: str, event: BenchmarkEvent) -> str: """ Creates the YAML spec file corresponding to a descriptor passed as parameter :param event: event that triggered this execution :param descriptor_contents: dict containing the parsed descriptor :param executor_config: configuration for the transpiler :param job_id: str :return: Tuple with (yaml string for the given descriptor, job_id) """ descriptor = BenchmarkDescriptor.from_dict( descriptor_contents, executor_config.descriptor_config) bai_k8s_builder = create_scheduled_benchmark_bai_k8s_builder( descriptor, executor_config.bai_config, job_id, event=event) return bai_k8s_builder.dump_yaml_string()
def run(self, event: FetcherBenchmarkEvent) -> BenchmarkJob: logger.info(f"Processing SageMaker benchmark {event.action_id}") try: descriptor = BenchmarkDescriptor.from_dict(event.payload.toml.contents, CONFIG) except DescriptorError as e: logger.exception("Could not parse descriptor %s", e) raise ExecutionEngineException("Cannot process the request") from e with tempfile.TemporaryDirectory(prefix=self.config.tmp_sources_dir) as tmpdirname: ScriptSourceDirectory.create(descriptor, tmpdirname, event.payload.scripts) session = self.session_factory() try: estimator = self.estimator_factory(session, descriptor, tmpdirname, self.config) except Exception as e: logger.exception("Could not create estimator %s", e) raise ExecutionEngineException("Cannot create estimator") from e # Estimate the total size total_size_gb = self._estimate_total_gb(event) estimator.train_volume_size = max(estimator.train_volume_size, total_size_gb) data = self._get_estimator_data(event) try: job_name = SageMakerExecutionEngine._get_job_name(event.action_id) merge = False if descriptor.custom_params and descriptor.custom_params.sagemaker_job_name: job_name = descriptor.custom_params.sagemaker_job_name if descriptor.custom_params and descriptor.custom_params.merge: merge = descriptor.custom_params.merge logger.info(f"Attempting to start training job {job_name}") if merge: estimator.fit(data, wait=True, logs=False, job_name=job_name) self.merge_metrics(descriptor) else: estimator.fit(data, wait=False, logs=False, job_name=job_name) except botocore.exceptions.ClientError as err: error_message = get_client_error_message(err, default="Unknown") raise ExecutionEngineException( f"Benchmark creation failed. SageMaker returned error: {error_message}" ) from err except Exception as err: logger.exception("Caught unexpected exception", err) raise err return BenchmarkJob(id=estimator.latest_training_job.name)
def handle_event(self, event: ExecutorBenchmarkEvent, kafka_service: KafkaService): job_id = event.payload.job.id if job_id in self.watchers: # This shouldn't happen, so it is here more as a protection mechanism logger.warning("There is already a watcher for job '%s'", job_id) return descriptor = BenchmarkDescriptor.from_dict(event.payload.toml.contents) if descriptor.hardware.strategy not in [ DistributedStrategy.SINGLE_NODE, DistributedStrategy.INFERENCE ]: logger.info(f"Unsupported strategy {descriptor.hardware.strategy}") kafka_service.send_status_message_event( event, Status.PENDING, f"'{descriptor.hardware.strategy.value}' strategy is not currently supported." ) return logger.info("Starting to watch the job '%s'", job_id) watcher_callback = self._make_status_callback( event, kafka_service, not self._is_sage_maker_job(event)) if self._is_sage_maker_job(event): watcher = SageMakerTrainingJobWatcher( job_id=job_id, callback=watcher_callback, sagemaker_client=boto3.client("sagemaker")) kafka_service.send_status_message_event( event, Status.PENDING, "Watching SageMaker benchmark") else: watcher = KubernetesJobWatcher( job_id, watcher_callback, kubernetes_client_jobs=kubernetes.client.BatchV1Api(), kubernetes_client_pods=kubernetes.client.CoreV1Api(), kubernetes_namespace=self.config. kubernetes_namespace_of_running_jobs, ) kafka_service.send_status_message_event( event, Status.PENDING, "Watching Kubernetes benchmark") self.watchers[job_id] = watcher watcher.start()
def descriptor(descriptor_as_dict): return BenchmarkDescriptor.from_dict(descriptor_as_dict)
def test_distributed_default(descriptor_as_adict): descriptor = BenchmarkDescriptor.from_dict(descriptor_as_adict.to_dict()) assert descriptor.hardware.processes_per_instance == 1
def test_invalid_args_type(descriptor_as_adict): descriptor_as_adict.ml.args = 4 with pytest.raises(DescriptorError): BenchmarkDescriptor.from_dict(descriptor_as_adict.to_dict())
def test_distributed_gpus(descriptor_as_adict): descriptor_as_adict.hardware.instance_type = "p3.8xlarge" descriptor_as_adict.hardware.distributed.processes_per_instance = ONE_PER_GPU descriptor = BenchmarkDescriptor.from_dict(descriptor_as_adict.to_dict()) assert descriptor.hardware.processes_per_instance == 4
def test_invalid_scheduling(descriptor_as_adict, scheduling): descriptor_as_adict.info.scheduling = scheduling with pytest.raises(DescriptorError): BenchmarkDescriptor.from_dict(descriptor_as_adict.to_dict())
def test_invalid_custom_labels(descriptor_as_adict, labels): descriptor_as_adict.info.labels = labels with pytest.raises(DescriptorError): BenchmarkDescriptor.from_dict(descriptor_as_adict.to_dict())
def test_invalid_strategy(descriptor_as_adict): descriptor_as_adict.hardware.strategy = "foo" with pytest.raises(DescriptorError): BenchmarkDescriptor.from_dict(descriptor_as_adict.to_dict())
def test_framework_explicit(descriptor_as_adict): descriptor_as_adict.ml.framework = "mxnet" descriptor = BenchmarkDescriptor.from_dict(descriptor_as_adict.to_dict()) assert descriptor.ml.framework == MLFramework.MXNET
def customparams_descriptor(descriptor_config, descriptor_customparams_as_adict): return BenchmarkDescriptor.from_dict(descriptor_customparams_as_adict.to_dict(), descriptor_config)
def test_framework_version_no_framework(descriptor_as_adict): descriptor_as_adict.ml.framework = "" descriptor_as_adict.ml.framework_version = "1.0" with pytest.raises(DescriptorError): BenchmarkDescriptor.from_dict(descriptor_as_adict.to_dict())
def test_distributed_gpus_on_cpu(descriptor_as_adict): descriptor_as_adict.hardware.instance_type = "t2.small" descriptor_as_adict.hardware.distributed.processes_per_instance = ONE_PER_GPU with pytest.raises(DescriptorError): BenchmarkDescriptor.from_dict(descriptor_as_adict.to_dict())
def test_script_file_required(descriptor_as_adict, script_value): descriptor_as_adict.ml.script.script = script_value with pytest.raises(DescriptorError): BenchmarkDescriptor.from_dict(descriptor_as_adict.to_dict())
def test_framework_required(descriptor_as_adict, descriptor_config): descriptor_config.valid_frameworks = ["foo"] with pytest.raises(DescriptorError): BenchmarkDescriptor.from_dict(descriptor_as_adict.to_dict(), descriptor_config)
def test_distributed_explicit(descriptor_as_adict): descriptor_as_adict.hardware.distributed.processes_per_instance = "4" descriptor = BenchmarkDescriptor.from_dict(descriptor_as_adict.to_dict()) assert descriptor.hardware.processes_per_instance == 4
def test_valid_strategy(descriptor_as_adict): descriptor_as_adict.hardware.strategy = "horovod" descriptor = BenchmarkDescriptor.from_dict(descriptor_as_adict.to_dict()) assert descriptor.hardware.strategy == DistributedStrategy.HOROVOD
def descriptor(descriptor_config, descriptor_as_adict): return BenchmarkDescriptor.from_dict(descriptor_as_adict.to_dict(), descriptor_config)
def test_distributed_num_instances_default(descriptor_as_adict): descriptor = BenchmarkDescriptor.from_dict(descriptor_as_adict.to_dict()) assert descriptor.hardware.distributed.num_instances == 2