def test_terminate_trainingjob(kfp_client, experiment_id, region, sagemaker_client): test_file_dir = "resources/config/simple-mnist-training" download_dir = utils.mkdir( os.path.join(test_file_dir + "/generated_test_terminate")) test_params = utils.load_params( utils.replace_placeholders( os.path.join(test_file_dir, "config.yaml"), os.path.join(download_dir, "config.yaml"), )) input_job_name = test_params["Arguments"]["job_name"] = ( utils.generate_random_string(4) + "-terminate-job") run_id, _, workflow_json = kfp_client_utils.compile_run_monitor_pipeline( kfp_client, experiment_id, test_params["PipelineDefinition"], test_params["Arguments"], download_dir, test_params["TestName"], 60, "running", ) print( f"Terminating run: {run_id} where Training job_name: {input_job_name}") kfp_client_utils.terminate_run(kfp_client, run_id) response = sagemaker_utils.describe_training_job(sagemaker_client, input_job_name) assert response["TrainingJobStatus"] in ["Stopping", "Stopped"] utils.remove_dir(download_dir)
def test_trainingjob(kfp_client, experiment_id, region, sagemaker_client, test_file_dir): download_dir = utils.mkdir(os.path.join(test_file_dir + "/generated")) test_params = utils.load_params( utils.replace_placeholders( os.path.join(test_file_dir, "config.yaml"), os.path.join(download_dir, "config.yaml"), )) _, _, workflow_json = kfp_client_utils.compile_run_monitor_pipeline( kfp_client, experiment_id, test_params["PipelineDefinition"], test_params["Arguments"], download_dir, test_params["TestName"], test_params["Timeout"], ) outputs = { "sagemaker-training-job": ["job_name", "model_artifact_url", "training_image"] } output_files = minio_utils.artifact_download_iterator( workflow_json, outputs, download_dir) # Verify Training job was successful on SageMaker training_job_name = utils.read_from_file_in_tar( output_files["sagemaker-training-job"]["job_name"]) print(f"training job name: {training_job_name}") train_response = sagemaker_utils.describe_training_job( sagemaker_client, training_job_name) assert train_response["TrainingJobStatus"] == "Completed" # Verify model artifacts output was generated from this run model_artifact_url = utils.read_from_file_in_tar( output_files["sagemaker-training-job"]["model_artifact_url"]) print(f"model_artifact_url: {model_artifact_url}") assert model_artifact_url == train_response["ModelArtifacts"][ "S3ModelArtifacts"] assert training_job_name in model_artifact_url # Verify training image output is an ECR image training_image = utils.read_from_file_in_tar( output_files["sagemaker-training-job"]["training_image"]) print(f"Training image used: {training_image}") if "ExpectedTrainingImage" in test_params.keys(): assert test_params["ExpectedTrainingImage"] == training_image else: assert f"dkr.ecr.{region}.amazonaws.com" in training_image assert not argo_utils.error_in_cw_logs( workflow_json["metadata"]["name"] ), "Found the CloudWatch error message in the log output. Check SageMaker to see if the job has failed." utils.remove_dir(download_dir)
def test_trainingjob(kfp_client, experiment_id, sagemaker_client, test_file_dir): download_dir = utils.mkdir(os.path.join(test_file_dir + "/generated")) test_params = utils.load_params( utils.replace_placeholders( os.path.join(test_file_dir, "config.yaml"), os.path.join(download_dir, "config.yaml"), )) test_params["Arguments"]["hyperparameters"] = json.dumps( test_params["Arguments"]["hyperparameters"]) test_params["Arguments"]["channels"] = json.dumps( test_params["Arguments"]["channels"]) run_id, status, workflow_json = kfp_client_utils.compile_run_monitor_pipeline( kfp_client, experiment_id, test_params["PipelineDefinition"], test_params["Arguments"], download_dir, test_params["TestName"], test_params["Timeout"], ) outputs = {"sagemaker-training-job": ["job_name", "model_artifact_url"]} output_files = minio_utils.artifact_download_iterator( workflow_json, outputs, download_dir) # Verify Training job was successful on SageMaker training_job_name = utils.extract_information( output_files["sagemaker-training-job"]["job_name"], "job_name.txt") print(f"training job name: {training_job_name}") train_response = sagemaker_utils.describe_training_job( sagemaker_client, training_job_name.decode()) assert train_response["TrainingJobStatus"] == "Completed" # Verify model artifacts output was generated from this run model_artifact_url = utils.extract_information( output_files["sagemaker-training-job"]["model_artifact_url"], "model_artifact_url.txt", ) print(f"model_artifact_url: {model_artifact_url}") assert (model_artifact_url.decode() == train_response["ModelArtifacts"] ["S3ModelArtifacts"]) assert (train_response["ModelArtifacts"]["S3ModelArtifacts"] in model_artifact_url.decode())
def test_hyperparameter_tuning(kfp_client, experiment_id, region, sagemaker_client, test_file_dir): download_dir = utils.mkdir(os.path.join(test_file_dir + "/generated")) test_params = utils.load_params( utils.replace_placeholders( os.path.join(test_file_dir, "config.yaml"), os.path.join(download_dir, "config.yaml"), )) test_params["Arguments"]["channels"] = json.dumps( test_params["Arguments"]["channels"]) test_params["Arguments"]["static_parameters"] = json.dumps( test_params["Arguments"]["static_parameters"]) test_params["Arguments"]["integer_parameters"] = json.dumps( test_params["Arguments"]["integer_parameters"]) test_params["Arguments"]["categorical_parameters"] = json.dumps( test_params["Arguments"]["categorical_parameters"]) _, _, workflow_json = kfp_client_utils.compile_run_monitor_pipeline( kfp_client, experiment_id, test_params["PipelineDefinition"], test_params["Arguments"], download_dir, test_params["TestName"], test_params["Timeout"], ) outputs = { "sagemaker-hyperparameter-tuning": [ "best_hyperparameters", "best_job_name", "hpo_job_name", "model_artifact_url", "training_image", ] } output_files = minio_utils.artifact_download_iterator( workflow_json, outputs, download_dir) # Verify HPO job was successful on SageMaker hpo_job_name = utils.read_from_file_in_tar( output_files["sagemaker-hyperparameter-tuning"]["hpo_job_name"], "hpo_job_name.txt", ) print(f"HPO job name: {hpo_job_name}") hpo_response = sagemaker_utils.describe_hpo_job(sagemaker_client, hpo_job_name) assert hpo_response["HyperParameterTuningJobStatus"] == "Completed" # Verify training image output is an ECR image training_image = utils.read_from_file_in_tar( output_files["sagemaker-hyperparameter-tuning"]["training_image"], "training_image.txt", ) print(f"Training image used: {training_image}") if "ExpectedTrainingImage" in test_params.keys(): assert test_params["ExpectedTrainingImage"] == training_image else: assert f"dkr.ecr.{region}.amazonaws.com" in training_image # Verify Training job was part of HPO job, returned as best and was successful best_training_job_name = utils.read_from_file_in_tar( output_files["sagemaker-hyperparameter-tuning"]["best_job_name"], "best_job_name.txt", ) print(f"best training job name: {best_training_job_name}") train_response = sagemaker_utils.describe_training_job( sagemaker_client, best_training_job_name) assert train_response["TuningJobArn"] == hpo_response[ "HyperParameterTuningJobArn"] assert (train_response["TrainingJobName"] == hpo_response["BestTrainingJob"]["TrainingJobName"]) assert train_response["TrainingJobStatus"] == "Completed" # Verify model artifacts output was generated from this run model_artifact_url = utils.read_from_file_in_tar( output_files["sagemaker-hyperparameter-tuning"]["model_artifact_url"], "model_artifact_url.txt", ) print(f"model_artifact_url: {model_artifact_url}") assert model_artifact_url == train_response["ModelArtifacts"][ "S3ModelArtifacts"] assert best_training_job_name in model_artifact_url # Verify hyper_parameters output is not empty hyper_parameters = json.loads( utils.read_from_file_in_tar( output_files["sagemaker-hyperparameter-tuning"] ["best_hyperparameters"], "best_hyperparameters.txt", )) print( f"HPO best hyperparameters: {json.dumps(hyper_parameters, indent = 2)}" ) assert hyper_parameters is not None utils.remove_dir(download_dir)