def main(argv=None): parser = create_parser() args = parser.parse_args(argv) logging.getLogger().setLevel(logging.INFO) client = _utils.get_sagemaker_client(args.region) logging.info( 'Submitting HyperParameter Tuning Job request to SageMaker...') hpo_job_name = _utils.create_hyperparameter_tuning_job(client, vars(args)) logging.info( 'HyperParameter Tuning Job request submitted. Waiting for completion...' ) _utils.wait_for_hyperparameter_training_job(client, hpo_job_name) best_job, best_hyperparameters = _utils.get_best_training_job_and_hyperparameters( client, hpo_job_name) model_artifact_url = _utils.get_model_artifacts_from_job(client, best_job) image = _utils.get_image_from_job(client, best_job) logging.info('HyperParameter Tuning Job completed.') _utils.write_output(args.hpo_job_name_output_path, hpo_job_name) _utils.write_output(args.model_artifact_url_output_path, model_artifact_url) _utils.write_output(args.best_job_name_output_path, best_job) _utils.write_output(args.best_hyperparameters_output_path, best_hyperparameters, json_encode=True) _utils.write_output(args.training_image_output_path, image)
def main(argv=None): parser = create_parser() args = parser.parse_args(argv) logging.getLogger().setLevel(logging.INFO) client = _utils.get_sagemaker_client(args.region) logging.info( 'Submitting HyperParameter Tuning Job request to SageMaker...') hpo_job_name = _utils.create_hyperparameter_tuning_job(client, vars(args)) logging.info( 'HyperParameter Tuning Job request submitted. Waiting for completion...' ) _utils.wait_for_hyperparameter_training_job(client, hpo_job_name) best_job, best_hyperparameters = _utils.get_best_training_job_and_hyperparameters( client, hpo_job_name) model_artifact_url = _utils.get_model_artifacts_from_job(client, best_job) image = _utils.get_image_from_job(client, best_job) logging.info('HyperParameter Tuning Job completed.') with open('/tmp/hpo_job_name.txt', 'w') as f: f.write(hpo_job_name) with open('/tmp/best_job_name.txt', 'w') as f: f.write(best_job) with open('/tmp/best_hyperparameters.txt', 'w') as f: f.write(json.dumps(best_hyperparameters)) with open('/tmp/model_artifact_url.txt', 'w') as f: f.write(model_artifact_url) with open('/tmp/training_image.txt', 'w') as f: f.write(image)
def main(argv=None): parser = create_parser() args = parser.parse_args(argv) logging.getLogger().setLevel(logging.INFO) client = _utils.get_sagemaker_client(args.region, args.endpoint_url) logging.info('Submitting Training Job to SageMaker...') job_name = _utils.create_training_job(client, vars(args)) logging.info('Job request submitted. Waiting for completion...') _utils.wait_for_training_job(client, job_name) image = _utils.get_image_from_job(client, job_name) model_artifact_url = _utils.get_model_artifacts_from_job(client, job_name) logging.info('Get model artifacts %s from training job %s.', model_artifact_url, job_name) with open('/tmp/model_artifact_url.txt', 'w') as f: f.write(model_artifact_url) with open('/tmp/job_name.txt', 'w') as f: f.write(job_name) with open('/tmp/training_image.txt', 'w') as f: f.write(image) logging.info('Job completed.')
def main(argv=None): parser = create_parser() args = parser.parse_args(argv) logging.getLogger().setLevel(logging.INFO) client = _utils.get_sagemaker_client(args.region, args.endpoint_url) logging.info('Submitting Training Job to SageMaker...') job_name = _utils.create_training_job(client, vars(args)) def signal_term_handler(signalNumber, frame): _utils.stop_training_job(client, job_name) logging.info(f"Training Job: {job_name} request submitted to Stop") signal.signal(signal.SIGTERM, signal_term_handler) logging.info('Job request submitted. Waiting for completion...') try: _utils.wait_for_training_job(client, job_name) _utils.wait_for_debug_rules(client, job_name) except: raise finally: cw_client = _utils.get_cloudwatch_client(args.region) _utils.print_logs_for_job(cw_client, '/aws/sagemaker/TrainingJobs', job_name) image = _utils.get_image_from_job(client, job_name) model_artifact_url = _utils.get_model_artifacts_from_job(client, job_name) logging.info('Get model artifacts %s from training job %s.', model_artifact_url, job_name) _utils.write_output(args.model_artifact_url_output_path, model_artifact_url) _utils.write_output(args.job_name_output_path, job_name) _utils.write_output(args.training_image_output_path, image) logging.info('Job completed.')
def test_get_image_from_defined_job(self): mock_client = MagicMock() mock_client.describe_training_job.return_value = { "AlgorithmSpecification": { "TrainingImage": "training-image-url" } } self.assertEqual( _utils.get_image_from_job(mock_client, 'training-job'), "training-image-url")
def test_get_image_from_algorithm_job(self): mock_client = MagicMock() mock_client.describe_hyper_parameter_tuning_job.return_value = { "TrainingJobDefinition": { "AlgorithmSpecification": { "AlgorithmName": "my-algorithm" } } } mock_client.describe_algorithm.return_value = { "TrainingSpecification": { "TrainingImage": "training-image-url" } } self.assertEqual( _utils.get_image_from_job(mock_client, 'training-job'), "training-image-url")
def main(argv=None): parser = create_parser() args = parser.parse_args(argv) logging.getLogger().setLevel(logging.INFO) client = _utils.get_sagemaker_client(args.region, args.endpoint_url, assume_role_arn=args.assume_role) logging.info( 'Submitting HyperParameter Tuning Job request to SageMaker...') hpo_job_name = _utils.create_hyperparameter_tuning_job(client, vars(args)) def signal_term_handler(signalNumber, frame): _utils.stop_hyperparameter_tuning_job(client, hpo_job_name) logging.info( f"HyperParameter Tuning Job: {hpo_job_name} request submitted to Stop" ) signal.signal(signal.SIGTERM, signal_term_handler) logging.info( 'HyperParameter Tuning Job request submitted. Waiting for completion...' ) _utils.wait_for_hyperparameter_training_job(client, hpo_job_name) best_job, best_hyperparameters = _utils.get_best_training_job_and_hyperparameters( client, hpo_job_name) model_artifact_url = _utils.get_model_artifacts_from_job(client, best_job) image = _utils.get_image_from_job(client, best_job) logging.info('HyperParameter Tuning Job completed.') _utils.write_output(args.hpo_job_name_output_path, hpo_job_name) _utils.write_output(args.model_artifact_url_output_path, model_artifact_url) _utils.write_output(args.best_job_name_output_path, best_job) _utils.write_output(args.best_hyperparameters_output_path, best_hyperparameters, json_encode=True) _utils.write_output(args.training_image_output_path, image)
def main(argv=None): parser = argparse.ArgumentParser( description='SageMaker Hyperparameter Tuning Job') parser.add_argument('--region', type=str.strip, required=True, help='The region where the cluster launches.') parser.add_argument( '--job_name', type=str.strip, required=False, help= 'The name of the tuning job. Must be unique within the same AWS account and AWS region.' ) parser.add_argument( '--role', type=str.strip, required=True, help= 'The Amazon Resource Name (ARN) that Amazon SageMaker assumes to perform tasks on your behalf.' ) parser.add_argument( '--image', type=str.strip, required=False, help= 'The registry path of the Docker image that contains the training algorithm.', default='') parser.add_argument( '--algorithm_name', type=str.strip, required=False, help= 'The name of the resource algorithm to use for the hyperparameter tuning job.', default='') parser.add_argument( '--training_input_mode', choices=['File', 'Pipe'], type=str.strip, required=False, help='The input mode that the algorithm supports. File or Pipe.', default='File') parser.add_argument( '--metric_definitions', type=_utils.str_to_json_dict, required=False, help= 'The dictionary of name-regex pairs specify the metrics that the algorithm emits.', default='{}') parser.add_argument( '--strategy', choices=['Bayesian', 'Random'], type=str.strip, required=False, help= 'How hyperparameter tuning chooses the combinations of hyperparameter values to use for the training job it launches.', default='Bayesian') parser.add_argument( '--metric_name', type=str.strip, required=True, help='The name of the metric to use for the objective metric.') parser.add_argument( '--metric_type', choices=['Maximize', 'Minimize'], type=str.strip, required=True, help='Whether to minimize or maximize the objective metric.') parser.add_argument( '--early_stopping_type', choices=['Off', 'Auto'], type=str.strip, required=False, help='Whether to minimize or maximize the objective metric.', default='Off') parser.add_argument( '--static_parameters', type=_utils.str_to_json_dict, required=False, help= 'The values of hyperparameters that do not change for the tuning job.', default='{}') parser.add_argument( '--integer_parameters', type=_utils.str_to_json_list, required=False, help= 'The array of IntegerParameterRange objects that specify ranges of integer hyperparameters that you want to search.', default='[]') parser.add_argument( '--continuous_parameters', type=_utils.str_to_json_list, required=False, help= 'The array of ContinuousParameterRange objects that specify ranges of continuous hyperparameters that you want to search.', default='[]') parser.add_argument( '--categorical_parameters', type=_utils.str_to_json_list, required=False, help= 'The array of CategoricalParameterRange objects that specify ranges of categorical hyperparameters that you want to search.', default='[]') parser.add_argument( '--channels', type=_utils.str_to_json_list, required=True, help= 'A list of dicts specifying the input channels. Must have at least one.' ) parser.add_argument( '--data_location_1', type=str.strip, required=False, help='The S3 URI of the input data source for channel 1.', default='') parser.add_argument( '--data_location_2', type=str.strip, required=False, help='The S3 URI of the input data source for channel 2.', default='') parser.add_argument( '--data_location_3', type=str.strip, required=False, help='The S3 URI of the input data source for channel 3.', default='') parser.add_argument( '--data_location_4', type=str.strip, required=False, help='The S3 URI of the input data source for channel 4.', default='') parser.add_argument( '--data_location_5', type=str.strip, required=False, help='The S3 URI of the input data source for channel 5.', default='') parser.add_argument( '--data_location_6', type=str.strip, required=False, help='The S3 URI of the input data source for channel 6.', default='') parser.add_argument( '--data_location_7', type=str.strip, required=False, help='The S3 URI of the input data source for channel 7.', default='') parser.add_argument( '--data_location_8', type=str.strip, required=False, help='The S3 URI of the input data source for channel 8.', default='') parser.add_argument( '--output_location', type=str.strip, required=True, help= 'The Amazon S3 path where you want Amazon SageMaker to store the results of the transform job.' ) parser.add_argument( '--output_encryption_key', type=str.strip, required=False, help= 'The AWS KMS key that Amazon SageMaker uses to encrypt the model artifacts.', default='') parser.add_argument( '--instance_type', choices=[ 'ml.m4.xlarge', 'ml.m4.2xlarge', 'ml.m4.4xlarge', 'ml.m4.10xlarge', 'ml.m4.16xlarge', 'ml.m5.large', 'ml.m5.xlarge', 'ml.m5.2xlarge', 'ml.m5.4xlarge', 'ml.m5.12xlarge', 'ml.m5.24xlarge', 'ml.c4.xlarge', 'ml.c4.2xlarge', 'ml.c4.4xlarge', 'ml.c4.8xlarge', 'ml.p2.xlarge', 'ml.p2.8xlarge', 'ml.p2.16xlarge', 'ml.p3.2xlarge', 'ml.p3.8xlarge', 'ml.p3.16xlarge', 'ml.c5.xlarge', 'ml.c5.2xlarge', 'ml.c5.4xlarge', 'ml.c5.9xlarge', 'ml.c5.18xlarge' ], type=str.strip, required=False, help='The ML compute instance type.', default='ml.m4.xlarge') parser.add_argument( '--instance_count', type=_utils.str_to_int, required=False, help='The number of ML compute instances to use in each training job.', default=1) parser.add_argument( '--volume_size', type=_utils.str_to_int, required=False, help='The size of the ML storage volume that you want to provision.', default=1) parser.add_argument( '--max_num_jobs', type=_utils.str_to_int, required=True, help= 'The maximum number of training jobs that a hyperparameter tuning job can launch.' ) parser.add_argument( '--max_parallel_jobs', type=_utils.str_to_int, required=True, help= 'The maximum number of concurrent training jobs that a hyperparameter tuning job can launch.' ) parser.add_argument( '--max_run_time', type=_utils.str_to_int, required=False, help='The maximum run time in seconds per training job.', default=86400) parser.add_argument( '--resource_encryption_key', type=str.strip, required=False, help= 'The AWS KMS key that Amazon SageMaker uses to encrypt data on the storage volume attached to the ML compute instance(s).', default='') parser.add_argument( '--vpc_security_group_ids', type=str.strip, required=False, help='The VPC security group IDs, in the form sg-xxxxxxxx.') parser.add_argument( '--vpc_subnets', type=str.strip, required=False, help= 'The ID of the subnets in the VPC to which you want to connect your hpo job.' ) parser.add_argument('--network_isolation', type=_utils.str_to_bool, required=False, help='Isolates the training container.', default=True) parser.add_argument( '--traffic_encryption', type=_utils.str_to_bool, required=False, help= 'Encrypts all communications between ML compute instances in distributed training.', default=False) parser.add_argument( '--warm_start_type', choices=['IdenticalDataAndAlgorithm', 'TransferLearning', ''], type=str.strip, required=False, help= 'Specifies either "IdenticalDataAndAlgorithm" or "TransferLearning"') parser.add_argument( '--parent_hpo_jobs', type=str.strip, required=False, help= 'List of previously completed or stopped hyperparameter tuning jobs to be used as a starting point.', default='') parser.add_argument( '--tags', type=_utils.str_to_json_dict, required=False, help='An array of key-value pairs, to categorize AWS resources.', default='{}') args = parser.parse_args() logging.getLogger().setLevel(logging.INFO) client = _utils.get_client(args.region) logging.info( 'Submitting HyperParameter Tuning Job request to SageMaker...') hpo_job_name = _utils.create_hyperparameter_tuning_job(client, vars(args)) logging.info( 'HyperParameter Tuning Job request submitted. Waiting for completion...' ) _utils.wait_for_hyperparameter_training_job(client, hpo_job_name) best_job, best_hyperparameters = _utils.get_best_training_job_and_hyperparameters( client, hpo_job_name) model_artifact_url = _utils.get_model_artifacts_from_job(client, best_job) image = _utils.get_image_from_job(client, best_job) logging.info('HyperParameter Tuning Job completed.') with open('/tmp/best_job_name.txt', 'w') as f: f.write(best_job) with open('/tmp/best_hyperparameters.txt', 'w') as f: f.write(json.dumps(best_hyperparameters)) with open('/tmp/model_artifact_url.txt', 'w') as f: f.write(model_artifact_url) with open('/tmp/training_image.txt', 'w') as f: f.write(image)
def main(argv=None): parser = argparse.ArgumentParser(description='SageMaker Training Job') parser.add_argument('--region', type=str.strip, required=True, help='The region where the training job launches.') parser.add_argument('--job_name', type=str.strip, required=False, help='The name of the training job.', default='') parser.add_argument( '--role', type=str.strip, required=True, help= 'The Amazon Resource Name (ARN) that Amazon SageMaker assumes to perform tasks on your behalf.' ) parser.add_argument( '--image', type=str.strip, required=True, help= 'The registry path of the Docker image that contains the training algorithm.', default='') parser.add_argument( '--algorithm_name', type=str.strip, required=False, help='The name of the resource algorithm to use for the training job.', default='') parser.add_argument( '--metric_definitions', type=_utils.str_to_json_dict, required=False, help= 'The dictionary of name-regex pairs specify the metrics that the algorithm emits.', default='{}') parser.add_argument( '--training_input_mode', choices=['File', 'Pipe'], type=str.strip, help='The input mode that the algorithm supports. File or Pipe.', default='File') parser.add_argument( '--hyperparameters', type=_utils.str_to_json_dict, help='Dictionary of hyperparameters for the the algorithm.', default='{}') parser.add_argument( '--channels', type=_utils.str_to_json_list, required=True, help= 'A list of dicts specifying the input channels. Must have at least one.' ) parser.add_argument( '--instance_type', required=True, choices=[ 'ml.m4.xlarge', 'ml.m4.2xlarge', 'ml.m4.4xlarge', 'ml.m4.10xlarge', 'ml.m4.16xlarge', 'ml.m5.large', 'ml.m5.xlarge', 'ml.m5.2xlarge', 'ml.m5.4xlarge', 'ml.m5.12xlarge', 'ml.m5.24xlarge', 'ml.c4.xlarge', 'ml.c4.2xlarge', 'ml.c4.4xlarge', 'ml.c4.8xlarge', 'ml.p2.xlarge', 'ml.p2.8xlarge', 'ml.p2.16xlarge', 'ml.p3.2xlarge', 'ml.p3.8xlarge', 'ml.p3.16xlarge', 'ml.c5.xlarge', 'ml.c5.2xlarge', 'ml.c5.4xlarge', 'ml.c5.9xlarge', 'ml.c5.18xlarge' ], type=str.strip, help='The ML compute instance type.', default='ml.m4.xlarge') parser.add_argument( '--instance_count', required=True, type=_utils.str_to_int, help= 'The registry path of the Docker image that contains the training algorithm.', default=1) parser.add_argument( '--volume_size', type=_utils.str_to_int, required=True, help='The size of the ML storage volume that you want to provision.', default=1) parser.add_argument( '--resource_encryption_key', type=str.strip, required=False, help= 'The AWS KMS key that Amazon SageMaker uses to encrypt data on the storage volume attached to the ML compute instance(s).', default='') parser.add_argument( '--max_run_time', type=_utils.str_to_int, required=True, help='The maximum run time in seconds for the training job.', default=86400) parser.add_argument( '--model_artifact_path', type=str.strip, required=True, help= 'Identifies the S3 path where you want Amazon SageMaker to store the model artifacts.' ) parser.add_argument( '--output_encryption_key', type=str.strip, required=False, help= 'The AWS KMS key that Amazon SageMaker uses to encrypt the model artifacts.', default='') parser.add_argument( '--vpc_security_group_ids', type=str.strip, required=False, help='The VPC security group IDs, in the form sg-xxxxxxxx.') parser.add_argument( '--vpc_subnets', type=str.strip, required=False, help= 'The ID of the subnets in the VPC to which you want to connect your hpo job.' ) parser.add_argument('--network_isolation', type=_utils.str_to_bool, required=False, help='Isolates the training container.', default=True) parser.add_argument( '--traffic_encryption', type=_utils.str_to_bool, required=False, help= 'Encrypts all communications between ML compute instances in distributed training.', default=False) parser.add_argument( '--tags', type=_utils.str_to_json_dict, required=False, help='An array of key-value pairs, to categorize AWS resources.', default='{}') args = parser.parse_args() logging.getLogger().setLevel(logging.INFO) client = _utils.get_client(args.region) logging.info('Submitting Training Job to SageMaker...') job_name = _utils.create_training_job(client, vars(args)) logging.info('Job request submitted. Waiting for completion...') _utils.wait_for_training_job(client, job_name) image = _utils.get_image_from_job(client, job_name) model_artifact_url = _utils.get_model_artifacts_from_job(client, job_name) logging.info('Get model artifacts %s from training job %s.', model_artifact_url, job_name) with open('/tmp/model_artifact_url.txt', 'w') as f: f.write(model_artifact_url) with open('/tmp/job_name.txt', 'w') as f: f.write(job_name) with open('/tmp/training_image.txt', 'w') as f: f.write(image) logging.info('Job completed.')