def test_source_dirs(tmpdir, sagemaker_local_session): source_dir = os.path.join(DATA_DIR, "pytorch_source_dirs") lib = os.path.join(str(tmpdir), "alexa.py") with open(lib, "w") as f: f.write("def question(to_anything): return 42") # TODO: fails on newer versions of pytorch in call to np.load(BytesIO(stream.read())) # "ValueError: Cannot load file containing pickled data when allow_pickle=False" estimator = PyTorch( entry_point="train.py", role="SageMakerRole", source_dir=source_dir, dependencies=[lib], framework_version= "0.4", # hard-code to last known good pytorch for now (see TODO above) py_version="py3", instance_count=1, instance_type="local", sagemaker_session=sagemaker_local_session, ) estimator.fit() # endpoint tests all use the same port, so we use this lock to prevent concurrent execution with lock.lock(): try: predictor = estimator.deploy(initial_instance_count=1, instance_type="local") predict_response = predictor.predict([7]) assert predict_response == [49] finally: predictor.delete_endpoint()
def test_github(sagemaker_local_session): script_path = "mnist.py" data_path = os.path.join(DATA_DIR, "pytorch_mnist") git_config = {"repo": GIT_REPO, "branch": BRANCH, "commit": COMMIT} pytorch = PyTorch( entry_point=script_path, role="SageMakerRole", source_dir="pytorch", framework_version=PYTORCH_VERSION, py_version=PYTHON_VERSION, train_instance_count=1, train_instance_type="local", sagemaker_session=sagemaker_local_session, git_config=git_config, ) pytorch.fit({ "training": "file://" + os.path.join(data_path, "training", MNIST_FOLDER_NAME) }) with lock.lock(LOCK_PATH): try: predictor = pytorch.deploy(initial_instance_count=1, instance_type="local") data = numpy.zeros(shape=(1, 1, 28, 28)).astype(numpy.float32) result = predictor.predict(data) assert result is not None finally: predictor.delete_endpoint()
def test_github(sagemaker_local_session, pytorch_inference_latest_version, pytorch_inference_latest_py_version): script_path = "mnist.py" git_config = {"repo": GIT_REPO, "branch": BRANCH, "commit": COMMIT} pytorch = PyTorch( entry_point=script_path, role="SageMakerRole", source_dir="pytorch", framework_version=pytorch_inference_latest_version, py_version=pytorch_inference_latest_py_version, instance_count=1, instance_type="local", sagemaker_session=sagemaker_local_session, git_config=git_config, ) data_path = os.path.join(DATA_DIR, "pytorch_mnist") pytorch.fit({"training": "file://" + os.path.join(data_path, "training")}) with lock.lock(LOCK_PATH): try: predictor = pytorch.deploy(initial_instance_count=1, instance_type="local") data = numpy.zeros(shape=(1, 1, 28, 28)).astype(numpy.float32) result = predictor.predict(data) assert 10 == len( result[0]) # check that there is a probability for each label finally: predictor.delete_endpoint()
def test_pytorch_airflow_config_uploads_data_source_to_s3_when_inputs_not_provided( sagemaker_session, cpu_instance_type, pytorch_inference_latest_version, pytorch_inference_latest_py_version, ): with timeout(seconds=AIRFLOW_CONFIG_TIMEOUT_IN_SECONDS): estimator = PyTorch( entry_point=PYTORCH_MNIST_SCRIPT, role=ROLE, framework_version=pytorch_inference_latest_version, py_version=pytorch_inference_latest_py_version, instance_count=2, instance_type=cpu_instance_type, hyperparameters={ "epochs": 6, "backend": "gloo" }, sagemaker_session=sagemaker_session, ) training_config = _build_airflow_workflow( estimator=estimator, instance_type=cpu_instance_type) _assert_that_s3_url_contains_data( sagemaker_session, training_config["HyperParameters"] ["sagemaker_submit_directory"].strip('"'), )
def test_async_fit_deploy(sagemaker_session, pytorch_full_version): training_job_name = "" # TODO: add tests against local mode when it's ready to be used instance_type = 'ml.p2.xlarge' with timeout(minutes=10): pytorch = _get_pytorch_estimator(sagemaker_session, pytorch_full_version, instance_type) pytorch.fit({'training': _upload_training_data(pytorch)}, wait=False) training_job_name = pytorch.latest_training_job.name print("Waiting to re-attach to the training job: %s" % training_job_name) time.sleep(20) if not _is_local_mode(instance_type): endpoint_name = 'test-pytorch-async-fit-attach-deploy-{}'.format(sagemaker_timestamp()) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): print("Re-attaching now to: %s" % training_job_name) estimator = PyTorch.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session) predictor = estimator.deploy(1, instance_type, endpoint_name=endpoint_name) batch_size = 100 data = numpy.random.rand(batch_size, 1, 28, 28).astype(numpy.float32) output = predictor.predict(data) assert output.shape == (batch_size, 10)
def _get_pytorch_estimator(sagemaker_session, pytorch_full_version, instance_type='ml.c4.xlarge', entry_point=MNIST_SCRIPT): return PyTorch(entry_point=entry_point, role='SageMakerRole', framework_version=pytorch_full_version, py_version=PYTHON_VERSION, train_instance_count=1, train_instance_type=instance_type, sagemaker_session=sagemaker_session)
def test_async_fit_deploy(sagemaker_session, pytorch_full_version): training_job_name = "" # TODO: add tests against local mode when it's ready to be used instance_type = 'ml.p2.xlarge' with timeout(minutes=10): pytorch = _get_pytorch_estimator(sagemaker_session, pytorch_full_version, instance_type) pytorch.fit({'training': _upload_training_data(pytorch)}, wait=False) training_job_name = pytorch.latest_training_job.name print("Waiting to re-attach to the training job: %s" % training_job_name) time.sleep(20) if not _is_local_mode(instance_type): endpoint_name = 'test-pytorch-async-fit-attach-deploy-{}'.format( sagemaker_timestamp()) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): print("Re-attaching now to: %s" % training_job_name) estimator = PyTorch.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session) predictor = estimator.deploy(1, instance_type, endpoint_name=endpoint_name) batch_size = 100 data = numpy.random.rand(batch_size, 1, 28, 28).astype(numpy.float32) output = predictor.predict(data) assert output.shape == (batch_size, 10)
def test_source_dirs(tmpdir, sagemaker_local_session): source_dir = os.path.join(DATA_DIR, 'pytorch_source_dirs') lib = os.path.join(str(tmpdir), 'alexa.py') with open(lib, 'w') as f: f.write('def question(to_anything): return 42') estimator = PyTorch(entry_point='train.py', role='SageMakerRole', source_dir=source_dir, dependencies=[lib], py_version=PYTHON_VERSION, train_instance_count=1, train_instance_type='local', sagemaker_session=sagemaker_local_session) try: estimator.fit() predictor = estimator.deploy(initial_instance_count=1, instance_type='local') predict_response = predictor.predict([7]) assert predict_response == [49] finally: estimator.delete_endpoint()
def test_source_dirs(tmpdir, sagemaker_local_session): source_dir = os.path.join(DATA_DIR, "pytorch_source_dirs") lib = os.path.join(str(tmpdir), "alexa.py") with open(lib, "w") as f: f.write("def question(to_anything): return 42") estimator = PyTorch( entry_point="train.py", role="SageMakerRole", source_dir=source_dir, dependencies=[lib], py_version=PYTHON_VERSION, train_instance_count=1, train_instance_type="local", sagemaker_session=sagemaker_local_session, ) estimator.fit() # endpoint tests all use the same port, so we use this lock to prevent concurrent execution with lock.lock(): try: predictor = estimator.deploy(initial_instance_count=1, instance_type="local") predict_response = predictor.predict([7]) assert predict_response == [49] finally: estimator.delete_endpoint()
def test_tuning_step_with_single_algo_tuner(pipeline_session, entry_point): inputs = TrainingInput( s3_data=f"s3://{pipeline_session.default_bucket()}/training-data") pytorch_estimator = PyTorch( entry_point=entry_point, role=sagemaker.get_execution_role(), framework_version="1.5.0", py_version="py3", instance_count=1, instance_type="ml.m5.xlarge", sagemaker_session=pipeline_session, enable_sagemaker_metrics=True, max_retry_attempts=3, ) hyperparameter_ranges = { "batch-size": IntegerParameter(64, 128), } tuner = HyperparameterTuner( estimator=pytorch_estimator, objective_metric_name="test:acc", objective_type="Maximize", hyperparameter_ranges=hyperparameter_ranges, metric_definitions=[{ "Name": "test:acc", "Regex": "Overall test accuracy: (.*?);" }], max_jobs=2, max_parallel_jobs=2, ) with warnings.catch_warnings(record=True) as w: step_args = tuner.fit(inputs=inputs) assert len(w) == 1 assert issubclass(w[-1].category, UserWarning) assert "Running within a PipelineSession" in str(w[-1].message) with warnings.catch_warnings(record=True) as w: step = TuningStep( name="MyTuningStep", step_args=step_args, ) assert len(w) == 0 pipeline = Pipeline( name="MyPipeline", steps=[step], sagemaker_session=pipeline_session, ) assert json.loads(pipeline.definition())["Steps"][0] == { "Name": "MyTuningStep", "Type": "Tuning", "Arguments": step_args, }
def test_jumpstart_catboost_image_uri(patched_get_model_specs, session): patched_get_model_specs.side_effect = get_prototype_model_spec model_id, model_version = "catboost-classification-model", "*" instance_type = "ml.p2.xlarge" region = "us-west-2" model_specs = accessors.JumpStartModelsAccessor.get_model_specs( region, model_id, model_version) # inference uri = image_uris.retrieve( framework=None, region=region, image_scope="inference", model_id=model_id, model_version=model_version, instance_type=instance_type, ) framework_class_uri = PyTorchModel( role="mock_role", model_data="mock_data", entry_point="mock_entry_point", framework_version=model_specs.hosting_ecr_specs.framework_version, py_version=model_specs.hosting_ecr_specs.py_version, sagemaker_session=session, ).serving_image_uri(region, instance_type) assert uri == framework_class_uri assert uri == "763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-inference:1.9.0-gpu-py38" # training uri = image_uris.retrieve( framework=None, region=region, image_scope="training", model_id=model_id, model_version=model_version, instance_type=instance_type, ) framework_class_uri = PyTorch( role="mock_role", entry_point="mock_entry_point", framework_version=model_specs.training_ecr_specs.framework_version, py_version=model_specs.training_ecr_specs.py_version, instance_type=instance_type, instance_count=1, sagemaker_session=session, ).training_image_uri(region=region) assert uri == framework_class_uri assert uri == "763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:1.9.0-gpu-py38"
def _get_pytorch_estimator( sagemaker_session, pytorch_version, py_version, instance_type, entry_point=MNIST_SCRIPT ): return PyTorch( entry_point=entry_point, role="SageMakerRole", framework_version=pytorch_version, py_version=py_version, instance_count=1, instance_type=instance_type, sagemaker_session=sagemaker_session, )
def test_pytorch_airflow_config_uploads_data_source_to_s3_when_inputs_not_provided( sagemaker_session, cpu_instance_type): with timeout(seconds=AIRFLOW_CONFIG_TIMEOUT_IN_SECONDS): estimator = PyTorch( entry_point=PYTORCH_MNIST_SCRIPT, role=ROLE, framework_version="1.1.0", train_instance_count=2, train_instance_type=cpu_instance_type, hyperparameters={ "epochs": 6, "backend": "gloo" }, ) train_config = sm_airflow.training_config(estimator=estimator) uploaded_s3_data = train_config["HyperParameters"][ "sagemaker_submit_directory"].strip('"') transform_config = sm_airflow.transform_config_from_estimator( estimator=estimator, task_id="transform_config", task_type="training", instance_count=SINGLE_INSTANCE_COUNT, instance_type=cpu_instance_type, data=uploaded_s3_data, content_type="text/csv", ) default_args = { "owner": "airflow", "start_date": airflow.utils.dates.days_ago(2), "provide_context": True, } dag = DAG("tensorflow_example", default_args=default_args, schedule_interval="@once") train_op = SageMakerTrainingOperator(task_id="tf_training", config=train_config, wait_for_completion=True, dag=dag) transform_op = SageMakerTransformOperator(task_id="transform_operator", config=transform_config, wait_for_completion=True, dag=dag) transform_op.set_upstream(train_op) _assert_that_s3_url_contains_data(sagemaker_session, uploaded_s3_data)
def test_fit_deploy(sagemaker_local_session, pytorch_full_version): pytorch = PyTorch( entry_point=MNIST_SCRIPT, role="SageMakerRole", framework_version=pytorch_full_version, py_version="py3", train_instance_count=1, train_instance_type="local", sagemaker_session=sagemaker_local_session, ) pytorch.fit({"training": "file://" + os.path.join(MNIST_DIR, "training")}) predictor = pytorch.deploy(1, "local") try: batch_size = 100 data = numpy.random.rand(batch_size, 1, 28, 28).astype(numpy.float32) output = predictor.predict(data) assert output.shape == (batch_size, 10) finally: predictor.delete_endpoint()
def test_sync_fit_deploy(pytorch_training_job, sagemaker_session): # TODO: add tests against local mode when it's ready to be used endpoint_name = 'test-pytorch-sync-fit-attach-deploy{}'.format(sagemaker_timestamp()) with timeout(minutes=20): estimator = PyTorch.attach(pytorch_training_job, sagemaker_session=sagemaker_session) predictor = estimator.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name) data = numpy.zeros(shape=(1, 1, 28, 28), dtype=numpy.float32) predictor.predict(data) batch_size = 100 data = numpy.random.rand(batch_size, 1, 28, 28).astype(numpy.float32) output = predictor.predict(data) assert output.shape == (batch_size, 10)
def test_sync_fit_deploy(pytorch_training_job, sagemaker_session, cpu_instance_type): # TODO: add tests against local mode when it's ready to be used endpoint_name = "test-pytorch-sync-fit-attach-deploy{}".format(sagemaker_timestamp()) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): estimator = PyTorch.attach(pytorch_training_job, sagemaker_session=sagemaker_session) predictor = estimator.deploy(1, cpu_instance_type, endpoint_name=endpoint_name) data = numpy.zeros(shape=(1, 1, 28, 28), dtype=numpy.float32) predictor.predict(data) batch_size = 100 data = numpy.random.rand(batch_size, 1, 28, 28).astype(numpy.float32) output = predictor.predict(data) assert output.shape == (batch_size, 10)
from ...sm_utils import get_sm_execution_role ON_SAGEMAKER_NOTEBOOK = False # preparation sm_boto3 = boto3.client('sagemaker') sess = sagemaker.Session() region = sess.boto_session.region_name bucket = sess.default_bucket() # this could also be a hard-coded bucket name print('Using bucket ' + bucket) sm_role = get_sm_execution_role(ON_SAGEMAKER_NOTEBOOK, region) trainpath = 's3://sagemaker-ap-southeast-2-454979696062/sagemaker/sklearncontainer/adult.csv' pytorch_estimator = PyTorch( entry_point='train.py', source_dir=os.path.abspath(os.path.dirname(__file__)), role = sm_role, train_instance_count=1, train_instance_type='ml.c5.xlarge', framework_version='1.5.0', base_job_name='fastai-pytorch', metric_definitions=[ {'Name': 'Dice accuracy', 'Regex': "Dice accuracy: ([0-9.]+).*$"}], hyperparameters = {'hidden_layer_1': 200, 'hidden_layer_2': 100}) # launch training job, with asynchronous call pytorch_estimator.fit({'train':trainpath}, wait=False)
def test_training_job_with_debugger_and_profiler( sagemaker_session, pipeline_name, role, pytorch_training_latest_version, pytorch_training_latest_py_version, ): instance_count = ParameterInteger(name="InstanceCount", default_value=1) instance_type = ParameterString(name="InstanceType", default_value="ml.m5.xlarge") rules = [ Rule.sagemaker(rule_configs.vanishing_gradient()), Rule.sagemaker(base_config=rule_configs.all_zero(), rule_parameters={"tensor_regex": ".*"}), Rule.sagemaker(rule_configs.loss_not_decreasing()), ] debugger_hook_config = DebuggerHookConfig( s3_output_path= f"s3://{sagemaker_session.default_bucket()}/{uuid.uuid4()}/tensors") base_dir = os.path.join(DATA_DIR, "pytorch_mnist") script_path = os.path.join(base_dir, "mnist.py") input_path = sagemaker_session.upload_data( path=os.path.join(base_dir, "training"), key_prefix="integ-test-data/pytorch_mnist/training", ) inputs = TrainingInput(s3_data=input_path) pytorch_estimator = PyTorch( entry_point=script_path, role="SageMakerRole", framework_version=pytorch_training_latest_version, py_version=pytorch_training_latest_py_version, instance_count=instance_count, instance_type=instance_type, sagemaker_session=sagemaker_session, rules=rules, debugger_hook_config=debugger_hook_config, ) step_train = TrainingStep( name="pytorch-train", estimator=pytorch_estimator, inputs=inputs, ) pipeline = Pipeline( name=pipeline_name, parameters=[instance_count, instance_type], steps=[step_train], sagemaker_session=sagemaker_session, ) try: response = pipeline.create(role) create_arn = response["PipelineArn"] execution = pipeline.start() response = execution.describe() assert response["PipelineArn"] == create_arn try: execution.wait(delay=10, max_attempts=60) except WaiterError: pass execution_steps = execution.list_steps() assert len(execution_steps) == 1 assert execution_steps[0].get("FailureReason", "") == "" assert execution_steps[0]["StepName"] == "pytorch-train" assert execution_steps[0]["StepStatus"] == "Succeeded" training_job_arn = execution_steps[0]["Metadata"]["TrainingJob"]["Arn"] job_description = sagemaker_session.sagemaker_client.describe_training_job( TrainingJobName=training_job_arn.split("/")[1]) for index, rule in enumerate(rules): config = job_description["DebugRuleConfigurations"][index] assert config["RuleConfigurationName"] == rule.name assert config["RuleEvaluatorImage"] == rule.image_uri assert config["VolumeSizeInGB"] == 0 assert (config["RuleParameters"]["rule_to_invoke"] == rule.rule_parameters["rule_to_invoke"]) assert job_description[ "DebugHookConfig"] == debugger_hook_config._to_request_dict() assert job_description["ProfilingStatus"] == "Enabled" assert job_description["ProfilerConfig"][ "ProfilingIntervalInMilliseconds"] == 500 finally: try: pipeline.delete() except Exception: pass
def test_model_registration_with_model_repack( sagemaker_session, role, pipeline_name, region_name, ): base_dir = os.path.join(DATA_DIR, "pytorch_mnist") entry_point = os.path.join(base_dir, "mnist.py") input_path = sagemaker_session.upload_data( path=os.path.join(base_dir, "training"), key_prefix="integ-test-data/pytorch_mnist/training", ) inputs = TrainingInput(s3_data=input_path) instance_count = ParameterInteger(name="InstanceCount", default_value=1) instance_type = ParameterString(name="InstanceType", default_value="ml.m5.xlarge") good_enough_input = ParameterInteger(name="GoodEnoughInput", default_value=1) pytorch_estimator = PyTorch( entry_point=entry_point, role=role, framework_version="1.5.0", py_version="py3", instance_count=instance_count, instance_type=instance_type, sagemaker_session=sagemaker_session, ) step_train = TrainingStep( name="pytorch-train", estimator=pytorch_estimator, inputs=inputs, ) step_register = RegisterModel( name="pytorch-register-model", estimator=pytorch_estimator, model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts, content_types=["*"], response_types=["*"], inference_instances=["*"], transform_instances=["*"], description="test-description", entry_point=entry_point, ) model = Model( image_uri=pytorch_estimator.training_image_uri(), model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts, sagemaker_session=sagemaker_session, role=role, ) model_inputs = CreateModelInput( instance_type="ml.m5.large", accelerator_type="ml.eia1.medium", ) step_model = CreateModelStep( name="pytorch-model", model=model, inputs=model_inputs, ) step_cond = ConditionStep( name="cond-good-enough", conditions=[ ConditionGreaterThanOrEqualTo(left=good_enough_input, right=1) ], if_steps=[step_train, step_register], else_steps=[step_model], ) pipeline = Pipeline( name=pipeline_name, parameters=[good_enough_input, instance_count, instance_type], steps=[step_cond], sagemaker_session=sagemaker_session, ) try: response = pipeline.create(role) create_arn = response["PipelineArn"] assert re.match( fr"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", create_arn) execution = pipeline.start(parameters={}) assert re.match( fr"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/", execution.arn, ) execution = pipeline.start(parameters={"GoodEnoughInput": 0}) assert re.match( fr"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/", execution.arn, ) finally: try: pipeline.delete() except Exception: pass
testpath = sess.upload_data(path='boston_test.csv', bucket=bucket, key_prefix='sagemaker/sklearncontainer') pytorch_estimator = PyTorch( entry_point='train.py', source_dir=os.path.abspath(os.path.dirname(__file__)), role=sm_role, train_instance_count=1, train_instance_type='ml.c5.xlarge', framework_version='1.0.0', base_job_name='dense-pytorch', metric_definitions=[{ 'Name': 'median-AE', 'Regex': "AE-at-50th-percentile: ([0-9.]+).*$" }], hyperparameters={ 'n-epochs': 1500, 'features': 'CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT', 'target': 'target' }, tags=[{ "Key": "CostCentre", "Value": "SageMaker" }]) # launch training job, with asynchronous call pytorch_estimator.fit({'train': trainpath, 'test': testpath}, wait=False)
def test_tuning_step_with_multi_algo_tuner(pipeline_session, entry_point): pytorch_estimator = PyTorch( entry_point=entry_point, role=sagemaker.get_execution_role(), framework_version="1.5.0", py_version="py3", instance_count=1, instance_type="ml.m5.xlarge", sagemaker_session=pipeline_session, enable_sagemaker_metrics=True, max_retry_attempts=3, hyperparameters={ "static-hp": "hp1", "train_size": "1280" }, ) tuner = HyperparameterTuner.create( estimator_dict={ "estimator-1": pytorch_estimator, "estimator-2": pytorch_estimator, }, objective_metric_name_dict={ "estimator-1": "test:acc", "estimator-2": "test:acc", }, hyperparameter_ranges_dict={ "estimator-1": { "batch-size": IntegerParameter(64, 128) }, "estimator-2": { "batch-size": IntegerParameter(256, 512) }, }, metric_definitions_dict={ "estimator-1": [{ "Name": "test:acc", "Regex": "Overall test accuracy: (.*?);" }], "estimator-2": [{ "Name": "test:acc", "Regex": "Overall test accuracy: (.*?);" }], }, ) input_path = f"s3://{pipeline_session.default_bucket()}/training-data" inputs = { "estimator-1": TrainingInput(s3_data=input_path), "estimator-2": TrainingInput(s3_data=input_path), } step_args = tuner.fit( inputs=inputs, include_cls_metadata={ "estimator-1": False, "estimator-2": False, }, ) step = TuningStep( name="MyTuningStep", step_args=step_args, ) pipeline = Pipeline( name="MyPipeline", steps=[step], sagemaker_session=pipeline_session, ) assert json.loads(pipeline.definition())["Steps"][0] == { "Name": "MyTuningStep", "Type": "Tuning", "Arguments": step_args, }
def test_debug_hook_disabled_with_checkpointing( sagemaker_session, mxnet_training_latest_version, mxnet_training_latest_py_version, cpu_instance_type, ): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): s3_output_path = os.path.join("s3://", sagemaker_session.default_bucket(), str(uuid.uuid4())) debugger_hook_config = DebuggerHookConfig( s3_output_path=os.path.join(s3_output_path, "tensors")) script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist_gluon.py") # Estimator with checkpointing enabled mx = MXNet( entry_point=script_path, role="SageMakerRole", framework_version=mxnet_training_latest_version, py_version=mxnet_training_latest_py_version, instance_count=1, instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, debugger_hook_config=debugger_hook_config, checkpoint_local_path="/opt/ml/checkpoints", checkpoint_s3_uri=os.path.join(s3_output_path, "checkpoints"), ) mx._prepare_for_training() # Debug Hook should be enabled assert mx.debugger_hook_config is not None # Estimator with checkpointing enabled and Instance Count>1 mx = MXNet( entry_point=script_path, role="SageMakerRole", framework_version=mxnet_training_latest_version, py_version=mxnet_training_latest_py_version, instance_count=2, instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, debugger_hook_config=debugger_hook_config, checkpoint_local_path="/opt/ml/checkpoints", checkpoint_s3_uri=os.path.join(s3_output_path, "checkpoints"), ) mx._prepare_for_training() # Debug Hook should be disabled assert mx.debugger_hook_config is False # Estimator with checkpointing enabled and SMDataParallel Enabled pt = PyTorch( base_job_name="pytorch-smdataparallel-mnist", entry_point=script_path, role="SageMakerRole", framework_version="1.8.0", py_version="py36", instance_count=1, # For training with p3dn instance use - ml.p3dn.24xlarge, with p4dn instance use - ml.p4d.24xlarge instance_type="ml.p3.16xlarge", sagemaker_session=sagemaker_session, # Training using SMDataParallel Distributed Training Framework distribution={ "smdistributed": { "dataparallel": { "enabled": True } } }, checkpoint_local_path="/opt/ml/checkpoints", checkpoint_s3_uri=os.path.join(s3_output_path, "checkpoints"), ) pt._prepare_for_training() # Debug Hook should be disabled assert pt.debugger_hook_config is False # Estimator with checkpointing enabled and SMModelParallel Enabled tf = TensorFlow( base_job_name="tf-smdataparallel-mnist", entry_point=script_path, role="SageMakerRole", framework_version="2.4.1", py_version="py36", instance_count=1, # For training with p3dn instance use - ml.p3dn.24xlarge, with p4dn instance use - ml.p4d.24xlarge instance_type="ml.p3.16xlarge", sagemaker_session=sagemaker_session, # Training using SMDataParallel Distributed Training Framework distribution={ "smdistributed": { "modelparallel": { "enabled": True } } }, checkpoint_local_path="/opt/ml/checkpoints", checkpoint_s3_uri=os.path.join(s3_output_path, "checkpoints"), ) tf._prepare_for_training() # Debug Hook should be disabled assert tf.debugger_hook_config is False # Estimator with checkpointing enabled with Xgboost Estimator xg = XGBoost( base_job_name="test_xgboost", entry_point=script_path, role="SageMakerRole", framework_version="1.2-1", py_version="py3", instance_count=2, # For training with p3dn instance use - ml.p3dn.24xlarge, with p4dn instance use - ml.p4d.24xlarge instance_type="ml.p3.16xlarge", sagemaker_session=sagemaker_session, # Training using SMDataParallel Distributed Training Framework ) xg._prepare_for_training() # Debug Hook should be enabled assert xg.debugger_hook_config is not None
@pytest.mark.parametrize( "estimator", [ SKLearn( framework_version="0.23-1", py_version="py3", instance_type=INSTANCE_TYPE, instance_count=1, role=sagemaker.get_execution_role(), entry_point="entry_point.py", ), PyTorch( role=sagemaker.get_execution_role(), instance_type=INSTANCE_TYPE, instance_count=1, framework_version="1.8.0", py_version="py36", entry_point="entry_point.py", ), TensorFlow( role=sagemaker.get_execution_role(), instance_type=INSTANCE_TYPE, instance_count=1, framework_version="2.0", py_version="py3", entry_point="entry_point.py", ), HuggingFace( transformers_version="4.6", pytorch_version="1.7", role=sagemaker.get_execution_role(),
def my_aws_app(cfg: DictConfig) -> None: script_folder = "." # todo. this is overriden by hydra script_folder = (hydra.utils.get_original_cwd() ) # todo. this is overriden by hydra as_dict = OmegaConf.to_container(cfg, resolve=False) # Override s3 datapath aws_bucket = cfg.aws.bucket_prefix try: aws_root_path = aws_bucket + cfg.aws.root_path except errors.ConfigAttributeError: aws_root_path = aws_bucket + cfg.root_path # Get the s3 location to load /save to aws_out_path = aws_root_path + "/" + as_dict["output_subdir"] aws_data_path = aws_root_path + "/" + as_dict["data_subdir"] # Override the job json file with sagemaker local dirs as_dict["root_path"] = "/opt/ml/" as_dict["data_subdir"] = "input/data/train" as_dict["output_subdir"] = "output/data" # Set the local dir for tensorboard tb_log_dir = "/opt/ml/output/tensorboard/" as_dict["tb_log_dir"] = tb_log_dir tensorboard_output_config = TensorBoardOutputConfig( s3_output_path=aws_out_path, container_local_output_path=tb_log_dir, ) print(OmegaConf.to_yaml(cfg)) print("Overriden Root Path: " + aws_root_path) # Save json file to tmp location to be uploaded with script tmp_relative_path = "tmp/tmp_job.json" tmp_path = script_folder + "/" + tmp_relative_path with open(tmp_path, "w") as json_file: json.dump(as_dict, json_file) wait = cfg.aws.wait role = cfg.aws.role instance_count = cfg.aws.instance_count instance_type = cfg.aws.instance_type env = { "SAGEMAKER_REQUIREMENTS": "requirements.txt", # path relative to `source_dir` below. } # Using Sagemaker prebuilt Pytorch container pytorch_estimator = PyTorch( entry_point="run.py", source_dir=script_folder, hyperparameters={"config_file": tmp_relative_path}, role=role, env=env, instance_count=instance_count, py_version="py3", framework_version="1.5.0", output_path=aws_out_path, base_job_name=cfg.experiment_name, instance_type=instance_type, tensorboard_output_config=tensorboard_output_config, ) pytorch_estimator.fit({"train": aws_data_path}, wait=wait) os.remove(tmp_path)
def exec_training( session, client, job_name, setting, pytorch, max_parallel_jobs, is_spot ): sagemaker_session = sagemaker.Session( boto_session=session, sagemaker_client=client ) conf = yaml.load(open(setting)) # input data inputs = conf["inputs"] if "upload_data" in conf and isinstance(conf["upload_data"], list): for d in conf["upload_data"]: s3_dir = sagemaker_session.upload_data( path=d["path"], key_prefix=os.path.join(job_name, d["key_prefix"]), ) inputs[d["name"]] = s3_dir estimator_args = conf["estimator"] estimator_args["sagemaker_session"] = sagemaker_session hyperparameters = estimator_args.pop("hyperparameters") fixed, targets = {}, {} for k, v in hyperparameters.items(): if isinstance(v, dict): targets[k] = v else: fixed[k] = v estimator_args["hyperparameters"] = fixed if is_spot: estimator_args["train_use_spot_instances"] = True if "checkpoint_s3_uri" not in estimator_args: bucket_name = sagemaker_session.default_bucket() uri = os.path.join("s3://", bucket_name, job_name, "checkpoints") estimator_args["checkpoint_s3_uri"] = uri if pytorch: estimator = PyTorch(**estimator_args) else: estimator = Chainer(**estimator_args) if len(targets) == 0: estimator.fit(inputs, wait=False, job_name=job_name) else: if "tuner" in conf: tuner_args = conf["tuner"] hyperparameter_ranges = {} for k, v in targets.items(): hyperparameter_ranges[k] = hp_type[v["type"].lower()]( v["range"] ) else: # use default values tuner_args = { "objective_metric_name": "metric_name", "metric_definitions": [ {"Name": "metric_name", "Regex": "ignore"} ], "strategy": "Random", "objective_type": "Maximize", "early_stopping_type": "Off", } max_jobs = 1 hyperparameter_ranges = {} for k, v in targets.items(): if v["type"].lower() != "categorical": raise ValueError( "the default tuner only supports Categorigal params." ) max_jobs *= len(v["range"]) hyperparameter_ranges[k] = hp_type[v["type"].lower()]( v["range"] ) tuner_args["max_jobs"] = max_jobs tuner_args["estimator"] = estimator tuner_args["hyperparameter_ranges"] = hyperparameter_ranges tuner_args["max_parallel_jobs"] = max_parallel_jobs tuner_args["base_tuning_job_name"] = job_name tuner_args["warm_start_config"] = None # not supported yet. tuner = HyperparameterTuner(**tuner_args) tuner.fit(inputs, job_name=job_name)
def exec_training(session, client, job_name, setting, pytorch, max_parallel_jobs): sagemaker_session = sagemaker.Session(boto_session=session, sagemaker_client=client) conf = yaml.load(open(setting)) # input data inputs = conf['inputs'] if 'upload_data' in conf and isinstance(conf['upload_data'], list): for d in conf['upload_data']: s3_dir = sagemaker_session.upload_data(path=d['path'], key_prefix=os.path.join( job_name, d['key_prefix'])) inputs[d['name']] = s3_dir estimator_args = conf['estimator'] estimator_args['sagemaker_session'] = sagemaker_session hyperparameters = estimator_args.pop('hyperparameters') fixed, targets = {}, {} for k, v in hyperparameters.items(): if isinstance(v, dict): targets[k] = v else: fixed[k] = v estimator_args['hyperparameters'] = fixed if pytorch: estimator = PyTorch(**estimator_args) else: estimator = Chainer(**estimator_args) if len(targets) == 0: estimator.fit(inputs, job_name=job_name) else: if 'tuner' in conf: tuner_args = conf['tuner'] hyperparameter_ranges = {} for k, v in targets.items(): hyperparameter_ranges[k] = hp_type[v['type'].lower()]( v['range']) else: # use default values tuner_args = { 'objective_metric_name': 'metric_name', 'metric_definitions': [{ 'Name': 'metric_name', 'Regex': 'ignore' }], 'strategy': 'Random', 'objective_type': 'Maximize', 'early_stopping_type': 'Off' } max_jobs = 1 hyperparameter_ranges = {} for k, v in targets.items(): if v['type'].lower() != 'categorical': raise ValueError( 'the default tuner only supports Categorigal params.') max_jobs *= len(v['range']) hyperparameter_ranges[k] = hp_type[v['type'].lower()]( v['range']) tuner_args['max_jobs'] = max_jobs tuner_args['estimator'] = estimator tuner_args['hyperparameter_ranges'] = hyperparameter_ranges tuner_args['max_parallel_jobs'] = max_parallel_jobs tuner_args['base_tuning_job_name'] = job_name tuner_args['warm_start_config'] = None # not supported yet. tuner = HyperparameterTuner(**tuner_args) tuner.fit(inputs, job_name=job_name)