예제 #1
0
def mxnet_training_job(
    sagemaker_session,
    mxnet_training_latest_version,
    mxnet_training_latest_py_version,
    cpu_instance_type,
):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        s3_prefix = "integ-test-data/mxnet_mnist"
        data_path = os.path.join(DATA_DIR, "mxnet_mnist")

        s3_source = sagemaker_session.upload_data(
            path=os.path.join(data_path, "sourcedir.tar.gz"),
            key_prefix="{}/src".format(s3_prefix))

        mx = MXNet(
            entry_point="mxnet_mnist/mnist.py",
            source_dir=s3_source,
            role="SageMakerRole",
            framework_version=mxnet_training_latest_version,
            py_version=mxnet_training_latest_py_version,
            instance_count=1,
            instance_type=cpu_instance_type,
            sagemaker_session=sagemaker_session,
        )

        train_input = mx.sagemaker_session.upload_data(
            path=os.path.join(data_path, "train"),
            key_prefix="{}/train".format(s3_prefix))
        test_input = mx.sagemaker_session.upload_data(
            path=os.path.join(data_path, "test"),
            key_prefix="{}/test".format(s3_prefix))

        mx.fit({"train": train_input, "test": test_input})
        return mx.latest_training_job.name
예제 #2
0
def test_mxnet_distributed(sagemaker_session, ecr_image, instance_type,
                           framework_version):
    data_path = os.path.join(RESOURCE_PATH, 'mnist')
    script_path = os.path.join(data_path, 'mnist.py')

    mx = MXNet(entry_point=script_path,
               role='SageMakerRole',
               train_instance_count=2,
               train_instance_type=instance_type,
               sagemaker_session=sagemaker_session,
               image_name=ecr_image,
               framework_version=framework_version,
               hyperparameters={'sagemaker_parameter_server_enabled': True})

    prefix = 'mxnet_mnist/{}'.format(sagemaker_timestamp())

    with timeout(minutes=15):
        train_input = mx.sagemaker_session.upload_data(
            path=os.path.join(data_path, 'train'),
            key_prefix=prefix + '/train')
        test_input = mx.sagemaker_session.upload_data(
            path=os.path.join(data_path, 'test'), key_prefix=prefix + '/test')

        mx.fit({'train': train_input, 'test': test_input})

    with timeout_and_delete_endpoint(estimator=mx, minutes=30):
        predictor = mx.deploy(initial_instance_count=1,
                              instance_type=instance_type)

        data = np.zeros(shape=(1, 1, 28, 28))
        predictor.predict(data)
def mxnet_training_job(
    sagemaker_session,
    cpu_instance_type,
    mxnet_training_latest_version,
    mxnet_training_latest_py_version,
):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist_neo.py")
        data_path = os.path.join(DATA_DIR, "mxnet_mnist")

        mx = MXNet(
            entry_point=script_path,
            role="SageMakerRole",
            framework_version=mxnet_training_latest_version,
            py_version=mxnet_training_latest_py_version,
            instance_count=1,
            instance_type=cpu_instance_type,
            sagemaker_session=sagemaker_session,
        )

        train_input = mx.sagemaker_session.upload_data(
            path=os.path.join(data_path, "train"), key_prefix="integ-test-data/mxnet_mnist/train"
        )
        test_input = mx.sagemaker_session.upload_data(
            path=os.path.join(data_path, "test"), key_prefix="integ-test-data/mxnet_mnist/test"
        )

        mx.fit({"train": train_input, "test": test_input})
        return mx.latest_training_job.name
def test_async_fit(sagemaker_session):
    endpoint_name = 'test-mxnet-attach-deploy-{}'.format(sagemaker_timestamp())

    with timeout(minutes=5):
        script_path = os.path.join(DATA_DIR, 'mxnet_mnist', 'mnist.py')
        data_path = os.path.join(DATA_DIR, 'mxnet_mnist')

        mx = MXNet(entry_point=script_path, role='SageMakerRole',
                   train_instance_count=1, train_instance_type='ml.c4.xlarge',
                   sagemaker_session=sagemaker_session)

        train_input = mx.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'),
                                                       key_prefix='integ-test-data/mxnet_mnist/train')
        test_input = mx.sagemaker_session.upload_data(path=os.path.join(data_path, 'test'),
                                                      key_prefix='integ-test-data/mxnet_mnist/test')

        mx.fit({'train': train_input, 'test': test_input}, wait=False)
        training_job_name = mx.latest_training_job.name

        print("Waiting to re-attach to the training job: %s" % training_job_name)
        time.sleep(20)

    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        print("Re-attaching now to: %s" % training_job_name)
        estimator = MXNet.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session)
        predictor = estimator.deploy(1, 'ml.m4.xlarge', endpoint_name=endpoint_name)
        data = numpy.zeros(shape=(1, 1, 28, 28))
        predictor.predict(data)
예제 #5
0
def test_mxnet_with_debugger_hook_config(sagemaker_session, mxnet_full_version, cpu_instance_type):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        debugger_hook_config = DebuggerHookConfig(
            s3_output_path=os.path.join(
                "s3://", sagemaker_session.default_bucket(), str(uuid.uuid4()), "tensors"
            )
        )

        script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist_gluon.py")
        data_path = os.path.join(DATA_DIR, "mxnet_mnist")

        mx = MXNet(
            entry_point=script_path,
            role="SageMakerRole",
            framework_version=mxnet_full_version,
            py_version=PYTHON_VERSION,
            train_instance_count=1,
            train_instance_type=cpu_instance_type,
            sagemaker_session=sagemaker_session,
            debugger_hook_config=debugger_hook_config,
        )

        train_input = mx.sagemaker_session.upload_data(
            path=os.path.join(data_path, "train"), key_prefix="integ-test-data/mxnet_mnist/train"
        )
        test_input = mx.sagemaker_session.upload_data(
            path=os.path.join(data_path, "test"), key_prefix="integ-test-data/mxnet_mnist/test"
        )

        mx.fit({"train": train_input, "test": test_input})

        job_description = mx.latest_training_job.describe()
        assert job_description["DebugHookConfig"] == debugger_hook_config._to_request_dict()

        _wait_and_assert_that_no_rule_jobs_errored(training_job=mx.latest_training_job)
예제 #6
0
def test_mxnet_with_rules_and_debugger_hook_config(
    sagemaker_session,
    mxnet_training_latest_version,
    mxnet_training_latest_py_version,
    cpu_instance_type,
):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        rules = [
            Rule.sagemaker(rule_configs.vanishing_gradient()),
            Rule.sagemaker(base_config=rule_configs.all_zero(),
                           rule_parameters={"tensor_regex": ".*"}),
            Rule.sagemaker(rule_configs.loss_not_decreasing()),
        ]
        debugger_hook_config = DebuggerHookConfig(s3_output_path=os.path.join(
            "s3://", sagemaker_session.default_bucket(), str(uuid.uuid4()),
            "tensors"))

        script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist_gluon.py")
        data_path = os.path.join(DATA_DIR, "mxnet_mnist")

        mx = MXNet(
            entry_point=script_path,
            role="SageMakerRole",
            framework_version=mxnet_training_latest_version,
            py_version=mxnet_training_latest_py_version,
            instance_count=1,
            instance_type=cpu_instance_type,
            sagemaker_session=sagemaker_session,
            rules=rules,
            debugger_hook_config=debugger_hook_config,
        )

        train_input = mx.sagemaker_session.upload_data(
            path=os.path.join(data_path, "train"),
            key_prefix="integ-test-data/mxnet_mnist/train")
        test_input = mx.sagemaker_session.upload_data(
            path=os.path.join(data_path, "test"),
            key_prefix="integ-test-data/mxnet_mnist/test")

        mx.fit({"train": train_input, "test": test_input})

        job_description = mx.latest_training_job.describe()

        for index, rule in enumerate(rules):
            assert (job_description["DebugRuleConfigurations"][index]
                    ["RuleConfigurationName"] == rule.name)
            assert (job_description["DebugRuleConfigurations"][index]
                    ["RuleEvaluatorImage"] == rule.image_uri)
            assert job_description["DebugRuleConfigurations"][index][
                "VolumeSizeInGB"] == 0
            assert (job_description["DebugRuleConfigurations"][index]
                    ["RuleParameters"]["rule_to_invoke"] ==
                    rule.rule_parameters["rule_to_invoke"])
        assert job_description[
            "DebugHookConfig"] == debugger_hook_config._to_request_dict()
        assert (job_description["DebugRuleEvaluationStatuses"] ==
                mx.latest_training_job.rule_job_summary())

        _wait_and_assert_that_no_rule_jobs_errored(
            training_job=mx.latest_training_job)
예제 #7
0
def _test_training(ecr_image, sagemaker_session, instance_type, instance_count,
                   framework_version):
    hyperparameters = {
        'random_seed': True,
        'num_steps': 50,
        'smdebug_path': '/tmp/ml/output/tensors',
        'epochs': 1
    }

    mx = MXNet(entry_point=SCRIPT_PATH,
               role='SageMakerRole',
               instance_count=instance_count,
               instance_type=instance_type,
               sagemaker_session=sagemaker_session,
               image_uri=ecr_image,
               framework_version=framework_version,
               hyperparameters=hyperparameters)

    with timeout(minutes=15):
        prefix = 'mxnet_mnist_gluon_basic_hook_demo/{}'.format(
            utils.sagemaker_timestamp())
        train_input = sagemaker_session.upload_data(
            path=os.path.join(DATA_PATH, 'train'),
            key_prefix=prefix + '/train')
        test_input = sagemaker_session.upload_data(path=os.path.join(
            DATA_PATH, 'test'),
                                                   key_prefix=prefix + '/test')

        job_name = utils.unique_name_from_base('test-mxnet-image')
        mx.fit({'train': train_input, 'test': test_input}, job_name=job_name)
예제 #8
0
def _test_training(ecr_image, sagemaker_session, instance_type, instance_count,
                   framework_version):
    hyperparameters = {
        'sagemaker_parameter_server_enabled': True
    } if instance_count > 1 else {}
    hyperparameters['epochs'] = 1

    mx = MXNet(entry_point=SCRIPT_PATH,
               role='SageMakerRole',
               instance_count=instance_count,
               instance_type=instance_type,
               sagemaker_session=sagemaker_session,
               image_uri=ecr_image,
               framework_version=framework_version,
               hyperparameters=hyperparameters)

    mx = _disable_sm_profiler(sagemaker_session.boto_region_name, mx)

    with timeout(minutes=15):
        prefix = 'mxnet_mnist/{}'.format(utils.sagemaker_timestamp())
        train_input = sagemaker_session.upload_data(
            path=os.path.join(DATA_PATH, 'train'),
            key_prefix=prefix + '/train')
        test_input = sagemaker_session.upload_data(path=os.path.join(
            DATA_PATH, 'test'),
                                                   key_prefix=prefix + '/test')

        job_name = utils.unique_name_from_base('test-mxnet-image')
        mx.fit({'train': train_input, 'test': test_input}, job_name=job_name)
예제 #9
0
def test_async_fit(sagemaker_session, mxnet_full_version):
    endpoint_name = 'test-mxnet-attach-deploy-{}'.format(sagemaker_timestamp())

    with timeout(minutes=5):
        script_path = os.path.join(DATA_DIR, 'mxnet_mnist', 'mnist.py')
        data_path = os.path.join(DATA_DIR, 'mxnet_mnist')

        mx = MXNet(entry_point=script_path, role='SageMakerRole', py_version=PYTHON_VERSION,
                   train_instance_count=1, train_instance_type='ml.c4.xlarge',
                   sagemaker_session=sagemaker_session, framework_version=mxnet_full_version,
                   distributions={'parameter_server': {'enabled': True}})

        train_input = mx.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'),
                                                       key_prefix='integ-test-data/mxnet_mnist/train')
        test_input = mx.sagemaker_session.upload_data(path=os.path.join(data_path, 'test'),
                                                      key_prefix='integ-test-data/mxnet_mnist/test')

        mx.fit({'train': train_input, 'test': test_input}, wait=False)
        training_job_name = mx.latest_training_job.name

        print("Waiting to re-attach to the training job: %s" % training_job_name)
        time.sleep(20)

    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        print("Re-attaching now to: %s" % training_job_name)
        estimator = MXNet.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session)
        predictor = estimator.deploy(1, 'ml.m4.xlarge', endpoint_name=endpoint_name)
        data = numpy.zeros(shape=(1, 1, 28, 28))
        predictor.predict(data)
예제 #10
0
def test_mxnet_with_debugger_hook_config_disabled(
    sagemaker_session,
    mxnet_training_latest_version,
    mxnet_training_latest_py_version,
    cpu_instance_type,
):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist_gluon.py")
        data_path = os.path.join(DATA_DIR, "mxnet_mnist")

        mx = MXNet(
            entry_point=script_path,
            role="SageMakerRole",
            framework_version=mxnet_training_latest_version,
            py_version=mxnet_training_latest_py_version,
            instance_count=1,
            instance_type=cpu_instance_type,
            sagemaker_session=sagemaker_session,
            debugger_hook_config=False,
        )

        train_input = mx.sagemaker_session.upload_data(
            path=os.path.join(data_path, "train"),
            key_prefix="integ-test-data/mxnet_mnist/train")
        test_input = mx.sagemaker_session.upload_data(
            path=os.path.join(data_path, "test"),
            key_prefix="integ-test-data/mxnet_mnist/test")

        mx.fit({"train": train_input, "test": test_input})

        job_description = mx.latest_training_job.describe()

        assert job_description.get("DebugHookConfig") is None
예제 #11
0
def test_git_support_codecommit_with_mxnet(sagemaker_local_session):
    script_path = "mnist.py"
    data_path = os.path.join(DATA_DIR, "mxnet_mnist")
    git_config = {
        "repo": CODECOMMIT_REPO,
        "branch": CODECOMMIT_BRANCH,
        "username": "******",
        "password": "******",
    }
    source_dir = "mxnet"
    dependencies = ["foo/bar.py"]
    mx = MXNet(
        entry_point=script_path,
        role="SageMakerRole",
        source_dir=source_dir,
        dependencies=dependencies,
        framework_version=MXNet.LATEST_VERSION,
        py_version=PYTHON_VERSION,
        train_instance_count=1,
        train_instance_type="local",
        sagemaker_session=sagemaker_local_session,
        git_config=git_config,
    )

    mx.fit(
        {
            "train": "file://" + os.path.join(data_path, "train"),
            "test": "file://" + os.path.join(data_path, "test"),
        }
    )

    files = [file for file in os.listdir(mx.source_dir)]
    assert "some_file" in files
    assert "mnist.py" in files
    assert os.path.exists(mx.dependencies[0])

    with lock.lock(LOCK_PATH):
        try:
            client = sagemaker_local_session.sagemaker_client
            desc = client.describe_training_job(TrainingJobName=mx.latest_training_job.name)
            model_data = desc["ModelArtifacts"]["S3ModelArtifacts"]
            model = MXNetModel(
                model_data,
                "SageMakerRole",
                entry_point=script_path,
                source_dir=source_dir,
                dependencies=dependencies,
                py_version=PYTHON_VERSION,
                sagemaker_session=sagemaker_local_session,
                framework_version=MXNet.LATEST_VERSION,
                git_config=git_config,
            )
            predictor = model.deploy(1, "local")

            data = numpy.zeros(shape=(1, 1, 28, 28))
            result = predictor.predict(data)
            assert result is not None
        finally:
            predictor.delete_endpoint()
예제 #12
0
def test_mxnet_with_built_in_profiler_rule_with_custom_parameters(
    sagemaker_session,
    mxnet_training_latest_version,
    mxnet_training_latest_py_version,
    cpu_instance_type,
):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        custom_profiler_report_rule = ProfilerRule.sagemaker(
            rule_configs.ProfilerReport(CPUBottleneck_threshold=90),
            name="CustomProfilerReportRule")
        script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist_gluon.py")
        data_path = os.path.join(DATA_DIR, "mxnet_mnist")

        mx = MXNet(
            entry_point=script_path,
            role="SageMakerRole",
            framework_version=mxnet_training_latest_version,
            py_version=mxnet_training_latest_py_version,
            instance_count=1,
            instance_type=cpu_instance_type,
            sagemaker_session=sagemaker_session,
            rules=[custom_profiler_report_rule],
        )

        train_input = mx.sagemaker_session.upload_data(
            path=os.path.join(data_path, "train"),
            key_prefix="integ-test-data/mxnet_mnist/train")
        test_input = mx.sagemaker_session.upload_data(
            path=os.path.join(data_path, "test"),
            key_prefix="integ-test-data/mxnet_mnist/test")

        training_job_name = unique_name_from_base(
            "test-profiler-mxnet-training")
        mx.fit(
            inputs={
                "train": train_input,
                "test": test_input
            },
            job_name=training_job_name,
            wait=False,
        )

        job_description = mx.latest_training_job.describe()
        assert job_description.get("ProfilingStatus") == "Enabled"
        assert (job_description.get("ProfilerConfig") == ProfilerConfig(
            s3_output_path=mx.output_path,
            system_monitor_interval_millis=500)._to_request_dict())

        profiler_rule_configuration = job_description.get(
            "ProfilerRuleConfigurations")[0]
        assert profiler_rule_configuration[
            "RuleConfigurationName"] == "CustomProfilerReportRule"
        assert profiler_rule_configuration["RuleEvaluatorImage"] == mx.rules[
            0].image_uri
        assert profiler_rule_configuration["RuleParameters"] == {
            "rule_to_invoke": "ProfilerReport",
            "CPUBottleneck_threshold": "90",
        }
예제 #13
0
def test_failed_training_job(sagemaker_session, mxnet_full_version):
    with timeout():
        script_path = os.path.join(DATA_DIR, 'mxnet_mnist', 'failure_script.py')

        mx = MXNet(entry_point=script_path, role='SageMakerRole', framework_version=mxnet_full_version,
                   py_version=PYTHON_VERSION, train_instance_count=1, train_instance_type='ml.c4.xlarge',
                   sagemaker_session=sagemaker_session)

        with pytest.raises(ValueError) as e:
            mx.fit()
        assert 'ExecuteUserScriptError' in str(e.value)
예제 #14
0
def test_nlp_training(sagemaker_session, ecr_image, instance_type):

    nlp = MXNet(entry_point=NLP_SCRIPT_PATH,
                role='SageMakerRole',
                train_instance_count=1,
                train_instance_type=instance_type,
                sagemaker_session=sagemaker_session,
                image_name=ecr_image,
                train_max_run=5 * 60)

    job_name = utils.unique_name_from_base('test-nlp-image')
    nlp.fit(job_name=job_name)
예제 #15
0
def test_dgl_training(sagemaker_session, ecr_image, instance_type):

    dgl = MXNet(entry_point=DGL_SCRIPT_PATH,
                role='SageMakerRole',
                train_instance_count=1,
                train_instance_type=instance_type,
                sagemaker_session=sagemaker_session,
                image_name=ecr_image)

    with timeout(minutes=15):
        job_name = utils.unique_name_from_base('test-dgl-image')
        dgl.fit(job_name=job_name)
예제 #16
0
def test_mxnet_with_custom_rule_and_actions(
    sagemaker_session,
    mxnet_training_latest_version,
    mxnet_training_latest_py_version,
    cpu_instance_type,
    actions,
):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        rules = [_get_custom_rule(sagemaker_session, actions)]

        script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist_gluon.py")
        data_path = os.path.join(DATA_DIR, "mxnet_mnist")

        mx = MXNet(
            entry_point=script_path,
            role="SageMakerRole",
            framework_version=mxnet_training_latest_version,
            py_version=mxnet_training_latest_py_version,
            instance_count=1,
            instance_type=cpu_instance_type,
            sagemaker_session=sagemaker_session,
            rules=rules,
        )

        train_input = mx.sagemaker_session.upload_data(
            path=os.path.join(data_path, "train"), key_prefix="integ-test-data/mxnet_mnist/train"
        )
        test_input = mx.sagemaker_session.upload_data(
            path=os.path.join(data_path, "test"), key_prefix="integ-test-data/mxnet_mnist/test"
        )

        mx.fit({"train": train_input, "test": test_input})

        job_description = mx.latest_training_job.describe()

        for index, rule in enumerate(rules):
            assert (
                job_description["DebugRuleConfigurations"][index]["RuleConfigurationName"]
                == rule.name
            )
            assert (
                job_description["DebugRuleConfigurations"][index]["RuleEvaluatorImage"]
                == rule.image_uri
            )
            assert job_description["DebugRuleConfigurations"][index]["VolumeSizeInGB"] == 30

        assert (
            _get_rule_evaluation_statuses(job_description)
            == mx.latest_training_job.rule_job_summary()
        )

        _wait_and_assert_that_no_rule_jobs_errored(training_job=mx.latest_training_job)
예제 #17
0
def test_private_github(
    sagemaker_local_session, mxnet_training_latest_version, mxnet_training_latest_py_version
):
    script_path = "mnist.py"
    data_path = os.path.join(DATA_DIR, "mxnet_mnist")
    git_config = {
        "repo": PRIVATE_GIT_REPO,
        "branch": PRIVATE_BRANCH,
        "commit": PRIVATE_COMMIT,
        "2FA_enabled": False,
        "username": "******",
        "password": "",  # TODO: find a secure approach
    }
    source_dir = "mxnet"
    dependencies = ["foo/bar.py"]
    mx = MXNet(
        entry_point=script_path,
        role="SageMakerRole",
        source_dir=source_dir,
        dependencies=dependencies,
        framework_version=mxnet_training_latest_version,
        py_version=mxnet_training_latest_py_version,
        instance_count=1,
        instance_type="local",
        sagemaker_session=sagemaker_local_session,
        git_config=git_config,
    )

    mx.fit(
        {
            "train": "file://" + os.path.join(data_path, "train"),
            "test": "file://" + os.path.join(data_path, "test"),
        }
    )

    files = [file for file in os.listdir(mx.source_dir)]
    assert "some_file" in files
    assert "mnist.py" in files
    assert os.path.exists(mx.dependencies[0])

    with lock.lock(LOCK_PATH):
        try:
            serving_script_path = "mnist_hosting_with_custom_handlers.py"
            predictor = mx.deploy(1, "local", entry_point=serving_script_path)

            data = numpy.zeros(shape=(1, 1, 28, 28))
            result = predictor.predict(data)
            assert result is not None
        finally:
            predictor.delete_endpoint()
def test_training(sagemaker_session, ecr_image, instance_type, instance_count):
    hyperparameters = {
        'sagemaker_parameter_server_enabled': True
    } if instance_count > 1 else {}
    hyperparameters['epochs'] = 1

    mx = MXNet(entry_point=SCRIPT_PATH,
               role='SageMakerRole',
               train_instance_count=instance_count,
               train_instance_type=instance_type,
               sagemaker_session=sagemaker_session,
               image_name=ecr_image,
               hyperparameters=hyperparameters)

    with timeout(minutes=15):
        prefix = 'mxnet_mnist/{}'.format(utils.sagemaker_timestamp())
        train_input = mx.sagemaker_session.upload_data(
            path=os.path.join(DATA_PATH, 'train'),
            key_prefix=prefix + '/train')
        test_input = mx.sagemaker_session.upload_data(
            path=os.path.join(DATA_PATH, 'test'), key_prefix=prefix + '/test')

        job_name = utils.unique_name_from_base('test-mxnet-image')
        mx.fit({'train': train_input, 'test': test_input}, job_name=job_name)

    dgl = MXNet(entry_point=DGL_SCRIPT_PATH,
                role='SageMakerRole',
                train_instance_count=1,
                train_instance_type=instance_type,
                sagemaker_session=sagemaker_session,
                image_name=ecr_image)

    with timeout(minutes=15):
        job_name = utils.unique_name_from_base('test-mxnet-dgl-image')
        dgl.fit(job_name=job_name)
예제 #19
0
def test_async_fit(
    sagemaker_session,
    mxnet_training_latest_version,
    mxnet_inference_latest_py_version,
    cpu_instance_type,
):
    endpoint_name = "test-mxnet-attach-deploy-{}".format(sagemaker_timestamp())

    with timeout(minutes=5):
        script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist.py")
        data_path = os.path.join(DATA_DIR, "mxnet_mnist")

        mx = MXNet(
            entry_point=script_path,
            role="SageMakerRole",
            py_version=mxnet_inference_latest_py_version,
            instance_count=1,
            instance_type=cpu_instance_type,
            sagemaker_session=sagemaker_session,
            framework_version=mxnet_training_latest_version,
            distribution={"parameter_server": {
                "enabled": True
            }},
        )

        train_input = mx.sagemaker_session.upload_data(
            path=os.path.join(data_path, "train"),
            key_prefix="integ-test-data/mxnet_mnist/train")
        test_input = mx.sagemaker_session.upload_data(
            path=os.path.join(data_path, "test"),
            key_prefix="integ-test-data/mxnet_mnist/test")

        mx.fit({"train": train_input, "test": test_input}, wait=False)
        training_job_name = mx.latest_training_job.name

        print("Waiting to re-attach to the training job: %s" %
              training_job_name)
        time.sleep(20)

    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        print("Re-attaching now to: %s" % training_job_name)
        estimator = MXNet.attach(training_job_name=training_job_name,
                                 sagemaker_session=sagemaker_session)
        predictor = estimator.deploy(1,
                                     cpu_instance_type,
                                     endpoint_name=endpoint_name)
        data = numpy.zeros(shape=(1, 1, 28, 28))
        result = predictor.predict(data)
        assert result is not None
def test_failed_training_job(sagemaker_session, mxnet_full_version):
    with timeout(minutes=15):
        script_path = os.path.join(DATA_DIR, 'mxnet_mnist', 'failure_script.py')
        data_path = os.path.join(DATA_DIR, 'mxnet_mnist')

        mx = MXNet(entry_point=script_path, role='SageMakerRole', framework_version=mxnet_full_version,
                   train_instance_count=1, train_instance_type='ml.c4.xlarge',
                   sagemaker_session=sagemaker_session)

        train_input = mx.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'),
                                                       key_prefix='integ-test-data/mxnet_mnist/train-failure')

        with pytest.raises(ValueError) as e:
            mx.fit(train_input)
        assert 'This failure is expected' in str(e.value)
예제 #21
0
def test_failed_training_job(sagemaker_session, mxnet_full_version):
    with timeout(minutes=15):
        script_path = os.path.join(DATA_DIR, 'mxnet_mnist', 'failure_script.py')
        data_path = os.path.join(DATA_DIR, 'mxnet_mnist')

        mx = MXNet(entry_point=script_path, role='SageMakerRole', framework_version=mxnet_full_version,
                   train_instance_count=1, train_instance_type='ml.c4.xlarge',
                   sagemaker_session=sagemaker_session)

        train_input = mx.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'),
                                                       key_prefix='integ-test-data/mxnet_mnist/train-failure')

        with pytest.raises(ValueError) as e:
            mx.fit(train_input)
        assert 'This failure is expected' in str(e.value)
예제 #22
0
def test_tuning_mxnet(sagemaker_session):
    with timeout(minutes=15):
        script_path = os.path.join(DATA_DIR, 'mxnet_mnist', 'tuning.py')
        data_path = os.path.join(DATA_DIR, 'mxnet_mnist')

        estimator = MXNet(entry_point=script_path,
                          role='SageMakerRole',
                          train_instance_count=1,
                          train_instance_type='ml.m4.xlarge',
                          sagemaker_session=sagemaker_session,
                          base_job_name='tune-mxnet')

        hyperparameter_ranges = {'learning_rate': ContinuousParameter(0.01, 0.2)}
        objective_metric_name = 'Validation-accuracy'
        metric_definitions = [{'Name': 'Validation-accuracy', 'Regex': 'Validation-accuracy=([0-9\\.]+)'}]
        tuner = HyperparameterTuner(estimator, objective_metric_name, hyperparameter_ranges, metric_definitions,
                                    max_jobs=4, max_parallel_jobs=2)

        train_input = estimator.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'),
                                                              key_prefix='integ-test-data/mxnet_mnist/train')
        test_input = estimator.sagemaker_session.upload_data(path=os.path.join(data_path, 'test'),
                                                             key_prefix='integ-test-data/mxnet_mnist/test')
        tuner.fit({'train': train_input, 'test': test_input})

        print('Started hyperparameter tuning job with name:' + tuner.latest_tuning_job.name)

        time.sleep(15)
        tuner.wait()

    best_training_job = tuner.best_training_job()
    with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session):
        predictor = tuner.deploy(1, 'ml.c4.xlarge')
        data = np.zeros(shape=(1, 1, 28, 28))
        predictor.predict(data)
예제 #23
0
def test_tuning(sagemaker_session, ecr_image, instance_type):
    mx = MXNet(entry_point=SCRIPT_PATH,
               role='SageMakerRole',
               train_instance_count=1,
               train_instance_type=instance_type,
               sagemaker_session=sagemaker_session,
               image_name=ecr_image,
               hyperparameters={'epochs': 1})

    hyperparameter_ranges = {'learning-rate': ContinuousParameter(0.01, 0.2)}
    objective_metric_name = 'Validation-accuracy'
    metric_definitions = [
        {'Name': 'Validation-accuracy', 'Regex': 'Validation-accuracy=([0-9\\.]+)'}]

    tuner = HyperparameterTuner(mx,
                                objective_metric_name,
                                hyperparameter_ranges,
                                metric_definitions,
                                max_jobs=2,
                                max_parallel_jobs=2)

    with timeout(minutes=20):
        prefix = 'mxnet_mnist/{}'.format(utils.sagemaker_timestamp())
        train_input = mx.sagemaker_session.upload_data(path=os.path.join(DATA_PATH, 'train'),
                                                       key_prefix=prefix + '/train')
        test_input = mx.sagemaker_session.upload_data(path=os.path.join(DATA_PATH, 'test'),
                                                      key_prefix=prefix + '/test')

        job_name = utils.unique_name_from_base('test-mxnet-image', max_length=32)
        tuner.fit({'train': train_input, 'test': test_input}, job_name=job_name)
        tuner.wait()
def mxnet_training_job(sagemaker_session, mxnet_full_version):
    with timeout(minutes=15):
        script_path = os.path.join(DATA_DIR, 'mxnet_mnist', 'mnist.py')
        data_path = os.path.join(DATA_DIR, 'mxnet_mnist')

        mx = MXNet(entry_point=script_path, role='SageMakerRole', framework_version=mxnet_full_version,
                   train_instance_count=1, train_instance_type='ml.c4.xlarge',
                   sagemaker_session=sagemaker_session)

        train_input = mx.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'),
                                                       key_prefix='integ-test-data/mxnet_mnist/train')
        test_input = mx.sagemaker_session.upload_data(path=os.path.join(data_path, 'test'),
                                                      key_prefix='integ-test-data/mxnet_mnist/test')

        mx.fit({'train': train_input, 'test': test_input})
        return mx.latest_training_job.name
예제 #25
0
def mxnet_training_job(sagemaker_session, mxnet_full_version):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        script_path = os.path.join(DATA_DIR, 'mxnet_mnist', 'mnist.py')
        data_path = os.path.join(DATA_DIR, 'mxnet_mnist')

        mx = MXNet(entry_point=script_path, role='SageMakerRole', framework_version=mxnet_full_version,
                   py_version=PYTHON_VERSION, train_instance_count=1, train_instance_type='ml.c4.xlarge',
                   sagemaker_session=sagemaker_session)

        train_input = mx.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'),
                                                       key_prefix='integ-test-data/mxnet_mnist/train')
        test_input = mx.sagemaker_session.upload_data(path=os.path.join(data_path, 'test'),
                                                      key_prefix='integ-test-data/mxnet_mnist/test')

        mx.fit({'train': train_input, 'test': test_input})
        return mx.latest_training_job.name
예제 #26
0
def mxnet_training_job(sagemaker_session):
    with timeout(minutes=15):
        script_path = os.path.join(DATA_DIR, 'mxnet_mnist', 'mnist.py')
        data_path = os.path.join(DATA_DIR, 'mxnet_mnist')

        mx = MXNet(entry_point=script_path, role='SageMakerRole',
                   train_instance_count=1, train_instance_type='ml.c4.xlarge',
                   sagemaker_session=sagemaker_session)

        train_input = mx.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'),
                                                       key_prefix='integ-test-data/mxnet_mnist/train')
        test_input = mx.sagemaker_session.upload_data(path=os.path.join(data_path, 'test'),
                                                      key_prefix='integ-test-data/mxnet_mnist/test')

        mx.fit({'train': train_input, 'test': test_input})
        return mx.latest_training_job.name
def test_attach_deploy(
    mxnet_training_job, sagemaker_session, cpu_instance_type, cpu_instance_family
):
    endpoint_name = unique_name_from_base("test-neo-attach-deploy")

    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        estimator = MXNet.attach(mxnet_training_job, sagemaker_session=sagemaker_session)

        estimator.compile_model(
            target_instance_family=cpu_instance_family,
            input_shape={"data": [1, 1, 28, 28]},
            output_path=estimator.output_path,
        )

        serializer = JSONSerializer(content_type="application/vnd+python.numpy+binary")

        predictor = estimator.deploy(
            1,
            cpu_instance_type,
            serializer=serializer,
            use_compiled_model=True,
            endpoint_name=endpoint_name,
        )
        data = numpy.zeros(shape=(1, 1, 28, 28))
        predictor.predict(data)
예제 #28
0
def test_codecommit(
    sagemaker_local_session, mxnet_training_latest_version, mxnet_training_latest_py_version
):
    script_path = "mnist.py"
    data_path = os.path.join(DATA_DIR, "mxnet_mnist")
    git_config = {
        "repo": CODECOMMIT_REPO,
        "branch": CODECOMMIT_BRANCH,
        "username": "******",
        "password": "",  # TODO: assume a role to get temporary credentials
    }
    source_dir = "mxnet"
    dependencies = ["foo/bar.py"]
    mx = MXNet(
        entry_point=script_path,
        role="SageMakerRole",
        source_dir=source_dir,
        dependencies=dependencies,
        framework_version=mxnet_training_latest_version,
        py_version=mxnet_training_latest_py_version,
        instance_count=1,
        instance_type="local",
        sagemaker_session=sagemaker_local_session,
        git_config=git_config,
    )

    mx.fit(
        {
            "train": "file://" + os.path.join(data_path, "train"),
            "test": "file://" + os.path.join(data_path, "test"),
        }
    )

    files = [file for file in os.listdir(mx.source_dir)]
    assert "some_file" in files
    assert "mnist.py" in files
    assert os.path.exists(mx.dependencies[0])

    with lock.lock(LOCK_PATH):
        try:
            predictor = mx.deploy(1, "local")

            data = numpy.zeros(shape=(1, 1, 28, 28))
            result = predictor.predict(data)
            assert result is not None
        finally:
            predictor.delete_endpoint()
def test_attach_deploy(mxnet_training_job, sagemaker_session):
    endpoint_name = 'test-mxnet-attach-deploy-{}'.format(sagemaker_timestamp())

    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        estimator = MXNet.attach(mxnet_training_job, sagemaker_session=sagemaker_session)
        predictor = estimator.deploy(1, 'ml.m4.xlarge', endpoint_name=endpoint_name)
        data = numpy.zeros(shape=(1, 1, 28, 28))
        predictor.predict(data)
예제 #30
0
def test_attach_deploy(mxnet_training_job, sagemaker_session):
    endpoint_name = 'test-mxnet-attach-deploy-{}'.format(sagemaker_timestamp())

    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        estimator = MXNet.attach(mxnet_training_job, sagemaker_session=sagemaker_session)
        predictor = estimator.deploy(1, 'ml.m4.xlarge', endpoint_name=endpoint_name)
        data = numpy.zeros(shape=(1, 1, 28, 28))
        predictor.predict(data)
예제 #31
0
def test_tuning_mxnet(
    sagemaker_session,
    mxnet_training_latest_version,
    mxnet_training_latest_py_version,
    cpu_instance_type,
):
    with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES):
        script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist.py")
        data_path = os.path.join(DATA_DIR, "mxnet_mnist")

        estimator = MXNet(
            entry_point=script_path,
            role="SageMakerRole",
            py_version=mxnet_training_latest_py_version,
            instance_count=1,
            instance_type=cpu_instance_type,
            framework_version=mxnet_training_latest_version,
            sagemaker_session=sagemaker_session,
        )

        hyperparameter_ranges = {
            "learning-rate": ContinuousParameter(0.01, 0.2)
        }
        objective_metric_name = "Validation-accuracy"
        metric_definitions = [{
            "Name": "Validation-accuracy",
            "Regex": "Validation-accuracy=([0-9\\.]+)"
        }]
        tuner = HyperparameterTuner(
            estimator,
            objective_metric_name,
            hyperparameter_ranges,
            metric_definitions,
            max_jobs=4,
            max_parallel_jobs=2,
        )

        train_input = estimator.sagemaker_session.upload_data(
            path=os.path.join(data_path, "train"),
            key_prefix="integ-test-data/mxnet_mnist/train")
        test_input = estimator.sagemaker_session.upload_data(
            path=os.path.join(data_path, "test"),
            key_prefix="integ-test-data/mxnet_mnist/test")

        tuning_job_name = unique_name_from_base("tune-mxnet", max_length=32)
        print("Started hyperparameter tuning job with name:" + tuning_job_name)
        tuner.fit({
            "train": train_input,
            "test": test_input
        },
                  job_name=tuning_job_name)

    best_training_job = tuner.best_training_job()
    with timeout_and_delete_endpoint_by_name(best_training_job,
                                             sagemaker_session):
        predictor = tuner.deploy(1, cpu_instance_type)
        data = np.zeros(shape=(1, 1, 28, 28))
        predictor.predict(data)
예제 #32
0
def test_attach_deploy(mxnet_training_job, sagemaker_session, cpu_instance_type):
    endpoint_name = "test-mxnet-attach-deploy-{}".format(sagemaker_timestamp())

    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        estimator = MXNet.attach(mxnet_training_job, sagemaker_session=sagemaker_session)
        predictor = estimator.deploy(1, cpu_instance_type, endpoint_name=endpoint_name)
        data = numpy.zeros(shape=(1, 1, 28, 28))
        result = predictor.predict(data)
        assert result is not None
 def create_estimator(self):
     from sagemaker.mxnet.estimator import MXNet
     return MXNet(self.script,
                  role=self.role_name,
                  base_job_name=self.job_name,
                  train_instance_count=self.instance_count,
                  train_instance_type=self.instance_type,
                  hyperparameters=self.hyperparameters,
                  py_version=self.python)
예제 #34
0
def test_jumpstart_mxnet_image_uri(patched_get_model_specs, session):

    patched_get_model_specs.side_effect = get_prototype_model_spec

    model_id, model_version = "mxnet-semseg-fcn-resnet50-ade", "*"
    instance_type = "ml.p2.xlarge"
    region = "us-west-2"

    model_specs = accessors.JumpStartModelsAccessor.get_model_specs(
        region, model_id, model_version)

    # inference
    uri = image_uris.retrieve(
        framework=None,
        region=region,
        image_scope="inference",
        model_id=model_id,
        model_version=model_version,
        instance_type=instance_type,
    )

    framework_class_uri = MXNetModel(
        role="mock_role",
        model_data="mock_data",
        entry_point="mock_entry_point",
        framework_version=model_specs.hosting_ecr_specs.framework_version,
        py_version=model_specs.hosting_ecr_specs.py_version,
        sagemaker_session=session,
    ).serving_image_uri(region, instance_type)

    assert uri == framework_class_uri
    assert uri == "763104351884.dkr.ecr.us-west-2.amazonaws.com/mxnet-inference:1.7.0-gpu-py3"

    # training
    uri = image_uris.retrieve(
        framework=None,
        region=region,
        image_scope="training",
        model_id=model_id,
        model_version=model_version,
        instance_type=instance_type,
    )

    framework_class_uri = MXNet(
        role="mock_role",
        entry_point="mock_entry_point",
        framework_version=model_specs.training_ecr_specs.framework_version,
        py_version=model_specs.training_ecr_specs.py_version,
        instance_type=instance_type,
        instance_count=1,
        sagemaker_session=session,
    ).training_image_uri(region=region)

    assert uri == framework_class_uri
    assert uri == "763104351884.dkr.ecr.us-west-2.amazonaws.com/mxnet-training:1.7.0-gpu-py3"