Exemplo n.º 1
0
 def training_job_analytics(self):
     """Return a ``TrainingJobAnalytics`` object for the current training job.
     """
     if self._current_job_name is None:
         raise ValueError('Estimator is not associated with a TrainingJob')
     return TrainingJobAnalytics(self._current_job_name,
                                 sagemaker_session=self.sagemaker_session)
def test_trainer_name():
    describe_training_result = {
        "TrainingStartTime": datetime.datetime(2018, 5, 16, 1, 2, 3),
        "TrainingEndTime": datetime.datetime(2018, 5, 16, 5, 6, 7),
    }
    session = create_sagemaker_session(describe_training_result)
    trainer = TrainingJobAnalytics("my-training-job", ["metric"], sagemaker_session=session)
    assert trainer.name == "my-training-job"
    assert str(trainer).find("my-training-job") != -1
def test_trainer_dataframe():
    session = create_sagemaker_session(describe_training_result=_describe_training_result(),
                                       metric_stats_results=_metric_stats_results())
    trainer = TrainingJobAnalytics("my-training-job", ["train:acc"], sagemaker_session=session)

    df = trainer.dataframe()
    assert df is not None
    assert len(df) == 3
    assert min(df['value']) == 77.1
    assert max(df['value']) == 97.1

    # Export to CSV and check that file exists
    tmp_name = "/tmp/unit-test-%s.csv" % uuid.uuid4()
    assert not os.path.isfile(tmp_name)
    trainer.export_csv(tmp_name)
    assert os.path.isfile(tmp_name)
    os.unlink(tmp_name)
Exemplo n.º 4
0
def test_start_time_end_time_and_period_specified():
    describe_training_result = {
        'TrainingStartTime': datetime.datetime(2018, 5, 16, 1, 2, 3),
        'TrainingEndTime': datetime.datetime(2018, 5, 16, 5, 6, 7),
    }
    session = create_sagemaker_session(describe_training_result)
    start_time = datetime.datetime(2018, 5, 16, 1, 3, 4)
    end_time = datetime.datetime(2018, 5, 16, 5, 1, 1)
    period = 300
    trainer = TrainingJobAnalytics('my-training-job', ['metric'],
                                   sagemaker_session=session,
                                   start_time=start_time,
                                   end_time=end_time,
                                   period=period)

    assert trainer._time_interval['start_time'] == start_time
    assert trainer._time_interval['end_time'] == end_time
    assert trainer._period == period
Exemplo n.º 5
0
def test_trainer_dataframe():
    describe_training_result = {
        'TrainingStartTime': datetime.datetime(2018, 5, 16, 1, 2, 3),
        'TrainingEndTime': datetime.datetime(2018, 5, 16, 5, 6, 7),
    }
    metric_stats_results = {
        'Datapoints': [
            {
                'Average': 77.1,
                'Timestamp': datetime.datetime(2018, 5, 16, 1, 3, 3),
            },
            {
                'Average': 87.1,
                'Timestamp': datetime.datetime(2018, 5, 16, 1, 8, 3),
            },
            {
                'Average': 97.1,
                'Timestamp': datetime.datetime(2018, 5, 16, 2, 3, 3),
            },
        ]
    }
    session = sagemaker_session(
        describe_training_result=describe_training_result,
        metric_stats_results=metric_stats_results)
    trainer = TrainingJobAnalytics("my-training-job", ["train:acc"],
                                   sagemaker_session=session)

    df = trainer.dataframe()
    assert df is not None
    assert len(df) == 3
    assert min(df['value']) == 77.1
    assert max(df['value']) == 97.1

    # Export to CSV and check that file exists
    tmp_name = "/tmp/unit-test-%s.csv" % uuid.uuid4()
    assert not os.path.isfile(tmp_name)
    trainer.export_csv(tmp_name)
    assert os.path.isfile(tmp_name)
    os.unlink(tmp_name)
def log_sagemaker_job_by_name(sagemaker_job_name,
                              api_key=None,
                              workspace=None,
                              project_name=None):
    # Metadata
    client = _get_boto_client()
    metadata = client.describe_training_job(TrainingJobName=sagemaker_job_name)

    if metadata["TrainingJobStatus"] != "Completed":
        raise ValueError("Not importing %r as it's not completed, status %r" %
                         (sagemaker_job_name, metadata["TrainingJobStatus"]))

    experiment = APIExperiment(
        api_key=api_key,
        workspace=workspace,
        project_name=project_name,
        experiment_name=sagemaker_job_name,
    )
    start_time = metadata["TrainingStartTime"]
    start_time_timestamp = calendar.timegm(start_time.utctimetuple())
    experiment.set_start_time(start_time_timestamp * 1000)
    end_time = metadata.get("TrainingEndTime")
    if end_time:
        experiment.set_end_time(
            calendar.timegm(end_time.utctimetuple()) * 1000)

    for param_name, param_value in metadata["HyperParameters"].items():
        experiment.log_parameter(param_name, param_value)

    other_list = [
        "BillableTimeInSeconds",
        "EnableInterContainerTrafficEncryption",
        "EnableManagedSpotTraining",
        "EnableNetworkIsolation",
        "RoleArn",
        "TrainingJobArn",
        "TrainingJobName",
        "TrainingJobStatus",
        "TrainingTimeInSeconds",
    ]
    for other_name in other_list:
        other_value = metadata.get(other_name)
        if other_value:
            experiment.log_other(other_name, other_value)

    experiment.log_other("TrainingImage",
                         metadata["AlgorithmSpecification"]["TrainingImage"])
    experiment.log_other(
        "TrainingInputMode",
        metadata["AlgorithmSpecification"]["TrainingInputMode"])

    for other_key, other_value in _flatten(metadata.get("ModelArtifacts", {}),
                                           "ModelArtifacts").items():
        experiment.log_other(other_key, other_value)

    for other_key, other_value in _flatten(metadata["OutputDataConfig"],
                                           "OutputDataConfig").items():
        experiment.log_other(other_key, other_value)

    for other_key, other_value in _flatten(metadata["ResourceConfig"],
                                           "ResourceConfig").items():
        experiment.log_other(other_key, other_value)

    for i, _input in enumerate(metadata["InputDataConfig"]):
        for other_key, other_value in _flatten(_input, "InputDataConfig.%d" %
                                               i).items():
            experiment.log_other(other_key, other_value)

    response = client.list_tags(ResourceArn=metadata["TrainingJobArn"])
    for tag_name, tag_value in response["Tags"]:
        experiment.add_tags(["%s:%s" % (tag_name, tag_value)])
    # Metrics
    metrics_dataframe = TrainingJobAnalytics(
        training_job_name=sagemaker_job_name).dataframe()

    for iloc, (timestamp, metric_name, value) in metrics_dataframe.iterrows():
        print("TS", start_time_timestamp + timestamp)
        experiment.log_metric(metric=metric_name,
                              value=value,
                              timestamp=start_time_timestamp + timestamp)

    return experiment
Exemplo n.º 7
0
    def _update_model_table_evaluation_states(self):
        """Update the evaluation states in the model table. This method
        will poll the Sagemaker evaluation job and then update
        evaluation job metadata of the model, including:
            eval_state,
            eval_scores

        Args:
            model_record (dict): Current model record in the
                model table
        """

        if self.model_record.eval_in_terminal_state():
            self.model_db_client.update_model_record(self._jsonify())
            return self._jsonify()

        # Try and fetch updated SageMaker Training Job Status
        sm_eval_job_info = {}

        max_describe_retries = 100
        sleep_between_describe_retries = 10

        for i in range(max_describe_retries):
            try:
                sm_eval_job_info = self.sagemaker_client.describe_training_job(
                    TrainingJobName=self.model_record._evaluation_job_name)
            except Exception as e:
                if "ValidationException" in str(e):
                    print(e)
                    if i > max_describe_retries:
                        # 3rd attempt for DescribeTrainingJob with validation failure
                        logger.warn(
                            "Looks like SageMaker Job was not submitted successfully."
                            f" Failing EvaluationJob {self.model_record._evaluation_job_name}"
                        )
                        self.model_record.update_eval_job_as_failed()
                        self.model_db_client.update_model_eval_as_failed(
                            self._jsonify())
                        return
                    else:
                        time.sleep(sleep_between_describe_retries)
                        continue
                else:
                    # Do not raise exception, most probably throttling.
                    logger.warn(
                        "Failed to check SageMaker Training Job state for EvaluationJob: "
                        f" {self.model_record._evaluation_job_name}. This exception will be ignored,"
                        " and retried.")
                    time.sleep(sleep_between_describe_retries)
                    return self._jsonify()

        eval_state = sm_eval_job_info.get("TrainingJobStatus", "Pending")
        if eval_state == "Completed":
            eval_score = "n.a."

            if self.local_mode:
                rgx = re.compile(
                    "average loss = ([-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?).*$",
                    re.M)
                eval_score_rgx = rgx.findall(self.log_output)

                if len(eval_score_rgx) == 0:
                    logger.warning("No eval score available from vw job log.")
                else:
                    eval_score = eval_score_rgx[0][0]  # [('eval_score', '')]
            else:
                attempts = 0
                while eval_score == "n.a." and attempts < 4:
                    try:
                        metric_df = TrainingJobAnalytics(
                            self.model_record._evaluation_job_name,
                            ["average_loss"]).dataframe()
                        eval_score = str(metric_df[metric_df["metric_name"] ==
                                                   "average_loss"]["value"][0])
                    except Exception:
                        # to avoid throttling
                        time.sleep(5)
                        continue
                    attempts += 1
            self.model_record._eval_state = eval_state
            self.model_record.add_model_eval_scores(eval_score)
            self.model_db_client.update_model_eval_job_state(self._jsonify())
        else:
            # update eval state via ddb client
            self.model_record.update_eval_job_state(eval_state)
            self.model_db_client.update_model_eval_job_state(self._jsonify())
Exemplo n.º 8
0
 def training_job_analytics(self):
     """Returns a TrainingJobAnalytics object for the current training job.
     """
     if self._current_job_name is None:
         raise ValueError('Estimator is not associated with a TrainingJob')
     return TrainingJobAnalytics(self._current_job_name)
Exemplo n.º 9
0
                    hyperparameters={
                        'epochs': 1,
                        'backend': 'gloo'
                    })

estimator.fit({'training': inputs})

########################################################################
# DONOT EDIT AFTER THIS LINE
########################################################################
training_job_name = estimator.latest_training_job.name

# Get metric values
metric_names = [metric['Name'] for metric in estimator.metric_definitions]
metrics_dataframe = TrainingJobAnalytics(
    training_job_name=training_job_name,
    metric_names=metric_names).dataframe()

# Report results
rr = ResultReport()
rr.report(estimator.model_data, metrics_dataframe)

# Update leaderboard. Make sure the key name is right
# Use any name if you don't want to use the leaderboard
score_metric = 'test:accuracy'
score_name = 'Test Accuracy'
leaderboard_ascending = False

if score_metric not in metric_names:
    print("leaderboard key name is not correct. No leaderboard support.")
    exit(-1)
Exemplo n.º 10
0
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": bucket_path + "/" + prefix + "/test",
                    "S3DataDistributionType": "FullyReplicated"
                }
            },
            "ContentType": "application/x-parquet",
            "CompressionType": "None"
        }
    ]
}


client = boto3.client('sagemaker', region_name=region)
client.create_training_job(**create_training_params)

status = client.describe_training_job(TrainingJobName=job_name)['TrainingJobStatus']
print(status)
while status !='Completed' and status!='Failed':
    time.sleep(60)
    status = client.describe_training_job(TrainingJobName=job_name)['TrainingJobStatus']
    print(status)

%matplotlib inline
from sagemaker.analytics import TrainingJobAnalytics

metric_name = 'validation:rmse'

metrics_dataframe = TrainingJobAnalytics(training_job_name=job_name, metric_names=[metric_name]).dataframe()
plt = metrics_dataframe.plot(kind='line', figsize=(12,5), x='timestamp', y='value', style='b.', legend=False)
plt.set_ylabel(metric_name);