예제 #1
0
    def test_glue(self):
        # create estimator
        estimator = self.create_estimator()

        # run training
        estimator.fit()

        # result dataframe
        result_metrics_df = TrainingJobAnalytics(estimator.latest_training_job.name).dataframe()

        # extract kpis
        eval_accuracy = list(result_metrics_df[result_metrics_df.metric_name == "eval_accuracy"]["value"])
        eval_loss = list(result_metrics_df[result_metrics_df.metric_name == "eval_loss"]["value"])
        # get train time from SageMaker job, this includes starting, preprocessing, stopping
        train_runtime = (
            Session().describe_training_job(estimator.latest_training_job.name).get("TrainingTimeInSeconds", 999999)
        )

        # assert kpis
        assert train_runtime <= self.results["train_runtime"]
        assert all(t >= self.results["eval_accuracy"] for t in eval_accuracy)
        assert all(t <= self.results["eval_loss"] for t in eval_loss)

        # dump tests result into json file to share in PR
        with open(f"{estimator.latest_training_job.name}.json", "w") as outfile:
            json.dump({"train_time": train_runtime, "eval_accuracy": eval_accuracy, "eval_loss": eval_loss}, outfile)
    def test_scripz(self, instance_count):
        # create estimator
        estimator = self.create_estimator(instance_count)

        # run training
        estimator.fit()

        # save csv
        self.save_results_as_csv(estimator.latest_training_job.name)
        # result dataframe
        result_metrics_df = TrainingJobAnalytics(
            estimator.latest_training_job.name).dataframe()

        # extract kpis
        train_runtime = list(result_metrics_df[result_metrics_df.metric_name ==
                                               "train_runtime"]["value"])
        eval_accuracy = list(result_metrics_df[result_metrics_df.metric_name ==
                                               "eval_accuracy"]["value"])
        eval_loss = list(result_metrics_df[result_metrics_df.metric_name ==
                                           "eval_loss"]["value"])

        # assert kpis
        assert all(t <= self.results["train_runtime"] for t in train_runtime)
        assert all(t >= self.results["eval_accuracy"] for t in eval_accuracy)
        assert all(t <= self.results["eval_loss"] for t in eval_loss)
예제 #3
0
    def _report_extended_online_metrics(self, describe_response):
        self._report_secondary_transitions(describe_response)

        # No reason to start reading metrics before the job is actually starting
        start_time = self._job_start_time(describe_response)
        if start_time:
            if self._analytics is None:
                self._analytics = TrainingJobAnalytics(
                    training_job_name=self._job_name,
                    metric_names=self._metric_names_for_training_job(),
                    start_time=start_time)

            metrics_df = self._analytics.dataframe(force_refresh=True)
            if not metrics_df.empty:
                for index, row in metrics_df.iterrows():
                    Report.job_metric(
                        row.get(SMApiConstants.Estimator.DF_METRIC_NAME,
                                "Unknown"),
                        row.get(SMApiConstants.Estimator.DF_METRIC_VALUE, 0))
예제 #4
0
 def save_results_as_csv(self, job_name):
     TrainingJobAnalytics(job_name).export_csv(f"{self.env.test_path}/{job_name}_metrics.csv")
예제 #5
0
class JobMonitorEstimator(JobMonitorBase):
    def __init__(self, sagemaker_client, job_name, logger):
        super(self.__class__, self).__init__(sagemaker_client, job_name,
                                             logger)

        self._metric_names = None
        self._analytics = None

    def _describe_job(self):
        return self._sagemaker_client.describe_training_job(
            TrainingJobName=self._job_name)

    def _job_status(self, describe_response):
        return describe_response[SMApiConstants.Estimator.JOB_STATUS]

    def _job_start_time(self, describe_response):
        return describe_response.get(SMApiConstants.Estimator.START_TIME)

    def _job_end_time(self, describe_response):
        return describe_response.get(SMApiConstants.Estimator.END_TIME)

    @cached_property
    def _host_metrics_defs(self):
        return [
            JobMonitorBase.MetricMeta('cpuavg_{}',
                                      SMApiConstants.METRIC_CPU_UTILIZATION,
                                      SMApiConstants.STAT_AVG),
            JobMonitorBase.MetricMeta('cpumin_{}',
                                      SMApiConstants.METRIC_CPU_UTILIZATION,
                                      SMApiConstants.STAT_MIN),
            JobMonitorBase.MetricMeta('cpumax_{}',
                                      SMApiConstants.METRIC_CPU_UTILIZATION,
                                      SMApiConstants.STAT_MAX),
            JobMonitorBase.MetricMeta('memavg_{}',
                                      SMApiConstants.METRIC_MEMORY_UTILIZATION,
                                      SMApiConstants.STAT_AVG),
            JobMonitorBase.MetricMeta('memmin_{}',
                                      SMApiConstants.METRIC_MEMORY_UTILIZATION,
                                      SMApiConstants.STAT_MIN),
            JobMonitorBase.MetricMeta('memmax_{}',
                                      SMApiConstants.METRIC_MEMORY_UTILIZATION,
                                      SMApiConstants.STAT_MAX),
            JobMonitorBase.MetricMeta('diskavg_{}',
                                      SMApiConstants.METRIC_MEMORY_UTILIZATION,
                                      SMApiConstants.STAT_AVG),
            JobMonitorBase.MetricMeta('diskmin_{}',
                                      SMApiConstants.METRIC_MEMORY_UTILIZATION,
                                      SMApiConstants.STAT_MIN),
            JobMonitorBase.MetricMeta('diskmax_{}',
                                      SMApiConstants.METRIC_MEMORY_UTILIZATION,
                                      SMApiConstants.STAT_MAX)
        ]

    def _metrics_namespace(self):
        return SMApiConstants.Estimator.NAMESPACE

    def _report_extended_online_metrics(self, describe_response):
        self._report_secondary_transitions(describe_response)

        # No reason to start reading metrics before the job is actually starting
        start_time = self._job_start_time(describe_response)
        if start_time:
            if self._analytics is None:
                self._analytics = TrainingJobAnalytics(
                    training_job_name=self._job_name,
                    metric_names=self._metric_names_for_training_job(),
                    start_time=start_time)

            metrics_df = self._analytics.dataframe(force_refresh=True)
            if not metrics_df.empty:
                for index, row in metrics_df.iterrows():
                    Report.job_metric(
                        row.get(SMApiConstants.Estimator.DF_METRIC_NAME,
                                "Unknown"),
                        row.get(SMApiConstants.Estimator.DF_METRIC_VALUE, 0))

    def _report_secondary_transitions(self, describe_response):
        secondary_transitions = describe_response[
            SMApiConstants.Estimator.SECONDARY_TRANSITIONS]

        rows = []
        for transition in secondary_transitions:
            start_time = transition['StartTime']
            end_time = transition.get('EndTime', datetime.now(pytz.UTC))
            status = transition['Status']
            message = transition['StatusMessage']

            time_span = (end_time - start_time).total_seconds()

            rows.append([
                start_time.strftime("%Y-%m-%d, %H:%M:%S"),
                end_time.strftime("%Y-%m-%d, %H:%M:%S"),
                Report.seconds_fmt(time_span), status, message
            ])

        if rows:
            Report.job_secondary_transitions(rows)

    def _metric_names_for_training_job(self):
        if self._metric_names is None:
            training_description = self._sagemaker_client.describe_training_job(
                TrainingJobName=self._job_name)

            metric_definitions = training_description[
                SMApiConstants.Estimator.ALGO_SPEC][
                    SMApiConstants.Estimator.METRIC_DEFS]
            self._metric_names = [
                md[SMApiConstants.Estimator.METRIC_DEF_NAME]
                for md in metric_definitions
                if md[SMApiConstants.Estimator.METRIC_DEF_NAME].startswith(
                    SMApiConstants.Estimator.TRAIN_PREFIX)
            ]

        return self._metric_names

    def _report_extended_final_metrics(self, describe_response):
        final_metrics = describe_response.get(
            SMApiConstants.Estimator.FINAL_METRIC_DATA_LIST)
        if final_metrics:
            for metric in final_metrics:
                Report.job_metric(
                    metric.get(SMApiConstants.Estimator.METRIC_NAME,
                               "Unknown"),
                    metric.get(SMApiConstants.Estimator.METRIC_VALUE, 0))