def test_glue(self): # create estimator estimator = self.create_estimator() # run training estimator.fit() # result dataframe result_metrics_df = TrainingJobAnalytics(estimator.latest_training_job.name).dataframe() # extract kpis eval_accuracy = list(result_metrics_df[result_metrics_df.metric_name == "eval_accuracy"]["value"]) eval_loss = list(result_metrics_df[result_metrics_df.metric_name == "eval_loss"]["value"]) # get train time from SageMaker job, this includes starting, preprocessing, stopping train_runtime = ( Session().describe_training_job(estimator.latest_training_job.name).get("TrainingTimeInSeconds", 999999) ) # assert kpis assert train_runtime <= self.results["train_runtime"] assert all(t >= self.results["eval_accuracy"] for t in eval_accuracy) assert all(t <= self.results["eval_loss"] for t in eval_loss) # dump tests result into json file to share in PR with open(f"{estimator.latest_training_job.name}.json", "w") as outfile: json.dump({"train_time": train_runtime, "eval_accuracy": eval_accuracy, "eval_loss": eval_loss}, outfile)
def test_scripz(self, instance_count): # create estimator estimator = self.create_estimator(instance_count) # run training estimator.fit() # save csv self.save_results_as_csv(estimator.latest_training_job.name) # result dataframe result_metrics_df = TrainingJobAnalytics( estimator.latest_training_job.name).dataframe() # extract kpis train_runtime = list(result_metrics_df[result_metrics_df.metric_name == "train_runtime"]["value"]) eval_accuracy = list(result_metrics_df[result_metrics_df.metric_name == "eval_accuracy"]["value"]) eval_loss = list(result_metrics_df[result_metrics_df.metric_name == "eval_loss"]["value"]) # assert kpis assert all(t <= self.results["train_runtime"] for t in train_runtime) assert all(t >= self.results["eval_accuracy"] for t in eval_accuracy) assert all(t <= self.results["eval_loss"] for t in eval_loss)
def _report_extended_online_metrics(self, describe_response): self._report_secondary_transitions(describe_response) # No reason to start reading metrics before the job is actually starting start_time = self._job_start_time(describe_response) if start_time: if self._analytics is None: self._analytics = TrainingJobAnalytics( training_job_name=self._job_name, metric_names=self._metric_names_for_training_job(), start_time=start_time) metrics_df = self._analytics.dataframe(force_refresh=True) if not metrics_df.empty: for index, row in metrics_df.iterrows(): Report.job_metric( row.get(SMApiConstants.Estimator.DF_METRIC_NAME, "Unknown"), row.get(SMApiConstants.Estimator.DF_METRIC_VALUE, 0))
def save_results_as_csv(self, job_name): TrainingJobAnalytics(job_name).export_csv(f"{self.env.test_path}/{job_name}_metrics.csv")
class JobMonitorEstimator(JobMonitorBase): def __init__(self, sagemaker_client, job_name, logger): super(self.__class__, self).__init__(sagemaker_client, job_name, logger) self._metric_names = None self._analytics = None def _describe_job(self): return self._sagemaker_client.describe_training_job( TrainingJobName=self._job_name) def _job_status(self, describe_response): return describe_response[SMApiConstants.Estimator.JOB_STATUS] def _job_start_time(self, describe_response): return describe_response.get(SMApiConstants.Estimator.START_TIME) def _job_end_time(self, describe_response): return describe_response.get(SMApiConstants.Estimator.END_TIME) @cached_property def _host_metrics_defs(self): return [ JobMonitorBase.MetricMeta('cpuavg_{}', SMApiConstants.METRIC_CPU_UTILIZATION, SMApiConstants.STAT_AVG), JobMonitorBase.MetricMeta('cpumin_{}', SMApiConstants.METRIC_CPU_UTILIZATION, SMApiConstants.STAT_MIN), JobMonitorBase.MetricMeta('cpumax_{}', SMApiConstants.METRIC_CPU_UTILIZATION, SMApiConstants.STAT_MAX), JobMonitorBase.MetricMeta('memavg_{}', SMApiConstants.METRIC_MEMORY_UTILIZATION, SMApiConstants.STAT_AVG), JobMonitorBase.MetricMeta('memmin_{}', SMApiConstants.METRIC_MEMORY_UTILIZATION, SMApiConstants.STAT_MIN), JobMonitorBase.MetricMeta('memmax_{}', SMApiConstants.METRIC_MEMORY_UTILIZATION, SMApiConstants.STAT_MAX), JobMonitorBase.MetricMeta('diskavg_{}', SMApiConstants.METRIC_MEMORY_UTILIZATION, SMApiConstants.STAT_AVG), JobMonitorBase.MetricMeta('diskmin_{}', SMApiConstants.METRIC_MEMORY_UTILIZATION, SMApiConstants.STAT_MIN), JobMonitorBase.MetricMeta('diskmax_{}', SMApiConstants.METRIC_MEMORY_UTILIZATION, SMApiConstants.STAT_MAX) ] def _metrics_namespace(self): return SMApiConstants.Estimator.NAMESPACE def _report_extended_online_metrics(self, describe_response): self._report_secondary_transitions(describe_response) # No reason to start reading metrics before the job is actually starting start_time = self._job_start_time(describe_response) if start_time: if self._analytics is None: self._analytics = TrainingJobAnalytics( training_job_name=self._job_name, metric_names=self._metric_names_for_training_job(), start_time=start_time) metrics_df = self._analytics.dataframe(force_refresh=True) if not metrics_df.empty: for index, row in metrics_df.iterrows(): Report.job_metric( row.get(SMApiConstants.Estimator.DF_METRIC_NAME, "Unknown"), row.get(SMApiConstants.Estimator.DF_METRIC_VALUE, 0)) def _report_secondary_transitions(self, describe_response): secondary_transitions = describe_response[ SMApiConstants.Estimator.SECONDARY_TRANSITIONS] rows = [] for transition in secondary_transitions: start_time = transition['StartTime'] end_time = transition.get('EndTime', datetime.now(pytz.UTC)) status = transition['Status'] message = transition['StatusMessage'] time_span = (end_time - start_time).total_seconds() rows.append([ start_time.strftime("%Y-%m-%d, %H:%M:%S"), end_time.strftime("%Y-%m-%d, %H:%M:%S"), Report.seconds_fmt(time_span), status, message ]) if rows: Report.job_secondary_transitions(rows) def _metric_names_for_training_job(self): if self._metric_names is None: training_description = self._sagemaker_client.describe_training_job( TrainingJobName=self._job_name) metric_definitions = training_description[ SMApiConstants.Estimator.ALGO_SPEC][ SMApiConstants.Estimator.METRIC_DEFS] self._metric_names = [ md[SMApiConstants.Estimator.METRIC_DEF_NAME] for md in metric_definitions if md[SMApiConstants.Estimator.METRIC_DEF_NAME].startswith( SMApiConstants.Estimator.TRAIN_PREFIX) ] return self._metric_names def _report_extended_final_metrics(self, describe_response): final_metrics = describe_response.get( SMApiConstants.Estimator.FINAL_METRIC_DATA_LIST) if final_metrics: for metric in final_metrics: Report.job_metric( metric.get(SMApiConstants.Estimator.METRIC_NAME, "Unknown"), metric.get(SMApiConstants.Estimator.METRIC_VALUE, 0))