def test_get_computed_metrics(self): job_status_dict = { 'start_time': 0, 'stop_time': 20, } raw_metrics = { 'accuracy': [ metrics.MetricPoint(metric_value=.2, wall_time=0), metrics.MetricPoint(metric_value=.4, wall_time=10), metrics.MetricPoint(metric_value=.6, wall_time=20), ], 'other_key': [ metrics.MetricPoint(metric_value=.8, wall_time=0), ], } tta_config = { 'accuracy_tag': 'accuracy', 'accuracy_threshold': 0.4, } computed_metrics = metrics.get_computed_metrics( raw_metrics, job_status_dict, tta_config=tta_config, find_memory_metrics=False) self.assertEqual( computed_metrics, { 'total_wall_time': metrics.MetricPoint(metric_value=20, wall_time=20), 'time_to_accuracy': metrics.MetricPoint(metric_value=10, wall_time=10) })
def _test_get_computed_metrics_with_memory_metrics(self): job_status_dict = { 'start_time': 0, 'stop_time': 20, } raw_metrics = { 'accuracy': [ metrics.MetricPoint(metric_value=.2, wall_time=0), metrics.MetricPoint(metric_value=.4, wall_time=10), metrics.MetricPoint(metric_value=.6, wall_time=20), ], 'other_key': [ metrics.MetricPoint(metric_value=.8, wall_time=0), ], } tta_config = { 'accuracy_tag': 'accuracy', 'accuracy_threshold': 0.4, } computed_metrics = metrics.get_computed_metrics( raw_metrics, job_status_dict, tta_config=tta_config, project_id=PROJECT_ID, job_name=JOB_NAME, find_memory_metrics=True) self.assertContainsSubset( ['total_wall_time', 'time_to_accuracy', 'vm_memory_usage_bytes'], computed_metrics.keys())
def test_get_computed_metrics(self, custom_start): job_status_dict = { 'start_time': 0, 'stop_time': 20, } raw_metrics = { 'accuracy': [ metrics.MetricPoint(metric_value=.2, wall_time=0), metrics.MetricPoint(metric_value=.4, wall_time=10), metrics.MetricPoint(metric_value=.6, wall_time=20), ], 'other_key': [ metrics.MetricPoint(metric_value=.8, wall_time=0), ], } if custom_start is not None: raw_metrics['TensorboardStartTimestamp'] = [ metrics.MetricPoint(metric_value=custom_start, wall_time=0), ] else: custom_start = 0 tta_config = { 'accuracy_tag': 'accuracy', 'accuracy_threshold': 0.4, } computed_metrics = metrics.get_computed_metrics( raw_metrics, job_status_dict, tta_config=tta_config, find_memory_metrics=False) self.assertEqual( computed_metrics, { 'total_wall_time': metrics.MetricPoint(metric_value=20 - custom_start, wall_time=20), 'time_to_accuracy': metrics.MetricPoint(metric_value=10 - custom_start, wall_time=10) })
def _process_pubsub_message(msg, status_handler, logger): publish_time = msg['publish_time'] msg_age_sec = time.time() - publish_time if msg_age_sec < MIN_MSG_AGE_SEC: logger.warning('Message was {} seconds old, which is less than the ' 'minimum of {}. Skipping for now but will retry on ' 'the next run.'.format(msg_age_sec, MIN_MSG_AGE_SEC)) return False # Do not ack the message. events_dir = msg.get('model_dir') test_name = msg.get('test_name') logs_link = util.add_unbound_time_to_logs_link(msg.get('logs_link', '')) metric_collection_config = msg.get('metric_collection_config') regression_test_config = msg.get('regression_test_config') job_name = msg.get('job_name') job_namespace = msg.get('job_namespace') test_type = msg.get('test_type') accelerator = msg.get('accelerator') framework_version = msg.get('framework_version') zone = msg.get('zone') cluster = msg.get('cluster_name') project = google.auth.default()[1] download_command = util.download_command( job_name, job_namespace, zone, cluster, project) workload_link = util.workload_link( job_name, job_namespace, zone, cluster, project) debug_info = alert_handler.DebugInfo( job_name, logs_link, download_command, workload_link) if not (events_dir and test_name and logs_link and job_name and zone \ and cluster and project): raise ValueError('Pubsub message must contain 7 required fields: ' 'events_dir, test_name, logs_link, job_name, ' 'zone, cluster, project. Message was: {}'.format(event)) if not regression_test_config and not metric_collection_config: raise ValueError('metric_collection_config and regression_test_config ' 'were both null; stopping early. See README for ' 'documentation on writing these configs.') status, stop_time, num_failures = status_handler.get_job_status( job_name, job_namespace) if status == job_status_handler.UNKNOWN_STATUS: logger.warning( 'Unknown status for job_name: {}. Message will be ' 'retried later.'.format(job_name)) return False # Do not ack the message. elif status == job_status_handler.DOES_NOT_EXIST: if msg_age_sec >= 60 * 60 * 24: logger.warning( 'Job with job_name: {} no longer exists in Kubernetes. Message ' 'will be acknowledged.'.format(job_name)) return True # Ack the message. else: logger.warning( 'Job with job_name: {} not found in Kubernetes. Message ' 'will be retried later.'.format(job_name)) return False # Do not ack the message. job_status = { 'final_status': status, 'start_time': publish_time, 'publish_time': publish_time, 'stop_time': stop_time, 'num_failures': num_failures, } # TODO: pass these in the pubsub message and remove this block. if not test_type: test_type = 'func' if 'func' in test_name else 'conv' if not accelerator: accelerator = 'tpu-v2-8' if 'v2-8' in test_name else 'tpu-v3-8' if not framework_version: framework_version = 'pt-nightly' if 'pt-nightly' in test_name \ else 'tf-nightly' handler = CloudMetricsHandler( test_name, events_dir, debug_info, metric_collection_config, regression_test_config, test_type, accelerator, framework_version, logger) # Sometimes pubsub messages get delayed. If we've already processed metrics # for a different attempt of this test, we need to see if that attempt came # before or after the current attempt. existing_row_uuid, existing_row_publish_time = handler.get_existing_row() if existing_row_publish_time: # If the current message is for an earlier attempt than the existing row, # we can stop early since we want to write metrics for the latest attempt. # Otherwise, proceed with processing the current message. if publish_time <= existing_row_publish_time: return True # Ack the message. # Alert for failing jobs unless the user has explicitly added a config # that disables alerts for this test. if job_status['final_status'] != job_status_handler.SUCCESS and ( not regression_test_config or regression_test_config.get( 'alert_for_failed_jobs', True)): logger.error( 'job_status was `{}` for test `{}`'.format( job_status['final_status'], test_name), debug_info=debug_info) raw_metrics, aggregated_metrics = handler.get_metrics_from_events_dir() computed_metrics = metrics.get_computed_metrics( raw_metrics, job_status, project, job_name, tta_config=metric_collection_config.get('time_to_accuracy')) aggregated_metrics.update(computed_metrics) if regression_test_config: metrics_history = handler.get_metrics_history_from_bigquery() metric_name_to_visual_bounds = handler.compute_bounds_and_report_errors( metrics_history, aggregated_metrics, job_status['final_status']) else: metric_name_to_visual_bounds = None handler.add_status_and_metrics_to_bigquery( job_status, aggregated_metrics, metric_name_to_visual_bounds) return True # Ack the message.