Python get_computed_metrics示例，metrics.get_computed_metrics Python示例

示例#1

0

显示文件

文件： metrics_test.py 项目： mohitreddy1996/ml-testing-accelerators

 def test_get_computed_metrics(self):
     job_status_dict = {
         'start_time': 0,
         'stop_time': 20,
     }
     raw_metrics = {
         'accuracy': [
             metrics.MetricPoint(metric_value=.2, wall_time=0),
             metrics.MetricPoint(metric_value=.4, wall_time=10),
             metrics.MetricPoint(metric_value=.6, wall_time=20),
         ],
         'other_key': [
             metrics.MetricPoint(metric_value=.8, wall_time=0),
         ],
     }
     tta_config = {
         'accuracy_tag': 'accuracy',
         'accuracy_threshold': 0.4,
     }
     computed_metrics = metrics.get_computed_metrics(
         raw_metrics,
         job_status_dict,
         tta_config=tta_config,
         find_memory_metrics=False)
     self.assertEqual(
         computed_metrics, {
             'total_wall_time':
             metrics.MetricPoint(metric_value=20, wall_time=20),
             'time_to_accuracy':
             metrics.MetricPoint(metric_value=10, wall_time=10)
         })

示例#2

0

显示文件

 def _test_get_computed_metrics_with_memory_metrics(self):
   job_status_dict = {
       'start_time': 0,
       'stop_time': 20,
   }
   raw_metrics = {
     'accuracy': [
       metrics.MetricPoint(metric_value=.2, wall_time=0),
       metrics.MetricPoint(metric_value=.4, wall_time=10),
       metrics.MetricPoint(metric_value=.6, wall_time=20),
     ],
     'other_key': [
       metrics.MetricPoint(metric_value=.8, wall_time=0),
     ],
   }
   tta_config = {
       'accuracy_tag': 'accuracy',
       'accuracy_threshold': 0.4,
   }
   computed_metrics = metrics.get_computed_metrics(
       raw_metrics, job_status_dict, tta_config=tta_config,
       project_id=PROJECT_ID, job_name=JOB_NAME, find_memory_metrics=True)
   self.assertContainsSubset(
       ['total_wall_time', 'time_to_accuracy', 'vm_memory_usage_bytes'],
       computed_metrics.keys())

示例#3

0

显示文件

    def test_get_computed_metrics(self, custom_start):
        job_status_dict = {
            'start_time': 0,
            'stop_time': 20,
        }
        raw_metrics = {
            'accuracy': [
                metrics.MetricPoint(metric_value=.2, wall_time=0),
                metrics.MetricPoint(metric_value=.4, wall_time=10),
                metrics.MetricPoint(metric_value=.6, wall_time=20),
            ],
            'other_key': [
                metrics.MetricPoint(metric_value=.8, wall_time=0),
            ],
        }

        if custom_start is not None:
            raw_metrics['TensorboardStartTimestamp'] = [
                metrics.MetricPoint(metric_value=custom_start, wall_time=0),
            ]
        else:
            custom_start = 0
        tta_config = {
            'accuracy_tag': 'accuracy',
            'accuracy_threshold': 0.4,
        }
        computed_metrics = metrics.get_computed_metrics(
            raw_metrics,
            job_status_dict,
            tta_config=tta_config,
            find_memory_metrics=False)
        self.assertEqual(
            computed_metrics, {
                'total_wall_time':
                metrics.MetricPoint(metric_value=20 - custom_start,
                                    wall_time=20),
                'time_to_accuracy':
                metrics.MetricPoint(metric_value=10 - custom_start,
                                    wall_time=10)
            })

示例#4

0

显示文件

def _process_pubsub_message(msg, status_handler, logger):
  publish_time = msg['publish_time']
  msg_age_sec = time.time() - publish_time
  if msg_age_sec < MIN_MSG_AGE_SEC:
    logger.warning('Message was {} seconds old, which is less than the '
                   'minimum of {}. Skipping for now but will retry on '
                   'the next run.'.format(msg_age_sec, MIN_MSG_AGE_SEC))
    return False  # Do not ack the message.
  events_dir = msg.get('model_dir')
  test_name = msg.get('test_name')
  logs_link = util.add_unbound_time_to_logs_link(msg.get('logs_link', ''))
  metric_collection_config = msg.get('metric_collection_config')
  regression_test_config = msg.get('regression_test_config')
  job_name = msg.get('job_name')
  job_namespace = msg.get('job_namespace')
  test_type = msg.get('test_type')
  accelerator = msg.get('accelerator')
  framework_version = msg.get('framework_version')
  zone = msg.get('zone')
  cluster = msg.get('cluster_name')
  project = google.auth.default()[1]
  download_command = util.download_command(
      job_name, job_namespace, zone, cluster, project)
  workload_link = util.workload_link(
      job_name, job_namespace, zone, cluster, project)
  debug_info = alert_handler.DebugInfo(
      job_name, logs_link, download_command, workload_link)

  if not (events_dir and test_name and logs_link and job_name and zone \
          and cluster and project):
    raise ValueError('Pubsub message must contain 7 required fields: '
                     'events_dir, test_name, logs_link, job_name, '
                     'zone, cluster, project. Message was: {}'.format(event))
  if not regression_test_config and not metric_collection_config:
    raise ValueError('metric_collection_config and regression_test_config '
                     'were both null; stopping early. See README for '
                     'documentation on writing these configs.')

  status, stop_time, num_failures = status_handler.get_job_status(
      job_name, job_namespace)
  if status == job_status_handler.UNKNOWN_STATUS:
    logger.warning(
        'Unknown status for job_name: {}. Message will be '
        'retried later.'.format(job_name))
    return False  # Do not ack the message.
  elif status == job_status_handler.DOES_NOT_EXIST:
    if msg_age_sec >= 60 * 60 * 24:
      logger.warning(
          'Job with job_name: {} no longer exists in Kubernetes. Message '
          'will be acknowledged.'.format(job_name))
      return True  # Ack the message.
    else:
      logger.warning(
          'Job with job_name: {} not found in Kubernetes. Message '
          'will be retried later.'.format(job_name))
      return False  # Do not ack the message.
  job_status = {
      'final_status': status,
      'start_time': publish_time,
      'publish_time': publish_time,
      'stop_time': stop_time,
      'num_failures': num_failures,
  }

  # TODO: pass these in the pubsub message and remove this block.
  if not test_type:
    test_type = 'func' if 'func' in test_name else 'conv'
  if not accelerator:
    accelerator = 'tpu-v2-8' if 'v2-8' in test_name else 'tpu-v3-8'
  if not framework_version:
    framework_version = 'pt-nightly' if 'pt-nightly' in test_name \
        else 'tf-nightly'

  handler = CloudMetricsHandler(
      test_name, events_dir, debug_info, metric_collection_config,
      regression_test_config, test_type, accelerator, framework_version, logger)

  # Sometimes pubsub messages get delayed. If we've already processed metrics
  # for a different attempt of this test, we need to see if that attempt came
  # before or after the current attempt.
  existing_row_uuid, existing_row_publish_time = handler.get_existing_row()
  if existing_row_publish_time:
    # If the current message is for an earlier attempt than the existing row,
    # we can stop early since we want to write metrics for the latest attempt.
    # Otherwise, proceed with processing the current message.
    if publish_time <= existing_row_publish_time:
      return True  # Ack the message.

  # Alert for failing jobs unless the user has explicitly added a config
  # that disables alerts for this test.
  if job_status['final_status'] != job_status_handler.SUCCESS and (
      not regression_test_config or regression_test_config.get(
          'alert_for_failed_jobs', True)):
    logger.error(
        'job_status was `{}` for test `{}`'.format(
            job_status['final_status'], test_name),
        debug_info=debug_info)

  raw_metrics, aggregated_metrics = handler.get_metrics_from_events_dir()
  computed_metrics = metrics.get_computed_metrics(
      raw_metrics, job_status, project, job_name,
      tta_config=metric_collection_config.get('time_to_accuracy'))
  aggregated_metrics.update(computed_metrics)
  if regression_test_config:
    metrics_history = handler.get_metrics_history_from_bigquery()
    metric_name_to_visual_bounds = handler.compute_bounds_and_report_errors(
        metrics_history, aggregated_metrics, job_status['final_status'])
  else:
    metric_name_to_visual_bounds = None

  handler.add_status_and_metrics_to_bigquery(
      job_status, aggregated_metrics, metric_name_to_visual_bounds)
  return True  # Ack the message.