def test_assert_duration(self): metric_source = metrics_pb2.MetricSource( literals=metrics_pb2.LiteralSource( assertions={ "duration": metrics_pb2.Assertion( within_bounds=metrics_pb2.Assertion.WithinBounds( lower_bound=100, upper_bound=200, ), inclusive_bounds=False, ) })) event = metrics_pb2.TestCompletedEvent( benchmark_id="test_benchmark", duration=duration_pb2.Duration(seconds=150), metric_collection_config=metrics_pb2.MetricCollectionConfig( sources=[metric_source])) collector = literal_collector.LiteralCollector( event=event, raw_source=metric_source) points = collector.metric_points() self.assertLen(points, 1) self.assertEqual(points[0].metric_key, 'duration') self.assertEqual(points[0].metric_value, 150) self.assertEqual(points[0].bounds, utils.Bounds(100, 200, False))
def test_create_test_completed_event(self, succeeded_count, failed_count, conditions, expected_status): job = _job_from_dict({ 'metadata': { 'name': 'job-name', 'namespace': 'namespace', 'labels': { 'benchmarkId': 'test-job', }, }, 'status': { 'startTime': _START_TIME, 'succeeded': succeeded_count, 'failed': failed_count, 'conditions': [ { 'status': True, 'reason': reason, 'type': cond_type, 'lastTransitionTime': _END_TIME, } for cond_type, reason in conditions ] } }) actual_event = event_publisher.create_test_completed_event( job, model_output_bucket='gs://fake-bucket', cluster_name='cluster-name', cluster_location='cluster-location', project='project-id' ) start_time = timestamp_pb2.Timestamp() start_time.FromDatetime(_START_TIME) duration = duration_pb2.Duration() duration.FromTimedelta(_END_TIME - _START_TIME) expected_event = metrics_pb2.TestCompletedEvent( benchmark_id='test-job', output_path='gs://fake-bucket/job-name', status=metrics_pb2.TestCompletedEvent.TestStatus.Value(expected_status), num_attempts=succeeded_count + failed_count, start_time=start_time, duration=duration, labels={'benchmarkId': 'test-job'}, debug_info=metrics_pb2.DebugInfo( logs_link='https://console.cloud.google.com/logs?project=project-id&advancedFilter=resource.type%3Dk8s_container%0Aresource.labels.project_id%3Dproject-id%0Aresource.labels.cluster_name%3Dcluster-name%0Aresource.labels.namespace_name%3Dnamespace%0Aresource.labels.pod_name%3Ajob-name%0Aresource.labels.location%3Acluster-location%0A', details_link=f'https://console.cloud.google.com/kubernetes/job/cluster-location/cluster-name/namespace/job-name?project=project-id' ), metric_collection_config=metrics_pb2.MetricCollectionConfig(), ) self.assertProtoEqual(expected_event, actual_event)
def test_aggregate_metrics_include_all_strategies(self): metric_source = metrics_pb2.MetricSource( tensorboard=metrics_pb2.TensorBoardSource(include_tags=[ metrics_pb2.TensorBoardSource.TagStrategy( tag_pattern="*", strategies=[ metrics_pb2.TensorBoardSource.FINAL, metrics_pb2.TensorBoardSource.MAX, metrics_pb2.TensorBoardSource.MIN, metrics_pb2.TensorBoardSource.AVERAGE, metrics_pb2.TensorBoardSource.MEDIAN, ]) ])) event = metrics_pb2.TestCompletedEvent( benchmark_id="test_benchmark", output_path=self.temp_dir, metric_collection_config=metrics_pb2.MetricCollectionConfig( sources=[metric_source])) collector = tensorboard_collector.TensorBoardCollector( event=event, raw_source=metric_source) points = list(collector.metric_points()) metric_to_value = {key: value for key, value, _ in points} self.assertDictEqual( metric_to_value, { 'foo_final': 2, 'foo_min': 1, 'foo_max': 2, 'foo_average': 1.5, 'foo_median': 1.5, 'eval/accuracy_final': .25, 'eval/accuracy_min': .125, 'eval/accuracy_max': .5, 'eval/accuracy_average': np.mean([.125, .25, .5]), 'eval/accuracy_median': np.median([.125, .25, .5]), 'train/bar_final': 100, 'train/bar_min': 10, 'train/bar_max': 100, 'train/bar_average': np.mean([10, 100, 100]), 'train/bar_median': np.median([10, 100, 100]), }) for _, _, bounds in points: self.assertEqual(bounds, utils.NO_BOUNDS)
def test_aggregate_metrics_with_assertion(self): metric_source = metrics_pb2.MetricSource( tensorboard=metrics_pb2.TensorBoardSource( include_tags=[ metrics_pb2.TensorBoardSource.TagStrategy( tag_pattern="eval/*", strategies=[ metrics_pb2.TensorBoardSource.FINAL, metrics_pb2.TensorBoardSource.MAX, metrics_pb2.TensorBoardSource.MIN, ]) ], aggregate_assertions=[ metrics_pb2.TensorBoardSource.AggregateAssertion( tag='eval/accuracy', strategy=metrics_pb2.TensorBoardSource.MAX, assertion=metrics_pb2.Assertion( within_bounds=metrics_pb2.Assertion.WithinBounds( lower_bound=.4, upper_bound=1.0, ), inclusive_bounds=True, )) ])) event = metrics_pb2.TestCompletedEvent( benchmark_id="test_benchmark", output_path=self.temp_dir, metric_collection_config=metrics_pb2.MetricCollectionConfig( sources=[metric_source])) collector = tensorboard_collector.TensorBoardCollector( event=event, raw_source=metric_source) points = list(collector.metric_points()) self.assertCountEqual( points, [ utils.MetricPoint('eval/accuracy_max', .5, utils.Bounds(.4, 1.0, True)), utils.MetricPoint('eval/accuracy_min', .125, utils.NO_BOUNDS), utils.MetricPoint('eval/accuracy_final', .25, utils.NO_BOUNDS), ], )
def test_include_and_exclude(self): metric_source = metrics_pb2.MetricSource( tensorboard=metrics_pb2.TensorBoardSource( include_tags=[ metrics_pb2.TensorBoardSource.TagStrategy( tag_pattern="*", strategies=[ metrics_pb2.TensorBoardSource.FINAL, ]) ], exclude_tags=[ 'foo', 'train/*', ], aggregate_assertions=[ metrics_pb2.TensorBoardSource.AggregateAssertion( tag='foo', strategy=metrics_pb2.TensorBoardSource.MIN, assertion=metrics_pb2.Assertion( within_bounds=metrics_pb2.Assertion.WithinBounds( lower_bound=0., upper_bound=2., ))) ])) event = metrics_pb2.TestCompletedEvent( benchmark_id="test_benchmark", output_path=self.temp_dir, metric_collection_config=metrics_pb2.MetricCollectionConfig( sources=[metric_source])) collector = tensorboard_collector.TensorBoardCollector( event=event, raw_source=metric_source) points = list(collector.metric_points()) self.assertCountEqual( points, [ utils.MetricPoint('eval/accuracy_final', .25, utils.NO_BOUNDS), utils.MetricPoint('foo_min', 1, utils.Bounds(0., 2., False)), ], )
def test_metric_collection_config(self, gcs_subdir): job = _job_from_dict({ 'metadata': { 'name': 'job-name', 'namespace': 'namespace', 'labels': { 'benchmarkId': 'test-job', }, 'annotations': { 'ml-testing-accelerators/metric-config': json.dumps({ 'sources': [{ 'literals': { 'assertions': { 'duration': { 'within_bounds': { 'lower_bound': 1, 'upper_bound': 2, } } } } }] }) } }, 'status': { 'startTime': _START_TIME, 'completionTime': _END_TIME, 'succeeded': 1, 'conditions': [ { 'status': True, 'type': 'Complete', 'lastTransitionTime': _END_TIME, } ] } }) if gcs_subdir: job.metadata.annotations['ml-testing-accelerators/gcs-subdir'] = gcs_subdir actual_event = event_publisher.create_test_completed_event( job, model_output_bucket='gs://fake-bucket', cluster_name='cluster-name', cluster_location='cluster-location', project='project-id' ) actual_mcc = actual_event.metric_collection_config expected_mcc = metrics_pb2.MetricCollectionConfig( sources=[ metrics_pb2.MetricSource( literals=metrics_pb2.LiteralSource( assertions={ 'duration': metrics_pb2.Assertion( within_bounds=metrics_pb2.Assertion.WithinBounds( lower_bound=1, upper_bound=2, ) ) } ) ) ] ) self.assertEqual(actual_event.output_path, os.path.join('gs://fake-bucket', gcs_subdir or '', 'job-name')) self.assertProtoEqual(expected_mcc, actual_mcc)
def test_get_metrics_from_perfzero_summary(self): temp_dir = self.create_tempdir().full_path summary_dir = os.path.join(temp_dir, 'date_and_time') pathlib.Path(summary_dir).mkdir(parents=True, exist_ok=True) summary_path = os.path.join(summary_dir, 'perfzero_summary.json') with open(summary_path, 'w') as f: json.dump( { "execution_id": "execution_id", "execution_timestamp": 1234567890.1, "benchmark_result": { "wall_time": 1234, "metrics": [{ "name": "exp_per_second", "value": 1.1, }, { "name": "avg_exp_per_second", "value": 2.2, }, { "name": "startup_time", "value": 3.3 }], }, "benchmark_info": { "not": "important", }, "setup_info": {}, "ml_framework_info": { "not": "important", }, "system_info": { "not": "important" }, "process_info": { "max_rss": 4.4, "max_vms": 5.5, "max_cpu_percent": 6.6, } }, f) metric_source = metrics_pb2.MetricSource( perfzero=metrics_pb2.PerfZeroSource( assertions={ 'total_wall_time': metrics_pb2.Assertion( within_bounds=metrics_pb2.Assertion.WithinBounds( lower_bound=1230, upper_bound=1240, )), 'exp_per_second': metrics_pb2.Assertion( within_bounds=metrics_pb2.Assertion.WithinBounds( lower_bound=1, upper_bound=100, ), ) })) event = metrics_pb2.TestCompletedEvent( benchmark_id="test_benchmark", output_path=temp_dir, metric_collection_config=metrics_pb2.MetricCollectionConfig( sources=[metric_source])) collector = perfzero_collector.PerfZeroCollector( event=event, raw_source=metric_source) points = list(collector.metric_points()) self.assertCountEqual( points, { utils.MetricPoint("total_wall_time", 1234, utils.Bounds(1230., 1240., False)), utils.MetricPoint("exp_per_second", 1.1, utils.Bounds(1., 100., False)), utils.MetricPoint("avg_exp_per_second", 2.2, utils.NO_BOUNDS), utils.MetricPoint("startup_time", 3.3, utils.NO_BOUNDS), utils.MetricPoint("process_info/max_rss", 4.4, utils.NO_BOUNDS), utils.MetricPoint("process_info/max_vms", 5.5, utils.NO_BOUNDS), utils.MetricPoint("process_info/max_cpu_percent", 6.6, utils.NO_BOUNDS), }, )
def create_test_completed_event( job: kubernetes.client.V1Job, model_output_bucket: str, cluster_name: str, cluster_location: str, project: str) -> metrics_pb2.TestCompletedEvent: """Returns a TestCompletedEvent to publish to PubSub. Args: job: A Kubernetes Job resource. model_output_bucket: Path to GCS bucket with model outputs. cluster_name: Name of the current Kubernetes cluster. cluster_location: Location (region or zone) of the current Kubernetes cluster. project: The project ID of the current project. Returns: A TestCompletedEvent with the information from job. """ if len(job.status.conditions) == 1: condition = job.status.conditions[0] # job.status.conditions _usually_ has length 1, but it can have both passing and failing conditions. # Give precedence to failing conditions. elif len(job.status.conditions) == 0: logging.error('Job %s has no conditions.', job.metadata.name) return else: condition = next( (c for c in job.status.conditions if c.type == 'Failed'), None) if not condition: logging.error('This should never happen. Conditions: %s', str(job.status.conditions)) return elif condition.reason == 'DeadlineExceeded': job_status = metrics_pb2.TestCompletedEvent.TIMEOUT elif condition.reason == 'BackoffLimitExceeded': job_status = metrics_pb2.TestCompletedEvent.FAILED elif condition.type == 'Complete': job_status = metrics_pb2.TestCompletedEvent.COMPLETED else: logging.error('Unknown condition for Job %s: %s', job.metadata.name, str(condition)) return annotations = job.metadata.annotations or {} gcs_subdir = annotations.get('ml-testing-accelerators/gcs-subdir', '') output_path = os.path.join(model_output_bucket, gcs_subdir, job.metadata.name) metric_config = metrics_pb2.MetricCollectionConfig() mcc_json = annotations.get('ml-testing-accelerators/metric-config', '{}') json_format.Parse(mcc_json, metric_config) stackdriver_query = textwrap.dedent(f"""\ resource.type=k8s_container resource.labels.project_id={project} resource.labels.cluster_name={cluster_name} resource.labels.namespace_name={job.metadata.namespace} resource.labels.pod_name:{job.metadata.name} resource.labels.location:{cluster_location} """) stackdriver_link = "https://console.cloud.google.com/logs?{}".format( urllib.parse.urlencode({ 'project': project, 'advancedFilter': stackdriver_query })) start_time = timestamp_pb2.Timestamp() start_time.FromDatetime(job.status.start_time) duration = duration_pb2.Duration() duration.FromTimedelta(condition.last_transition_time - job.status.start_time) return metrics_pb2.TestCompletedEvent( benchmark_id=job.metadata.labels['benchmarkId'], output_path=output_path, status=job_status, num_attempts=(job.status.succeeded or 0) + (job.status.failed or 0), start_time=start_time, duration=duration, metric_collection_config=metric_config, labels=job.metadata.labels, debug_info=metrics_pb2.DebugInfo( logs_link=stackdriver_link, # TODO: fix hard-coded region and cluster name details_link= f'https://console.cloud.google.com/kubernetes/job/{cluster_location}/{cluster_name}/{job.metadata.namespace}/{job.metadata.name}?project={project}' ))