Пример #1
0
def digest(workflow_path: ComparisonPath, operations_path: ComparisonPath) -> JsonObject:
    def call_fn(succeeded_operations: Dict[CallName, JsonObject],
                operation_id: OperationId,
                path: CallNameSequence,
                attempt: JsonObject) -> None:
        backend_status = attempt.get('backendStatus', 'Unknown')
        # This script should only ever be pointed at successful workflow metadata. All jobs that have a backend status
        # other than `Success` must have later been re-run successfully, so any un`Success`ful attempts are ignored.
        # It's possible that a future version of the digester might actually want to look at these jobs since they
        # may have completed some lifecycle events which could be useful in accumulating more performance data.
        if backend_status == 'Success':
            string_path = '.'.join(path)
            cromwell_start = attempt.get('start')
            cromwell_end = attempt.get('end')

            cromwell_total_time_seconds = (dateutil.parser.parse(cromwell_end) -
                                           dateutil.parser.parse(cromwell_start)).total_seconds()

            bare_operation_id = operation_id.split('/')[-1]
            operations_file_path = operations_path / f'{bare_operation_id}.json'
            operations_data = operations_file_path.read_text()
            operations_metadata = json.loads(operations_data)
            operation = OperationDigester.create(operations_metadata)

            papi_total_time_seconds = operation.total_time_seconds()

            cromwell_additional_total_time_seconds = \
                float("%.3f" % (cromwell_total_time_seconds - papi_total_time_seconds))

            succeeded_operations[string_path] = {
                Attempt: attempt.get('attempt'),
                CromwellAdditionalTotalTimeSeconds: cromwell_additional_total_time_seconds,
                CromwellEnd: cromwell_end,
                CromwellStart: cromwell_start,
                CromwellTotalTimeSeconds: cromwell_total_time_seconds,
                DelocalizationTimeSeconds: operation.delocalization_time_seconds(),
                Disks: operation.disks(),
                DockerImagePullTimeSeconds: operation.docker_image_pull_time_seconds(),
                LocalizationTimeSeconds: operation.localization_time_seconds(),
                MachineType: operation.machine_type(),
                OperationIdKey: operation_id,
                OtherTimeSeconds: operation.other_time_seconds(),
                PapiCreate: operation.create_time(),
                PapiEnd: operation.end_time(),
                PapiStart: operation.start_time(),
                PapiTotalTimeSeconds: operation.total_time_seconds(),
                ShardIndex: attempt.get('shardIndex'),
                StartupTimeSeconds: operation.startup_time_seconds(),
                UserCommandTimeSeconds: operation.user_command_time_seconds(),
            }

    data = workflow_path.read_text()
    metadata = json.loads(data)

    shards = operation_ids.visit_papi_operations(metadata, call_fn, initial_accumulator={})
    return {'version': Version, 'calls': shards, 'workflowId': metadata['id']}
Пример #2
0
def gcs_parent(subdir: AnyStr,
               gcs_comparison_path_by_subdir: dict) -> ComparisonPath:
    """
    GcsComparisonPaths are somewhat expensive to create so cache them.
    """
    if subdir not in gcs_comparison_path_by_subdir:
        path = ComparisonPath.create(
            f'gs://papi-performance-analysis/{subdir}')
        gcs_comparison_path_by_subdir[subdir] = path
    return gcs_comparison_path_by_subdir[subdir]
Пример #3
0
def main(args: argparse.Namespace) -> None:
    for path in args.paths:
        parent_path = ComparisonPath.create(path)

        workflow_path = parent_path / 'workflow.json'
        operations_dir_path = parent_path / 'operations'

        digest_parent = parent_path / 'digests' / Version
        digest_path = digest_parent / 'digest.json'

        if not digest_path.exists() or args.force:
            digest_parent.mkdir_p()
            digest_json = digest(workflow_path, operations_dir_path)
            digest_string = json.dumps(digest_json, sort_keys=True, indent=4)
            digest_path.write_text(digest_string)
        else:
            raise ValueError(f'digest file already exists at {digest_path} and --force not specified')
    def test_operations_digestion(self) -> None:
        """
        This uses "real" metadata from the PAPI v2 performance spike to drive operations digester testing.
        The metadata is stored in GCS and copied down to the local machine if not already present from an earlier run.
        Operations digesters can run against either local or GCS paths using `ComparisonPath`s. Since GCS testing is
        slow it's turned off by default, it can be turned on by setting the DIGESTER_TEST_GCS environment variable.
        """

        credentials, project_id = google.auth.default()
        storage_client = storage.Client(credentials=credentials)

        bucket_name = 'papi-performance-analysis'
        bucket = storage_client.get_bucket(bucket_name)

        # A cache of expensive-to-create GCS comparison paths.
        gcs_comparison_path_by_subdir = {}
        papi_versions = [VERSION_PAPI_V1, VERSION_PAPI_V2]

        for papi_version in papi_versions:
            subdir = subdir_for_papi_version(papi_version)
            local_parent = ComparisonPath.create(subdir)

            for sample_name in EXPECTATIONS.keys():
                download_metadata_from_gcs_if_needed(sample_name, local_parent, bucket)
                parents_to_test = [local_parent]
                # Skip slow GCS testing unless this environment variable is set.
                if os.environ.get('DIGESTER_TEST_GCS'):
                    parents_to_test.append(gcs_parent(subdir, gcs_comparison_path_by_subdir))

                for parent in parents_to_test:
                    description = parent.description()
                    logging.info(
                        f"Running operation digester on {description} sample '{sample_name}' backend {papi_version}")
                    sample_path = parent / sample_name

                    for operation in EXPECTATIONS.get(sample_name).get(papi_version).keys():
                        operations_path = sample_path / 'operations' / f'{operation}.json'
                        json_str = operations_path.read_text()
                        op_digester = OperationDigester.create(json.loads(json_str))
                        for key, value in EXPECTATIONS.get(sample_name).get(papi_version).get(operation).items():
                            method_to_call = getattr(op_digester, key)
                            self.assertEqual(method_to_call(), value, f'{key} was not {value}')
Пример #5
0
    def test_digestion(self) -> None:
        """
        This uses "real" metadata from the PAPI v2 performance spike to drive digester testing. The metadata is stored
        in GCS and copied down to the local machine if not already present from an earlier run. The digester can run
        against either local or GCS paths using `ComparisonPath`s. Local is nicer to iterate on than GCS since it
        runs so much more quickly. Since GCS testing is slow it's turned off by default, it can be turned on by setting
        the DIGESTER_TEST_GCS environment variable.
        """

        credentials, project_id = google.auth.default()
        storage_client = storage.Client(credentials=credentials)

        bucket_name = 'papi-performance-analysis'
        bucket = storage_client.get_bucket(bucket_name)

        # A cache of expensive-to-create GCS comparison paths.
        gcs_comparison_path_by_subdir = {}
        papi_versions = [VERSION_PAPI_V1, VERSION_PAPI_V2]

        for papi_version in papi_versions:
            subdir = subdir_for_papi_version(papi_version)
            local_parent = ComparisonPath.create(subdir)

            for sample_name in EXPECTATIONS.keys():
                download_metadata_from_gcs_if_needed(sample_name, local_parent,
                                                     bucket)
                parents_to_test = [local_parent]
                # Skip slow GCS testing unless this environment variable is set.
                if os.environ.get('DIGESTER_TEST_GCS'):
                    parents_to_test.append(
                        gcs_parent(subdir, gcs_comparison_path_by_subdir))

                for parent in parents_to_test:
                    description = parent.description()
                    logging.info(
                        f"Running digester test on {description} for sample '{sample_name}' on backend {papi_version}"
                    )
                    sample_path = parent / sample_name
                    workflow_path = sample_path / 'workflow.json'
                    operations_path = sample_path / 'operations'
                    actual = digest(workflow_path, operations_path)

                    expected = EXPECTATIONS[sample_name][papi_version]
                    calls: JsonObject = actual.get('calls')

                    actual_total = len(calls)
                    self.assertEqual(actual_total, expected['total_jobs'])

                    for num_attempts in [1, 2, 3]:
                        actual_len = len(
                            list(
                                filter(
                                    more_than_x_attempts(calls, num_attempts),
                                    calls)))
                        self.assertEqual(
                            actual_len,
                            expected[f'more_than_{num_attempts}_attempts'])

                    for minutes_longer in range(3, 9):
                        actual_len = len(
                            list(
                                filter(
                                    more_than_x_minutes_longer(
                                        calls, minutes_longer), calls)))
                        expectation = expected[
                            f'cromwell_time_more_than_{minutes_longer}_minutes_longer_total']
                        self.assertEqual(actual_len, expectation)

                    # Currently just a smoke test to assert not-completely-insane results for both v1 and v2 digesters.

                    keys = [
                        StartupTimeSeconds, DockerImagePullTimeSeconds,
                        LocalizationTimeSeconds, UserCommandTimeSeconds,
                        DelocalizationTimeSeconds, PapiTotalTimeSeconds,
                        CromwellTotalTimeSeconds, OtherTimeSeconds
                    ]

                    for key in keys:
                        for name in calls:
                            self.assertTrue(
                                calls[name].get(key) >= 0,
                                f"failed for {papi_version} / {sample_name} / {key}"
                            )
Пример #6
0
def json_from_path(string: AnyStr) -> ComparisonPath:
    path = ComparisonPath.create(str(RESOURCES) + '/comparer/' + string)
    return json.loads(path.read_text())
Пример #7
0
 def __read_resource(filename: str) -> AnyStr:
     return (ComparisonPath.create(str(RESOURCES)) / "comparer" / filename).read_text()
Пример #8
0
def json_from_path_string(path_string: AnyStr) -> JsonObject:
    path = ComparisonPath.create(path_string)
    return json.loads(path.read_text())
Пример #9
0
                        required=True,
                        help='Path to output CSV file.')
    parser.add_argument('--call-prefix-to-remove',
                        metavar='CALL_PREFIX_TO_REMOVE',
                        type=str,
                        nargs='*',
                        help='Call prefix to remove if present.')

    args = parser.parse_args()
    set_log_verbosity(args.verbose)
    quieten_chatty_imports()
    logger.info("Starting Comparer operation.")

    _json_1, _json_2 = [
        json_from_path_string(p[0]) for p in [args.digest1, args.digest2]
    ]

    prefixes = [] if not args.call_prefix_to_remove else args.call_prefix_to_remove

    comparison_data = compare_jsons(_json_1, _json_2, args.name1[0],
                                    args.name2[0], prefixes, args.force)
    out_path = args.output_path[0]
    out = ComparisonPath.create(out_path)
    if out.exists() and not args.force:
        raise ValueError(
            f"Specified output file '{out_path}' already exists and --force not specified."
        )
    out.write_text(csv_string_from_data(comparison_data))

    logger.info('Comparer operation completed successfully.')
Пример #10
0
 def validate_path(p: AnyStr) -> AnyStr:
     if ComparisonPath.is_valid_path_string(p):
         return p
     raise ValueError(f'{p} is not a valid path whatsoever')