Exemplo n.º 1
0
def gcs_parent(subdir: AnyStr,
               gcs_comparison_path_by_subdir: dict) -> ComparisonPath:
    """
    GcsComparisonPaths are somewhat expensive to create so cache them.
    """
    if subdir not in gcs_comparison_path_by_subdir:
        path = ComparisonPath.create(
            f'gs://papi-performance-analysis/{subdir}')
        gcs_comparison_path_by_subdir[subdir] = path
    return gcs_comparison_path_by_subdir[subdir]
Exemplo n.º 2
0
def main(args: argparse.Namespace) -> None:
    for path in args.paths:
        parent_path = ComparisonPath.create(path)

        workflow_path = parent_path / 'workflow.json'
        operations_dir_path = parent_path / 'operations'

        digest_parent = parent_path / 'digests' / Version
        digest_path = digest_parent / 'digest.json'

        if not digest_path.exists() or args.force:
            digest_parent.mkdir_p()
            digest_json = digest(workflow_path, operations_dir_path)
            digest_string = json.dumps(digest_json, sort_keys=True, indent=4)
            digest_path.write_text(digest_string)
        else:
            raise ValueError(f'digest file already exists at {digest_path} and --force not specified')
    def test_operations_digestion(self) -> None:
        """
        This uses "real" metadata from the PAPI v2 performance spike to drive operations digester testing.
        The metadata is stored in GCS and copied down to the local machine if not already present from an earlier run.
        Operations digesters can run against either local or GCS paths using `ComparisonPath`s. Since GCS testing is
        slow it's turned off by default, it can be turned on by setting the DIGESTER_TEST_GCS environment variable.
        """

        credentials, project_id = google.auth.default()
        storage_client = storage.Client(credentials=credentials)

        bucket_name = 'papi-performance-analysis'
        bucket = storage_client.get_bucket(bucket_name)

        # A cache of expensive-to-create GCS comparison paths.
        gcs_comparison_path_by_subdir = {}
        papi_versions = [VERSION_PAPI_V1, VERSION_PAPI_V2]

        for papi_version in papi_versions:
            subdir = subdir_for_papi_version(papi_version)
            local_parent = ComparisonPath.create(subdir)

            for sample_name in EXPECTATIONS.keys():
                download_metadata_from_gcs_if_needed(sample_name, local_parent, bucket)
                parents_to_test = [local_parent]
                # Skip slow GCS testing unless this environment variable is set.
                if os.environ.get('DIGESTER_TEST_GCS'):
                    parents_to_test.append(gcs_parent(subdir, gcs_comparison_path_by_subdir))

                for parent in parents_to_test:
                    description = parent.description()
                    logging.info(
                        f"Running operation digester on {description} sample '{sample_name}' backend {papi_version}")
                    sample_path = parent / sample_name

                    for operation in EXPECTATIONS.get(sample_name).get(papi_version).keys():
                        operations_path = sample_path / 'operations' / f'{operation}.json'
                        json_str = operations_path.read_text()
                        op_digester = OperationDigester.create(json.loads(json_str))
                        for key, value in EXPECTATIONS.get(sample_name).get(papi_version).get(operation).items():
                            method_to_call = getattr(op_digester, key)
                            self.assertEqual(method_to_call(), value, f'{key} was not {value}')
Exemplo n.º 4
0
    def test_digestion(self) -> None:
        """
        This uses "real" metadata from the PAPI v2 performance spike to drive digester testing. The metadata is stored
        in GCS and copied down to the local machine if not already present from an earlier run. The digester can run
        against either local or GCS paths using `ComparisonPath`s. Local is nicer to iterate on than GCS since it
        runs so much more quickly. Since GCS testing is slow it's turned off by default, it can be turned on by setting
        the DIGESTER_TEST_GCS environment variable.
        """

        credentials, project_id = google.auth.default()
        storage_client = storage.Client(credentials=credentials)

        bucket_name = 'papi-performance-analysis'
        bucket = storage_client.get_bucket(bucket_name)

        # A cache of expensive-to-create GCS comparison paths.
        gcs_comparison_path_by_subdir = {}
        papi_versions = [VERSION_PAPI_V1, VERSION_PAPI_V2]

        for papi_version in papi_versions:
            subdir = subdir_for_papi_version(papi_version)
            local_parent = ComparisonPath.create(subdir)

            for sample_name in EXPECTATIONS.keys():
                download_metadata_from_gcs_if_needed(sample_name, local_parent,
                                                     bucket)
                parents_to_test = [local_parent]
                # Skip slow GCS testing unless this environment variable is set.
                if os.environ.get('DIGESTER_TEST_GCS'):
                    parents_to_test.append(
                        gcs_parent(subdir, gcs_comparison_path_by_subdir))

                for parent in parents_to_test:
                    description = parent.description()
                    logging.info(
                        f"Running digester test on {description} for sample '{sample_name}' on backend {papi_version}"
                    )
                    sample_path = parent / sample_name
                    workflow_path = sample_path / 'workflow.json'
                    operations_path = sample_path / 'operations'
                    actual = digest(workflow_path, operations_path)

                    expected = EXPECTATIONS[sample_name][papi_version]
                    calls: JsonObject = actual.get('calls')

                    actual_total = len(calls)
                    self.assertEqual(actual_total, expected['total_jobs'])

                    for num_attempts in [1, 2, 3]:
                        actual_len = len(
                            list(
                                filter(
                                    more_than_x_attempts(calls, num_attempts),
                                    calls)))
                        self.assertEqual(
                            actual_len,
                            expected[f'more_than_{num_attempts}_attempts'])

                    for minutes_longer in range(3, 9):
                        actual_len = len(
                            list(
                                filter(
                                    more_than_x_minutes_longer(
                                        calls, minutes_longer), calls)))
                        expectation = expected[
                            f'cromwell_time_more_than_{minutes_longer}_minutes_longer_total']
                        self.assertEqual(actual_len, expectation)

                    # Currently just a smoke test to assert not-completely-insane results for both v1 and v2 digesters.

                    keys = [
                        StartupTimeSeconds, DockerImagePullTimeSeconds,
                        LocalizationTimeSeconds, UserCommandTimeSeconds,
                        DelocalizationTimeSeconds, PapiTotalTimeSeconds,
                        CromwellTotalTimeSeconds, OtherTimeSeconds
                    ]

                    for key in keys:
                        for name in calls:
                            self.assertTrue(
                                calls[name].get(key) >= 0,
                                f"failed for {papi_version} / {sample_name} / {key}"
                            )
Exemplo n.º 5
0
def json_from_path(string: AnyStr) -> ComparisonPath:
    path = ComparisonPath.create(str(RESOURCES) + '/comparer/' + string)
    return json.loads(path.read_text())
Exemplo n.º 6
0
 def __read_resource(filename: str) -> AnyStr:
     return (ComparisonPath.create(str(RESOURCES)) / "comparer" / filename).read_text()
Exemplo n.º 7
0
def json_from_path_string(path_string: AnyStr) -> JsonObject:
    path = ComparisonPath.create(path_string)
    return json.loads(path.read_text())
Exemplo n.º 8
0
                        required=True,
                        help='Path to output CSV file.')
    parser.add_argument('--call-prefix-to-remove',
                        metavar='CALL_PREFIX_TO_REMOVE',
                        type=str,
                        nargs='*',
                        help='Call prefix to remove if present.')

    args = parser.parse_args()
    set_log_verbosity(args.verbose)
    quieten_chatty_imports()
    logger.info("Starting Comparer operation.")

    _json_1, _json_2 = [
        json_from_path_string(p[0]) for p in [args.digest1, args.digest2]
    ]

    prefixes = [] if not args.call_prefix_to_remove else args.call_prefix_to_remove

    comparison_data = compare_jsons(_json_1, _json_2, args.name1[0],
                                    args.name2[0], prefixes, args.force)
    out_path = args.output_path[0]
    out = ComparisonPath.create(out_path)
    if out.exists() and not args.force:
        raise ValueError(
            f"Specified output file '{out_path}' already exists and --force not specified."
        )
    out.write_text(csv_string_from_data(comparison_data))

    logger.info('Comparer operation completed successfully.')