Exemplo n.º 1
0
 def sum_call_times_weighted(j: JsonObject, key: AnyStr):
     """
     Sums call times for a column weighted by the machine type used as a more accurate estimation
     of cost.
     """
     times = [
         j.get('calls').get(ck.full).get(key) *
         MachineTypeCostMultiplier[j.get('calls').get(
             ck.full).get('machineType')] for ck in call_keys
     ]
     return sum(times)
Exemplo n.º 2
0
    def call_fn(succeeded_operations: Dict[CallName, JsonObject],
                operation_id: OperationId,
                path: CallNameSequence,
                attempt: JsonObject) -> None:
        backend_status = attempt.get('backendStatus', 'Unknown')
        # This script should only ever be pointed at successful workflow metadata. All jobs that have a backend status
        # other than `Success` must have later been re-run successfully, so any un`Success`ful attempts are ignored.
        # It's possible that a future version of the digester might actually want to look at these jobs since they
        # may have completed some lifecycle events which could be useful in accumulating more performance data.
        if backend_status == 'Success':
            string_path = '.'.join(path)
            cromwell_start = attempt.get('start')
            cromwell_end = attempt.get('end')

            cromwell_total_time_seconds = (dateutil.parser.parse(cromwell_end) -
                                           dateutil.parser.parse(cromwell_start)).total_seconds()

            bare_operation_id = operation_id.split('/')[-1]
            operations_file_path = operations_path / f'{bare_operation_id}.json'
            operations_data = operations_file_path.read_text()
            operations_metadata = json.loads(operations_data)
            operation = OperationDigester.create(operations_metadata)

            papi_total_time_seconds = operation.total_time_seconds()

            cromwell_additional_total_time_seconds = \
                float("%.3f" % (cromwell_total_time_seconds - papi_total_time_seconds))

            succeeded_operations[string_path] = {
                Attempt: attempt.get('attempt'),
                CromwellAdditionalTotalTimeSeconds: cromwell_additional_total_time_seconds,
                CromwellEnd: cromwell_end,
                CromwellStart: cromwell_start,
                CromwellTotalTimeSeconds: cromwell_total_time_seconds,
                DelocalizationTimeSeconds: operation.delocalization_time_seconds(),
                Disks: operation.disks(),
                DockerImagePullTimeSeconds: operation.docker_image_pull_time_seconds(),
                LocalizationTimeSeconds: operation.localization_time_seconds(),
                MachineType: operation.machine_type(),
                OperationIdKey: operation_id,
                OtherTimeSeconds: operation.other_time_seconds(),
                PapiCreate: operation.create_time(),
                PapiEnd: operation.end_time(),
                PapiStart: operation.start_time(),
                PapiTotalTimeSeconds: operation.total_time_seconds(),
                ShardIndex: attempt.get('shardIndex'),
                StartupTimeSeconds: operation.startup_time_seconds(),
                UserCommandTimeSeconds: operation.user_command_time_seconds(),
            }
Exemplo n.º 3
0
 def create(operation_json: JsonObject):
     operation_id = operation_json.get('name')
     version = operation_id_to_api_version(operation_id)
     if version == PAPI_V1_API_VERSION:
         return PapiV1OperationDigester(operation_json)
     elif version == PAPI_V2_ALPHA1_API_VERSION:
         return PapiV2AlphaOperationDigester(operation_json)
     elif version == PAPI_V2_BETA_API_VERSION:
         return PapiV2BetaOperationDigester(operation_json)
     else:
         raise ValueError(
             f"Unrecognized format for PAPI operation ID {operation_id}")
Exemplo n.º 4
0
 def has_description_like(event: JsonObject) -> bool:
     return regex.match(event.get('description')) is not None
Exemplo n.º 5
0
 def has_description(event: JsonObject) -> bool:
     return event.get('description') == description
Exemplo n.º 6
0
def error_checks(name_1: AnyStr,
                 name_2: AnyStr,
                 json_1: JsonObject,
                 json_2: JsonObject,
                 force: bool = False):
    version_1, version_2 = [j.get('version') for j in [json_1, json_2]]

    if version_1 != version_2:
        msg = f"Inconsistent digest versions: First JSON digest is {version_1} but second is {version_2}"
        raise ValueError(msg)

    call_keys_1, call_keys_2 = [
        list(j.get('calls').keys()) for j in [json_1, json_2]
    ]
    call_keys_1.sort()
    call_keys_2.sort()

    if call_keys_1 != call_keys_2:
        in_1 = ', '.join(set(call_keys_1) - set(call_keys_2))
        in_2 = ', '.join(set(call_keys_2) - set(call_keys_1))
        msg_1 = None
        msg_2 = None
        if in_1:
            msg_1 = f"In {name_1} but not {name_2}: {in_1}."
        if in_2:
            msg_2 = f"In {name_1} but not {name_2}: {in_2}."

        raise ValueError(
            'The specified digest files do not have the same call keys. These digests cannot be '
            +
            f'compared and probably are not from the same workflow and sample. {msg_1} {msg_2}'
        )

    for call_key in call_keys_1:
        call_1 = json_1.get('calls').get(call_key)
        call_2 = json_2.get('calls').get(call_key)
        for call in [call_1, call_2]:
            for digester_key in DigesterKeys:
                json_key = digester_key.json_key
                if json_key not in call:
                    if call == call_1:
                        nth = "first"
                    else:
                        nth = "second"
                    raise ValueError(
                        f"In {nth} digest JSON: call '{call_key}' missing required key '{json_key}'"
                    )

    discrepancies = []

    for k in call_keys_1:
        machine_type_1 = json_1.get('calls').get(k).get('machineType')
        machine_type_2 = json_2.get('calls').get(k).get('machineType')
        if machine_type_1 != machine_type_2:
            discrepancies.append({
                'key': k,
                'machine_type_1': machine_type_1,
                'machine_type_2': machine_type_2
            })

    if discrepancies:
        string = ', '.join(
            f'{e["key"]}: {e["machine_type_1"]} vs {e["machine_type_2"]}'
            for e in discrepancies)
        message = 'The specified digest files unexpectedly contain corresponding jobs that ran with ' + \
                  'different machine types: ' + string
        if force:
            logger.warning(message)
        else:
            raise ValueError(
                message +
                '. Specify the --force argument to force comparison anyway.')
Exemplo n.º 7
0
 def sum_call_times(j: JsonObject, key: AnyStr):
     """ Sums raw call times for a column. """
     times = [j.get('calls').get(ck.full).get(key) for ck in call_keys]
     return sum(times)
Exemplo n.º 8
0
def compare_jsons(json_1: JsonObject,
                  json_2: JsonObject,
                  name_1: AnyStr,
                  name_2: AnyStr,
                  call_prefixes_to_remove: List[AnyStr],
                  force: bool = False) -> List[List[AnyStr]]:
    """
    Produce a CSV representing the comparison of the specified JSONs.
    """
    error_checks(name_1, name_2, json_1, json_2, force)

    call_keys = [
        CallKey(k, call_prefixes_to_remove)
        for k in json_1.get('calls').keys()
    ]

    call_keys_sorted_without_prefix = sorted(call_keys,
                                             key=lambda k: k.without_prefix)

    # Columns to produce in triplets: name_1, name_2, percent increase from name_1 to name_2.
    digester_key_names = [
        PapiTotalTimeSeconds, StartupTimeSeconds, DockerImagePullTimeSeconds,
        LocalizationTimeSeconds, UserCommandTimeSeconds,
        DelocalizationTimeSeconds
    ]

    def build_header_rows():
        """ Builds the header rows of the spreadsheet. """
        top_header_row = ['job', 'Machine type']
        percent_contrib_row = ['% contribution to total run time', '']
        total_row = ['Total', '']
        machine_type_weighted_row = [
            'Machine-type weighted total (cost-ish)', ''
        ]

        def sum_call_times(j: JsonObject, key: AnyStr):
            """ Sums raw call times for a column. """
            times = [j.get('calls').get(ck.full).get(key) for ck in call_keys]
            return sum(times)

        def sum_call_times_weighted(j: JsonObject, key: AnyStr):
            """
            Sums call times for a column weighted by the machine type used as a more accurate estimation
            of cost.
            """
            times = [
                j.get('calls').get(ck.full).get(key) *
                MachineTypeCostMultiplier[j.get('calls').get(
                    ck.full).get('machineType')] for ck in call_keys
            ]
            return sum(times)

        # Treat the total time specially to be able to report percentages of time for individual lifecycle phases.
        total_total_time_1, total_total_time_2 = [
            sum_call_times(j, PapiTotalTimeSeconds) for j in [json_1, json_2]
        ]

        for digester_key_name in digester_key_names:
            digester_key = digester_key_by_json_key(digester_key_name)
            top_header_row.append(f'{name_1} {digester_key.display_text}')
            top_header_row.append(f'{name_2} {digester_key.display_text}')
            top_header_row.append('% increase')

            # Total times within a column, not necessarily total time of the job.
            total_time_1, total_time_2 = [
                sum_call_times(j, digester_key_name) for j in [json_1, json_2]
            ]

            percent_contrib_row.append(
                f'{(total_time_1 / total_total_time_1) * 100:.2f}%')
            percent_contrib_row.append(
                f'{(total_time_2 / total_total_time_2) * 100:.2f}%')
            percent_contrib_row.append('')

            total_row.append(format_seconds(total_time_1))
            total_row.append(format_seconds(total_time_2))
            total_percent_increase = (
                (total_time_2 - total_time_1) * 100) / total_time_1
            total_row.append(f'{total_percent_increase :.2f}%')

            weighted_total_time_1, weighted_total_time_2 = [
                sum_call_times_weighted(j, digester_key_name)
                for j in [json_1, json_2]
            ]

            machine_type_weighted_row.append(
                format_seconds(weighted_total_time_1))
            machine_type_weighted_row.append(
                format_seconds(weighted_total_time_2))
            weighted_percent = (
                (weighted_total_time_2 - weighted_total_time_1) *
                100) / weighted_total_time_1
            machine_type_weighted_row.append(f'{weighted_percent:.2f}%')

        return [
            top_header_row, percent_contrib_row, total_row,
            machine_type_weighted_row, []
        ]

    rows = []

    # Produce data for individual lifecycle phases of individual calls.
    for call_key in call_keys_sorted_without_prefix:
        row = []
        call_1, call_2 = [
            j.get('calls').get(call_key.full) for j in [json_1, json_2]
        ]
        row.append(call_key.without_prefix)
        row.append(call_1.get(MachineType))

        for digester_key_name in digester_key_names:
            time_1 = call_1.get(digester_key_name)
            time_2 = call_2.get(digester_key_name)
            row.append(format_seconds(time_1))
            row.append(format_seconds(time_2))
            if time_1:
                row.append(f'{((time_2 - time_1)  * 100) / time_1:.2f}%')
            else:
                row.append('---')

        rows.append(row)

    return build_header_rows() + rows