def sum_call_times_weighted(j: JsonObject, key: AnyStr): """ Sums call times for a column weighted by the machine type used as a more accurate estimation of cost. """ times = [ j.get('calls').get(ck.full).get(key) * MachineTypeCostMultiplier[j.get('calls').get( ck.full).get('machineType')] for ck in call_keys ] return sum(times)
def call_fn(succeeded_operations: Dict[CallName, JsonObject], operation_id: OperationId, path: CallNameSequence, attempt: JsonObject) -> None: backend_status = attempt.get('backendStatus', 'Unknown') # This script should only ever be pointed at successful workflow metadata. All jobs that have a backend status # other than `Success` must have later been re-run successfully, so any un`Success`ful attempts are ignored. # It's possible that a future version of the digester might actually want to look at these jobs since they # may have completed some lifecycle events which could be useful in accumulating more performance data. if backend_status == 'Success': string_path = '.'.join(path) cromwell_start = attempt.get('start') cromwell_end = attempt.get('end') cromwell_total_time_seconds = (dateutil.parser.parse(cromwell_end) - dateutil.parser.parse(cromwell_start)).total_seconds() bare_operation_id = operation_id.split('/')[-1] operations_file_path = operations_path / f'{bare_operation_id}.json' operations_data = operations_file_path.read_text() operations_metadata = json.loads(operations_data) operation = OperationDigester.create(operations_metadata) papi_total_time_seconds = operation.total_time_seconds() cromwell_additional_total_time_seconds = \ float("%.3f" % (cromwell_total_time_seconds - papi_total_time_seconds)) succeeded_operations[string_path] = { Attempt: attempt.get('attempt'), CromwellAdditionalTotalTimeSeconds: cromwell_additional_total_time_seconds, CromwellEnd: cromwell_end, CromwellStart: cromwell_start, CromwellTotalTimeSeconds: cromwell_total_time_seconds, DelocalizationTimeSeconds: operation.delocalization_time_seconds(), Disks: operation.disks(), DockerImagePullTimeSeconds: operation.docker_image_pull_time_seconds(), LocalizationTimeSeconds: operation.localization_time_seconds(), MachineType: operation.machine_type(), OperationIdKey: operation_id, OtherTimeSeconds: operation.other_time_seconds(), PapiCreate: operation.create_time(), PapiEnd: operation.end_time(), PapiStart: operation.start_time(), PapiTotalTimeSeconds: operation.total_time_seconds(), ShardIndex: attempt.get('shardIndex'), StartupTimeSeconds: operation.startup_time_seconds(), UserCommandTimeSeconds: operation.user_command_time_seconds(), }
def create(operation_json: JsonObject): operation_id = operation_json.get('name') version = operation_id_to_api_version(operation_id) if version == PAPI_V1_API_VERSION: return PapiV1OperationDigester(operation_json) elif version == PAPI_V2_ALPHA1_API_VERSION: return PapiV2AlphaOperationDigester(operation_json) elif version == PAPI_V2_BETA_API_VERSION: return PapiV2BetaOperationDigester(operation_json) else: raise ValueError( f"Unrecognized format for PAPI operation ID {operation_id}")
def has_description_like(event: JsonObject) -> bool: return regex.match(event.get('description')) is not None
def has_description(event: JsonObject) -> bool: return event.get('description') == description
def error_checks(name_1: AnyStr, name_2: AnyStr, json_1: JsonObject, json_2: JsonObject, force: bool = False): version_1, version_2 = [j.get('version') for j in [json_1, json_2]] if version_1 != version_2: msg = f"Inconsistent digest versions: First JSON digest is {version_1} but second is {version_2}" raise ValueError(msg) call_keys_1, call_keys_2 = [ list(j.get('calls').keys()) for j in [json_1, json_2] ] call_keys_1.sort() call_keys_2.sort() if call_keys_1 != call_keys_2: in_1 = ', '.join(set(call_keys_1) - set(call_keys_2)) in_2 = ', '.join(set(call_keys_2) - set(call_keys_1)) msg_1 = None msg_2 = None if in_1: msg_1 = f"In {name_1} but not {name_2}: {in_1}." if in_2: msg_2 = f"In {name_1} but not {name_2}: {in_2}." raise ValueError( 'The specified digest files do not have the same call keys. These digests cannot be ' + f'compared and probably are not from the same workflow and sample. {msg_1} {msg_2}' ) for call_key in call_keys_1: call_1 = json_1.get('calls').get(call_key) call_2 = json_2.get('calls').get(call_key) for call in [call_1, call_2]: for digester_key in DigesterKeys: json_key = digester_key.json_key if json_key not in call: if call == call_1: nth = "first" else: nth = "second" raise ValueError( f"In {nth} digest JSON: call '{call_key}' missing required key '{json_key}'" ) discrepancies = [] for k in call_keys_1: machine_type_1 = json_1.get('calls').get(k).get('machineType') machine_type_2 = json_2.get('calls').get(k).get('machineType') if machine_type_1 != machine_type_2: discrepancies.append({ 'key': k, 'machine_type_1': machine_type_1, 'machine_type_2': machine_type_2 }) if discrepancies: string = ', '.join( f'{e["key"]}: {e["machine_type_1"]} vs {e["machine_type_2"]}' for e in discrepancies) message = 'The specified digest files unexpectedly contain corresponding jobs that ran with ' + \ 'different machine types: ' + string if force: logger.warning(message) else: raise ValueError( message + '. Specify the --force argument to force comparison anyway.')
def sum_call_times(j: JsonObject, key: AnyStr): """ Sums raw call times for a column. """ times = [j.get('calls').get(ck.full).get(key) for ck in call_keys] return sum(times)
def compare_jsons(json_1: JsonObject, json_2: JsonObject, name_1: AnyStr, name_2: AnyStr, call_prefixes_to_remove: List[AnyStr], force: bool = False) -> List[List[AnyStr]]: """ Produce a CSV representing the comparison of the specified JSONs. """ error_checks(name_1, name_2, json_1, json_2, force) call_keys = [ CallKey(k, call_prefixes_to_remove) for k in json_1.get('calls').keys() ] call_keys_sorted_without_prefix = sorted(call_keys, key=lambda k: k.without_prefix) # Columns to produce in triplets: name_1, name_2, percent increase from name_1 to name_2. digester_key_names = [ PapiTotalTimeSeconds, StartupTimeSeconds, DockerImagePullTimeSeconds, LocalizationTimeSeconds, UserCommandTimeSeconds, DelocalizationTimeSeconds ] def build_header_rows(): """ Builds the header rows of the spreadsheet. """ top_header_row = ['job', 'Machine type'] percent_contrib_row = ['% contribution to total run time', ''] total_row = ['Total', ''] machine_type_weighted_row = [ 'Machine-type weighted total (cost-ish)', '' ] def sum_call_times(j: JsonObject, key: AnyStr): """ Sums raw call times for a column. """ times = [j.get('calls').get(ck.full).get(key) for ck in call_keys] return sum(times) def sum_call_times_weighted(j: JsonObject, key: AnyStr): """ Sums call times for a column weighted by the machine type used as a more accurate estimation of cost. """ times = [ j.get('calls').get(ck.full).get(key) * MachineTypeCostMultiplier[j.get('calls').get( ck.full).get('machineType')] for ck in call_keys ] return sum(times) # Treat the total time specially to be able to report percentages of time for individual lifecycle phases. total_total_time_1, total_total_time_2 = [ sum_call_times(j, PapiTotalTimeSeconds) for j in [json_1, json_2] ] for digester_key_name in digester_key_names: digester_key = digester_key_by_json_key(digester_key_name) top_header_row.append(f'{name_1} {digester_key.display_text}') top_header_row.append(f'{name_2} {digester_key.display_text}') top_header_row.append('% increase') # Total times within a column, not necessarily total time of the job. total_time_1, total_time_2 = [ sum_call_times(j, digester_key_name) for j in [json_1, json_2] ] percent_contrib_row.append( f'{(total_time_1 / total_total_time_1) * 100:.2f}%') percent_contrib_row.append( f'{(total_time_2 / total_total_time_2) * 100:.2f}%') percent_contrib_row.append('') total_row.append(format_seconds(total_time_1)) total_row.append(format_seconds(total_time_2)) total_percent_increase = ( (total_time_2 - total_time_1) * 100) / total_time_1 total_row.append(f'{total_percent_increase :.2f}%') weighted_total_time_1, weighted_total_time_2 = [ sum_call_times_weighted(j, digester_key_name) for j in [json_1, json_2] ] machine_type_weighted_row.append( format_seconds(weighted_total_time_1)) machine_type_weighted_row.append( format_seconds(weighted_total_time_2)) weighted_percent = ( (weighted_total_time_2 - weighted_total_time_1) * 100) / weighted_total_time_1 machine_type_weighted_row.append(f'{weighted_percent:.2f}%') return [ top_header_row, percent_contrib_row, total_row, machine_type_weighted_row, [] ] rows = [] # Produce data for individual lifecycle phases of individual calls. for call_key in call_keys_sorted_without_prefix: row = [] call_1, call_2 = [ j.get('calls').get(call_key.full) for j in [json_1, json_2] ] row.append(call_key.without_prefix) row.append(call_1.get(MachineType)) for digester_key_name in digester_key_names: time_1 = call_1.get(digester_key_name) time_2 = call_2.get(digester_key_name) row.append(format_seconds(time_1)) row.append(format_seconds(time_2)) if time_1: row.append(f'{((time_2 - time_1) * 100) / time_1:.2f}%') else: row.append('---') rows.append(row) return build_header_rows() + rows