def _parse_kineto_results(self, result): # result.events() has most of the events - PyTorch op-level and device-level events trace_start_us = result.trace_start_us() mem_records = [[evt, False] for evt in result.events() if evt.name() == MEMORY_EVENT_NAME] mem_records_acc = MemRecordsAcc(mem_records) def _cpu_memory_usage(mem_record): return mem_record.nbytes() if \ mem_record.device_type() in [DeviceType.CPU, DeviceType.MKLDNN, DeviceType.IDEEP] \ else 0 def _cuda_memory_usage(mem_record): return mem_record.nbytes() if \ mem_record.device_type() in [DeviceType.CUDA, DeviceType.HIP] \ else 0 # Create and return FunctionEvent list function_events = [] cuda_corr_map: Dict[int, List[FunctionEvent]] = {} max_evt_id = 0 for kineto_event in result.events(): if _filter_name(kineto_event.name()): continue rel_start_us = kineto_event.start_us() - trace_start_us rel_end_us = rel_start_us + kineto_event.duration_us() abs_end_us = kineto_event.start_us() + kineto_event.duration_us() cpu_memory_usage = 0 cuda_memory_usage = 0 if kineto_event.device_type() == DeviceType.CPU: # find the corresponding memory allocation events for mem_record in mem_records_acc.in_interval( kineto_event.start_us(), abs_end_us): cpu_memory_usage += _cpu_memory_usage(mem_record[0]) cuda_memory_usage += _cuda_memory_usage(mem_record[0]) mem_record[1] = True is_async = kineto_event.is_async() or ( kineto_event.start_thread_id() != kineto_event.end_thread_id()) fe = FunctionEvent( id=kineto_event.correlation_id(), name=_rewrite_name(name=kineto_event.name(), with_wildcard=True), trace_name=_rewrite_name(name=kineto_event.name(), with_wildcard=False), thread=kineto_event.start_thread_id(), start_us=rel_start_us, end_us=rel_end_us, fwd_thread=kineto_event.fwd_thread_id(), input_shapes=kineto_event.shapes(), stack=[ entry for entry in kineto_event.stack() if _filter_stack_entry(entry) ], scope=kineto_event.scope(), cpu_memory_usage=cpu_memory_usage, cuda_memory_usage=cuda_memory_usage, is_async=is_async, sequence_nr=kineto_event.sequence_nr(), device_type=kineto_event.device_type(), device_index=kineto_event.device_index(), flops=kineto_event.flops(), ) max_evt_id = fe.id if fe.id > max_evt_id else max_evt_id if fe.device_type == DeviceType.CPU and not fe.is_async: # Check if we have CUDA time as a fallback cuda_time = kineto_event.cuda_elapsed_us() if cuda_time > 0: fe.append_kernel(fe.name, fe.device_index, cuda_time) fe.is_legacy = True function_events.append(fe) corr_id = kineto_event.linked_correlation_id() if corr_id > 0: if corr_id not in cuda_corr_map: cuda_corr_map[corr_id] = [] cuda_corr_map[corr_id].append(fe) # associate CUDA kernels and CUDA runtime (CPU) with CPU events for fe in function_events: if (fe.device_type == DeviceType.CPU and not fe.is_async and fe.id in cuda_corr_map): for f_evt in cuda_corr_map[fe.id]: if f_evt.device_type == DeviceType.CUDA: fe.append_kernel( f_evt.name, f_evt.device_index, f_evt.time_range.end - f_evt.time_range.start) elif f_evt.device_type == DeviceType.CPU: # make sure that 'thread' of a CPU Kineto (e.g. CUDA Runtime) event is associated # with the 'thread' of the corresponding linked PyTorch event to properly track # parents and children f_evt.thread = fe.thread # output top-level memory events for mem_record in mem_records: if not mem_record[1]: rel_start_us = mem_record[0].start_us() - trace_start_us max_evt_id += 1 fe = FunctionEvent( id=max_evt_id, name=MEMORY_EVENT_NAME, trace_name=None, # not outputting in the trace thread=mem_record[0].start_thread_id(), start_us=rel_start_us, end_us=rel_start_us, # no duration fwd_thread=mem_record[0].start_thread_id(), input_shapes=[], stack=[], scope=0, # RecordScope::FUNCTION cpu_memory_usage=_cpu_memory_usage(mem_record[0]), cuda_memory_usage=_cuda_memory_usage(mem_record[0]), is_async=False, sequence_nr=-1, device_type=DeviceType.CPU, device_index=0, ) function_events.append(fe) function_events.sort( key=lambda evt: [evt.time_range.start, -evt.time_range.end]) return function_events
def parse_nvprof_trace(path): import sqlite3 conn = sqlite3.connect(path) conn.row_factory = sqlite3.Row # Parse strings table strings = {} for r in conn.execute("SELECT _id_ as id, value FROM StringTable"): strings[r["id"]] = torch._C._demangle(r["value"]) # First, find all functions and create FunctionEvents for them marker_query = """ SELECT start.id AS marker_id, start.name, start.timestamp AS start_time, end.timestamp AS end_time FROM CUPTI_ACTIVITY_KIND_MARKER AS start INNER JOIN CUPTI_ACTIVITY_KIND_MARKER AS end ON start.id = end.id WHERE start.name != 0 AND end.name = 0 """ functions = [] functions_map = {} unique = EnforceUnique() for row in conn.execute(marker_query): unique.see(row['marker_id']) evt = FunctionEvent( id=row['marker_id'], node_id= 0, # missing a node_id when calling FunctionEvent. This is just to ensure # that pytorch doesn't crash when creating a FunctionEvent() object name=strings[row['name']], start_us=row['start_time'], end_us=row['end_time'], thread=0) # TODO: find in sqlite database functions.append(evt) functions_map[evt.id] = evt # Now, correlate all kernels with FunctionEvents kernel_query = """ SELECT start.id AS marker_id, start.name, start.timestamp, end.timestamp, runtime._id_ AS runtime_id, runtime.cbid, runtime.start AS runtime_start, runtime.end AS runtime_end, kernel.start AS kernel_start, kernel.end AS kernel_end, kernel.name AS kernel_name FROM CUPTI_ACTIVITY_KIND_MARKER AS start INNER JOIN CUPTI_ACTIVITY_KIND_MARKER AS end ON start.id = end.id INNER JOIN CUPTI_ACTIVITY_KIND_RUNTIME as runtime ON (start.timestamp < runtime.start AND runtime.end < end.timestamp) INNER JOIN CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL AS kernel ON kernel.correlationId = runtime.correlationId """ unique = EnforceUnique() for row in conn.execute(kernel_query): unique.see(row['marker_id'], row['runtime_id']) # 211 is cudaKernelLaunch for cuda >= 9.2 assert (row['cbid'] == 211) evt = functions_map[row['marker_id']] evt.append_kernel(row['kernel_name'], 0, row['kernel_end'] - row['kernel_start']) functions.sort(key=lambda evt: evt.time_range.start) return functions
def _parse_legacy_records(thread_records): def _get_record_key(record): """ Returns a tuple to be used by _parse_legacy_records for correlating start and end records. """ return (record.handle(), record.node_id()) next_id = 0 start_record = None functions = [] record_stack = [] # '__start_profile' is not guaranteed to be first, so we must find it here for record in itertools.chain(*thread_records): name = record.name() if start_record is None and name == '__start_profile': start_record = record assert start_record is not None and not start_record.is_remote() for thread_record_list in thread_records: # accumulated memory allocations per handle cpu_memory_allocs = {} cuda_memory_allocs = {} # ranges per handle range_starts = {} filtered_handles = set() prev_record = None for record in thread_record_list: record_key = _get_record_key(record) if (_filter_name(record.name()) or record_key in filtered_handles): filtered_handles.add(record_key) continue if record.kind() == 'push': # workaround to reduce double logging from operator # wrappers and redispatch if prev_record is not None: duplicate = (prev_record.name() == record.name() and prev_record.kind() == record.kind() and prev_record.node_id() == record.node_id()) if duplicate: filtered_handles.add(record_key) continue range_starts[record_key] = record cpu_memory_allocs[record_key] = 0 cuda_memory_allocs[record_key] = 0 elif record.kind() == 'pop': assert ( record_key in range_starts ), """Expected record with key {} to exist in range_starts. This means that the pop event did not have a corresponding push.""".format( record_key) start = range_starts[record_key] cpu_memory_usage = cpu_memory_allocs[record_key] cuda_memory_usage = cuda_memory_allocs[record_key] is_async = start.is_async() or (start.thread_id() != record.thread_id()) is_remote_event = record.is_remote() start_flops = start.flops() fe = FunctionEvent( id=record.handle(), node_id=record.node_id(), name=_rewrite_name(name=start.name(), with_wildcard=True), trace_name=_rewrite_name(name=start.name(), with_wildcard=False), thread=start.thread_id(), start_us=start_record.cpu_elapsed_us(start), end_us=start_record.cpu_elapsed_us(record), fwd_thread=start.fwd_thread_id(), input_shapes=start.shapes(), stack=[ entry for entry in start.stack() if _filter_stack_entry(entry) ], scope=start.scope(), cpu_memory_usage=cpu_memory_usage, cuda_memory_usage=cuda_memory_usage, is_async=is_async, is_remote=is_remote_event, sequence_nr=start.sequence_nr(), device_type=DeviceType.CPU, is_legacy=True, flops=start_flops, ) # note: async events have only cpu total time if not is_async and start.has_cuda(): duration = start.cuda_elapsed_us(record) if duration > 0: fe.append_kernel(start.name(), start.device(), duration) functions.append(fe) del range_starts[record_key] del cpu_memory_allocs[record_key] del cuda_memory_allocs[record_key] elif record.kind() == 'memory_alloc': num_open_handles_cpu = len(cpu_memory_allocs) num_open_handles_cuda = len(cuda_memory_allocs) assert num_open_handles_cpu == num_open_handles_cuda for handle in cpu_memory_allocs.keys(): cpu_memory_allocs[handle] += record.cpu_memory_usage() for handle in cuda_memory_allocs.keys(): cuda_memory_allocs[handle] += record.cuda_memory_usage() if num_open_handles_cpu == 0: # output event as a top-level memory event fe = FunctionEvent( id=0, name=MEMORY_EVENT_NAME, trace_name=None, thread=0, start_us=0, end_us=0, stack=[], cpu_memory_usage=record.cpu_memory_usage(), cuda_memory_usage=record.cuda_memory_usage(), is_legacy=True, ) functions.append(fe) prev_record = record # Sort functions by start time then by end time ascending. # This ensures that--in the case of nested events which # have the same start time (which may happen due to the # granularity of the given clock tick)--we always show # the outermost nested call first. This adds stability # in how FunctionEvents appear functions.sort(key=lambda evt: [evt.time_range.start, -evt.time_range.end]) return functions