예제 #1
0
 def createFunctionEventForMemoryEvents(evt):
     rel_start_us = evt.start_us() - trace_start_us
     fe = FunctionEvent(
         id=max_evt_id,
         name=evt.name(),
         trace_name=None,  # not outputting in the trace
         thread=evt.start_thread_id(),
         start_us=rel_start_us,
         end_us=rel_start_us,  # no duration
         fwd_thread=evt.start_thread_id(),
         input_shapes=[],
         stack=[],
         scope=0,  # RecordScope::FUNCTION
         cpu_memory_usage=_cpu_memory_usage(evt),
         cuda_memory_usage=_cuda_memory_usage(evt),
         is_async=False,
         sequence_nr=-1,
         device_type=DeviceType.CPU,
         device_index=0,
     )
     return fe
예제 #2
0
def parse_nvprof_trace(path):
    import sqlite3
    conn = sqlite3.connect(path)
    conn.row_factory = sqlite3.Row

    # Parse strings table
    strings = {}
    for r in conn.execute("SELECT _id_ as id, value FROM StringTable"):
        strings[r["id"]] = torch._C._demangle(r["value"])

    # First, find all functions and create FunctionEvents for them
    marker_query = """
    SELECT
        start.id AS marker_id, start.name, start.timestamp AS start_time, end.timestamp AS end_time
    FROM
        CUPTI_ACTIVITY_KIND_MARKER AS start INNER JOIN CUPTI_ACTIVITY_KIND_MARKER AS end
        ON start.id = end.id
    WHERE
        start.name != 0 AND end.name = 0
    """
    functions = []
    functions_map = {}
    unique = EnforceUnique()
    for row in conn.execute(marker_query):
        unique.see(row['marker_id'])
        evt = FunctionEvent(
            id=row['marker_id'],
            node_id=
            0,  # missing a node_id when calling FunctionEvent. This is just to ensure
            # that pytorch doesn't crash when creating a FunctionEvent() object
            name=strings[row['name']],
            start_us=row['start_time'],
            end_us=row['end_time'],
            thread=0)  # TODO: find in sqlite database
        functions.append(evt)
        functions_map[evt.id] = evt

    # Now, correlate all kernels with FunctionEvents
    kernel_query = """
    SELECT
        start.id AS marker_id, start.name, start.timestamp, end.timestamp,
        runtime._id_ AS runtime_id, runtime.cbid, runtime.start AS runtime_start, runtime.end AS runtime_end,
        kernel.start AS kernel_start, kernel.end AS kernel_end, kernel.name AS kernel_name
    FROM
        CUPTI_ACTIVITY_KIND_MARKER AS start
        INNER JOIN CUPTI_ACTIVITY_KIND_MARKER AS end
            ON start.id = end.id
        INNER JOIN CUPTI_ACTIVITY_KIND_RUNTIME as runtime
            ON (start.timestamp < runtime.start AND runtime.end < end.timestamp)
        INNER JOIN CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL AS kernel
            ON kernel.correlationId = runtime.correlationId
    """
    unique = EnforceUnique()
    for row in conn.execute(kernel_query):
        unique.see(row['marker_id'], row['runtime_id'])
        # 211 is cudaKernelLaunch for cuda >= 9.2
        assert (row['cbid'] == 211)
        evt = functions_map[row['marker_id']]
        evt.append_kernel(row['kernel_name'], 0,
                          row['kernel_end'] - row['kernel_start'])

    functions.sort(key=lambda evt: evt.time_range.start)
    return functions
예제 #3
0
    def _parse_kineto_results(self, result):
        # result.events() has most of the events - PyTorch op-level and device-level events

        trace_start_us = result.trace_start_us()
        mem_records = [[evt, False] for evt in result.events()
                       if evt.name() == MEMORY_EVENT_NAME]
        mem_records_acc = MemRecordsAcc(mem_records)

        def _cpu_memory_usage(mem_record):
            return mem_record.nbytes() if \
                mem_record.device_type() in [DeviceType.CPU, DeviceType.MKLDNN, DeviceType.IDEEP] \
                else 0

        def _cuda_memory_usage(mem_record):
            return mem_record.nbytes() if \
                mem_record.device_type() in [DeviceType.CUDA, DeviceType.HIP] \
                else 0

        # Create and return FunctionEvent list
        function_events = []
        cuda_corr_map: Dict[int, List[FunctionEvent]] = {}
        max_evt_id = 0
        for kineto_event in result.events():
            if _filter_name(kineto_event.name()):
                continue
            rel_start_us = kineto_event.start_us() - trace_start_us
            rel_end_us = rel_start_us + kineto_event.duration_us()
            abs_end_us = kineto_event.start_us() + kineto_event.duration_us()

            cpu_memory_usage = 0
            cuda_memory_usage = 0
            if kineto_event.device_type() == DeviceType.CPU:
                # find the corresponding memory allocation events
                for mem_record in mem_records_acc.in_interval(
                        kineto_event.start_us(), abs_end_us):
                    cpu_memory_usage += _cpu_memory_usage(mem_record[0])
                    cuda_memory_usage += _cuda_memory_usage(mem_record[0])
                    mem_record[1] = True

            is_async = kineto_event.is_async() or (
                kineto_event.start_thread_id() != kineto_event.end_thread_id())

            fe = FunctionEvent(
                id=kineto_event.correlation_id(),
                name=_rewrite_name(name=kineto_event.name(),
                                   with_wildcard=True),
                trace_name=_rewrite_name(name=kineto_event.name(),
                                         with_wildcard=False),
                thread=kineto_event.start_thread_id(),
                start_us=rel_start_us,
                end_us=rel_end_us,
                fwd_thread=kineto_event.fwd_thread_id(),
                input_shapes=kineto_event.shapes(),
                stack=[
                    entry for entry in kineto_event.stack()
                    if _filter_stack_entry(entry)
                ],
                scope=kineto_event.scope(),
                cpu_memory_usage=cpu_memory_usage,
                cuda_memory_usage=cuda_memory_usage,
                is_async=is_async,
                sequence_nr=kineto_event.sequence_nr(),
                device_type=kineto_event.device_type(),
                device_index=kineto_event.device_index(),
                flops=kineto_event.flops(),
            )
            max_evt_id = fe.id if fe.id > max_evt_id else max_evt_id
            if fe.device_type == DeviceType.CPU and not fe.is_async:
                # Check if we have CUDA time as a fallback
                cuda_time = kineto_event.cuda_elapsed_us()
                if cuda_time > 0:
                    fe.append_kernel(fe.name, fe.device_index, cuda_time)
                    fe.is_legacy = True
            function_events.append(fe)
            corr_id = kineto_event.linked_correlation_id()
            if corr_id > 0:
                if corr_id not in cuda_corr_map:
                    cuda_corr_map[corr_id] = []
                cuda_corr_map[corr_id].append(fe)

        # associate CUDA kernels and CUDA runtime (CPU) with CPU events
        for fe in function_events:
            if (fe.device_type == DeviceType.CPU and not fe.is_async
                    and fe.id in cuda_corr_map):
                for f_evt in cuda_corr_map[fe.id]:
                    if f_evt.device_type == DeviceType.CUDA:
                        fe.append_kernel(
                            f_evt.name, f_evt.device_index,
                            f_evt.time_range.end - f_evt.time_range.start)
                    elif f_evt.device_type == DeviceType.CPU:
                        # make sure that 'thread' of a CPU Kineto (e.g. CUDA Runtime) event is associated
                        # with the 'thread' of the corresponding linked PyTorch event to properly track
                        # parents and children
                        f_evt.thread = fe.thread

        # output top-level memory events
        for mem_record in mem_records:
            if not mem_record[1]:
                rel_start_us = mem_record[0].start_us() - trace_start_us
                max_evt_id += 1
                fe = FunctionEvent(
                    id=max_evt_id,
                    name=MEMORY_EVENT_NAME,
                    trace_name=None,  # not outputting in the trace
                    thread=mem_record[0].start_thread_id(),
                    start_us=rel_start_us,
                    end_us=rel_start_us,  # no duration
                    fwd_thread=mem_record[0].start_thread_id(),
                    input_shapes=[],
                    stack=[],
                    scope=0,  # RecordScope::FUNCTION
                    cpu_memory_usage=_cpu_memory_usage(mem_record[0]),
                    cuda_memory_usage=_cuda_memory_usage(mem_record[0]),
                    is_async=False,
                    sequence_nr=-1,
                    device_type=DeviceType.CPU,
                    device_index=0,
                )
                function_events.append(fe)

        function_events.sort(
            key=lambda evt: [evt.time_range.start, -evt.time_range.end])
        return function_events
예제 #4
0
def _parse_legacy_records(thread_records):
    def _get_record_key(record):
        """
        Returns a tuple to be used by _parse_legacy_records for correlating start and
        end records.
        """
        return (record.handle(), record.node_id())

    next_id = 0
    start_record = None
    functions = []
    record_stack = []

    # '__start_profile' is not guaranteed to be first, so we must find it here
    for record in itertools.chain(*thread_records):
        name = record.name()
        if start_record is None and name == '__start_profile':
            start_record = record

    assert start_record is not None and not start_record.is_remote()

    for thread_record_list in thread_records:
        # accumulated memory allocations per handle
        cpu_memory_allocs = {}
        cuda_memory_allocs = {}
        # ranges per handle
        range_starts = {}

        filtered_handles = set()
        prev_record = None
        for record in thread_record_list:
            record_key = _get_record_key(record)
            if (_filter_name(record.name()) or record_key in filtered_handles):
                filtered_handles.add(record_key)
                continue

            if record.kind() == 'push':
                # workaround to reduce double logging from operator
                # wrappers and redispatch
                if prev_record is not None:
                    duplicate = (prev_record.name() == record.name()
                                 and prev_record.kind() == record.kind()
                                 and prev_record.node_id() == record.node_id())
                    if duplicate:
                        filtered_handles.add(record_key)
                        continue

                range_starts[record_key] = record
                cpu_memory_allocs[record_key] = 0
                cuda_memory_allocs[record_key] = 0
            elif record.kind() == 'pop':
                assert (
                    record_key in range_starts
                ), """Expected record with key {} to exist in range_starts.
                    This means that the pop event did not have a corresponding push.""".format(
                    record_key)

                start = range_starts[record_key]

                cpu_memory_usage = cpu_memory_allocs[record_key]
                cuda_memory_usage = cuda_memory_allocs[record_key]
                is_async = start.is_async() or (start.thread_id() !=
                                                record.thread_id())
                is_remote_event = record.is_remote()
                start_flops = start.flops()

                fe = FunctionEvent(
                    id=record.handle(),
                    node_id=record.node_id(),
                    name=_rewrite_name(name=start.name(), with_wildcard=True),
                    trace_name=_rewrite_name(name=start.name(),
                                             with_wildcard=False),
                    thread=start.thread_id(),
                    start_us=start_record.cpu_elapsed_us(start),
                    end_us=start_record.cpu_elapsed_us(record),
                    fwd_thread=start.fwd_thread_id(),
                    input_shapes=start.shapes(),
                    stack=[
                        entry for entry in start.stack()
                        if _filter_stack_entry(entry)
                    ],
                    scope=start.scope(),
                    cpu_memory_usage=cpu_memory_usage,
                    cuda_memory_usage=cuda_memory_usage,
                    is_async=is_async,
                    is_remote=is_remote_event,
                    sequence_nr=start.sequence_nr(),
                    device_type=DeviceType.CPU,
                    is_legacy=True,
                    flops=start_flops,
                )
                # note: async events have only cpu total time
                if not is_async and start.has_cuda():
                    duration = start.cuda_elapsed_us(record)
                    if duration > 0:
                        fe.append_kernel(start.name(), start.device(),
                                         duration)
                functions.append(fe)
                del range_starts[record_key]
                del cpu_memory_allocs[record_key]
                del cuda_memory_allocs[record_key]
            elif record.kind() == 'memory_alloc':
                num_open_handles_cpu = len(cpu_memory_allocs)
                num_open_handles_cuda = len(cuda_memory_allocs)
                assert num_open_handles_cpu == num_open_handles_cuda
                for handle in cpu_memory_allocs.keys():
                    cpu_memory_allocs[handle] += record.cpu_memory_usage()
                for handle in cuda_memory_allocs.keys():
                    cuda_memory_allocs[handle] += record.cuda_memory_usage()
                if num_open_handles_cpu == 0:
                    # output event as a top-level memory event
                    fe = FunctionEvent(
                        id=0,
                        name=MEMORY_EVENT_NAME,
                        trace_name=None,
                        thread=0,
                        start_us=0,
                        end_us=0,
                        stack=[],
                        cpu_memory_usage=record.cpu_memory_usage(),
                        cuda_memory_usage=record.cuda_memory_usage(),
                        is_legacy=True,
                    )
                    functions.append(fe)
            prev_record = record

    # Sort functions by start time then by end time ascending.
    # This ensures that--in the case of nested events which
    # have the same start time (which may happen due to the
    # granularity of the given clock tick)--we always show
    # the outermost nested call first. This adds stability
    # in how FunctionEvents appear
    functions.sort(key=lambda evt: [evt.time_range.start, -evt.time_range.end])
    return functions