def _preprocess_logs(self, logs, op_type_list, _nvtx_meta_data_dict): """ get the op_type counted in the profile. get the scheduling list that needs to be analyse. """ flag_nvtx_time_start = False nvtx_time_start_step = 0 for i in range(len(logs)): line = api_param.parse_string(logs[i]) if flag_nvtx_time_start: infos = line.strip().split() if not infos: continue nvtx_range_type = infos[-1] if nvtx_range_type in ['pybind_imperative_func', 'compute']: op_type = infos[-2] if op_type not in op_type_list and '_grad' not in op_type: op_type_list.append(op_type) _nvtx_meta_data_dict[op_type + ' pybind_imperative_func'] = None _nvtx_meta_data_dict[op_type] = None _nvtx_meta_data_dict[op_type + ' compute'] = None _nvtx_meta_data_dict[op_type + '_grad'] = None _nvtx_meta_data_dict[op_type + '_grad compute'] = None if not flag_nvtx_time_start and 'NVTX Push-Pop Range Statistics:' in line: flag_nvtx_time_start = True nvtx_time_start_step = i return nvtx_time_start_step
def __init__(self, case_detail, compare_framework): self.case_name = case_detail["name"] self.op_type = parse_op_type(self.case_name) self.disabled = case_detail.get("disabled", False) self.compare_framework = compare_framework if case_detail.get("parameters", None): parameters = api_param.parse_string(case_detail["parameters"]) parameters = parameters.replace(" ", "") parameters = parameters.replace("\n", " ") else: parameters = "--" self.parameters = parameters self.gpu_forward = {} self.gpu_backward = {} self.gpu_backward_forward = {} self.cpu_forward = {} self.cpu_backward = {} self.cpu_backward_forward = {} for device in ["gpu", "cpu"]: for direction in ["forward", "backward", "backward_forward"]: attr_name = device + "_" + direction result = getattr(self, attr_name) paddle_total, paddle_gpu_time, paddle_gflops, paddle_gbs = self._get_case_value( case_detail, "paddle", device, "speed", direction) result["paddle"] = { "total": paddle_total, "gpu_time": paddle_gpu_time, "gflops": paddle_gflops, "gbs": paddle_gbs } competitor_total, competitor_gpu_time, competitor_gflops, competitor_gbs = self._get_case_value( case_detail, self.compare_framework, device, "speed", direction) result[self.compare_framework] = { "total": competitor_total, "gpu_time": competitor_gpu_time, "gflops": competitor_gflops, "gbs": competitor_gbs } total_result, total_ratio = _compare(paddle_total, competitor_total) gpu_time_result, gpu_time_ratio = _compare(paddle_gpu_time, competitor_gpu_time) result["compare"] = { "total": total_result, "total_ratio": total_ratio, "gpu_time": gpu_time_result, "gpu_time_ratio": gpu_time_ratio } accuracy, difference = self._get_case_value( case_detail, "paddle", device, "accuracy", direction) result["accuracy"] = str(accuracy) result["difference"] = str(difference)
def _parse_logs(self, logs): kernel_line_from = None kernel_line_to = None memcpy_line_from = None memcpy_line_to = None for i in range(len(logs)): line = api_param.parse_string(logs[i]) if "CUDA Kernel Statistics:" in line: kernel_line_from = i for j in range(i + 2, len(logs)): if logs[j] == "": kernel_line_to = j break if "CUDA Memory Operation Statistics (by time):" in line: memcpy_line_from = i for j in range(i + 2, len(logs)): if logs[j] == "": memcpy_line_to = j break parse_status = False kernel_gpu_time = 0.0 if kernel_line_from is not None and kernel_line_to is not None: for i in range(kernel_line_from, kernel_line_to): print(logs[i]) print("") parse_status = True kernel_gpu_time = self._parse_gpu_time(logs[kernel_line_from + 4]) memcpy_gpu_time = 0.0 if memcpy_line_from is not None and memcpy_line_to is not None: for i in range(memcpy_line_from, memcpy_line_to): print(logs[i]) print("") parse_status = True memcpy_gpu_time = self._parse_gpu_time(logs[memcpy_line_from + 4]) total_gpu_time = kernel_gpu_time + memcpy_gpu_time if total_gpu_time != 0.0: print( "total gpu_time: {:.4f} ms (kernel: {:.4f} ms ({:.2f}%); memcpy: {:.4f} ms ({:.2f}%))" .format(total_gpu_time, kernel_gpu_time, kernel_gpu_time * 100 / total_gpu_time, memcpy_gpu_time, memcpy_gpu_time * 100 / total_gpu_time)) else: print( "total gpu_time: {:.4f} ms (kernel: {:.4f} ms; memcpy: {:.4f} ms)" .format(total_gpu_time, kernel_gpu_time, memcpy_gpu_time)) print("") return parse_status, total_gpu_time
def _parse_nvprof_logs(logs): line_from = None line_to = None total_gpu_time = 0.0 for i in range(len(logs)): line = api_param.parse_string(logs[i]) if "GPU activities:" in line: line_from = i - 1 if line_from is not None and "API calls:" in line: line_to = i - 1 if line_from is not None and line_to is not None: for i in range(line_from, line_to): print(logs[i]) print("") return True, _parse_gpu_time(logs[line_from + 1]) else: return False, 0.0
def __init__(self, case_detail): self.case_name = case_detail["name"] self.op_type = parse_op_type(self.case_name) if case_detail.get("parameters", None): parameters = api_param.parse_string(case_detail["parameters"]) parameters = parameters.replace(" ", "") parameters = parameters.replace("\n", " ") else: parameters = "--" self.parameters = parameters self.gpu_forward = {} self.gpu_backward = {} self.cpu_forward = {} self.cpu_backward = {} for device in ["gpu", "cpu"]: for direction in ["forward", "backward"]: attr_name = device + "_" + direction result = getattr(self, attr_name) paddle_total, paddle_gpu_time = self._get_case_value( case_detail, "paddle", device, "speed", direction) result["paddle"] = { "total": paddle_total, "gpu_time": paddle_gpu_time } tf_total, tf_gpu_time = self._get_case_value( case_detail, "tensorflow", device, "speed", direction) result["tensorflow"] = { "total": tf_total, "gpu_time": tf_gpu_time } result["compare"] = { "total": _compare(paddle_total, tf_total), "gpu_time": _compare(paddle_gpu_time, tf_gpu_time) } accuracy, difference = self._get_case_value( case_detail, "paddle", device, "accuracy", direction) result["accuracy"] = str(accuracy) result["difference"] = str(difference)
def _parse_logs(self, logs, profile_start_step, profile_end_step, sync_interval): """ parse logs to analyse performance. """ parse_status = False total_step_time = 0.0 total_op_call_time_per_step = 0.0 # num step of using profile num_step = profile_end_step - profile_start_step # Profile data in start_step and end_step may be not correct, # so we need to select some reliable data. Number of reliable # step data is step_count. step_count = 0 op_type_list = [] # scheduling time: # op_type pybind_imperative_func (imperative_avg_time) # op_type (fwd_trace_op_avg_time) # op_type compute (fwd_op_compute_avg_time) # op_type_grad (bwd_trace_op_avg_time) # op_type_grad compute (bwd_op_compute_avg_time) _nvtx_meta_data_dict = {} scheduling_time_dict = {} # preprocess logs to get op_type appeared in logs and # initialize data in _nvtx_meta_data_dict to None. nvtx_time_start_step = self._preprocess_logs(logs, op_type_list, _nvtx_meta_data_dict) # parse report to get meta scheduling time for i in range(nvtx_time_start_step, len(logs)): line = api_param.parse_string(logs[i]) infos = line.strip().split() if not infos: continue nvtx_range_type = infos[-1] if nvtx_range_type in ['pybind_imperative_func', 'compute']: nvtx_range_type = infos[-2] + ' ' + nvtx_range_type # step time if nvtx_range_type.isdigit( ) and int(nvtx_range_type) > profile_start_step and int( nvtx_range_type ) < profile_end_step - 1 and int(nvtx_range_type) % sync_interval: step_count += 1 step_time = self._to_float(infos[1]) * 1E-6 total_step_time += step_time # nvtx time if nvtx_range_type in _nvtx_meta_data_dict: avg_time = self._calculate_avg_time_per_op(infos) _nvtx_meta_data_dict[nvtx_range_type] = round(avg_time, 6) if '_grad' in nvtx_range_type and 'compute' not in nvtx_range_type or 'pybind_imperative_func' in nvtx_range_type: total_op_call_time_per_step += self._calculate_avg_time_per_step( infos, num_step) # analyse scheduling time scheduling_time_dict['step_time'] = round( total_step_time / step_count, 6) if step_count != 0 else None scheduling_time_dict['op_call_time_per_step'] = round( total_op_call_time_per_step, 6) scheduling_time_dict[ 'python_call_time'] = self._calculate_scheduling_time( scheduling_time_dict['step_time'], scheduling_time_dict['op_call_time_per_step']) for op_type in op_type_list: scheduling_time_dict[ op_type] = self._get_scheduling_time_from_meta_data( op_type, _nvtx_meta_data_dict) parse_status = True return parse_status, op_type_list, scheduling_time_dict