def test_execution_graph_start_stop(self): use_cuda = torch.profiler.ProfilerActivity.CUDA in supported_activities( ) # Create a temp file to save execution graph data. fp = tempfile.NamedTemporaryFile('w+t', suffix='.json', delete=False) fp.close() expected_loop_events = 0 eg = ExecutionGraphObserver() eg.register_callback(fp.name) for idx in range(10): if idx == 3: eg.start() elif idx == 5: eg.stop() elif idx == 8: eg.start() elif idx == 9: eg.stop() eg.unregister_callback() if eg._execution_graph_running: expected_loop_events += 1 with record_function(f"## LOOP {idx} ##"): self.payload(use_cuda=use_cuda) assert fp.name == eg.get_output_file_path() nodes = self.get_execution_graph_root(fp.name) loop_count = 0 for n in nodes: assert "name" in n if "[pytorch|profiler|execution_graph|process]" in n["name"]: found_root_node = True if n["name"].startswith("## LOOP "): loop_count += 1 assert found_root_node assert loop_count == expected_loop_events
def test_execution_graph_no_capture(self): fp = tempfile.NamedTemporaryFile('w+t', suffix='.json', delete=False) fp.close() eg = ExecutionGraphObserver() eg.register_callback(fp.name) eg.unregister_callback() assert fp.name == eg.get_output_file_path() nodes = self.get_execution_graph_root(fp.name) for n in nodes: assert "name" in n if "[pytorch|profiler|execution_graph|process]" in n["name"]: found_root_node = True assert found_root_node
def test_execution_graph_with_kineto(self): trace_called_num = 0 def trace_handler(p): nonlocal trace_called_num trace_called_num += 1 use_cuda = torch.profiler.ProfilerActivity.CUDA in supported_activities( ) # Create a temp file to save execution graph data. fp = tempfile.NamedTemporaryFile('w+t', suffix='.json', delete=False) fp.close() expected_loop_events = 0 eg = ExecutionGraphObserver() eg.register_callback(fp.name) with profile( activities=supported_activities(), schedule=torch.profiler.schedule(skip_first=3, wait=1, warmup=1, active=2), on_trace_ready=trace_handler, ) as p: eg.start() for idx in range(10): expected_loop_events += 1 with record_function(f"## LOOP {idx} ##"): self.payload(use_cuda=use_cuda) p.step() eg.stop() eg.unregister_callback() assert trace_called_num == 2 assert fp.name == eg.get_output_file_path() nodes = self.get_execution_graph_root(fp.name) loop_count = 0 for n in nodes: assert "name" in n if "[pytorch|profiler|execution_graph|process]" in n["name"]: found_root_node = True if n["name"].startswith("## LOOP "): loop_count += 1 assert found_root_node assert loop_count == expected_loop_events
def main(): parser = argparse.ArgumentParser(description="PyTorch Microbenchmarks") parser.add_argument("-c", "--config", type=str, help="The benchmark config file.") parser.add_argument( "-w", "--warmup", type=int, default=1, help="Number of warm up iterations." ) parser.add_argument( "-i", "--iteration", type=int, default=1, help="Number of benchmark iterations." ) parser.add_argument( "-b", "--backward", action="store_true", help="Include backward pass." ) parser.add_argument( "-d", "--device", type=str, default="cpu", help="Target device for benchmark." ) parser.add_argument( "-o", "--output-prefix", type=str, default="benchmark_result", help="File name prefix to write benchmark results.", ) parser.add_argument( "-r", "--resume-id", type=str, default=None, help="Define a resume op_run_id to continue benchmark, skip all previous configs.", ) parser.add_argument( "-s", "--stop_id", type=str, default=None, help="Define a stop op_run_id (exclusive) to stop benchmark, skip remaining configs.", ) parser.add_argument( "-a", "--append", action="store_true", help="Append to output file, rather than overwrite.", ) parser.add_argument( "--cuda-l2-cache", default="off", nargs="?", choices=["on", "off"], help="Set option for CUDA GPU L2 cache between iterations in discrete mode.", ) parser.add_argument( "--ncu", action="store_true", help="Run NSight Compute to collect metrics." ) parser.add_argument( "--ncu-bin", type=str, default=None, help="Path to the NSight Compute (ncu) binary.", ) parser.add_argument( "--ncu-args-file", type=str, default=None, help="NSight Compute extra command line options (metrics etc.).", ) parser.add_argument( "--ncu-warmup", type=int, default=None, help="NSight Systems number of warmup runs.", ) parser.add_argument( "--ncu-iteration", type=int, default=None, help="NSight Systems number of measured iteration runs.", ) parser.add_argument( "--nsys", action="store_true", help="Run NSight Systems to collect metrics." ) parser.add_argument( "--nsys-bin", type=str, default=None, help="Path to the NSight Systems (nsys) binary.", ) parser.add_argument( "--nsys-args-file", type=str, default=None, help="NSight Systems extra command line options (metrics etc.).", ) parser.add_argument( "--nsys-warmup", type=int, default=None, help="NSight Systems number of warmup runs.", ) parser.add_argument( "--nsys-iteration", type=int, default=None, help="NSight Systems number of measured iteration runs.", ) parser.add_argument( "--run-batch-size", type=int, default=50, help="Batch run input size (number of input configs to run in one launch), used by both NCU and NSYS.", ) parser.add_argument( "--batch-cuda-device", type=int, default=1, help="CUDA GPU device ID to run batch job.", ) parser.add_argument( "--batch-cmd", type=str, default=None, help="Run batch job command.", ) parser.add_argument( "--exec-mode", type=str, default="discrete", nargs="?", choices=["discrete", "continuous", "continuous_events"], help="Set execution mode of the operators (discrete, continuous, continuous_events). Default=discrete", ) parser.add_argument( "-p", "--profile", action="store_true", help="Enable profiler and tracing.", ) parser.add_argument( "--eg", action="store_true", help="Collect execution graph.", ) parser.add_argument( "-l", "--log-level", default="INFO", help="Log output verbosity." ) parser.add_argument("--version", action="store_true", help="Print version.") args = parser.parse_args() logger = init_logging(getattr(logging, args.log_level.upper(), logging.INFO)) if args.version: logger.info(f"PARAM train compute version: {__version__}") return elif not args.config: parser.print_usage() return # Load PyTorch implementations for data generator and operators. load_modules(lib_pytorch) # Load PyTorch operator workloads. load_modules(workloads_pytorch) run_options = get_benchmark_options() run_options["warmup"] = args.warmup run_options["iteration"] = args.iteration run_options["device"] = args.device run_options["cuda_l2_cache"] = args.cuda_l2_cache == "on" run_options["resume_op_run_id"] = args.resume_id run_options["stop_op_run_id"] = args.stop_id run_options["run_batch_size"] = args.run_batch_size run_options["batch_cuda_device"] = args.batch_cuda_device if args.backward: run_options["pass_type"] = ExecutionPass.BACKWARD else: run_options["pass_type"] = ExecutionPass.FORWARD run_options["op_exec_mode"] = OpExecutionMode(args.exec_mode) run_options["run_ncu"] = args.ncu run_options["run_nsys"] = args.nsys pid = os.getpid() start_time = datetime.now() timestamp = int(datetime.timestamp(start_time)) out_file_prefix = f"{args.output_prefix}_{pid}_{timestamp}" out_file_name = f"{out_file_prefix}.json" write_option = "a" if args.append else "w" if args.batch_cmd: run_options["batch_cmd"] = args.batch_cmd if args.ncu_bin: run_options["ncu_bin"] = args.ncu_bin if args.ncu_warmup: run_options["ncu_warmup"] = args.ncu_warmup if args.ncu_iteration: run_options["ncu_iteration"] = args.ncu_iteration if args.ncu_args_file: with open(args.ncu_args_file, "r") as ncu_file: run_options["ncu_args"] = ncu_file.read().strip() if args.nsys_bin: run_options["nsys_bin"] = args.nsys_bin if args.nsys_warmup: run_options["nsys_warmup"] = args.nsys_warmup if args.nsys_iteration: run_options["nsys_iteration"] = args.nsys_iteration if args.nsys_args_file: with open(args.nsys_args_file, "r") as nsys_file: run_options["nsys_args"] = nsys_file.read().strip() run_options["cmd_args"] = args.__dict__ with open(out_file_name, write_option) as out_file: run_options["out_file_prefix"] = args.output_prefix run_options["out_stream"] = out_file benchmark_setup = { "run_options": run_options, "sys_info": get_sys_info(), "start_time": start_time.isoformat(timespec="seconds"), } print(json.dumps(benchmark_setup, default=str), file=out_file) bench_config = BenchmarkConfig(run_options) bench_config.load_json_file(args.config) benchmark = make_default_benchmark(bench_config) use_cuda = False if run_options["device"].startswith("cuda"): use_cuda = True eg = None if args.eg: eg_file = f"{out_file_prefix}_eg.json" eg = ExecutionGraphObserver() eg.register_callback(eg_file) eg.start() with torch.autograd.profiler.profile( args.profile, use_cuda=use_cuda, use_kineto=True, record_shapes=False ) as prof: with record_function(f"[param|{run_options['device']}]"): benchmark.run() if eg: eg.stop() eg.unregister_callback() logger.info(f"exeution graph: {eg_file}") print( json.dumps({"finish_time": datetime.now().isoformat(timespec="seconds")}), file=out_file, ) if args.profile and prof: trace_file = f"{out_file_prefix}_trace.json" logger.info(f"trace: {trace_file}") prof.export_chrome_trace(trace_file) print(json.dumps({"trace_file": trace_file}), file=out_file) logger.info(f"benchmark result: {out_file_name}")
def benchTime(self): self.preprocess_graph() print("Start to execution: ") time.sleep(10) total_time = 0.0 event_1 = torch.cuda.Event(enable_timing=True) event_2 = torch.cuda.Event(enable_timing=True) eg_file = "/tmp/replay_eg.json" eg = ExecutionGraphObserver() eg.register_callback(eg_file) if self.profile_replay: with torch.profiler.profile( activities=[ torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA, ], record_shapes=True, # schedule=torch.profiler.schedule( # skip_first=10, # wait=10, # warmup=10, # active=10, # ), on_trace_ready=trace_handler, # profile_memory=True, ) as prof: for iter in range(self.numWarmupIters + self.numIters): if iter == self.numWarmupIters: eg.start() if iter == self.numWarmupIters + 1: eg.stop() eg.unregister_callback() event_1.record() for node in self.sorted_nodes: self.run_op(node) event_2.record() torch.cuda.synchronize() if iter >= self.numWarmupIters: total_time += event_1.elapsed_time(event_2) # Comment out this for now since it will introduce additional cudaMalloc # self.reset_registry() prof.step() # print(iter, torch.cuda.memory_allocated(self.cuda)) # print(prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=20)) else: for iter in range(self.numWarmupIters + self.numIters): event_1.record() for node in self.sorted_nodes: self.run_op(node) event_2.record() torch.cuda.synchronize() if iter >= self.numWarmupIters: total_time += event_1.elapsed_time(event_2) # Comment out this for now since it will introduce additional cudaMalloc # self.reset_registry() if self.profile_memory: print("Allocated GPU memory(B):") for node in dict(sorted(self.op_allocated_mem.items(), key=lambda item: item[1], reverse=True)[:100]): print(node.id, self.op_allocated_mem[node]) print("Reserved GPU memory(B):") for node in dict(sorted(self.op_reserved_mem.items(), key=lambda item: item[1], reverse=True)[:100]): print(node.id, self.op_reserved_mem[node])