def wrap_forward(*args, **kwargs): with tprofiler.profile( use_cuda=self.use_cuda, profile_memory=self.profile_memory, ) as prof: res = _forward(*args, **kwargs) event_list = prof.function_events event_list.populate_cpu_children() # each profile call should be contained in its own list self.profile_events[path].append(event_list) return res
def _start_warmup(self): self.profiler = prof.profile( use_cuda=(ProfilerActivity.CUDA in self.activities), use_cpu=(ProfilerActivity.CPU in self.activities), record_shapes=self.record_shapes, with_flops=self.with_flops, profile_memory=self.profile_memory, with_stack=self.with_stack, use_kineto=True, ) self.profiler._prepare_kineto_trace()
def test(): with profiler.profile(profile_memory=True, record_shapes=True) as prof: with profiler.record_function("model_infrerence"): model.eval() output = model(features, adj) loss_test = F.nll_loss(output[idx_test], labels[idx_test]) acc_test = accuracy(output[idx_test], labels[idx_test]) print("Test set results:", "loss= {:.4f}".format(loss_test.item()), "accuracy= {:.4f}".format(acc_test.item())) print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10)) print(prof.key_averages().table(sort_by="cpu_memory_usage", row_limit=10))
def profile_one_step(func, nwarmup=3): for i in range(nwarmup): func() use_cuda = args.device == "cuda" with profiler.profile(record_shapes=True, use_cuda=use_cuda) as prof: func() print( prof.key_averages(group_by_input_shape=True).table( sort_by="cpu_time_total", row_limit=30))
def eval(dataloader, model_str, model, device, loss, highest_accuracy, save_model, trace): size = len(dataloader.dataset) num_batches = len(dataloader) # Switch model to evaluation mode model.eval() test_loss, correct = 0, 0 with torch.no_grad(): for X, y in dataloader: X = X.to(device) y = y.to(device) # Evaluate the model on the test input if (trace): with profiler.profile(record_shapes=True, with_stack=True, profile_memory=True) as prof: with profiler.record_function("model_inference"): pred = model(X) print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=1000)) break else: pred = model(X) test_loss += loss(pred, y).to("cpu") correct += (pred.to("cpu").argmax(1) == y.to("cpu")).type( torch.float).sum() if not trace: test_loss /= num_batches correct /= size if (correct.item() > highest_accuracy): highest_accuracy = correct.item() print("current highest_accuracy: ", highest_accuracy) # save model if save_model: state_dict = collections.OrderedDict() for key in model.state_dict().keys(): state_dict[key] = model.state_dict()[key].to("cpu") checkpoint = get_checkpoint_folder(model_str, device) torch.save(state_dict, checkpoint) print( f"Test Error: \n Accuracy: {(100*correct.item()):>0.1f}%, Avg loss: {test_loss.item():>8f} \n" ) return highest_accuracy
def test_source(self): """Checks that source code attribution works for eager, TS and autograd mode """ # avoid automatic inlining prev_opt = torch._C._get_graph_executor_optimize() torch._C._set_graph_executor_optimize(False) @torch.jit.script def ts_method_2(x, y): return torch.matmul(x, y) @torch.jit.script def ts_method_1(x, y, z): a = x + z w = ts_method_2(x, y) + a return w.sum() class DummyModule(nn.Module): def __init__(self): super(DummyModule, self).__init__() self.conv = torch.nn.Conv2d(3, 2, kernel_size=1, stride=2, padding=3, bias=False) def forward(self, x): return self.conv(x) mod = DummyModule() with profile(with_stack=True, use_kineto=kineto_available()) as p: x = torch.randn(10, 10, requires_grad=True) y = torch.randn(10, 10, requires_grad=True) z = x + y w = ts_method_1(x, y, z) v = 2 * w v.backward() a = torch.randn(2, 3, 2, 2, requires_grad=True) b = mod(a) c = b.sum() c.backward() print(p.key_averages( group_by_stack_n=5).table( sort_by="self_cpu_time_total", row_limit=-1)) for e in p.function_events: if "aten::add" in e.name or "AddBackward" in e.name: self.assertTrue(any(["test_profiler" in entry for entry in e.stack])) self.assertTrue(any([( "test_source" in entry or "ts_method_1" in entry or "ts_method_2" in entry) for entry in e.stack])) torch._C._set_graph_executor_optimize(prev_opt)
def wrap_forward(*args, **kwargs): try: with torch_profiler.profile( use_cuda=self.use_cuda, profile_memory=self.profile_memory) as prof: res = _forward(*args, **kwargs) except TypeError: if self.profile_memory: warnings.warn( "`profile_memory` is unsupported in torch < 1.6", RuntimeWarning, ) self.profile_memory = False with torch_profiler.profile( use_cuda=self.use_cuda) as prof: res = _forward(*args, **kwargs) event_list = prof.function_events event_list.populate_cpu_children() # each profile call should be contained in its own list self.trace_profile_events[path].append(event_list) return res
def test_flops(self): model = torch.nn.Sequential( nn.Conv2d(16, 33, 18), nn.ReLU(), nn.Linear(243, 243), nn.ReLU(), ) inputs = torch.randn(40, 16, 18, 260) with profiler.profile(record_shapes=True, with_flops=True) as prof: model(inputs) profiler_output = prof.key_averages(group_by_input_shape=True).table(sort_by="cpu_time_total", row_limit=10) print(profiler_output) self.assertIn("FLOPS", profiler_output)
def prepare_trace(self): self.profiler = prof.profile( use_cuda=(ProfilerActivity.CUDA in self.activities), use_cpu=(ProfilerActivity.CPU in self.activities), record_shapes=self.record_shapes, with_flops=self.with_flops, profile_memory=self.profile_memory, with_stack=self.with_stack, with_modules=self.with_modules, use_kineto=True, experimental_config=self.experimental_config, ) self.profiler._prepare_trace()
def checkTraceTVM(self, func, input_tensors=None, input_shapes=None, size=100000, runs=100, verbose=False): # prepare inputs if input_tensors is None: if input_shapes is None: seed = torch.rand(size) / runs / 2 input_tensors = (seed, seed, seed) else: input_tensors = [] for shape in input_shapes: seed = torch.rand(*shape) / runs / 2 input_tensors.append(seed) # jit the function trace_jit = torch.jit.trace(func, input_tensors) # specialize the graph with the inputs _ = trace_jit(*input_tensors) # timeit the perf jit_start = time.time() for _ in range(runs): outputs_jit = trace_jit(*input_tensors) jit_time = time.time() - jit_start # jit the function and lower to TVM torch_tvm.enable() trace_tvm = torch.jit.trace(func, input_tensors) tvm_unused = "TVM was not able to optimize this trace." assert "tvm::CompilationGroup" in str( trace_tvm.graph_for(*input_tensors)), tvm_unused # tvm compile the graph and ensure TVM is used with profile() as p: _ = trace_tvm(*input_tensors) assert "TVM" in [_.name for _ in p.function_events], tvm_unused torch_tvm.disable() # timeit the perf tvm_start = time.time() for _ in range(runs): outputs_tvm = trace_tvm(*input_tensors) tvm_time = time.time() - tvm_start if verbose: print("\noperator " + func.__name__ + ":\t{} runs of size {}".format(runs, size) + " \tjit time:{:.4f}s".format(jit_time) + "\ttvm time:{:.4f}s".format(tvm_time)) self.assertEqual(outputs_jit, outputs_tvm)
def main(): model = MyModule(500, 10).cuda() data = torch.rand(128, 500).cuda() mask = torch.rand((500, 500, 500), dtype=torch.double).cuda() model(data, mask) with profiler.profile(with_stack=True, profile_memory=True) as prof: out, idx = model(data, mask) report = prof.key_averages(group_by_stack_n=5) report = report.table(sort_by='self_cpu_time_total', row_limit=5) print(report)
def wrap_forward(*args, **kwargs): try: with torch_profiler.profile( use_cuda=self.use_cuda, profile_memory=self.profile_memory ) as prof: res = _forward(*args, **kwargs) except TypeError: if self.profile_memory: warnings.warn( "`profile_memory` is unsupported in torch < 1.6", RuntimeWarning, ) self.profile_memory = False with torch_profiler.profile(use_cuda=self.use_cuda) as prof: res = _forward(*args, **kwargs) event_list = prof.function_events # PyTorch up until version 1.7 exposes this method. From PyTorch 1.8 onwards, # it is called via EventList._build_tree at the end of the context manager. if hasattr(event_list, "populate_cpu_children"): event_list.populate_cpu_children() # each profile call should be contained in its own list self.trace_profile_events[path].append(event_list) return res
def analyzer(): matrix = torch.randint(0, len(tokenizer) - 1, (args.batch, args.max_seq_length)) all_input_ids = torch.tensor(matrix, dtype=torch.long).to(device) all_attention_mask = torch.tensor(matrix, dtype=torch.long).to(device) inputs = { "input_ids": all_input_ids, "attention_mask": all_attention_mask, } with profiler.profile(profile_memory=True, record_shapes=True) as prof: model(**inputs) total_average = prof.total_average() return total_average.cpu_memory_usage
def _test_soft_histogram(gpu=False): image = skimage.io.imread(os.path.join(data_dir, "gray.png")).astype( np.float32) / 255.0 image = image[:64, :64] nbins = 8 image = Variable(th.from_numpy(image), requires_grad=True) if gpu: image = image.cuda() print("profiling") with profiler.profile() as prof: output = funcs.SoftHistogram.apply(image, nbins) loss = output.sum() loss.backward() print(prof)
def Pytorch_Profiler_Example(): startt = time.time() import torch.autograd.profiler as profiler with profiler.profile(profile_memory=True, record_shapes=True, use_cuda=True) as prof: with profiler.record_function("model_inference"): for iis, (datax, labels) in enumerate(data_loader): res = loss(datax.to(DEVICE), labels.to(DEVICE)) res.backward() if iis > 5: break print(type(res)) endt = time.time() print("DeltaT,", endt - startt) print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=100))
def main(): logging.captureWarnings(True) opts = AppSettings() opts = update_settings(opts) if opts.profile: try: with profiler.profile(record_shapes=True, use_cuda=True) as prof: eval_main(opts) finally: print('tracing...') print(prof.key_averages().table( sort_by='cpu_time_total', row_limit=16)) prof.export_chrome_trace("/tmp/trace.json") else: eval_main(opts)
def main() -> None: parser = argparse.ArgumentParser() parser.add_argument('config', metavar='FILE', help='config file') parser.add_argument('--run-dir', metavar='DIR', help='run directory') parser.add_argument('--pdb', action='store_true', help='pdb') parser.add_argument('--gpu', type=str, help='gpu ids', default=None) args, opts = parser.parse_known_args() configs.load(args.config, recursive=True) configs.update(opts) if configs.debug.pdb or args.pdb: pdb.set_trace() if args.gpu is not None: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu if configs.debug.set_seed: torch.manual_seed(configs.debug.seed) np.random.seed(configs.debug.seed) if configs.run.device == 'gpu': device = torch.device('cuda') elif configs.run.device == 'cpu': device = torch.device('cpu') else: raise ValueError(configs.run.device) logger.info(' '.join([sys.executable] + sys.argv)) logger.info(f'Profiling started: "{args.run_dir}".' + '\n' + f'{configs}') inputs = torch.tensor(torch.rand(configs.run.bsz, 1, 28, 28), device=device) model = builder.make_model() model.to(device) with profiler.profile(record_shapes=True) as prof: with profiler.record_function("model_inference"): for _ in range(3): time.sleep(0.5) model(inputs) # for _ in range(3): # model(inputs) prof.export_chrome_trace("part1_static.json")
def measure_backward(self): self.prepare() with profiler.profile(use_cuda=self.use_cuda) as prof: self.run_backward() dur = 0.0 threads = set([]) if self.use_cuda: for evt in prof.function_events: threads.add(evt.thread) for k in evt.kernels: dur += k.interval.elapsed_us() else: for evt in prof.function_events: threads.add(evt.thread) dur += evt.cpu_interval.elapsed_us() # print(len(threads)) return dur
def speed_memory_test(device=None, batch_size=1, repeats=100): """Run speed and memory tests.""" torch.manual_seed(0) if device is None: device = torch.device('cpu') n = [10, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000] t = [[], [], [], []] m = [[], [], [], []] fcns = [sinkhorn, OptimalTransportLayer(approx_grad=True), OptimalTransportLayer(block_inverse=False), OptimalTransportLayer()] for ni in n: print("Profiling on {}-by-{} problem...".format(ni, ni)) M_true = torch.randn((batch_size, ni, ni), dtype=torch.float) #M_true = torch.log(torch.rand((batch_size, ni, ni), dtype=torch.float)) P_true = sinkhorn(M_true).to(device) M_init = torch.log(torch.rand_like(M_true)).to(device) # profile speed _, _, ti = learnM(fcns, M_init, None, None, P_true, repeats) for i in range(4): t[i].append(ti[i]) # profile memory for i, f in enumerate(fcns): with profiler.profile(profile_memory=True) as prof: _ = learnM([f], M_init, None, None, P_true, 1) m[i].append(prof.total_average().cpu_memory_usage) print("...done") plt.figure() plt.plot(n, t[0], n, t[1], n, t[2], n, t[3]) plt.xlabel('problem size'); plt.ylabel('running time') plt.legend(['autograd', 'approx', 'implicit (full inv)', 'implicit (blk inv)']) plt.title('Running time on {} with batch size {}'.format(device, batch_size)) plt.figure() plt.plot(n, m[0], n, m[1], n, m[2], n, m[3]) plt.xlabel('problem size'); plt.ylabel('memory usage') plt.legend(['autograd', 'approx', 'implicit (full inv)', 'implicit (blk inv)']) plt.title('Memory usage on {} with batch size {}'.format(device, batch_size))
def profile(): print("Profiling model") vec_env = make_env(config.eval_scenarios, config.parallel_envs, name="profile") algo = make_algo(vec_env, config) # get a trace with profiler.profile(profile_memory=True, record_shapes=True, use_cuda=True) as prof: with profiler.record_function("train_step"): algo.learn(algo.batch_size) prof.export_chrome_trace("trace.json") print("done.")
def profile_model(device, input_size=10, num_units=10, num_segments=20, dim_context=15, batch_size=4096, iterations=10, dendritic_layer_class=AbsoluteMaxGatingDendriticLayer): """Create dendritic layer using the specified layer type, and profile it.""" print("\n\n=============== " + dendritic_layer_class.__name__ + " ================") use_cuda = device.type == "cuda" linear = torch.nn.Linear(input_size, num_units) dendrite_layer = dendritic_layer_class(module=linear, num_segments=num_segments, dim_context=dim_context, module_sparsity=0.7, dendrite_sparsity=0.9).to(device) dummy_tensor = torch.rand((batch_size, input_size), device=device) dummy_context = torch.rand((batch_size, dim_context), device=device) s = time.time() with profiler.profile(record_shapes=True, use_cuda=use_cuda) as prof: with profiler.record_function(dendritic_layer_class.__name__ + " inference"): res = dendrite_layer(dummy_tensor, dummy_context) for _ in range(iterations - 1): res += dendrite_layer(dummy_tensor, dummy_context) wall_clock = time.time() - s print("Wall clock:", wall_clock) if device.type == "cuda": print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10)) else: print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10)) if res.sum() == 0: # Just to make Python think we need res print(res.sum()) return wall_clock
def test_export_stacks(self): with profile(with_stack=True, use_kineto=kineto_available()) as p: x = torch.randn(10, 10) y = torch.randn(10, 10) z = torch.mm(x, y) z = z + y with tempfile.NamedTemporaryFile(mode="w+") as f: p.export_stacks(f.name) lines = f.readlines() assert len(lines) > 0, "Empty stacks file" for line in lines: is_int = False try: assert int(line.split(" ")[-1]) > 0, "Invalid stacks record" is_int = True except ValueError: pass assert is_int, "Invalid stacks record"
def __call__(self): self.reset() for i in range(self.burn_iters): self.run() start = time.time() with profiler.profile() as prof: for i in range(self.iters): start1 = time.time() self.run() # th.cuda.synchronize() end1 = time.time() runtime1 = (end1-start1)*1000.0 # print "iter {}: {:.2f}ms".format(i, runtime1) end = time.time() # print prof runtime = (end-start)*1000.0/self.iters return BenchmarkResult(self.name(), runtime, self.cuda)
def profile(self): import torch.autograd.profiler as profiler with profiler.profile(use_cuda=True) as prof: self.model.to(self.device) self.model.train() for batch_idx, (data, rgb, target) in enumerate(tqdm(self.data_loader)): if batch_idx == 30: break self.optimizer.zero_grad() data, rgb, target = data.to(self.device, non_blocking=True), rgb.to(self.device, non_blocking=True), target.to( self.device, non_blocking=True) output = self.model(data, rgb) loss = self.criterion(output, target) loss.backward() self.optimizer.step() print(prof.key_averages(group_by_input_shape=True).table(sort_by="cpu_time_total", row_limit=10))
def timeit_graph(batch_size, model, device_name, n_epochs=1000, n_stations=2, half_precision=False): LOGGER.info("Starting measurement") LOGGER.info(f"No. CUDA devices: {torch.cuda.device_count()}") device = torch.device(device_name) use_cuda = True if device_name == 'cuda' else False model = GraphNet_v1().to(device) if half_precision: model = model.half() model.eval() N = 900 E = 1300 F = 5 with profile(use_cuda=use_cuda) as prof: for _ in tqdm(range(n_epochs)): temp_X = torch.rand(batch_size, N, F) temp_Ri = torch.rand(batch_size, N, E) temp_Ro = temp_Ri if half_precision: temp_X = temp_X.half() temp_Ri = temp_X.half() temp_Ro = temp_X.half() #print(temp_lengths) graph = (temp_X, temp_Ri, temp_Ro) preds = model(graph) table = prof.key_averages().table() print(table) result = 'Speed:', round( (batch_size * n_epochs) / float(str(table).split('\n')[-2].split(' ')[-1].strip('s')), 3), 'elements/s' LOGGER.info(table) LOGGER.info(result) print(result)
def profile_for_batch_size(G: nn.Module, cfg: DictConfig, batch_size: int): z = torch.randn(batch_size, G.z_dim, device=cfg.device) c = None times = [] for i in tqdm(range(cfg.num_warmup_iters), desc='Warming up'): torch.cuda.synchronize() fake_img = G(z, c).contiguous() y = fake_img[0, 0, 0, 0].item() # sync torch.cuda.synchronize() time.sleep(1) torch.cuda.reset_peak_memory_stats() with profiler.profile(record_shapes=True, use_cuda=True) as prof: for i in tqdm(range(cfg.num_profile_iters), desc='Profiling'): torch.cuda.synchronize() start_time = time.time() with profiler.record_function("forward"): fake_img = G(z, c).contiguous() y = fake_img[0, 0, 0, 0].item() # sync torch.cuda.synchronize() times.append(time.time() - start_time) torch.cuda.empty_cache() num_imgs_processed = len(times) * batch_size total_time_spent = np.sum(times) bandwidth = num_imgs_processed / total_time_spent summary = prof.key_averages().table(sort_by="cpu_time_total", row_limit=10) print( f'[Batch size: {batch_size}] Mean: {np.mean(times):.05f}s/it. Std: {np.std(times):.05f}s' ) print(f'[Batch size: {batch_size}] Imgs/sec: {bandwidth:.03f}') print( f'[Batch size: {batch_size}] Max mem: {torch.cuda.max_memory_allocated(cfg.device) / 2**30:<6.2f} gb' ) return bandwidth, summary
def main(argv): if FLAGS.bindsnet: import bindsnet_lif run_benchmark(bindsnet_lif.lif_feed_forward_benchmark, "BindsNET_lif") if FLAGS.genn: import genn_lif run_benchmark(genn_lif.lif_feed_forward_benchmark, "GeNN_lif") if FLAGS.norse: import norse_lif if FLAGS.profile: import torch.autograd.profiler as profiler with profiler.profile(profile_memory=True, use_cuda=(FLAGS.device == "cuda")) as prof: run_benchmark(norse_lif.lif_feed_forward_benchmark, "Norse_lif") prof.export_chrome_trace("trace.json") else: run_benchmark(norse_lif.lif_feed_forward_benchmark, "Norse_lif")
def do_cuda_timing(f, inp, context=None, n_loops=100): ''' Get timings of cuda modules. Note `self_cpu_time_total` is returned, but from experiments this appears to be similar/same to the total CUDA time f : function to profile, typically an nn.Module inp : required input to f context : optional additional input into f, used for Decoder-style modules ''' f.cuda() args = (inp.cuda(), ) if exists(context): args += (context.cuda(), ) with profiler.profile(record_shapes=False, use_cuda=True) as prof: with profiler.record_function("model_inference"): with torch.no_grad(): for _ in range(n_loops): f(*args) torch.cuda.synchronize() res = round((prof.key_averages().self_cpu_time_total / 1000) / n_loops, 3) print(f'{res}ms') return res
def test_mem_leak(self): """Checks that there's no memory leak when using profiler with CUDA """ t = torch.rand(1, 1).cuda() p = psutil.Process() last_rss = collections.deque(maxlen=5) for outer_idx in range(10): with profile(use_cuda=True): for _ in range(1024): t = torch.mm(t, t) gc.collect() torch.cuda.empty_cache() last_rss.append(p.memory_info().rss) max_diff = -1 for idx in range(1, len(last_rss)): max_diff = max(max_diff, last_rss[idx] - last_rss[idx - 1]) # with CUDA events leaking the increase in memory was ~7 MB, # using much smaller threshold but not zero to reduce flakiness self.assertTrue(max_diff < 100 * 1024)
def test_kineto_profiler_api(self): called_num = [0] with profile(use_cuda=True, use_kineto=True): self.payload() def trace_handler(p): print(p.key_averages().table( sort_by="self_cuda_time_total", row_limit=-1)) # p.export_chrome_trace("/tmp/test_trace_" + str(called_num[0]) + ".json") called_num[0] += 1 with torch.profiler.profile( activities=[ torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA], schedule=torch.profiler.schedule( wait=1, warmup=1, active=2), on_trace_ready=trace_handler ) as p: for idx in range(8): self.payload() p.next_step() self.assertEqual(called_num[0], 2) # case without enable_pred with torch.profiler.profile( activities=[ torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA] ) as p: self.payload() self.payload() print(p.key_averages().table( sort_by="self_cuda_time_total", row_limit=-1))
def run_prof(use_cuda=False): with profiler.profile(use_cuda=use_cuda) as prof: exec(code, globs, None) return prof