예제 #1
0
 def wrap_forward(*args, **kwargs):
     with tprofiler.profile(
             use_cuda=self.use_cuda,
             profile_memory=self.profile_memory,
     ) as prof:
         res = _forward(*args, **kwargs)
     event_list = prof.function_events
     event_list.populate_cpu_children()
     # each profile call should be contained in its own list
     self.profile_events[path].append(event_list)
     return res
예제 #2
0
 def _start_warmup(self):
     self.profiler = prof.profile(
         use_cuda=(ProfilerActivity.CUDA in self.activities),
         use_cpu=(ProfilerActivity.CPU in self.activities),
         record_shapes=self.record_shapes,
         with_flops=self.with_flops,
         profile_memory=self.profile_memory,
         with_stack=self.with_stack,
         use_kineto=True,
     )
     self.profiler._prepare_kineto_trace()
예제 #3
0
def test():
    with profiler.profile(profile_memory=True, record_shapes=True) as prof:
        with profiler.record_function("model_infrerence"):
            model.eval()
            output = model(features, adj)
            loss_test = F.nll_loss(output[idx_test], labels[idx_test])
            acc_test = accuracy(output[idx_test], labels[idx_test])
            print("Test set results:", "loss= {:.4f}".format(loss_test.item()),
                  "accuracy= {:.4f}".format(acc_test.item()))
    print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))
    print(prof.key_averages().table(sort_by="cpu_memory_usage", row_limit=10))
예제 #4
0
def profile_one_step(func, nwarmup=3):
    for i in range(nwarmup):
        func()

    use_cuda = args.device == "cuda"

    with profiler.profile(record_shapes=True, use_cuda=use_cuda) as prof:
        func()

    print(
        prof.key_averages(group_by_input_shape=True).table(
            sort_by="cpu_time_total", row_limit=30))
예제 #5
0
def eval(dataloader, model_str, model, device, loss, highest_accuracy,
         save_model, trace):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)

    # Switch model to evaluation mode
    model.eval()

    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X = X.to(device)
            y = y.to(device)

            # Evaluate the model on the test input
            if (trace):
                with profiler.profile(record_shapes=True,
                                      with_stack=True,
                                      profile_memory=True) as prof:
                    with profiler.record_function("model_inference"):
                        pred = model(X)
                print(prof.key_averages().table(sort_by="cpu_time_total",
                                                row_limit=1000))
                break
            else:
                pred = model(X)

            test_loss += loss(pred, y).to("cpu")
            correct += (pred.to("cpu").argmax(1) == y.to("cpu")).type(
                torch.float).sum()

    if not trace:
        test_loss /= num_batches
        correct /= size

        if (correct.item() > highest_accuracy):
            highest_accuracy = correct.item()
            print("current highest_accuracy: ", highest_accuracy)

            # save model
            if save_model:
                state_dict = collections.OrderedDict()
                for key in model.state_dict().keys():
                    state_dict[key] = model.state_dict()[key].to("cpu")
                checkpoint = get_checkpoint_folder(model_str, device)
                torch.save(state_dict, checkpoint)

        print(
            f"Test Error: \n Accuracy: {(100*correct.item()):>0.1f}%, Avg loss: {test_loss.item():>8f} \n"
        )

    return highest_accuracy
예제 #6
0
    def test_source(self):
        """Checks that source code attribution works for eager, TS and autograd mode
        """
        # avoid automatic inlining
        prev_opt = torch._C._get_graph_executor_optimize()
        torch._C._set_graph_executor_optimize(False)

        @torch.jit.script
        def ts_method_2(x, y):
            return torch.matmul(x, y)

        @torch.jit.script
        def ts_method_1(x, y, z):
            a = x + z
            w = ts_method_2(x, y) + a
            return w.sum()

        class DummyModule(nn.Module):
            def __init__(self):
                super(DummyModule, self).__init__()
                self.conv = torch.nn.Conv2d(3, 2, kernel_size=1, stride=2, padding=3, bias=False)

            def forward(self, x):
                return self.conv(x)

        mod = DummyModule()

        with profile(with_stack=True, use_kineto=kineto_available()) as p:
            x = torch.randn(10, 10, requires_grad=True)
            y = torch.randn(10, 10, requires_grad=True)
            z = x + y
            w = ts_method_1(x, y, z)
            v = 2 * w
            v.backward()
            a = torch.randn(2, 3, 2, 2, requires_grad=True)
            b = mod(a)
            c = b.sum()
            c.backward()

        print(p.key_averages(
            group_by_stack_n=5).table(
            sort_by="self_cpu_time_total", row_limit=-1))

        for e in p.function_events:
            if "aten::add" in e.name or "AddBackward" in e.name:
                self.assertTrue(any(["test_profiler" in entry for entry in e.stack]))
                self.assertTrue(any([(
                    "test_source" in entry or
                    "ts_method_1" in entry or
                    "ts_method_2" in entry) for entry in e.stack]))

        torch._C._set_graph_executor_optimize(prev_opt)
예제 #7
0
            def wrap_forward(*args, **kwargs):
                try:
                    with torch_profiler.profile(
                            use_cuda=self.use_cuda,
                            profile_memory=self.profile_memory) as prof:
                        res = _forward(*args, **kwargs)
                except TypeError:
                    if self.profile_memory:
                        warnings.warn(
                            "`profile_memory` is unsupported in torch < 1.6",
                            RuntimeWarning,
                        )
                        self.profile_memory = False
                    with torch_profiler.profile(
                            use_cuda=self.use_cuda) as prof:
                        res = _forward(*args, **kwargs)

                event_list = prof.function_events
                event_list.populate_cpu_children()
                # each profile call should be contained in its own list
                self.trace_profile_events[path].append(event_list)
                return res
예제 #8
0
 def test_flops(self):
     model = torch.nn.Sequential(
         nn.Conv2d(16, 33, 18),
         nn.ReLU(),
         nn.Linear(243, 243),
         nn.ReLU(),
     )
     inputs = torch.randn(40, 16, 18, 260)
     with profiler.profile(record_shapes=True, with_flops=True) as prof:
         model(inputs)
     profiler_output = prof.key_averages(group_by_input_shape=True).table(sort_by="cpu_time_total", row_limit=10)
     print(profiler_output)
     self.assertIn("FLOPS", profiler_output)
예제 #9
0
 def prepare_trace(self):
     self.profiler = prof.profile(
         use_cuda=(ProfilerActivity.CUDA in self.activities),
         use_cpu=(ProfilerActivity.CPU in self.activities),
         record_shapes=self.record_shapes,
         with_flops=self.with_flops,
         profile_memory=self.profile_memory,
         with_stack=self.with_stack,
         with_modules=self.with_modules,
         use_kineto=True,
         experimental_config=self.experimental_config,
     )
     self.profiler._prepare_trace()
예제 #10
0
    def checkTraceTVM(self,
                      func,
                      input_tensors=None,
                      input_shapes=None,
                      size=100000,
                      runs=100,
                      verbose=False):
        # prepare inputs
        if input_tensors is None:
            if input_shapes is None:
                seed = torch.rand(size) / runs / 2
                input_tensors = (seed, seed, seed)
            else:
                input_tensors = []
                for shape in input_shapes:
                    seed = torch.rand(*shape) / runs / 2
                    input_tensors.append(seed)

        # jit the function
        trace_jit = torch.jit.trace(func, input_tensors)
        # specialize the graph with the inputs
        _ = trace_jit(*input_tensors)
        # timeit the perf
        jit_start = time.time()
        for _ in range(runs):
            outputs_jit = trace_jit(*input_tensors)
        jit_time = time.time() - jit_start

        # jit the function and lower to TVM
        torch_tvm.enable()
        trace_tvm = torch.jit.trace(func, input_tensors)
        tvm_unused = "TVM was not able to optimize this trace."
        assert "tvm::CompilationGroup" in str(
            trace_tvm.graph_for(*input_tensors)), tvm_unused
        # tvm compile the graph and ensure TVM is used
        with profile() as p:
            _ = trace_tvm(*input_tensors)
        assert "TVM" in [_.name for _ in p.function_events], tvm_unused
        torch_tvm.disable()
        # timeit the perf
        tvm_start = time.time()
        for _ in range(runs):
            outputs_tvm = trace_tvm(*input_tensors)
        tvm_time = time.time() - tvm_start

        if verbose:
            print("\noperator " + func.__name__ +
                  ":\t{} runs of size {}".format(runs, size) +
                  " \tjit time:{:.4f}s".format(jit_time) +
                  "\ttvm time:{:.4f}s".format(tvm_time))
        self.assertEqual(outputs_jit, outputs_tvm)
예제 #11
0
def main():
    model = MyModule(500, 10).cuda()
    data = torch.rand(128, 500).cuda()
    mask = torch.rand((500, 500, 500), dtype=torch.double).cuda()

    model(data, mask)

    with profiler.profile(with_stack=True, profile_memory=True) as prof:
        out, idx = model(data, mask)

    report = prof.key_averages(group_by_stack_n=5)
    report = report.table(sort_by='self_cpu_time_total', row_limit=5)

    print(report)
예제 #12
0
            def wrap_forward(*args, **kwargs):
                try:
                    with torch_profiler.profile(
                        use_cuda=self.use_cuda, profile_memory=self.profile_memory
                    ) as prof:
                        res = _forward(*args, **kwargs)
                except TypeError:
                    if self.profile_memory:
                        warnings.warn(
                            "`profile_memory` is unsupported in torch < 1.6",
                            RuntimeWarning,
                        )
                        self.profile_memory = False
                    with torch_profiler.profile(use_cuda=self.use_cuda) as prof:
                        res = _forward(*args, **kwargs)

                event_list = prof.function_events
                # PyTorch up until version 1.7 exposes this method. From PyTorch 1.8 onwards, 
                # it is called via EventList._build_tree at the end of the context manager.
                if hasattr(event_list, "populate_cpu_children"):
                    event_list.populate_cpu_children()
                # each profile call should be contained in its own list
                self.trace_profile_events[path].append(event_list)
                return res
예제 #13
0
파일: test.py 프로젝트: khu-dev/KoELECTRA
def analyzer():
    matrix = torch.randint(0,
                           len(tokenizer) - 1,
                           (args.batch, args.max_seq_length))
    all_input_ids = torch.tensor(matrix, dtype=torch.long).to(device)
    all_attention_mask = torch.tensor(matrix, dtype=torch.long).to(device)
    inputs = {
        "input_ids": all_input_ids,
        "attention_mask": all_attention_mask,
    }
    with profiler.profile(profile_memory=True, record_shapes=True) as prof:
        model(**inputs)

    total_average = prof.total_average()

    return total_average.cpu_memory_usage
예제 #14
0
def _test_soft_histogram(gpu=False):
    image = skimage.io.imread(os.path.join(data_dir, "gray.png")).astype(
        np.float32) / 255.0
    image = image[:64, :64]
    nbins = 8

    image = Variable(th.from_numpy(image), requires_grad=True)
    if gpu:
        image = image.cuda()

    print("profiling")
    with profiler.profile() as prof:
        output = funcs.SoftHistogram.apply(image, nbins)
        loss = output.sum()
        loss.backward()
    print(prof)
예제 #15
0
def Pytorch_Profiler_Example():
    startt = time.time()
    import torch.autograd.profiler as profiler
    with profiler.profile(profile_memory=True,
                          record_shapes=True,
                          use_cuda=True) as prof:
        with profiler.record_function("model_inference"):
            for iis, (datax, labels) in enumerate(data_loader):
                res = loss(datax.to(DEVICE), labels.to(DEVICE))
                res.backward()
                if iis > 5:
                    break
    print(type(res))
    endt = time.time()
    print("DeltaT,", endt - startt)
    print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=100))
def main():
    logging.captureWarnings(True)
    opts = AppSettings()
    opts = update_settings(opts)
    if opts.profile:
        try:
            with profiler.profile(record_shapes=True, use_cuda=True) as prof:
                eval_main(opts)
        finally:
            print('tracing...')
            print(prof.key_averages().table(
                sort_by='cpu_time_total',
                row_limit=16))
            prof.export_chrome_trace("/tmp/trace.json")
    else:
        eval_main(opts)
예제 #17
0
def main() -> None:
    parser = argparse.ArgumentParser()
    parser.add_argument('config', metavar='FILE', help='config file')
    parser.add_argument('--run-dir', metavar='DIR', help='run directory')
    parser.add_argument('--pdb', action='store_true', help='pdb')
    parser.add_argument('--gpu', type=str, help='gpu ids', default=None)
    args, opts = parser.parse_known_args()

    configs.load(args.config, recursive=True)
    configs.update(opts)

    if configs.debug.pdb or args.pdb:
        pdb.set_trace()

    if args.gpu is not None:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    if configs.debug.set_seed:
        torch.manual_seed(configs.debug.seed)
        np.random.seed(configs.debug.seed)

    if configs.run.device == 'gpu':
        device = torch.device('cuda')
    elif configs.run.device == 'cpu':
        device = torch.device('cpu')
    else:
        raise ValueError(configs.run.device)

    logger.info(' '.join([sys.executable] + sys.argv))
    logger.info(f'Profiling started: "{args.run_dir}".' + '\n' + f'{configs}')

    inputs = torch.tensor(torch.rand(configs.run.bsz, 1, 28, 28),
                          device=device)

    model = builder.make_model()
    model.to(device)

    with profiler.profile(record_shapes=True) as prof:
        with profiler.record_function("model_inference"):
            for _ in range(3):
                time.sleep(0.5)
                model(inputs)

    # for _ in range(3):
    #     model(inputs)

    prof.export_chrome_trace("part1_static.json")
예제 #18
0
	def measure_backward(self):        
		self.prepare()
		with profiler.profile(use_cuda=self.use_cuda) as prof:
			self.run_backward()
		dur = 0.0
		threads = set([])
		if self.use_cuda:
			for evt in prof.function_events:
				threads.add(evt.thread)
				for k in evt.kernels:
					dur += k.interval.elapsed_us()
		else:
			for evt in prof.function_events:
				threads.add(evt.thread)
				dur += evt.cpu_interval.elapsed_us()
		# print(len(threads))
		return dur
예제 #19
0
def speed_memory_test(device=None, batch_size=1, repeats=100):
    """Run speed and memory tests."""

    torch.manual_seed(0)
    if device is None:
        device = torch.device('cpu')

    n = [10, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]
    t = [[], [], [], []]
    m = [[], [], [], []]

    fcns = [sinkhorn, OptimalTransportLayer(approx_grad=True), OptimalTransportLayer(block_inverse=False), OptimalTransportLayer()]
    for ni in n:
        print("Profiling on {}-by-{} problem...".format(ni, ni))
        M_true = torch.randn((batch_size, ni, ni), dtype=torch.float)
        #M_true = torch.log(torch.rand((batch_size, ni, ni), dtype=torch.float))
        P_true = sinkhorn(M_true).to(device)

        M_init = torch.log(torch.rand_like(M_true)).to(device)

        # profile speed
        _, _, ti = learnM(fcns, M_init, None, None, P_true, repeats)
        for i in range(4):
            t[i].append(ti[i])

        # profile memory
        for i, f in enumerate(fcns):
            with profiler.profile(profile_memory=True) as prof:
                _ = learnM([f], M_init, None, None, P_true, 1)
            m[i].append(prof.total_average().cpu_memory_usage)

    print("...done")

    plt.figure()
    plt.plot(n, t[0], n, t[1], n, t[2], n, t[3])
    plt.xlabel('problem size');
    plt.ylabel('running time')
    plt.legend(['autograd', 'approx', 'implicit (full inv)', 'implicit (blk inv)'])
    plt.title('Running time on {} with batch size {}'.format(device, batch_size))

    plt.figure()
    plt.plot(n, m[0], n, m[1], n, m[2], n, m[3])
    plt.xlabel('problem size');
    plt.ylabel('memory usage')
    plt.legend(['autograd', 'approx', 'implicit (full inv)', 'implicit (blk inv)'])
    plt.title('Memory usage on {} with batch size {}'.format(device, batch_size))
예제 #20
0
def profile():

    print("Profiling model")

    vec_env = make_env(config.eval_scenarios,
                       config.parallel_envs,
                       name="profile")
    algo = make_algo(vec_env, config)
    # get a trace
    with profiler.profile(profile_memory=True,
                          record_shapes=True,
                          use_cuda=True) as prof:
        with profiler.record_function("train_step"):
            algo.learn(algo.batch_size)
    prof.export_chrome_trace("trace.json")

    print("done.")
예제 #21
0
def profile_model(device,
                  input_size=10,
                  num_units=10,
                  num_segments=20,
                  dim_context=15,
                  batch_size=4096,
                  iterations=10,
                  dendritic_layer_class=AbsoluteMaxGatingDendriticLayer):
    """Create dendritic layer using the specified layer type, and profile it."""

    print("\n\n=============== " + dendritic_layer_class.__name__ +
          " ================")
    use_cuda = device.type == "cuda"
    linear = torch.nn.Linear(input_size, num_units)
    dendrite_layer = dendritic_layer_class(module=linear,
                                           num_segments=num_segments,
                                           dim_context=dim_context,
                                           module_sparsity=0.7,
                                           dendrite_sparsity=0.9).to(device)

    dummy_tensor = torch.rand((batch_size, input_size), device=device)
    dummy_context = torch.rand((batch_size, dim_context), device=device)

    s = time.time()
    with profiler.profile(record_shapes=True, use_cuda=use_cuda) as prof:
        with profiler.record_function(dendritic_layer_class.__name__ +
                                      " inference"):
            res = dendrite_layer(dummy_tensor, dummy_context)
            for _ in range(iterations - 1):
                res += dendrite_layer(dummy_tensor, dummy_context)
    wall_clock = time.time() - s
    print("Wall clock:", wall_clock)

    if device.type == "cuda":
        print(prof.key_averages().table(sort_by="cuda_time_total",
                                        row_limit=10))
    else:
        print(prof.key_averages().table(sort_by="cpu_time_total",
                                        row_limit=10))

    if res.sum() == 0:  # Just to make Python think we need res
        print(res.sum())

    return wall_clock
예제 #22
0
    def test_export_stacks(self):
        with profile(with_stack=True, use_kineto=kineto_available()) as p:
            x = torch.randn(10, 10)
            y = torch.randn(10, 10)
            z = torch.mm(x, y)
            z = z + y

        with tempfile.NamedTemporaryFile(mode="w+") as f:
            p.export_stacks(f.name)
            lines = f.readlines()
            assert len(lines) > 0, "Empty stacks file"
            for line in lines:
                is_int = False
                try:
                    assert int(line.split(" ")[-1]) > 0, "Invalid stacks record"
                    is_int = True
                except ValueError:
                    pass
                assert is_int, "Invalid stacks record"
예제 #23
0
  def __call__(self):
    self.reset()
    for i in range(self.burn_iters):
      self.run()

    start = time.time()
    with profiler.profile() as prof:
      for i in range(self.iters):
        start1 = time.time()
        self.run()
        # th.cuda.synchronize()
        end1 = time.time()
        runtime1 = (end1-start1)*1000.0
        # print "iter {}: {:.2f}ms".format(i, runtime1)
      end = time.time()
    # print prof

    runtime = (end-start)*1000.0/self.iters

    return BenchmarkResult(self.name(), runtime, self.cuda)
예제 #24
0
    def profile(self):
        import torch.autograd.profiler as profiler
        with profiler.profile(use_cuda=True) as prof:
            self.model.to(self.device)
            self.model.train()
    
            for batch_idx, (data, rgb, target) in enumerate(tqdm(self.data_loader)):
                if batch_idx == 30:
                    break
                    
                self.optimizer.zero_grad()
                data, rgb, target = data.to(self.device, non_blocking=True), rgb.to(self.device,
                                                                                    non_blocking=True), target.to(
                    self.device, non_blocking=True)
        
                output = self.model(data, rgb)
                loss = self.criterion(output, target)
                loss.backward()
                self.optimizer.step()

        print(prof.key_averages(group_by_input_shape=True).table(sort_by="cpu_time_total", row_limit=10))
예제 #25
0
def timeit_graph(batch_size,
                 model,
                 device_name,
                 n_epochs=1000,
                 n_stations=2,
                 half_precision=False):
    LOGGER.info("Starting measurement")
    LOGGER.info(f"No. CUDA devices: {torch.cuda.device_count()}")
    device = torch.device(device_name)
    use_cuda = True if device_name == 'cuda' else False
    model = GraphNet_v1().to(device)
    if half_precision:
        model = model.half()
    model.eval()
    N = 900
    E = 1300
    F = 5
    with profile(use_cuda=use_cuda) as prof:
        for _ in tqdm(range(n_epochs)):
            temp_X = torch.rand(batch_size, N, F)
            temp_Ri = torch.rand(batch_size, N, E)
            temp_Ro = temp_Ri

            if half_precision:
                temp_X = temp_X.half()
                temp_Ri = temp_X.half()
                temp_Ro = temp_X.half()
            #print(temp_lengths)
            graph = (temp_X, temp_Ri, temp_Ro)
            preds = model(graph)

    table = prof.key_averages().table()
    print(table)
    result = 'Speed:', round(
        (batch_size * n_epochs) /
        float(str(table).split('\n')[-2].split(' ')[-1].strip('s')),
        3), 'elements/s'
    LOGGER.info(table)
    LOGGER.info(result)
    print(result)
예제 #26
0
파일: profile.py 프로젝트: LordHui/inr-gan
def profile_for_batch_size(G: nn.Module, cfg: DictConfig, batch_size: int):
    z = torch.randn(batch_size, G.z_dim, device=cfg.device)
    c = None
    times = []

    for i in tqdm(range(cfg.num_warmup_iters), desc='Warming up'):
        torch.cuda.synchronize()
        fake_img = G(z, c).contiguous()
        y = fake_img[0, 0, 0, 0].item()  # sync
        torch.cuda.synchronize()

    time.sleep(1)

    torch.cuda.reset_peak_memory_stats()

    with profiler.profile(record_shapes=True, use_cuda=True) as prof:
        for i in tqdm(range(cfg.num_profile_iters), desc='Profiling'):
            torch.cuda.synchronize()
            start_time = time.time()
            with profiler.record_function("forward"):
                fake_img = G(z, c).contiguous()
                y = fake_img[0, 0, 0, 0].item()  # sync
            torch.cuda.synchronize()
            times.append(time.time() - start_time)

    torch.cuda.empty_cache()
    num_imgs_processed = len(times) * batch_size
    total_time_spent = np.sum(times)
    bandwidth = num_imgs_processed / total_time_spent
    summary = prof.key_averages().table(sort_by="cpu_time_total", row_limit=10)

    print(
        f'[Batch size: {batch_size}] Mean: {np.mean(times):.05f}s/it. Std: {np.std(times):.05f}s'
    )
    print(f'[Batch size: {batch_size}] Imgs/sec: {bandwidth:.03f}')
    print(
        f'[Batch size: {batch_size}] Max mem: {torch.cuda.max_memory_allocated(cfg.device) / 2**30:<6.2f} gb'
    )

    return bandwidth, summary
예제 #27
0
def main(argv):
    if FLAGS.bindsnet:
        import bindsnet_lif

        run_benchmark(bindsnet_lif.lif_feed_forward_benchmark, "BindsNET_lif")
    if FLAGS.genn:
        import genn_lif

        run_benchmark(genn_lif.lif_feed_forward_benchmark, "GeNN_lif")
    if FLAGS.norse:
        import norse_lif

        if FLAGS.profile:
            import torch.autograd.profiler as profiler

            with profiler.profile(profile_memory=True,
                                  use_cuda=(FLAGS.device == "cuda")) as prof:
                run_benchmark(norse_lif.lif_feed_forward_benchmark,
                              "Norse_lif")
            prof.export_chrome_trace("trace.json")
        else:
            run_benchmark(norse_lif.lif_feed_forward_benchmark, "Norse_lif")
예제 #28
0
def do_cuda_timing(f, inp, context=None, n_loops=100):
    '''
        Get timings of cuda modules. Note `self_cpu_time_total` is returned, but
        from experiments this appears to be similar/same to the total CUDA time

        f :  function to profile, typically an nn.Module
        inp : required input to f
        context : optional additional input into f, used for Decoder-style modules
    '''
    f.cuda()
    args = (inp.cuda(), )
    if exists(context): args += (context.cuda(), )
    with profiler.profile(record_shapes=False, use_cuda=True) as prof:
        with profiler.record_function("model_inference"):
            with torch.no_grad():
                for _ in range(n_loops):
                    f(*args)
                    torch.cuda.synchronize()

    res = round((prof.key_averages().self_cpu_time_total / 1000) / n_loops, 3)
    print(f'{res}ms')
    return res
예제 #29
0
    def test_mem_leak(self):
        """Checks that there's no memory leak when using profiler with CUDA
        """
        t = torch.rand(1, 1).cuda()
        p = psutil.Process()
        last_rss = collections.deque(maxlen=5)
        for outer_idx in range(10):
            with profile(use_cuda=True):
                for _ in range(1024):
                    t = torch.mm(t, t)

            gc.collect()
            torch.cuda.empty_cache()
            last_rss.append(p.memory_info().rss)

        max_diff = -1
        for idx in range(1, len(last_rss)):
            max_diff = max(max_diff, last_rss[idx] - last_rss[idx - 1])

        # with CUDA events leaking the increase in memory was ~7 MB,
        # using much smaller threshold but not zero to reduce flakiness
        self.assertTrue(max_diff < 100 * 1024)
예제 #30
0
    def test_kineto_profiler_api(self):
        called_num = [0]

        with profile(use_cuda=True, use_kineto=True):
            self.payload()

        def trace_handler(p):
            print(p.key_averages().table(
                sort_by="self_cuda_time_total", row_limit=-1))
            # p.export_chrome_trace("/tmp/test_trace_" + str(called_num[0]) + ".json")
            called_num[0] += 1

        with torch.profiler.profile(
            activities=[
                torch.profiler.ProfilerActivity.CPU,
                torch.profiler.ProfilerActivity.CUDA],
            schedule=torch.profiler.schedule(
                wait=1,
                warmup=1,
                active=2),
            on_trace_ready=trace_handler
        ) as p:
            for idx in range(8):
                self.payload()
                p.next_step()

        self.assertEqual(called_num[0], 2)

        # case without enable_pred
        with torch.profiler.profile(
            activities=[
                torch.profiler.ProfilerActivity.CPU,
                torch.profiler.ProfilerActivity.CUDA]
        ) as p:
            self.payload()
            self.payload()
        print(p.key_averages().table(
            sort_by="self_cuda_time_total", row_limit=-1))
예제 #31
0
 def run_prof(use_cuda=False):
     with profiler.profile(use_cuda=use_cuda) as prof:
         exec(code, globs, None)
     return prof