예제 #1
0
    def test_kineto(self):
        use_cuda = torch.profiler.ProfilerActivity.CUDA in supported_activities(
        )
        with _profile(use_cuda=use_cuda, use_kineto=True):
            self.payload(use_cuda=use_cuda)

        # rerun to avoid initial start overhead
        with _profile(use_cuda=use_cuda, use_kineto=True) as p:
            self.payload(use_cuda=use_cuda)
        output = p.key_averages().table(sort_by="self_cuda_time_total"
                                        if use_cuda else "self_cpu_time_total",
                                        row_limit=-1)
        # print(output)
        found_gemm = False
        found_memcpy = False
        found_mm = False
        for e in p.function_events:
            if "aten::mm" in e.name:
                found_mm = True
            if "gemm" in e.name:
                found_gemm = True
            if "Memcpy" in e.name or "memcpy" in e.name:
                found_memcpy = True
        if use_cuda:
            self.assertTrue(found_gemm)
            self.assertTrue(found_memcpy)
        else:
            self.assertTrue(found_mm)
예제 #2
0
    def _test_profiler_tracing(self, use_kineto):
        with _profile(use_kineto=use_kineto) as prof:
            t1, t2 = torch.ones(1), torch.ones(1)
            torch.add(t1, t2)

        with TemporaryFileName(mode="w+") as fname:
            prof.export_chrome_trace(fname)
            # read the trace and expect valid json
            # if the JSON generated by export_chrome_trace is not valid, this will throw and fail the test.
            with io.open(fname, 'r') as f:
                json.load(f)

        # test empty trace
        with _profile(use_kineto=use_kineto) as prof:
            pass
        # saving an empty trace
        with TemporaryFileName(mode="w+") as fname:
            prof.export_chrome_trace(fname)

        # Same test but for cuda.
        use_cuda = torch.profiler.ProfilerActivity.CUDA in supported_activities(
        )
        if not use_cuda:
            return

        device = torch.device("cuda:0")
        with _profile(use_cuda=True, use_kineto=use_kineto) as prof:
            t1, t2 = torch.ones(1, device=device), torch.ones(1, device=device)
            torch.add(t1, t2)

        with TemporaryFileName(mode="w+") as fname:
            prof.export_chrome_trace(fname)
            # Now validate the json
            with io.open(fname, 'r') as f:
                json.load(f)
예제 #3
0
    def test_profiler_tracing(self):
        with _profile(use_kineto=kineto_available()) as prof:
            t1, t2 = torch.ones(1), torch.ones(1)
            torch.add(t1, t2)

        with TemporaryFileName(mode="w+") as fname:
            prof.export_chrome_trace(fname)
            # read the trace and expect valid json
            # if the JSON generated by export_chrome_trace is not valid, this will throw and fail the test.
            with io.open(fname, 'r') as f:
                json.load(f)

        # Same test but for cuda.
        if not torch.cuda.is_available():
            return

        device = torch.device("cuda:0")
        with _profile(use_cuda=True, use_kineto=kineto_available()) as prof:
            t1, t2 = torch.ones(1, device=device), torch.ones(1, device=device)
            torch.add(t1, t2)

        with TemporaryFileName(mode="w+") as fname:
            prof.export_chrome_trace(fname)
            # Now validate the json
            with io.open(fname, 'r') as f:
                json.load(f)
예제 #4
0
    def test_tensorboard_trace_handler(self):
        with _profile(use_cuda=True, use_kineto=True):
            self.payload()

        with TemporaryDirectoryName() as dname:
            with profile(
                    activities=[
                        torch.profiler.ProfilerActivity.CPU,
                        torch.profiler.ProfilerActivity.CUDA
                    ],
                    schedule=torch.profiler.schedule(wait=1,
                                                     warmup=1,
                                                     active=2,
                                                     repeat=3),
                    on_trace_ready=torch.profiler.tensorboard_trace_handler(
                        dname)) as p:
                for _ in range(18):
                    self.payload()
                    p.step()

            self.assertTrue(os.path.exists(dname))
            file_num = 0
            for file_name in os.listdir(dname):
                parts = file_name.split('.')
                self.assertTrue(len(parts) > 4)
                self.assertTrue(parts[-4].isdigit() and int(parts[-4]) > 0,
                                "Wrong tracing file name pattern")
                self.assertEqual(parts[-3:], ['pt', 'trace', 'json'])
                file_num += 1
            self.assertEqual(file_num, 3)
예제 #5
0
    def test_mem_leak(self):
        """Checks that there's no memory leak when using profiler with CUDA
        """
        t = torch.rand(1, 1).cuda()
        p = psutil.Process()
        last_rss = collections.deque(maxlen=5)
        for outer_idx in range(10):
            with _profile(use_cuda=True):
                for _ in range(1024):
                    t = torch.mm(t, t)

            gc.collect()
            torch.cuda.empty_cache()
            last_rss.append(p.memory_info().rss)

        # with CUDA events leaking the increase in memory was ~7 MB between
        # profiler invocations above
        is_increasing = all([
            last_rss[idx] > last_rss[idx - 1]
            for idx in range(1, len(last_rss))
        ])
        max_diff = -1
        for idx in range(1, len(last_rss)):
            max_diff = max(max_diff, last_rss[idx] - last_rss[idx - 1])
        self.assertTrue(not (is_increasing and max_diff > 100 * 1024),
                        msg='memory usage is increasing, {}'.format(
                            str(last_rss)))
예제 #6
0
 def _record_function_with_param(self):
     u = torch.randn(3, 4, 5, requires_grad=True)
     with _profile(with_stack=True, use_kineto=kineto_available(), record_shapes=True) as prof:
         with record_function("## TEST 1 ##", "1, 2, 3"):
             rf_handle = _record_function_with_args_enter("## TEST 2 ##", 1, False, 2.5, [u, u], "hello", u)
             _record_function_with_args_exit(rf_handle)
     return prof
예제 #7
0
    def test_kineto_profiler_api(self):
        called_num = [0]

        with _profile(use_cuda=True, use_kineto=True):
            self.payload()

        def trace_handler(p):
            print(p.key_averages().table(sort_by="self_cuda_time_total",
                                         row_limit=-1))
            # p.export_chrome_trace("/tmp/test_trace_" + str(called_num[0]) + ".json")
            called_num[0] += 1

        with profile(activities=[
                torch.profiler.ProfilerActivity.CPU,
                torch.profiler.ProfilerActivity.CUDA
        ],
                     schedule=torch.profiler.schedule(wait=1,
                                                      warmup=1,
                                                      active=2),
                     on_trace_ready=trace_handler) as p:
            for idx in range(8):
                self.payload()
                p.step()

        self.assertEqual(called_num[0], 2)

        # case without enable_pred
        with profile(activities=[
                torch.profiler.ProfilerActivity.CPU,
                torch.profiler.ProfilerActivity.CUDA
        ]) as p:
            self.payload()
            self.payload()
        print(p.key_averages().table(sort_by="self_cuda_time_total",
                                     row_limit=-1))
예제 #8
0
    def test_flops(self):
        model = torch.nn.Sequential(
            nn.Conv2d(16, 33, 18),
            nn.ReLU(),
            nn.Linear(243, 243),
            nn.ReLU(),
        )
        inputs = torch.randn(40, 16, 18, 260)
        with _profile(record_shapes=True,
                      with_flops=True,
                      use_kineto=kineto_available()) as prof:
            model(inputs)
        profiler_output = prof.key_averages(group_by_input_shape=True).table(
            sort_by="cpu_time_total", row_limit=10)
        self.assertIn("FLOPS", profiler_output)

        if not (kineto_available() and torch.cuda.is_available()):
            return

        with profile(
                activities=[
                    torch.profiler.ProfilerActivity.CPU,
                    torch.profiler.ProfilerActivity.CUDA
                ],
                record_shapes=True,
                with_flops=True,
        ) as kineto_profiler:
            model(inputs)
        profiler_output = kineto_profiler.key_averages().table(
            sort_by="self_cuda_time_total", row_limit=-1)
        self.assertIn("FLOPS", profiler_output)
예제 #9
0
    def test_tensorboard_trace_handler(self):
        use_cuda = torch.profiler.ProfilerActivity.CUDA in supported_activities(
        )
        with _profile(use_cuda=use_cuda, use_kineto=True):
            self.payload(use_cuda=use_cuda)

        with TemporaryDirectoryName() as dname:
            with profile(
                    activities=[torch.profiler.ProfilerActivity.CPU] +
                ([torch.profiler.ProfilerActivity.CUDA] if use_cuda else []),
                    schedule=torch.profiler.schedule(wait=1,
                                                     warmup=1,
                                                     active=2,
                                                     repeat=3),
                    on_trace_ready=torch.profiler.tensorboard_trace_handler(
                        dname)) as p:
                for _ in range(18):
                    self.payload(use_cuda=use_cuda)
                    p.step()

            self.assertTrue(os.path.exists(dname))
            file_num = 0
            for file_name in os.listdir(dname):
                parts = file_name.split('.')
                self.assertTrue(len(parts) > 4)
                self.assertTrue(parts[-4].isdigit() and int(parts[-4]) > 0,
                                "Wrong tracing file name pattern")
                self.assertEqual(parts[-3:], ['pt', 'trace', 'json'])
                file_num += 1
            self.assertEqual(file_num, 3)

        # test case for gzip file format
        with TemporaryDirectoryName() as dname:
            p = profile(
                activities=[torch.profiler.ProfilerActivity.CPU] +
                ([torch.profiler.ProfilerActivity.CUDA] if use_cuda else []),
                schedule=torch.profiler.schedule(wait=1,
                                                 warmup=1,
                                                 active=2,
                                                 repeat=3),
                on_trace_ready=torch.profiler.tensorboard_trace_handler(
                    dname, use_gzip=True))
            p.start()
            for _ in range(18):
                self.payload(use_cuda=use_cuda)
                p.step()
            p.stop()

            self.assertTrue(os.path.exists(dname))
            file_num = 0
            for file_name in os.listdir(dname):
                parts = file_name.split('.')
                self.assertTrue(len(parts) > 4)
                self.assertTrue(parts[-5].isdigit() and int(parts[-5]) > 0,
                                "Wrong tracing file name pattern")
                self.assertEqual(parts[-4:], ['pt', 'trace', 'json', 'gz'])
                file_num += 1
            self.assertEqual(file_num, 3)
예제 #10
0
    def test_kineto(self):
        with _profile(use_cuda=True, use_kineto=True):
            self.payload()

        # rerun to avoid initial start overhead
        with _profile(use_cuda=True, use_kineto=True) as p:
            self.payload()
        print(p.key_averages().table(sort_by="self_cuda_time_total",
                                     row_limit=-1))
        found_gemm = False
        found_memcpy = False
        for e in p.function_events:
            if "gemm" in e.name:
                found_gemm = True
            if "Memcpy" in e.name or "memcpy" in e.name:
                found_memcpy = True
        self.assertTrue(found_gemm)
        self.assertTrue(found_memcpy)
예제 #11
0
    def test_source(self):
        """Checks that source code attribution works for eager, TS and autograd mode
        """
        # avoid automatic inlining
        prev_opt = torch._C._get_graph_executor_optimize()
        torch._C._set_graph_executor_optimize(False)

        @torch.jit.script
        def ts_method_2(x, y):
            return torch.matmul(x, y)

        @torch.jit.script
        def ts_method_1(x, y, z):
            a = x + z
            w = ts_method_2(x, y) + a
            return w.sum()

        class DummyModule(nn.Module):
            def __init__(self):
                super(DummyModule, self).__init__()
                self.conv = torch.nn.Conv2d(3,
                                            2,
                                            kernel_size=1,
                                            stride=2,
                                            padding=3,
                                            bias=False)

            def forward(self, x):
                return self.conv(x)

        mod = DummyModule()

        with _profile(with_stack=True, use_kineto=kineto_available()) as p:
            x = torch.randn(10, 10, requires_grad=True)
            y = torch.randn(10, 10, requires_grad=True)
            z = x + y
            w = ts_method_1(x, y, z)
            v = 2 * w
            v.backward()
            a = torch.randn(2, 3, 2, 2, requires_grad=True)
            b = mod(a)
            c = b.sum()
            c.backward()

        print(
            p.key_averages(group_by_stack_n=5).table(
                sort_by="self_cpu_time_total", row_limit=-1))

        for e in p.function_events:
            if "aten::add" in e.name or "AddBackward" in e.name:
                self.assertTrue(
                    any(["test_profiler" in entry for entry in e.stack]))
                self.assertTrue(
                    any([("test_source" in entry or "ts_method_1" in entry
                          or "ts_method_2" in entry) for entry in e.stack]))

        torch._C._set_graph_executor_optimize(prev_opt)
예제 #12
0
 def run_profiler(tensor_creation_fn):
     # collecting allocs / deallocs
     with _profile(profile_memory=True, record_shapes=True, use_kineto=kineto_available()) as prof:
         x = None
         with record_function("test_user_scope_alloc"):
             x = tensor_creation_fn()
         with record_function("test_user_scope_dealloc"):
             del x
     return prof.key_averages(group_by_input_shape=True)
예제 #13
0
 def test_flops(self):
     model = torch.nn.Sequential(
         nn.Conv2d(16, 33, 18),
         nn.ReLU(),
         nn.Linear(243, 243),
         nn.ReLU(),
     )
     inputs = torch.randn(40, 16, 18, 260)
     with _profile(record_shapes=True, with_flops=True) as prof:
         model(inputs)
     profiler_output = prof.key_averages(group_by_input_shape=True).table(sort_by="cpu_time_total", row_limit=10)
     print(profiler_output)
     self.assertIn("FLOPS", profiler_output)
예제 #14
0
    def test_datapipe_with_record_function_fork(self):
        with _profile(with_stack=True, use_kineto=kineto_available(), record_shapes=True) as prof:
            input_dp = dp.iter.IterableWrapper(range(10))
            dp1, dp2, dp3 = input_dp.fork(num_instances=3)
            output1 = list(dp1)
        has_iter = False
        has_child = False
        for e in prof.function_events:
            if has_iter and has_child:
                break

            if not has_iter and e.name == "enumerate(DataPipe)#IterableWrapperIterDataPipe":
                has_iter = True
            if not has_child and e.name == "enumerate(DataPipe)#_ChildDataPipe":
                has_child = True
        self.assertTrue(has_iter)
        self.assertTrue(has_child)
예제 #15
0
    def test_kineto_profiler_api(self):
        called_num = [0]

        use_cuda = torch.cuda.is_available()
        with _profile(use_cuda=use_cuda, use_kineto=True):
            self.payload(use_cuda=use_cuda)

        def trace_handler(p):
            output = p.key_averages().table(
                sort_by="self_cuda_time_total" if use_cuda else "self_cpu_time_total", row_limit=-1)
            # print(output)
            # p.export_chrome_trace("/tmp/test_trace_" + str(called_num[0]) + ".json")
            called_num[0] += 1

        with profile(
            activities=[
                torch.profiler.ProfilerActivity.CPU
            ] + ([
                torch.profiler.ProfilerActivity.CUDA
            ] if use_cuda else []),
            schedule=torch.profiler.schedule(
                wait=1,
                warmup=1,
                active=2),
            on_trace_ready=trace_handler
        ) as p:
            for idx in range(8):
                self.payload(use_cuda=use_cuda)
                p.step()

        self.assertEqual(called_num[0], 2)

        # case without schedule
        with profile(
            activities=[
                torch.profiler.ProfilerActivity.CPU
            ] + ([
                torch.profiler.ProfilerActivity.CUDA
            ] if use_cuda else []),
        ) as p:
            self.payload(use_cuda=use_cuda)
            self.payload(use_cuda=use_cuda)
        output = p.key_averages().table(
            sort_by="self_cuda_time_total" if use_cuda else "self_cpu_time_total", row_limit=-1)
예제 #16
0
 def test_profiler_fwd_bwd_link(self):
     with _profile(use_kineto=True) as prof:
         t1, t2 = torch.ones(1, requires_grad=True), torch.ones(
             1, requires_grad=True)
         z = torch.add(t1, t2)
         y = torch.ones(1)
         loss = torch.nn.functional.binary_cross_entropy_with_logits(z, y)
         loss.backward()
     with TemporaryFileName(mode="w+") as fname:
         prof.export_chrome_trace(fname)
         with io.open(fname, 'r') as f:
             j = json.load(f)
             events = j["traceEvents"]
             ts_to_name = {}
             flow_s_to_ts = {}
             flow_f_to_ts = {}
             for e in events:
                 if e["ph"] == "X":
                     ts_to_name[e["ts"]] = e["name"]
                 if "cat" in e and "name" in e and e[
                         "cat"] == "forward_backward" and e[
                             "name"] == "fwd_bwd":
                     if e["ph"] == "s":
                         flow_s_to_ts[e["id"]] = e["ts"]
                     elif e["ph"] == "f":
                         flow_f_to_ts[e["id"]] = e["ts"]
             self.assertTrue(len(flow_s_to_ts) == 2)
             self.assertTrue(len(flow_f_to_ts) == 2)
             self.assertTrue(1 in flow_s_to_ts.keys())
             self.assertTrue(1 in flow_f_to_ts.keys())
             self.assertTrue(2 in flow_s_to_ts.keys())
             self.assertTrue(2 in flow_f_to_ts.keys())
             s_ts_1 = flow_s_to_ts[1]
             f_ts_1 = flow_f_to_ts[1]
             s_ts_2 = flow_s_to_ts[2]
             f_ts_2 = flow_f_to_ts[2]
             self.assertTrue(
                 all([
                     ts in ts_to_name.keys()
                     for ts in [s_ts_1, f_ts_1, s_ts_2, f_ts_2]
                 ]))
             self.assertTrue(ts_to_name[s_ts_1] ==
                             "aten::binary_cross_entropy_with_logits")
             self.assertTrue(ts_to_name[s_ts_2] == "aten::add")
예제 #17
0
    def test_datapipe_with_record_function(self):
        with _profile(with_stack=True, use_kineto=kineto_available(), record_shapes=True) as prof:
            input_dp1 = dp.iter.IterableWrapper(range(4))
            input_dp2 = dp.iter.IterableWrapper(range(4, 8))
            input_dp3 = dp.iter.IterableWrapper(range(8, 12))
            output_dp = input_dp1.mux(input_dp2, input_dp3)
            output = list(output_dp)

        has_iter = False
        has_mux = False
        for e in prof.function_events:
            if has_iter and has_mux:
                break

            if not has_iter and e.name == "enumerate(DataPipe)#IterableWrapperIterDataPipe":
                has_iter = True
            if not has_mux and e.name == "enumerate(DataPipe)#MultiplexerIterDataPipe":
                has_mux = True
        self.assertTrue(has_iter)
        self.assertTrue(has_mux)
예제 #18
0
    def test_export_stacks(self):
        with _profile(with_stack=True, use_kineto=kineto_available()) as p:
            x = torch.randn(10, 10)
            y = torch.randn(10, 10)
            z = torch.mm(x, y)
            z = z + y

        with TemporaryFileName(mode="w+") as fname:
            p.export_stacks(fname)
            with io.open(fname, 'r') as f:
                lines = f.readlines()
            assert len(lines) > 0, "Empty stacks file"
            for line in lines:
                is_int = False
                try:
                    assert int(line.split(" ")[-1]) > 0, "Invalid stacks record"
                    is_int = True
                except ValueError:
                    pass
                assert is_int, "Invalid stacks record"
예제 #19
0
    def test_high_level_trace(self):
        """Checks that python side high level events are recorded.
        """
        class RepeatedDataset(torch.utils.data.Dataset):
            def __init__(self, N, D_in, D_out):
                self.N = N
                self.x = torch.randn(N, D_in)
                self.y = torch.randn(N, D_out)

            def __len__(self):
                return self.N

            def __getitem__(self, idx):
                return self.x, self.y

        class TwoLayerNet(torch.nn.Module):
            def __init__(self, D_in, H, D_out):
                super(TwoLayerNet, self).__init__()
                self.linear1 = torch.nn.Linear(D_in, H)
                self.linear2 = torch.nn.Linear(H, D_out)

            def forward(self, x):
                h_relu = self.linear1(x).clamp(min=0)
                y_pred = self.linear2(h_relu)
                return y_pred

        class CustomSGD(torch.optim.SGD):
            def __init__(self, *args, **kwargs):
                super(CustomSGD, self).__init__(*args, **kwargs)

        def train():
            for _, data in enumerate(dataloader):
                x, y = data[0], data[1]
                y_pred = model(x)
                loss = criterion(y_pred, y)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

        N, D_in, H, D_out = 8, 10, 5, 2
        model = TwoLayerNet(D_in, H, D_out)
        criterion = torch.nn.MSELoss(reduction='sum')
        optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)
        ds = RepeatedDataset(N, D_in, D_out)
        dataloader = torch.utils.data.DataLoader(ds, batch_size=1)

        try:
            train()
        except Exception:
            self.assertTrue(False, "Expected no exception without profiling.")

        # Create multiple instances, expect each func is hooked only one time.
        # Nested wrappers(repeated patching) will make following test fail.
        optimizer_duplicate = torch.optim.SGD(model.parameters(), lr=1e-4)
        dataloader_duplicate = torch.utils.data.DataLoader(ds, batch_size=1)

        def judge(expected_event_count, prof):
            actual_event_count = {}
            for e in prof.function_events:
                if "#" in e.name:
                    key = e.name
                    if key in expected_event_count.keys():
                        actual_event_count[
                            key] = actual_event_count.setdefault(key, 0) + 1
            for key, count in expected_event_count.items():
                self.assertTrue((key in actual_event_count.keys())
                                and (count == actual_event_count[key]))

        with _profile(use_kineto=kineto_available()) as prof:
            train()
        expected_event_count = {
            # "+1" because the final iteration will enter __next__ but skip the loop body.
            "enumerate(DataLoader)#_SingleProcessDataLoaderIter.__next__":
            (N + 1),
            "Optimizer.step#SGD.step": N,
            "Optimizer.zero_grad#SGD.zero_grad": N
        }
        judge(expected_event_count, prof)

        # Test on pickle/unpickle. Expect to work in multi-processing.
        optimizer = pickle.loads(pickle.dumps(optimizer))
        with _profile(use_kineto=kineto_available()) as prof:
            train()
        judge(expected_event_count, prof)

        # Test on customized optimizer.
        optimizer = CustomSGD(model.parameters(), lr=1e-4)
        with _profile(use_kineto=kineto_available()) as prof:
            train()
        expected_event_count = {
            "enumerate(DataLoader)#_SingleProcessDataLoaderIter.__next__":
            (N + 1),
            "Optimizer.step#CustomSGD.step": N,
            "Optimizer.zero_grad#CustomSGD.zero_grad": N
        }
        judge(expected_event_count, prof)
예제 #20
0
    def test_memory_profiler(self):
        def run_profiler(tensor_creation_fn):
            # collecting allocs / deallocs
            with _profile(profile_memory=True,
                          record_shapes=True,
                          use_kineto=kineto_available()) as prof:
                x = None
                with record_function("test_user_scope_alloc"):
                    x = tensor_creation_fn()
                with record_function("test_user_scope_dealloc"):
                    del x
            return prof.key_averages(group_by_input_shape=True)

        def check_metrics(stats, metric, allocs=None, deallocs=None):
            stat_metrics = {}
            for stat in stats:
                stat_metrics[stat.key] = getattr(stat, metric)
            if allocs is not None:
                for alloc_fn in allocs:
                    self.assertTrue(alloc_fn in stat_metrics)
                    self.assertTrue(stat_metrics[alloc_fn] > 0)
            if deallocs is not None:
                for dealloc_fn in deallocs:
                    self.assertTrue(dealloc_fn in stat_metrics)
                    self.assertTrue(stat_metrics[dealloc_fn] < 0)

        def create_cpu_tensor():
            return torch.rand(10, 10)

        def create_cuda_tensor():
            return torch.rand(10, 10).cuda()

        def create_mkldnn_tensor():
            return torch.rand(10, 10, dtype=torch.float32).to_mkldnn()

        stats = run_profiler(create_cpu_tensor)
        check_metrics(stats,
                      "cpu_memory_usage",
                      allocs=[
                          "aten::empty",
                          "aten::rand",
                          "test_user_scope_alloc",
                      ],
                      deallocs=[
                          "test_user_scope_dealloc",
                      ])

        if kineto_available():
            with TemporaryFileName(mode="w+") as fname:
                with profile(profile_memory=True) as prof:
                    x = None
                    with record_function("test_user_scope_alloc"):
                        x = create_cpu_tensor()
                    with record_function("test_user_scope_dealloc"):
                        del x
                prof.export_chrome_trace(fname)
                with io.open(fname, 'r') as f:
                    trace = json.load(f)
                    assert "traceEvents" in trace
                    events = trace["traceEvents"]
                    found_memory_events = False
                    for evt in events:
                        assert "name" in evt
                        if evt["name"] == "[memory]":
                            found_memory_events = True
                            assert "args" in evt
                            assert "Device Type" in evt["args"]
                            assert "Device Id" in evt["args"]
                            assert "Bytes" in evt["args"]
                    assert found_memory_events

        if torch.cuda.is_available():
            create_cuda_tensor()
            stats = run_profiler(create_cuda_tensor)
            check_metrics(stats,
                          "cuda_memory_usage",
                          allocs=[
                              "test_user_scope_alloc",
                              "aten::to",
                              "aten::empty_strided",
                          ],
                          deallocs=[
                              "test_user_scope_dealloc",
                          ])
            check_metrics(stats,
                          "cpu_memory_usage",
                          allocs=[
                              "aten::rand",
                              "aten::empty",
                          ])

        if torch._C.has_mkldnn:
            create_mkldnn_tensor()
            stats = run_profiler(create_mkldnn_tensor)
            check_metrics(stats,
                          "cpu_memory_usage",
                          allocs=[
                              "test_user_scope_alloc",
                              "aten::rand",
                              "aten::empty",
                              "aten::to_mkldnn",
                          ],
                          deallocs=[
                              "test_user_scope_dealloc",
                          ])

        # check top-level memory events
        with _profile(profile_memory=True,
                      use_kineto=kineto_available()) as prof:
            x = torch.rand(10, 10)
            del x
            if torch.cuda.is_available():
                y = torch.rand(10, 10).cuda()
                del y
            gc.collect()
        stats = prof.key_averages(group_by_input_shape=True)
        check_metrics(stats,
                      "cpu_memory_usage",
                      allocs=["aten::rand", "aten::empty"],
                      deallocs=["[memory]"])
        if torch.cuda.is_available():
            check_metrics(stats, "cuda_memory_usage", deallocs=["[memory]"])
예제 #21
0
    def test_memory_profiler(self):
        def run_profiler(tensor_creation_fn, metric):
            # collecting allocs / deallocs
            with _profile(profile_memory=True,
                          record_shapes=True,
                          use_kineto=kineto_available()) as prof:
                x = None
                with record_function("test_user_scope_alloc"):
                    x = tensor_creation_fn()
                with record_function("test_user_scope_dealloc"):
                    del x
            return prof.key_averages(group_by_input_shape=True)

        def check_metrics(stats, metric, allocs=None, deallocs=None):
            stat_metrics = {}
            for stat in stats:
                stat_metrics[stat.key] = getattr(stat, metric)
            if allocs is not None:
                for alloc_fn in allocs:
                    self.assertTrue(alloc_fn in stat_metrics)
                    self.assertTrue(stat_metrics[alloc_fn] > 0)
            if deallocs is not None:
                for dealloc_fn in deallocs:
                    self.assertTrue(dealloc_fn in stat_metrics)
                    self.assertTrue(stat_metrics[dealloc_fn] < 0)

        def create_cpu_tensor():
            return torch.rand(10, 10)

        def create_cuda_tensor():
            return torch.rand(10, 10).cuda()

        def create_mkldnn_tensor():
            return torch.rand(10, 10, dtype=torch.float32).to_mkldnn()

        stats = run_profiler(create_cpu_tensor, "cpu_memory_usage")
        check_metrics(stats,
                      "cpu_memory_usage",
                      allocs=[
                          "aten::empty",
                          "aten::rand",
                          "test_user_scope_alloc",
                      ],
                      deallocs=[
                          "test_user_scope_dealloc",
                      ])

        if torch.cuda.is_available():
            create_cuda_tensor()
            stats = run_profiler(create_cuda_tensor, "cuda_memory_usage")
            check_metrics(stats,
                          "cuda_memory_usage",
                          allocs=[
                              "test_user_scope_alloc",
                              "aten::to",
                              "aten::empty_strided",
                          ],
                          deallocs=[
                              "test_user_scope_dealloc",
                          ])
            check_metrics(stats,
                          "cpu_memory_usage",
                          allocs=[
                              "aten::rand",
                              "aten::empty",
                          ])

        if torch._C.has_mkldnn:
            create_mkldnn_tensor()
            stats = run_profiler(create_mkldnn_tensor, "cpu_memory_usage")
            check_metrics(stats,
                          "cpu_memory_usage",
                          allocs=[
                              "test_user_scope_alloc",
                              "aten::rand",
                              "aten::empty",
                              "aten::to_mkldnn",
                          ],
                          deallocs=[
                              "test_user_scope_dealloc",
                          ])

        # check top-level memory events
        with _profile(profile_memory=True,
                      use_kineto=kineto_available()) as prof:
            x = torch.rand(10, 10)
            del x
            if torch.cuda.is_available():
                y = torch.rand(10, 10).cuda()
                del y
            gc.collect()
        stats = prof.key_averages(group_by_input_shape=True)
        check_metrics(stats,
                      "cpu_memory_usage",
                      allocs=["aten::rand", "aten::empty"],
                      deallocs=["[memory]"])
        if torch.cuda.is_available():
            check_metrics(stats, "cuda_memory_usage", deallocs=["[memory]"])
예제 #22
0
    def test_source(self):
        """Checks that source code attribution works for eager, TS and autograd mode
        """
        # avoid automatic inlining
        prev_opt = torch._C._get_graph_executor_optimize()
        torch._C._set_graph_executor_optimize(False)

        @torch.jit.script
        def ts_method_2(x, y):
            return torch.matmul(x, y)

        @torch.jit.script
        def ts_method_1(x, y, z):
            a = x + z
            w = ts_method_2(x, y) + a
            return w.sum()

        class DummyModule(nn.Module):
            def __init__(self):
                super(DummyModule, self).__init__()
                self.conv = torch.nn.Conv2d(3,
                                            2,
                                            kernel_size=1,
                                            stride=2,
                                            padding=3,
                                            bias=False)

            def forward(self, x):
                return self.conv(x)

        mod = DummyModule()

        def call_module(x):
            return mod(x)

        with _profile(with_stack=True, use_kineto=kineto_available()) as p:
            x = torch.randn(10, 10, requires_grad=True)
            y = torch.randn(10, 10, requires_grad=True)
            z = x + y
            w = ts_method_1(x, y, z)
            v = 2 * w
            v.backward()
            a = torch.randn(2, 3, 2, 2, requires_grad=True)
            b = call_module(a)
            c = b.sum()
            c.backward()

        for e in p.function_events:
            if "aten::add" in e.name or "AddBackward" in e.name:
                self.assertTrue(
                    any(["test_profiler" in entry for entry in e.stack]))
                self.assertTrue(
                    any([("test_source" in entry or "ts_method_1" in entry
                          or "ts_method_2" in entry) for entry in e.stack]))

        # TODO: https://github.com/pytorch/kineto/issues/617
        if kineto_available() and not IS_WINDOWS:
            with TemporaryFileName(mode="w+") as fname:
                p.export_chrome_trace(fname)
                with io.open(fname, 'r') as f:
                    events = json.load(f)["traceEvents"]

                def extract(pattern: str):
                    matches = [
                        e for e in events if re.search(pattern, e["name"])
                    ]
                    self.assertEqual(len(matches), 1,
                                     repr([e["name"] for e in matches]))
                    return matches[0]

                module_event = extract(r"DummyModule_0")
                wrapper_event = extract(r"call_module")
                self.assertEqual(module_event["args"]["Python parent id"],
                                 wrapper_event["args"]["Python id"])

        torch._C._set_graph_executor_optimize(prev_opt)