def run_benchmark(bsz, mean_i, mean_j, var, autograd, writer): RAND_INTS = [(int(random.gauss(mean_j, var)), int( random.gauss(mean_i, var))) for _ in range(bsz)] src_ = nestedtensor.nested_tensor( [torch.randn(NDIM * i * j).float().reshape(NDIM, i, j) for (i, j) in RAND_INTS], device=DEVICE, dtype=torch.float) src = [] for i, s in enumerate(src_): src.append(i*len(s) + s) detr_nt_src = DETRNestedTensor.from_tensor_list(src) sparsity = int(detr_nt_src.decompose()[1].float().mean().item() * 10) / 10 def gen_t_loop_mha(src): detr_nt_src = DETRNestedTensor.from_tensor_list(src) src, mask = detr_nt_src.decompose() src = src.flatten(2).permute(2, 0, 1).contiguous() mask = mask.flatten(1).contiguous() if autograd: src.requires_grad_() def te(): if autograd: MODEL(src, src, src, key_padding_mask=mask, need_weights=False)[0].sum().backward() MODEL(src, src, src, key_padding_mask=mask, need_weights=False) return te def gen_nt_mha(src): src = nestedtensor.nested_tensor([t.flatten(1).permute( 1, 0) for t in src], device=DEVICE, dtype=torch.float, requires_grad=True) def nt(): if autograd: MODEL(src, src, src, need_weights=False)[0].sum().backward() MODEL(src, src, src, need_weights=False) return nt result_t = {**utils.benchmark_fn(gen_t_loop_mha(src), 5.0, cuda=True), "bsz": bsz, "sparsity": sparsity, "autograd": autograd, "var": var, "mean_i": mean_i, "mean_j": mean_j} result_t["numel"] = sum([x.numel() for x in src_]) result_t["numel_div_avg_us"] = result_t["numel"] / result_t["avg_us"] result_t["avg_ns_div_numel"] = result_t["avg_us"] / result_t["numel"] * 1000 writer.writerow(result_t) result_nt = {**utils.benchmark_fn(gen_nt_mha(src), 5.0, cuda=True), "bsz": bsz, "sparsity": 0.0, "autograd": autograd, "var": var, "mean_i": mean_i, "mean_j": mean_j} result_nt["numel"] = sum([x.numel() for x in src_]) result_nt["numel_div_avg_us"] = result_nt["numel"] / result_nt["avg_us"] result_nt["avg_ns_div_numel"] = result_nt["avg_us"] / result_nt["numel"] * 1000 writer.writerow(result_nt)
def run(self): params = itertools.product( self.args.cuda, self.args.N, self.args.C, self.args.H, self.args.W, self.args.seed, ) if self.args.V: var_params = [(v, v) for v in self.args.V] else: var_params = itertools.product(self.args.HV, self.args.WV) params = [[p + v for v in var_params] for p in params] params = sum(params, []) writer = None i = 0 for cuda, n, c, h, w, seed, h_var, w_var in params: # generate inputs before iterating layers to have the same imput per layer self.inputs, self.targets = self.get_input(cuda, n, c, h, w, h_var, w_var, seed) benchmarks = [(layer, self.get_benchmark(c, layer, cuda)) for layer in self.args.layers] for layer, benchmark in benchmarks: result = utils.benchmark_fn(benchmark, run_time=self.args.run_time, warmup=self.args.warmup) result["#"] = str(i) + "/" + str(len(benchmarks) * len(params)) result["N"] = n result["C"] = c result["H"] = h result["W"] = w result["h_var"] = h_var result["w_var"] = w_var result["seed"] = seed result["avg_us"] = int(result["avg_us"]) result["std_us"] = int(result["std_us"]) result["name"] = layer result["cuda"] = cuda result["numel"] = sum(x.numel() for x in self.inputs) if writer is None and self.args.csv_log: writer = csv.DictWriter(open(self.args.csv_log, 'w'), fieldnames=result.keys()) writer.writeheader() if writer is not None: writer.writerow(result) print(",".join( str((str(key), result[key])) for key in sorted(result.keys()))) i += 1
nested_tensor.unbind() return ant def gen_nt_unbind_2(): nested_tensor = nestedtensor.nested_tensor( [[torch.rand(i, 25) for i in RAND_INTS] for j in range(100)]) def nt_2(): [t.unbind() for t in nested_tensor.unbind()] return nt_2 def gen_ant_unbind_2(): nested_tensor = nestedtensor.as_nested_tensor( [[torch.rand(i, 25) for i in RAND_INTS] for j in range(100)]) def ant_2(): [t.unbind() for t in nested_tensor.unbind()] return ant_2 if __name__ == "__main__": print(utils.benchmark_fn(gen_nt_unbind())) print(utils.benchmark_fn(gen_ant_unbind())) print(utils.benchmark_fn(gen_nt_unbind_2())) print(utils.benchmark_fn(gen_ant_unbind_2()))
tensor = torch.cat([torch.rand(i, 2560).reshape(-1) for i in RAND_INTS]) tensor = tensor.cuda() def t(): tensor.cos_() return t def gen_t_loop_cos(): tensors = [torch.rand(i, 2560).cuda() for i in RAND_INTS] def t_loop(): for t in tensors: t.cos_() return t_loop def gen_nt_cos(): nested_tensor = nestedtensor.nested_tensor( [torch.rand(i, 2560).cuda() for i in RAND_INTS]) def nt(): nested_tensor.cos_() return nt if __name__ == "__main__": print(utils.benchmark_fn(gen_t_cos())) print(utils.benchmark_fn(gen_t_loop_cos())) print(utils.benchmark_fn(gen_nt_cos()))
# Performance tanks hard for lots of small Tensors as expected RAND_INTS = [random.randint(10, 30) for _ in range(2000)] OUTDIM=256 TENSORS0 = [torch.rand(i, OUTDIM).cuda() for i in RAND_INTS] def gen_t_matmul(): nt0 = nestedtensor.nested_tensor(TENSORS0, device=torch.device('cuda'), dtype=torch.float) data, _ = nt0.to_tensor_mask() t1 = torch.randn(OUTDIM, 512).cuda() def t(): torch.matmul(data, t1) return t @torch.inference_mode() def gen_nt_matmul(): nt0 = nestedtensor.nested_tensor(TENSORS0, device=torch.device('cuda'), dtype=torch.float) t1 = torch.randn(OUTDIM, 512).cuda() def nt(): torch.matmul(nt0, t1) return nt if __name__ == "__main__": print(utils.benchmark_fn(gen_t_matmul())) print(utils.benchmark_fn(gen_nt_matmul()))
@torch.jit.ignore def get_tensor() -> torch.Tensor: return tensor @torch.jit.script def my_fun(x, y): x = x + get_scalar() x = x + get_tensor() y = y + x.abs() return y return my_fun my_fun = gen_my_fun(3.0, torch.randn(1).to(device='cuda')) def _algorithm_jit(): nestedtensor._C.jit_apply_function((n, n), my_fun) return _algorithm_jit if __name__ == "__main__": # print(utils.benchmark_fn(alg, use_cprofile=True)) # alg = gen_list_nested_tensor_construction() # print(utils.benchmark_fn(alg)) alg1 = gen_current() print(utils.benchmark_fn(alg1)) alg2 = gen_jit() print(utils.benchmark_fn(alg2))
import utils import random def gen_list_nested_tensor_construction(): tensors = [torch.rand(random.randint(500, 1500), 25600) for _ in range(20)] def _algorithm(): torch._ListNestedTensor(tensors) return _algorithm def gen_list_nested_tensor_unbind(): nested_tensor = torch._ListNestedTensor( [torch.rand(random.randint(500, 1500), 25600) for _ in range(20)]) def _algorithm(): nested_tensor.unbind() return _algorithm if __name__ == "__main__": # print(utils.benchmark_fn(alg, use_cprofile=True)) # alg = gen_list_nested_tensor_construction() # print(utils.benchmark_fn(alg)) alg = gen_list_nested_tensor_unbind() print(utils.benchmark_fn(alg))
return_layers = {'layer4': 'out'} MODEL = torchvision.models._utils.IntermediateLayerGetter( backbone, return_layers=return_layers).cuda() def gen_t_loop_segmentation(): tensors = [torch.rand(1, 3, i, 256).cuda() for i in RAND_INTS] def t_loop(): for t in tensors: MODEL(t)['out'].sum().backward() return t_loop def gen_nt_segmentation(): nested_tensor = nestedtensor.nested_tensor( [torch.rand(3, i, 256) for i in RAND_INTS], device=torch.device('cuda'), dtype=torch.float) def nt(): MODEL(nested_tensor)['out'].sum().backward() return nt if __name__ == "__main__": # print(utils.benchmark_fn(gen_t_loop_segmentation(), 10.0)) print(utils.benchmark_fn(gen_nt_segmentation(), 2.0))
return t def gen_t_loop_mul(): tensors1 = [torch.rand(i, 2560).cuda() for i in RAND_INTS] tensors2 = [torch.rand(i, 2560).cuda() for i in RAND_INTS] def t_loop(): for t1, t2 in zip(tensors1, tensors2): t1.mul(t2) return t_loop def gen_nt_mul(): nested_tensor1 = nestedtensor.nested_tensor( [torch.rand(i, 2560).cuda() for i in RAND_INTS]) nested_tensor2 = nestedtensor.nested_tensor( [torch.rand(i, 2560).cuda() for i in RAND_INTS]) def nt(): nested_tensor1.mul(nested_tensor2) return nt if __name__ == "__main__": print(utils.benchmark_fn(gen_t_mul())) print(utils.benchmark_fn(gen_t_loop_mul())) print(utils.benchmark_fn(gen_nt_mul()))