def subprocess_main(args): seed = args.DETAIL_seed cuda = (args.DETAIL_device == _GPU) with open(args.DETAIL_result_file, "ab") as f: for dtype_str in _DTYPES_TO_TEST[args.pr]: dtype = _DTYPE_STR_TO_DTYPE[dtype_str] iterator = unary.UnaryOpFuzzer(seed=seed, dtype=dtype, cuda=cuda).take(_RUNS_PER_LOOP) for i, (tensors, tensor_parameters, params) in enumerate(iterator): params["dtype_str"] = dtype_str stmt, label = construct_stmt_and_label(args.pr, params) timer = Timer( stmt=stmt, globals=tensors, label=label, description= f"[{i}, seed={seed}] ({dtype_str}), stmt = {stmt}", env=args.DETAIL_env, ) measurement = timer.blocked_autorange( min_run_time=_MIN_RUN_SEC) measurement.metadata = { "tensor_parameters": tensor_parameters, "params": params, } print(measurement) pickle.dump(measurement, f)
def benchmark_filterer( dataset: Dataset, filterer: Hint[Filterer], filterer_kwargs: Optional[Mapping[str, Any]] = None, ) -> Iterable[Mapping[str, Any]]: """Benchmark a filterer.""" filterer_kwargs = filterer_kwargs or {} # include some metadata into each entry kwargs = dict( dataset=dataset.get_normalized_name(), filterer=filterer, **filterer_kwargs, ) filterer_cls = filterer_resolver.lookup(filterer) tqdm.write(f'[{filterer_cls.__name__}] measure creation (=indexing) time') timer = TorchTimer(stmt="filterer_cls(triples_factory=factory, **kwargs)", globals=dict( filterer_cls=filterer_cls, factory=dataset.training, kwargs=filterer_kwargs, )) measurement = timer.blocked_autorange() yield dict( operation="index", subset="train", time=measurement.median, num_triples=dataset.training.num_triples, **kwargs, ) # instantiate filterer for further tests filterer = filterer_resolver.make(filterer, pos_kwargs=filterer_kwargs, triples_factory=dataset.training) for key, value in dataset.factory_dict.items(): if key == 'training': continue tqdm.write(f'[{filterer}] measure inference time ({key})') timer = TorchTimer(stmt="filterer(mapped_triples)", globals=dict( filterer=filterer, mapped_triples=value.mapped_triples, )) measurement = timer.blocked_autorange() # check for correctness error_rate = float( (~filterer(value.mapped_triples)[1]).float().mean().item()) yield dict( operation="inference", subset=key, time=measurement.median, num_triples=value.num_triples, observed_error_rate=error_rate, **kwargs, )
def time_with_torch_timer(fn, args, string_id, kwargs={}): print("################################################") print(f"#### Torch Timer for {string_id} starts #########") print("################################################") ref = fn(*args, **kwargs) gO = torch.rand_like(ref) env = {"args": args, "gO": gO, "kwargs": kwargs, "fn": fn} grad_none = {"for x in args: x.grad=None"} fn_call = "fn(*args, **kwargs)" # Measure end-to-end fwd time timer = Timer(stmt=f"{fn_call}", globals=env) fwd_latency = round(timer.timeit(1000).mean * 10**6, 3) timer_blocked = timer.blocked_autorange() print(f"Forward = {fwd_latency}") # Measure end-to-end fwd bwd timer = Timer( stmt=f"{grad_none}; fwd = {fn_call}; fwd.backward(gO)", globals=env, ) fwd_bwd_latency = round(timer.timeit(1000).mean * 10**6, 3) timer_blocked = timer.blocked_autorange() # print(f"Forward + sum + Backward = {fwd_sum_bwd_latency}") bwd_latency = round(fwd_bwd_latency - fwd_latency, 3) print(f"Backward = {bwd_latency}") print("################################################") print(f"#### Torch Timer for {string_id} ends ###############") print("################################################\n\n\n\n")
def _subprocess_main(seed=0, num_threads=1, sub_label="N/A", result_file=None, env=None): import torch from torch.utils.benchmark import Timer conda_prefix = os.getenv("CONDA_PREFIX") assert conda_prefix if not torch.__file__.startswith(conda_prefix): raise ValueError( f"PyTorch mismatch: `import torch` resolved to `{torch.__file__}`, " f"which is not in the correct conda env: {conda_prefix}") torch.manual_seed(seed) results = [] for n in [4, 8, 16, 32, 64, 128, 256, 512, 1024, 7, 96, 150, 225]: dtypes = (("Single", torch.float32), ("Double", torch.float64)) shapes = ( # Square MatMul ((n, n), (n, n), "(n x n) x (n x n)", "Matrix-Matrix Product"), # Matrix-Vector product ((n, n), (n, 1), "(n x n) x (n x 1)", "Matrix-Vector Product"), ) for (dtype_name, dtype), (x_shape, y_shape, shape_str, blas_type) in it.product(dtypes, shapes): t = Timer( stmt="torch.mm(x, y)", label=f"torch.mm {shape_str} {blas_type} ({dtype_name})", sub_label=sub_label, description=f"n = {n}", env=os.path.split(env or "")[1] or None, globals={ "x": torch.rand(x_shape, dtype=dtype), "y": torch.rand(y_shape, dtype=dtype), }, num_threads=num_threads, ).blocked_autorange(min_run_time=MIN_RUN_TIME) results.append(t) if result_file is not None: with open(result_file, "wb") as f: pickle.dump(results, f)
def run(n, stmt, fuzzer_cls): float_iter = fuzzer_cls(seed=0, dtype=torch.float32).take(n) double_iter = fuzzer_cls(seed=0, dtype=torch.float64).take(n) raw_results = [] for i, (float_values, int_values) in enumerate(zip(float_iter, double_iter)): float_tensors, float_tensor_params, float_params = float_values int_tensors, int_tensor_params, int_params = int_values assert_dicts_equal(float_params, int_params) assert_dicts_equal(float_tensor_params["x"], int_tensor_params["x"]) float_measurement, int_measurement = [ Timer( stmt, globals=tensors, ).blocked_autorange(min_run_time=_MEASURE_TIME) for tensors in (float_tensors, int_tensors) ] descriptions = [] for name in float_tensors: shape_str = "(" + ", ".join([ f"2 ** {int(np.log2(i))}" if 2 ** int(np.log2(i)) == i and i > 1 else str(i) for i in float_tensors[name].shape ]) + ")" sparse_dim = float_tensor_params[name]["sparse_dim"] sparse_dim_str = str(sparse_dim) is_coalesced = float_tensor_params[name]["is_coalesced"] is_coalesced_str = "True" if is_coalesced else "False" descriptions.append((name, shape_str, sparse_dim_str, is_coalesced_str)) raw_results.append((float_measurement, int_measurement, descriptions)) print(f"\r{i + 1} / {n}", end="") print() parsed_results, name_len, shape_len, sparse_dim_len, is_coalesced_len = [], 0, 0, 0, 0 for float_measurement, int_measurement, descriptions in raw_results: t_float = float_measurement.median * 1e6 t_int = int_measurement.median * 1e6 rel_diff = abs(t_float - t_int) / (t_float + t_int) * 2 parsed_results.append((t_float, t_int, rel_diff, descriptions)) for name, shape, sparse_dim, is_coalesced in descriptions: name_len = max(name_len, len(name)) shape_len = max(shape_len, len(shape)) sparse_dim_len = max(sparse_dim_len, len(sparse_dim)) is_coalesced_len = max(is_coalesced_len, len(is_coalesced)) parsed_results.sort(key=lambda x: x[2]) print(f"stmt: {stmt}") print(f" diff faster{'':>17}{' ' * name_len} ", end="") print(f"{'shape'.ljust(shape_len)}{'':>12}{'sparse_dim'.ljust(sparse_dim_len)}", end="") print(f" is_coalesced\n{'-' * 100}") for results, spacer in [(parsed_results[:10], "..."), (parsed_results[-10:], "")]: for t_float, t_int, rel_diff, descriptions in results: time_str = [f"{rel_diff * 100:>4.1f}% {'int' if t_int < t_float else 'float':<20}"] time_str.extend(["".ljust(len(time_str[0])) for _ in descriptions[:-1]]) for t_str, (name, shape, sparse_dim, is_coalesced) in zip(time_str, descriptions): name = f"{name}:".ljust(name_len + 1) shape = shape.ljust(shape_len + 10) sparse_dim = sparse_dim.ljust(sparse_dim_len) print(f"{t_str} {name} {shape}| {sparse_dim} | {is_coalesced}") print(spacer)
def time_cuda(fn, inputs, test_runs): t = Timer(stmt="fn(*inputs)", globals={"fn": fn, "inputs": inputs}) times = t.blocked_autorange() return times.median * 1000 # time in ms
workload = parallel_workload if args.use_script: traced_workload = torch.jit.trace(workload, (input_x, )) workload = traced_workload if profiling_enabled: def payload(): x = None with torch.autograd.profiler.profile( use_cuda=args.with_cuda, with_stack=args.with_stack, use_kineto=args.use_kineto, use_cpu=not args.cuda_only) as prof: x = workload(input_x) return x else: def payload(): return workload(input_x) t = Timer( "payload()", globals={ "payload": payload }, timer=timeit.default_timer, ).blocked_autorange(min_run_time=args.timer_min_run_time) print(t)
def run(n, stmt, fuzzer_cls): float_iter = fuzzer_cls(seed=0, dtype=torch.float32).take(n) int_iter = fuzzer_cls(seed=0, dtype=torch.int32).take(n) raw_results = [] for i, (float_values, int_values) in enumerate(zip(float_iter, int_iter)): float_tensors, float_tensor_params, float_params = float_values int_tensors, int_tensor_params, int_params = int_values # This benchmark assumes that the two fuzzers generate identically # sized and strided Tensors, since the same seed is used. assert_dicts_equal(float_params, int_params) assert_dicts_equal(float_tensor_params["x"], int_tensor_params["x"]) float_measurement, int_measurement = [ Timer( stmt, globals=tensors, ).blocked_autorange(min_run_time=_MEASURE_TIME) for tensors in (float_tensors, int_tensors) ] descriptions = [] for name in float_tensors: shape_str = "(" + ", ".join([ f"2 ** {int(np.log2(i))}" if 2**int(np.log2(i)) == i and i > 1 else str(i) for i in float_tensors[name].shape ]) + ")" order = float_tensor_params[name]["order"] order_str = ("" if all( order == np.arange(len(order))) else str(tuple(order))) steps = float_tensor_params[name]["steps"] steps_str = str(steps) if sum(steps) > len(steps) else "" descriptions.append((name, shape_str, order_str, steps_str)) raw_results.append((float_measurement, int_measurement, descriptions)) print(f"\r{i + 1} / {n}", end="") print() parsed_results, name_len, shape_len, order_len, steps_len = [], 0, 0, 0, 0 for float_measurement, int_measurement, descriptions in raw_results: t_float = float_measurement.median * 1e6 t_int = int_measurement.median * 1e6 rel_diff = abs(t_float - t_int) / (t_float + t_int) * 2 parsed_results.append((t_float, t_int, rel_diff, descriptions)) for name, shape, order, steps in descriptions: name_len = max(name_len, len(name)) shape_len = max(shape_len, len(shape)) order_len = max(order_len, len(order)) steps_len = max(steps_len, len(steps)) parsed_results.sort(key=lambda x: x[2]) print(f"stmt: {stmt}") print(f" diff faster{'':>17}{' ' * name_len} ", end="") print(f"{'shape'.ljust(shape_len)}{'':>16}{'order'.ljust(order_len)}", end="") print(f" steps\n{'-' * 100}") for results, spacer in [(parsed_results[:10], "..."), (parsed_results[-10:], "")]: for t_float, t_int, rel_diff, descriptions in results: time_str = [ f"{rel_diff * 100:>4.1f}% {'int' if t_int < t_float else 'float':<20}" ] time_str.extend( ["".ljust(len(time_str[0])) for _ in descriptions[:-1]]) for t_str, (name, shape, order, steps) in zip(time_str, descriptions): name = f"{name}:".ljust(name_len + 1) shape = shape.ljust(shape_len + 10) order = order.ljust(order_len) print(f"{t_str} {name} {shape}| {order} | {steps}") print(spacer)
# transforms can give us different interesting quantities. # # functorch provides ``jacrev`` as a convenience function that performs # the vmap-vjp composition to compute jacobians. ``jacrev`` accepts an argnums # argument that says which argument we would like to compute Jacobians with # respect to. from functorch import jacrev ft_jacobian = jacrev(predict, argnums=2)(weight, bias, x) assert torch.allclose(ft_jacobian, jacobian) # Let's compare the performance of the two ways to compute jacobian. # The functorch version is much faster (and becomes even faster the more outputs # there are). In general, we expect that vectorization via ``vmap`` can help # eliminate overhead and give better utilization of your hardware. from torch.utils.benchmark import Timer without_vmap = Timer(stmt="compute_jac(xp)", globals=globals()) with_vmap = Timer(stmt="jacrev(predict, argnums=2)(weight, bias, x)", globals=globals()) print(without_vmap.timeit(500)) print(with_vmap.timeit(500)) # It's pretty easy to flip the problem around and say we want to compute # Jacobians of the parameters to our model (weight, bias) instead of the input. ft_jac_weight, ft_jac_bias = jacrev(predict, argnums=(0, 1))(weight, bias, x) ###################################################################### # reverse-mode Jacobian (jacrev) vs forward-mode Jacobian (jacfwd) # -------------------------------------------------------------------- # We offer two APIs to compute jacobians: jacrev and jacfwd: # - jacrev uses reverse-mode AD. As you saw above it is a composition of our # vjp and vmap transforms.
# A `Timer` serves as a task definition. # from torch.utils.benchmark import Timer timer = Timer( # The computation which will be run in a loop and timed. stmt="x * y", # `setup` will be run before calling the measurement loop, and is used to # populate any state which is needed by `stmt` setup=""" x = torch.ones((128,)) y = torch.ones((128,)) """, # Alternately, `globals` can be used to pass variables from the outer scope. # ------------------------------------------------------------------------- # globals={ # "x": torch.ones((128,)), # "y": torch.ones((128,)), # }, # Control the number of threads that PyTorch uses. (Default: 1) num_threads=1, ) ############################################################################### # 2. Wall time: `Timer.blocked_autorange(...)` # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
def main(): parser = argparse.ArgumentParser(prog="tensor_product_benchmark") parser.add_argument("--jit", type=t_or_f, default=True) parser.add_argument("--irreps", type=str, default="8x0e + 8x1e + 8x2e + 8x3o") parser.add_argument("--irreps-in1", type=str, default=None) parser.add_argument("--irreps-in2", type=str, default=None) parser.add_argument("--irreps-out", type=str, default=None) parser.add_argument("--cuda", type=t_or_f, default=True) parser.add_argument("--backward", type=t_or_f, default=True) parser.add_argument("--opt-ein", type=t_or_f, default=True) parser.add_argument("--specialized-code", type=t_or_f, default=True) parser.add_argument("--elementwise", action='store_true') parser.add_argument("-n", type=int, default=1000) parser.add_argument("--batch", type=int, default=10) args = parser.parse_args() device = 'cuda' if (torch.cuda.is_available() and args.cuda) else 'cpu' args.cuda = device == 'cuda' print("======= Benchmark with settings: ======") for key, val in vars(args).items(): print(f"{key:>18} : {val}") print("=" * 40) irreps_in1 = Irreps(args.irreps_in1 if args.irreps_in1 else args.irreps) irreps_in2 = Irreps(args.irreps_in2 if args.irreps_in2 else args.irreps) irreps_out = Irreps(args.irreps_out if args.irreps_out else args.irreps) if args.elementwise: tp = ElementwiseTensorProduct(irreps_in1, irreps_in2, _specialized_code=args.specialized_code, _optimize_einsums=args.opt_ein) if args.backward: print( "Elementwise TP has no weights, cannot backward. Setting --backward False." ) args.backward = False else: tp = FullyConnectedTensorProduct( irreps_in1, irreps_in2, irreps_out, _specialized_code=args.specialized_code, _optimize_einsums=args.opt_ein) tp = tp.to(device=device) assert len(tp.instructions) > 0, "Bad irreps, no instructions" print(f"Tensor product: {tp}") print("Instructions:") for ins in tp.instructions: print(f" {ins}") # from https://pytorch.org/docs/master/_modules/torch/utils/benchmark/utils/timer.html#Timer.timeit warmup = max(int(args.n // 100), 1) inputs = iter([(irreps_in1.randn(args.batch, -1).to(device=device), irreps_in2.randn(args.batch, -1).to(device=device)) for _ in range(args.n + warmup)]) # compile if args.jit: tp = compile(tp) print("starting...") # tanh() forces it to realize the grad as a full size matrix rather than expanded (stride 0) ones t = Timer( stmt=("tp.zero_grad()\n" "out = tp(*next(inputs))\n" + ("out.tanh().sum().backward()\n" if args.backward else '')), globals={ 'tp': tp, 'inputs': inputs }) perloop = t.timeit(args.n) print() print(perloop)