def compare_optimize_resnet18_to_torchscript(): results = [] for i in range(20): test_input = torch.rand(1, 3, 224, 224).half().cuda() sub_label = f"[test {i}]" results.append( benchmark.Timer( stmt="meta_module_resnet18(test_input)", setup="from __main__ import meta_module_resnet18", globals={"test_input": test_input}, sub_label=sub_label, description="tuning by meta", ).blocked_autorange() ) results.append( benchmark.Timer( stmt="jit_module_resnet18(test_input)", setup="from __main__ import jit_module_resnet18", globals={"test_input": test_input}, sub_label=sub_label, description="tuning by jit", ).blocked_autorange() ) compare = benchmark.Compare(results) compare.print()
def prof(dtype, op, nl, hidden_size_max): fuzzer = benchmark.Fuzzer( parameters=[ benchmark.FuzzedParameter('s', minval=1000, maxval=6000, distribution='uniform'), # seq_length benchmark.FuzzedParameter('b', minval=1, maxval=64, distribution='uniform'), # batch_size benchmark.FuzzedParameter('i', minval=16, maxval=512, distribution='uniform'), # input_size benchmark.FuzzedParameter('h', minval=16, maxval=hidden_size_max, distribution='uniform'), # hidden_size benchmark.FuzzedParameter('n', minval=1, maxval=4, distribution='uniform'), # num_layer ], tensors=[ benchmark.FuzzedTensor('x', size='sbi', min_elements=12, max_elements=10000000, cuda=True, dtype=d_dtype[dtype], max_allocation_bytes=1_000_000_000) ], seed=42, constraints=[ lambda params: params['i'] % 8 == 0, lambda params: params['h'] % 8 == 0 ]) res = [] for tensors, tensor_params, params in fuzzer.take(20): s = params['s'] b = params['b'] i = params['i'] h = params['h'] n = params['n'] sub_label = f'x=({s}, {b}, {i}),'.ljust(20) + f'op=({i}, {h}, {n})' # sub_label = str(tensors['x'].size()) if nl is None: setup=f'rnn=torch.nn.{op}({i}, {h}, {n})' else: setup=f'rnn=torch.nn.{op}({i}, {h}, {n}, nonlinearity="{nl}")' setup += f'.to(device="cuda", dtype={d_dtype[dtype]})' res.append( benchmark.Timer(stmt=f'rnn(x)', setup=setup, globals=tensors, label=f"{op=}, nonlinearity='{nl}', {dtype=}", sub_label=sub_label, description=f'{torch.__version__}') .blocked_autorange(min_run_time=0.1)) torch_ver = str(torch.__version__) torch_git_ver = torch_ver[torch_ver.index('+') + 1:] with open(f'{torch_git_ver}-{op}-{nl}-{dtype}.pkl', 'wb') as f: pickle.dump(res, f) compare = benchmark.Compare(res) # compare.colorize() compare.print()
def prof(dtype, op): fuzzer = benchmark.Fuzzer(parameters=[ benchmark.FuzzedParameter('n', minval=4, maxval=16, distribution='uniform'), benchmark.FuzzedParameter('c', minval=4, maxval=256, distribution='uniform'), benchmark.FuzzedParameter('h', minval=8, maxval=256, distribution='uniform'), benchmark.FuzzedParameter('w', minval=8, maxval=256, distribution='uniform'), ], tensors=[ benchmark.FuzzedTensor( 'x', size='nchw', min_elements=12, max_elements=10000000, cuda=True, dtype=d_dtype[dtype], max_allocation_bytes=1_000_000_000) ], seed=42) res = [] for kernel_size in [2, 3, 5]: for tensors, tensor_params, params in fuzzer.take(20): sub_label = str(tensors['x'].size()) res.append( benchmark.Timer( stmt=f'torch.nn.functional.{op}(x, {kernel_size})', setup='', globals=tensors, label=f'{op}, {dtype=}, {kernel_size=}', sub_label=sub_label, description=f'{torch.__version__}').blocked_autorange( min_run_time=0.1)) torch_ver = str(torch.__version__) torch_git_ver = torch_ver[torch_ver.index('+') + 1:] with open(f'{torch_git_ver}-{op}-{dtype}.pkl', 'wb') as f: pickle.dump(res, f) compare = benchmark.Compare(res) # compare.colorize() compare.print()
def main(): tasks = [ ("add", "add", "torch.add(x, y)"), ("add", "add (extra +0)", "torch.add(x, y + zero)"), ] serialized_results = [] repeats = 2 timers = [ benchmark_utils.Timer( stmt=stmt, globals={ "torch": torch if branch == "master" else FauxTorch(torch, overhead_ns), "x": torch.ones((size, 4)), "y": torch.ones((1, 4)), "zero": torch.zeros(()), }, label=label, sub_label=sub_label, description=f"size: {size}", env=branch, num_threads=num_threads, ) for branch, overhead_ns in [("master", None), ("my_branch", 1), ("severe_regression", 5)] for label, sub_label, stmt in tasks for size in [1, 10, 100, 1000, 10000, 50000] for num_threads in [1, 4] ] for i, timer in enumerate(timers * repeats): serialized_results.append( pickle.dumps(timer.blocked_autorange(min_run_time=0.05))) print(f"\r{i + 1} / {len(timers) * repeats}", end="") sys.stdout.flush() print() comparison = benchmark_utils.Compare( [pickle.loads(i) for i in serialized_results]) print("== Unformatted " + "=" * 80 + "\n" + "/" * 95 + "\n") comparison.print() print("== Formatted " + "=" * 80 + "\n" + "/" * 93 + "\n") comparison.trim_significant_figures() comparison.colorize() comparison.print()
def main(): tasks = [ ("matmul", "x @ y", "torch.sparse.mm(x, y)"), ("matmul", "x @ y + 0", "torch.sparse.mm(x, y) + zero"), ] serialized_results = [] repeats = 2 timers = [ benchmark_utils.Timer( stmt=stmt, globals={ "torch": torch if branch == "master" else FauxTorch(torch, overhead_ns), "x": gen_sparse(size=size, density=density, dtype=torch.float32), "y": torch.rand(size, dtype=torch.float32), "zero": torch.zeros(()), }, label=label, sub_label=sub_label, description=f"size: {size}", env=branch, num_threads=num_threads, ) for branch, overhead_ns in [("master", None), ("my_branch", 1), ("severe_regression", 10)] for label, sub_label, stmt in tasks for density in [0.05, 0.1] for size in [(8, 8), (32, 32), (64, 64), (128, 128)] for num_threads in [1, 4] ] for i, timer in enumerate(timers * repeats): serialized_results.append(pickle.dumps( timer.blocked_autorange(min_run_time=0.05) )) print(f"\r{i + 1} / {len(timers) * repeats}", end="") sys.stdout.flush() print() comparison = benchmark_utils.Compare([ pickle.loads(i) for i in serialized_results ]) print("== Unformatted " + "=" * 80 + "\n" + "/" * 95 + "\n") comparison.print() print("== Formatted " + "=" * 80 + "\n" + "/" * 93 + "\n") comparison.trim_significant_figures() comparison.colorize() comparison.print()
def run_bench(model_names, bench_args): results = [] for model_name in model_names: model_creator = MODELS[model_name] inputs, model = model_creator(bench_args) print("Benchmarking RecordFunction overhead for", model_name) print("Running warmup...", end=" ") sys.stdout.flush() for _ in range(bench_args.warmup): model(*inputs) print("finished") for num_threads in NUM_THREADS: for with_rec_fn in [True, False]: torch.autograd._enable_record_function(with_rec_fn) torch.autograd._clear_callbacks() if with_rec_fn: torch.autograd._set_empty_test_observer(True, 0.0001) print("Running {} RecordFunction, num threads {} ...".format( "with" if with_rec_fn else "without", num_threads), end=" ") sys.stdout.flush() timer = benchmark_utils.Timer( stmt="model(*inputs)", globals={ "model": model, "inputs": inputs }, description=model_name, label="Record function overhead", sub_label= f"with{'' if with_rec_fn else 'out'}_rec_fn, num_threads {num_threads}", num_threads=num_threads) result = timer.blocked_autorange( min_run_time=bench_args.timer_min_run_time) print("finished") print(result) sys.stdout.flush() results.append(result) comparison = benchmark_utils.Compare(results) comparison.trim_significant_figures() comparison.highlight_warnings() comparison.print()
def run_lobpcg_comparison(label, generator, generator_settings, k=5, largest=True, tol=1e-5): label = '{} {} (k={}, largest={})'.format(args.format.upper(), label, k, largest) results = [] for kwargs in generator_settings: # generate input matrix a_pt, a_sp = generator(**kwargs) # use same initial eigenvectors for both scipy and pytorch x_pt = torch.randn(a_pt.size(0), k) x_sp = x_pt.numpy() description = '{:.4e}'.format(a_pt.size(0)) t1 = benchmark.Timer( stmt="torch.lobpcg(a, X=x, largest=largest, tol=tol)", setup="import torch", globals=dict(a=a_pt, x=x_pt, largest=largest, tol=tol), num_threads=torch.get_num_threads(), label=label, sub_label='torch_lobpcg', description=description, ) t2 = benchmark.Timer( stmt="lobpcg(a, X=x, largest=largest, tol=tol)", setup="from scipy.sparse.linalg import lobpcg", globals=dict(a=a_sp, x=x_sp, largest=largest, tol=tol), num_threads=torch.get_num_threads(), label=label, sub_label='scipy_lobpcg', description=description, ) results.append(t1.blocked_autorange(min_run_time=1.)) results.append(t2.blocked_autorange(min_run_time=1.)) compare = benchmark.Compare(results) compare.print()
def benchMark(sizes): results = [] if (len(sizes) == 0): print("Parameter 'sizes' has to a have minumun of 1 parameters") return for n in sizes: # label and sub_label are the rows # description is the column label = 'Batched dot' sub_label = f'[{n}, {n}]' x = torch.ones((n, n)) results.append( benchmark.Timer( stmt='batched_dot_mul_sum(x, x)', setup='from __main__ import batched_dot_mul_sum', globals={ 'x': x }, num_threads=torch.get_num_threads(), label=label, sub_label=sub_label, description='mul/sum', ).blocked_autorange()) results.append( benchmark.Timer( stmt='batched_dot_bmm(x, x)', setup='from __main__ import batched_dot_bmm', globals={ 'x': x }, num_threads=torch.get_num_threads(), label=label, sub_label=sub_label, description='bmm', ).blocked_autorange()) compare = benchmark.Compare(results) compare.print() return compare
min_elements=12, max_elements=10000000, cuda=True, dtype=torch.half, max_allocation_bytes=1_000_000_000) ], seed=42) res = [] for kernel_size in [2, 3, 5]: for tensors, tensor_params, params in fuzzer.take(20): sub_label = str(tensors['x'].size()) res.append( benchmark.Timer(stmt=f'torch.nn.functional.max_pool3d(x, {kernel_size})', setup='', globals=tensors, label=f'max_pool3d, {kernel_size=}', sub_label=sub_label, description=f'{torch.__version__}').blocked_autorange(min_run_time=0.1)) torch_ver = str(torch.__version__) torch_git_ver = torch_ver[torch_ver.index('+') + 1:] with open(f'{torch_git_ver}.pkl', 'wb') as f: pickle.dump(res, f) compare = benchmark.Compare(res) # compare.colorize() compare.print()
def benchmark_multihead_attention( label="", attn_dtype=torch.uint8, key_padding_dtype=torch.uint8, add_bias_kv=False, add_zero_attn=False, static_kv=False, batch_size=20, embedding=EMB, seq_len=SEQ, num_heads=HEADS, ): results = [] # device = torch.device("cuda") xformers_att_config = '{"name": "scaled_dot_product"}' attn_mask = _get_mask(to_dtype=attn_dtype, dim0=seq_len, dim1=seq_len) key_padding_mask = _get_mask(to_dtype=key_padding_dtype, dim0=batch_size, dim1=seq_len) q = torch.rand(seq_len, batch_size, embedding, requires_grad=True) k = torch.rand(seq_len, batch_size, embedding, requires_grad=True) v = torch.rand(seq_len, batch_size, embedding, requires_grad=True) _reset_seeds() original_mha = MultiheadAttention( embedding, num_heads, dropout=0.0, xformers_att_config=None, add_bias_kv=add_bias_kv, add_zero_attn=add_zero_attn, ) xformers_mha = MultiheadAttention( embedding, num_heads, dropout=0.0, xformers_att_config=xformers_att_config, add_bias_kv=add_bias_kv, add_zero_attn=add_zero_attn, ) def original_bench_fw(q, k, v, key_padding_mask, attn_mask, static_kv): original_mha( query=q, key=k, value=v, key_padding_mask=key_padding_mask, attn_mask=attn_mask, static_kv=static_kv, ) def xformers_bench_fw(q, k, v, key_padding_mask, attn_mask, static_kv): xformers_mha( query=q, key=k, value=v, key_padding_mask=key_padding_mask, attn_mask=attn_mask, static_kv=static_kv, ) def original_bench_fw_bw(q, k, v, key_padding_mask, attn_mask, static_kv): output, _ = original_mha( query=q, key=k, value=v, key_padding_mask=key_padding_mask, attn_mask=attn_mask, static_kv=static_kv, ) loss = torch.norm(output) loss.backward() def xformers_bench_fw_bw(q, k, v, key_padding_mask, attn_mask, static_kv): output, _ = xformers_mha( query=q, key=k, value=v, key_padding_mask=key_padding_mask, attn_mask=attn_mask, static_kv=static_kv, ) loss = torch.norm(output) loss.backward() fns = [ original_bench_fw, xformers_bench_fw, original_bench_fw_bw, xformers_bench_fw_bw, ] for fn in fns: results.append( benchmark.Timer( stmt="fn(q, k, v, key_padding_mask, attn_mask, static_kv)", globals={ "q": q, "k": k, "v": v, "key_padding_mask": key_padding_mask, "attn_mask": attn_mask, "static_kv": static_kv, "fn": fn, }, label="multihead fw + bw", sub_label=f"{fn.__name__}", description=label, ).blocked_autorange(min_run_time=1)) compare = benchmark.Compare(results) compare.print()
description='mul/sum', ).blocked_autorange(min_run_time=1)) results.append( benchmark.Timer( stmt='batched_dot_bmm(x, x)', setup='from __main__ import batched_dot_bmm', globals={ 'x': x }, num_threads=num_threads, label=label, sub_label=sub_label, description='bmm', ).blocked_autorange(min_run_time=1)) compare = benchmark.Compare(results) compare.print() ###################################################################### # .. code-block:: none # :caption: Output # # [--------------- Batched dot ----------------] # | mul/sum | bmm # 1 threads: ----------------------------------- # [1, 1] | 5.9 | 11.2 # [1, 64] | 6.4 | 11.4 # [1, 1024] | 6.7 | 14.2 # [1, 10000] | 10.2 | 23.7 # [64, 1] | 6.3 | 11.5 # [64, 64] | 8.6 | 15.4
if __name__ == '__main__': parser = ArgumentParser(description=__doc__) parser.add_argument('--device', type=str, choices=DEVICE_NAMES, nargs='+', default=DEVICE_NAMES) parser.add_argument('--bench', type=str, choices=BENCHMARK_NAMES, nargs='+', default=BENCHMARK_NAMES) parser.add_argument('--seed', type=int, default=0) parser.add_argument('--samples', type=int, default=10) parser.add_argument('--probability_regular', type=float, default=1.0) parser.add_argument('-o', '--output', type=str) args = parser.parse_args() num_benchmarks = len(args.device) * len(args.bench) i = 0 results = [] for device in args.device: for bench in (BENCHMARK_MAP[b] for b in args.bench): results += run_benchmark( name=bench.name, function=bench.function, dtype=bench.dtype, seed=args.seed, device=device, samples=args.samples, probability_regular=args.probability_regular) i += 1 print(f'Completed {bench.name} benchmark on {device} ({i} of {num_benchmarks})') if args.output is not None: with open(args.output, 'w') as f: _output_csv(f, results) compare = benchmark.Compare(results) compare.trim_significant_figures() compare.colorize() compare.print()
).blocked_autorange(min_run_time=1)) all_res.append( benchmark.Timer( stmt='conv(inp)', globals={ 'inp': inp2, 'conv': eca }, num_threads=num_threads, label=label2, sub_label="ECA", description='description', ).blocked_autorange(min_run_time=1)) all_res.append( benchmark.Timer( stmt='conv(inp)', globals={ 'inp': inp2, 'conv': eca9 }, num_threads=num_threads, label=label2, sub_label="ECA9", description='description', ).blocked_autorange(min_run_time=1)) ## divide speed by batch size all_res = [adjust_for_bs(i) for i in all_res] compare = benchmark.Compare(all_res) compare.print()
params = get_params_str(conv2_sep) t22 = benchmark.Timer( stmt='conv_sep(inp)', globals={ 'inp': inp2, 'conv_sep': conv2_sep }, num_threads=num_threads, label=label2, sub_label=f'Conv Sep. Params: {get_params_str(conv2_sep)}', description='description', ).blocked_autorange(min_run_time=1) ## divide speed by batch size t0 = adjust_for_bs(t0) t1 = adjust_for_bs(t1) t2 = adjust_for_bs(t2) t20 = adjust_for_bs(t20) t21 = adjust_for_bs(t21) t22 = adjust_for_bs(t22) compare = benchmark.Compare([t0, t1, t2, t20, t21, t22]) compare.print() # print(t0) # print(dir(t0)) # print(t0.median) # print(t1) # print(t2)
def test_compare(self): # Simulate several approaches. costs = ( # overhead_optimized_fn() (1e-6, 1e-9), # compute_optimized_fn() (3e-6, 5e-10), # special_case_fn() [square inputs only] (1e-6, 4e-10), ) sizes = ( (16, 16), (16, 128), (128, 128), (4096, 1024), (2048, 2048), ) # overhead_optimized_fn() class _MockTimer_0(self._MockTimer): _function_costs = tuple( (f"fn({i}, {j})", costs[0][0] + costs[0][1] * i * j) for i, j in sizes) class MockTimer_0(benchmark_utils.Timer): _timer_cls = _MockTimer_0 # compute_optimized_fn() class _MockTimer_1(self._MockTimer): _function_costs = tuple( (f"fn({i}, {j})", costs[1][0] + costs[1][1] * i * j) for i, j in sizes) class MockTimer_1(benchmark_utils.Timer): _timer_cls = _MockTimer_1 # special_case_fn() class _MockTimer_2(self._MockTimer): _function_costs = tuple( (f"fn({i}, {j})", costs[2][0] + costs[2][1] * i * j) for i, j in sizes if i == j) class MockTimer_2(benchmark_utils.Timer): _timer_cls = _MockTimer_2 results = [] for i, j in sizes: results.append( MockTimer_0( f"fn({i}, {j})", label="fn", description=f"({i}, {j})", sub_label="overhead_optimized", ).blocked_autorange(min_run_time=10)) results.append( MockTimer_1( f"fn({i}, {j})", label="fn", description=f"({i}, {j})", sub_label="compute_optimized", ).blocked_autorange(min_run_time=10)) if i == j: results.append( MockTimer_2( f"fn({i}, {j})", label="fn", description=f"({i}, {j})", sub_label="special_case (square)", ).blocked_autorange(min_run_time=10)) def check_output(output: str, expected: str): # VSCode will strip trailing newlines from `expected`, so we have to match # this behavior when comparing output. output_str = "\n".join( i.rstrip() for i in output.strip().splitlines(keepends=False)) self.assertEqual(output_str, textwrap.dedent(expected).strip()) compare = benchmark_utils.Compare(results) check_output( str(compare), """ [------------------------------------------------- fn ------------------------------------------------] | (16, 16) | (16, 128) | (128, 128) | (4096, 1024) | (2048, 2048) 1 threads: -------------------------------------------------------------------------------------------- overhead_optimized | 1.3 | 3.0 | 17.4 | 4174.4 | 4174.4 compute_optimized | 3.1 | 4.0 | 11.2 | 2099.3 | 2099.3 special_case (square) | 1.1 | | 7.5 | | 1674.7 Times are in microseconds (us).""") compare.trim_significant_figures() check_output( str(compare), """ [------------------------------------------------- fn ------------------------------------------------] | (16, 16) | (16, 128) | (128, 128) | (4096, 1024) | (2048, 2048) 1 threads: -------------------------------------------------------------------------------------------- overhead_optimized | 1 | 3.0 | 17 | 4200 | 4200 compute_optimized | 3 | 4.0 | 11 | 2100 | 2100 special_case (square) | 1 | | 8 | | 1700 Times are in microseconds (us).""") compare.colorize() check_output(str(compare), """ [------------------------------------------------- fn ------------------------------------------------] | (16, 16) | (16, 128) | (128, 128) | (4096, 1024) | (2048, 2048) 1 threads: -------------------------------------------------------------------------------------------- overhead_optimized | 1 | \x1b[92m\x1b[1m 3.0 \x1b[0m\x1b[0m | \x1b[2m\x1b[91m 17 \x1b[0m\x1b[0m | 4200 | \x1b[2m\x1b[91m 4200 \x1b[0m\x1b[0m compute_optimized | \x1b[2m\x1b[91m 3 \x1b[0m\x1b[0m | 4.0 | 11 | \x1b[92m\x1b[1m 2100 \x1b[0m\x1b[0m | 2100 special_case (square) | \x1b[92m\x1b[1m 1 \x1b[0m\x1b[0m | | \x1b[92m\x1b[1m 8 \x1b[0m\x1b[0m | | \x1b[92m\x1b[1m 1700 \x1b[0m\x1b[0m Times are in microseconds (us).""" # noqa )
def test_compare(self): # Simulate several approaches. costs = ( # overhead_optimized_fn() (1e-6, 1e-9), # compute_optimized_fn() (3e-6, 5e-10), # special_case_fn() [square inputs only] (1e-6, 4e-10), ) sizes = ( (16, 16), (16, 128), (128, 128), (4096, 1024), (2048, 2048), ) # overhead_optimized_fn() class _MockTimer_0(self._MockTimer): _function_costs = tuple( (f"fn({i}, {j})", costs[0][0] + costs[0][1] * i * j) for i, j in sizes) class MockTimer_0(benchmark_utils.Timer): _timer_cls = _MockTimer_0 # compute_optimized_fn() class _MockTimer_1(self._MockTimer): _function_costs = tuple( (f"fn({i}, {j})", costs[1][0] + costs[1][1] * i * j) for i, j in sizes) class MockTimer_1(benchmark_utils.Timer): _timer_cls = _MockTimer_1 # special_case_fn() class _MockTimer_2(self._MockTimer): _function_costs = tuple( (f"fn({i}, {j})", costs[2][0] + costs[2][1] * i * j) for i, j in sizes if i == j) class MockTimer_2(benchmark_utils.Timer): _timer_cls = _MockTimer_2 results = [] for i, j in sizes: results.append( MockTimer_0( f"fn({i}, {j})", label="fn", description=f"({i}, {j})", sub_label="overhead_optimized", ).blocked_autorange(min_run_time=10)) results.append( MockTimer_1( f"fn({i}, {j})", label="fn", description=f"({i}, {j})", sub_label="compute_optimized", ).blocked_autorange(min_run_time=10)) if i == j: results.append( MockTimer_2( f"fn({i}, {j})", label="fn", description=f"({i}, {j})", sub_label="special_case (square)", ).blocked_autorange(min_run_time=10)) def rstrip_lines(s: str) -> str: # VSCode will rstrip the `expected` string literal whether you like # it or not. So we have to rstrip the compare table as well. return "\n".join( [i.rstrip() for i in s.splitlines(keepends=False)]) compare = benchmark_utils.Compare(results) self.regularizeAndAssertExpectedInline( rstrip_lines(str(compare).strip()), """\ [------------------------------------------------- fn ------------------------------------------------] | (16, 16) | (16, 128) | (128, 128) | (4096, 1024) | (2048, 2048) 1 threads: -------------------------------------------------------------------------------------------- overhead_optimized | 1.3 | 3.0 | 17.4 | 4174.4 | 4174.4 compute_optimized | 3.1 | 4.0 | 11.2 | 2099.3 | 2099.3 special_case (square) | 1.1 | | 7.5 | | 1674.7 Times are in microseconds (us).""") compare.trim_significant_figures() self.regularizeAndAssertExpectedInline( rstrip_lines(str(compare).strip()), """\ [------------------------------------------------- fn ------------------------------------------------] | (16, 16) | (16, 128) | (128, 128) | (4096, 1024) | (2048, 2048) 1 threads: -------------------------------------------------------------------------------------------- overhead_optimized | 1 | 3.0 | 17 | 4200 | 4200 compute_optimized | 3 | 4.0 | 11 | 2100 | 2100 special_case (square) | 1 | | 8 | | 1700 Times are in microseconds (us).""") compare.colorize() columnwise_colored_actual = rstrip_lines(str(compare).strip()) columnwise_colored_expected = textwrap.dedent("""\ [------------------------------------------------- fn ------------------------------------------------] | (16, 16) | (16, 128) | (128, 128) | (4096, 1024) | (2048, 2048) 1 threads: -------------------------------------------------------------------------------------------- overhead_optimized | 1 | \x1b[92m\x1b[1m 3.0 \x1b[0m\x1b[0m | \x1b[2m\x1b[91m 17 \x1b[0m\x1b[0m | 4200 | \x1b[2m\x1b[91m 4200 \x1b[0m\x1b[0m compute_optimized | \x1b[2m\x1b[91m 3 \x1b[0m\x1b[0m | 4.0 | 11 | \x1b[92m\x1b[1m 2100 \x1b[0m\x1b[0m | 2100 special_case (square) | \x1b[92m\x1b[1m 1 \x1b[0m\x1b[0m | | \x1b[92m\x1b[1m 8 \x1b[0m\x1b[0m | | \x1b[92m\x1b[1m 1700 \x1b[0m\x1b[0m Times are in microseconds (us).""" # noqa ) compare.colorize(rowwise=True) rowwise_colored_actual = rstrip_lines(str(compare).strip()) rowwise_colored_expected = textwrap.dedent("""\ [------------------------------------------------- fn ------------------------------------------------] | (16, 16) | (16, 128) | (128, 128) | (4096, 1024) | (2048, 2048) 1 threads: -------------------------------------------------------------------------------------------- overhead_optimized | \x1b[92m\x1b[1m 1 \x1b[0m\x1b[0m | \x1b[2m\x1b[91m 3.0 \x1b[0m\x1b[0m | \x1b[31m\x1b[1m 17 \x1b[0m\x1b[0m | \x1b[31m\x1b[1m 4200 \x1b[0m\x1b[0m | \x1b[31m\x1b[1m 4200 \x1b[0m\x1b[0m compute_optimized | \x1b[92m\x1b[1m 3 \x1b[0m\x1b[0m | 4.0 | \x1b[2m\x1b[91m 11 \x1b[0m\x1b[0m | \x1b[31m\x1b[1m 2100 \x1b[0m\x1b[0m | \x1b[31m\x1b[1m 2100 \x1b[0m\x1b[0m special_case (square) | \x1b[92m\x1b[1m 1 \x1b[0m\x1b[0m | | \x1b[31m\x1b[1m 8 \x1b[0m\x1b[0m | | \x1b[31m\x1b[1m 1700 \x1b[0m\x1b[0m Times are in microseconds (us).""" # noqa ) def print_new_expected(s: str) -> None: print(f'{"":>12}"""\\', end="") for l in s.splitlines(keepends=False): print("\n" + textwrap.indent(repr(l)[1:-1], " " * 12), end="") print('"""\n') if expecttest.ACCEPT: # expecttest does not currently support non-printable characters, # so these two entries have to be updated manually. if columnwise_colored_actual != columnwise_colored_expected: print("New columnwise coloring:\n") print_new_expected(columnwise_colored_actual) if rowwise_colored_actual != rowwise_colored_expected: print("New rowwise coloring:\n") print_new_expected(rowwise_colored_actual) self.assertEqual(columnwise_colored_actual, columnwise_colored_expected) self.assertEqual(rowwise_colored_actual, rowwise_colored_expected)
for x, y in load_dataset(dataset_path, hidden_size, sparsity) ] measurements = [] for i, timer in enumerate(timers * repeats): m = timer.blocked_autorange(min_run_time=0.05) serialized_results.append(pickle.dumps(m)) m.metadata = { "device": 'cuda' if m.task_spec.env.find("cuda") >= 0 else 'cpu' } measurements.append(m) print(f"\r{i + 1} / {len(timers) * repeats}", end="") sys.stdout.flush() print() comparison = benchmark_utils.Compare( [pickle.loads(i) for i in serialized_results]) print("== Unformatted " + "=" * 80 + "\n" + "/" * 95 + "\n") comparison.print() print("== Formatted " + "=" * 80 + "\n" + "/" * 93 + "\n") comparison.trim_significant_figures() comparison.colorize() comparison.print() table = [(m.task_spec.sub_label, m.task_spec.description, m.metadata["device"], m.mean) for m in measurements] df = pd.DataFrame(table, columns=['method', 'sparsity', 'device', 'time']) df.to_pickle(df_output_path)
label1 = f"Stem conv. Shape: {inp.shape}" t0 = benchmark.Timer( stmt='conv(inp)', globals={ 'inp': inp, 'conv': conv_pw }, num_threads=num_threads, label="PW Stem convs", # sub_label=f'Reg Conv. Params: {get_params_str(conv)}', description='description', ).blocked_autorange(min_run_time=1) t1 = benchmark.Timer( stmt='conv(inp)', globals={ 'inp': inp2, 'conv': conv_pw2 }, num_threads=num_threads, label="PW deeper convs", # sub_label=f'Conv DW. Params: {get_params_str(conv_dw)}', description='description', ).blocked_autorange(min_run_time=1) ## divide speed by batch size t0 = adjust_for_bs(t0) t1 = adjust_for_bs(t1) compare = benchmark.Compare([t0, t1]) compare.print()