예제 #1
0
def compare_optimize_resnet18_to_torchscript():
    results = []
    for i in range(20):
        test_input = torch.rand(1, 3, 224, 224).half().cuda()
        sub_label = f"[test {i}]"
        results.append(
            benchmark.Timer(
                stmt="meta_module_resnet18(test_input)",
                setup="from __main__ import meta_module_resnet18",
                globals={"test_input": test_input},
                sub_label=sub_label,
                description="tuning by meta",
            ).blocked_autorange()
        )
        results.append(
            benchmark.Timer(
                stmt="jit_module_resnet18(test_input)",
                setup="from __main__ import jit_module_resnet18",
                globals={"test_input": test_input},
                sub_label=sub_label,
                description="tuning by jit",
            ).blocked_autorange()
        )
    compare = benchmark.Compare(results)
    compare.print()
예제 #2
0
def prof(dtype, op, nl, hidden_size_max):
    fuzzer = benchmark.Fuzzer(
        parameters=[
            benchmark.FuzzedParameter('s', minval=1000, maxval=6000, distribution='uniform'),    # seq_length
            benchmark.FuzzedParameter('b', minval=1, maxval=64, distribution='uniform'),   # batch_size
            benchmark.FuzzedParameter('i', minval=16, maxval=512, distribution='uniform'),   # input_size
            benchmark.FuzzedParameter('h', minval=16, maxval=hidden_size_max, distribution='uniform'),   # hidden_size
            benchmark.FuzzedParameter('n', minval=1, maxval=4, distribution='uniform'),   # num_layer
        ],
        tensors=[
            benchmark.FuzzedTensor('x',
                                   size='sbi',
                                   min_elements=12,
                                   max_elements=10000000,
                                   cuda=True,
                                   dtype=d_dtype[dtype],
                                   max_allocation_bytes=1_000_000_000)
        ],
        seed=42,
        constraints=[
            lambda params: params['i'] % 8 == 0,
            lambda params: params['h'] % 8 == 0
        ])

    res = []

    for tensors, tensor_params, params in fuzzer.take(20):
        s = params['s']
        b = params['b']
        i = params['i']
        h = params['h']
        n = params['n']
        sub_label = f'x=({s}, {b}, {i}),'.ljust(20) + f'op=({i}, {h}, {n})'
        # sub_label = str(tensors['x'].size())

        if nl is None:
            setup=f'rnn=torch.nn.{op}({i}, {h}, {n})'
        else:
            setup=f'rnn=torch.nn.{op}({i}, {h}, {n}, nonlinearity="{nl}")'
        setup += f'.to(device="cuda", dtype={d_dtype[dtype]})'

        res.append(
            benchmark.Timer(stmt=f'rnn(x)',
                            setup=setup,
                            globals=tensors,
                            label=f"{op=}, nonlinearity='{nl}', {dtype=}",
                            sub_label=sub_label,
                            description=f'{torch.__version__}')
                        .blocked_autorange(min_run_time=0.1))

    torch_ver = str(torch.__version__)
    torch_git_ver = torch_ver[torch_ver.index('+') + 1:]

    with open(f'{torch_git_ver}-{op}-{nl}-{dtype}.pkl', 'wb') as f:
        pickle.dump(res, f)

    compare = benchmark.Compare(res)
    # compare.colorize()
    compare.print()
예제 #3
0
파일: b.py 프로젝트: xwang233/code-snippet
def prof(dtype, op):
    fuzzer = benchmark.Fuzzer(parameters=[
        benchmark.FuzzedParameter('n',
                                  minval=4,
                                  maxval=16,
                                  distribution='uniform'),
        benchmark.FuzzedParameter('c',
                                  minval=4,
                                  maxval=256,
                                  distribution='uniform'),
        benchmark.FuzzedParameter('h',
                                  minval=8,
                                  maxval=256,
                                  distribution='uniform'),
        benchmark.FuzzedParameter('w',
                                  minval=8,
                                  maxval=256,
                                  distribution='uniform'),
    ],
                              tensors=[
                                  benchmark.FuzzedTensor(
                                      'x',
                                      size='nchw',
                                      min_elements=12,
                                      max_elements=10000000,
                                      cuda=True,
                                      dtype=d_dtype[dtype],
                                      max_allocation_bytes=1_000_000_000)
                              ],
                              seed=42)

    res = []

    for kernel_size in [2, 3, 5]:
        for tensors, tensor_params, params in fuzzer.take(20):
            sub_label = str(tensors['x'].size())
            res.append(
                benchmark.Timer(
                    stmt=f'torch.nn.functional.{op}(x, {kernel_size})',
                    setup='',
                    globals=tensors,
                    label=f'{op}, {dtype=}, {kernel_size=}',
                    sub_label=sub_label,
                    description=f'{torch.__version__}').blocked_autorange(
                        min_run_time=0.1))

    torch_ver = str(torch.__version__)
    torch_git_ver = torch_ver[torch_ver.index('+') + 1:]

    with open(f'{torch_git_ver}-{op}-{dtype}.pkl', 'wb') as f:
        pickle.dump(res, f)

    compare = benchmark.Compare(res)
    # compare.colorize()
    compare.print()
예제 #4
0
def main():
    tasks = [
        ("add", "add", "torch.add(x, y)"),
        ("add", "add (extra +0)", "torch.add(x, y + zero)"),
    ]

    serialized_results = []
    repeats = 2
    timers = [
        benchmark_utils.Timer(
            stmt=stmt,
            globals={
                "torch":
                torch if branch == "master" else FauxTorch(torch, overhead_ns),
                "x":
                torch.ones((size, 4)),
                "y":
                torch.ones((1, 4)),
                "zero":
                torch.zeros(()),
            },
            label=label,
            sub_label=sub_label,
            description=f"size: {size}",
            env=branch,
            num_threads=num_threads,
        ) for branch, overhead_ns in [("master",
                                       None), ("my_branch",
                                               1), ("severe_regression", 5)]
        for label, sub_label, stmt in tasks
        for size in [1, 10, 100, 1000, 10000, 50000] for num_threads in [1, 4]
    ]

    for i, timer in enumerate(timers * repeats):
        serialized_results.append(
            pickle.dumps(timer.blocked_autorange(min_run_time=0.05)))
        print(f"\r{i + 1} / {len(timers) * repeats}", end="")
        sys.stdout.flush()
    print()

    comparison = benchmark_utils.Compare(
        [pickle.loads(i) for i in serialized_results])

    print("== Unformatted " + "=" * 80 + "\n" + "/" * 95 + "\n")
    comparison.print()

    print("== Formatted " + "=" * 80 + "\n" + "/" * 93 + "\n")
    comparison.trim_significant_figures()
    comparison.colorize()
    comparison.print()
예제 #5
0
def main():
    tasks = [
        ("matmul", "x @ y", "torch.sparse.mm(x, y)"),
        ("matmul", "x @ y + 0", "torch.sparse.mm(x, y) + zero"),
    ]

    serialized_results = []
    repeats = 2
    timers = [
        benchmark_utils.Timer(
            stmt=stmt,
            globals={
                "torch": torch if branch == "master" else FauxTorch(torch, overhead_ns),
                "x": gen_sparse(size=size, density=density, dtype=torch.float32),
                "y": torch.rand(size, dtype=torch.float32),
                "zero": torch.zeros(()),
            },
            label=label,
            sub_label=sub_label,
            description=f"size: {size}",
            env=branch,
            num_threads=num_threads,
        )
        for branch, overhead_ns in [("master", None), ("my_branch", 1), ("severe_regression", 10)]
        for label, sub_label, stmt in tasks
        for density in [0.05, 0.1]
        for size in [(8, 8), (32, 32), (64, 64), (128, 128)]
        for num_threads in [1, 4]
    ]

    for i, timer in enumerate(timers * repeats):
        serialized_results.append(pickle.dumps(
            timer.blocked_autorange(min_run_time=0.05)
        ))
        print(f"\r{i + 1} / {len(timers) * repeats}", end="")
        sys.stdout.flush()
    print()

    comparison = benchmark_utils.Compare([
        pickle.loads(i) for i in serialized_results
    ])

    print("== Unformatted " + "=" * 80 + "\n" + "/" * 95 + "\n")
    comparison.print()

    print("== Formatted " + "=" * 80 + "\n" + "/" * 93 + "\n")
    comparison.trim_significant_figures()
    comparison.colorize()
    comparison.print()
def run_bench(model_names, bench_args):
    results = []
    for model_name in model_names:
        model_creator = MODELS[model_name]
        inputs, model = model_creator(bench_args)

        print("Benchmarking RecordFunction overhead for", model_name)
        print("Running warmup...", end=" ")
        sys.stdout.flush()
        for _ in range(bench_args.warmup):
            model(*inputs)
        print("finished")

        for num_threads in NUM_THREADS:
            for with_rec_fn in [True, False]:
                torch.autograd._enable_record_function(with_rec_fn)
                torch.autograd._clear_callbacks()
                if with_rec_fn:
                    torch.autograd._set_empty_test_observer(True, 0.0001)

                print("Running {} RecordFunction, num threads {} ...".format(
                    "with" if with_rec_fn else "without", num_threads),
                      end=" ")
                sys.stdout.flush()
                timer = benchmark_utils.Timer(
                    stmt="model(*inputs)",
                    globals={
                        "model": model,
                        "inputs": inputs
                    },
                    description=model_name,
                    label="Record function overhead",
                    sub_label=
                    f"with{'' if with_rec_fn else 'out'}_rec_fn, num_threads {num_threads}",
                    num_threads=num_threads)
                result = timer.blocked_autorange(
                    min_run_time=bench_args.timer_min_run_time)
                print("finished")
                print(result)
                sys.stdout.flush()
                results.append(result)

    comparison = benchmark_utils.Compare(results)
    comparison.trim_significant_figures()
    comparison.highlight_warnings()
    comparison.print()
예제 #7
0
def run_lobpcg_comparison(label,
                          generator,
                          generator_settings,
                          k=5,
                          largest=True,
                          tol=1e-5):
    label = '{} {} (k={}, largest={})'.format(args.format.upper(), label, k,
                                              largest)

    results = []
    for kwargs in generator_settings:
        # generate input matrix
        a_pt, a_sp = generator(**kwargs)

        # use same initial eigenvectors for both scipy and pytorch
        x_pt = torch.randn(a_pt.size(0), k)
        x_sp = x_pt.numpy()

        description = '{:.4e}'.format(a_pt.size(0))

        t1 = benchmark.Timer(
            stmt="torch.lobpcg(a, X=x, largest=largest, tol=tol)",
            setup="import torch",
            globals=dict(a=a_pt, x=x_pt, largest=largest, tol=tol),
            num_threads=torch.get_num_threads(),
            label=label,
            sub_label='torch_lobpcg',
            description=description,
        )

        t2 = benchmark.Timer(
            stmt="lobpcg(a, X=x, largest=largest, tol=tol)",
            setup="from scipy.sparse.linalg import lobpcg",
            globals=dict(a=a_sp, x=x_sp, largest=largest, tol=tol),
            num_threads=torch.get_num_threads(),
            label=label,
            sub_label='scipy_lobpcg',
            description=description,
        )

        results.append(t1.blocked_autorange(min_run_time=1.))
        results.append(t2.blocked_autorange(min_run_time=1.))

    compare = benchmark.Compare(results)
    compare.print()
예제 #8
0
def benchMark(sizes):
    results = []
    if (len(sizes) == 0):
        print("Parameter 'sizes' has to a have minumun of 1 parameters")
        return

    for n in sizes:
        # label and sub_label are the rows
        # description is the column
        label = 'Batched dot'
        sub_label = f'[{n}, {n}]'
        x = torch.ones((n, n))
        results.append(
            benchmark.Timer(
                stmt='batched_dot_mul_sum(x, x)',
                setup='from __main__ import batched_dot_mul_sum',
                globals={
                    'x': x
                },
                num_threads=torch.get_num_threads(),
                label=label,
                sub_label=sub_label,
                description='mul/sum',
            ).blocked_autorange())
        results.append(
            benchmark.Timer(
                stmt='batched_dot_bmm(x, x)',
                setup='from __main__ import batched_dot_bmm',
                globals={
                    'x': x
                },
                num_threads=torch.get_num_threads(),
                label=label,
                sub_label=sub_label,
                description='bmm',
            ).blocked_autorange())
    compare = benchmark.Compare(results)
    compare.print()
    return compare
예제 #9
0
파일: b.py 프로젝트: xwang233/code-snippet
                                                     min_elements=12,
                                                     max_elements=10000000,
                                                     cuda=True,
                                                     dtype=torch.half,
                                                     max_allocation_bytes=1_000_000_000)
                          ],
                          seed=42)

res = []

for kernel_size in [2, 3, 5]:
    for tensors, tensor_params, params in fuzzer.take(20):
        sub_label = str(tensors['x'].size())
        res.append(
            benchmark.Timer(stmt=f'torch.nn.functional.max_pool3d(x, {kernel_size})',
                            setup='',
                            globals=tensors,
                            label=f'max_pool3d, {kernel_size=}',
                            sub_label=sub_label,
                            description=f'{torch.__version__}').blocked_autorange(min_run_time=0.1))

torch_ver = str(torch.__version__)
torch_git_ver = torch_ver[torch_ver.index('+') + 1:]

with open(f'{torch_git_ver}.pkl', 'wb') as f:
    pickle.dump(res, f)

compare = benchmark.Compare(res)
# compare.colorize()
compare.print()
예제 #10
0
def benchmark_multihead_attention(
    label="",
    attn_dtype=torch.uint8,
    key_padding_dtype=torch.uint8,
    add_bias_kv=False,
    add_zero_attn=False,
    static_kv=False,
    batch_size=20,
    embedding=EMB,
    seq_len=SEQ,
    num_heads=HEADS,
):

    results = []
    # device = torch.device("cuda")

    xformers_att_config = '{"name": "scaled_dot_product"}'

    attn_mask = _get_mask(to_dtype=attn_dtype, dim0=seq_len, dim1=seq_len)
    key_padding_mask = _get_mask(to_dtype=key_padding_dtype,
                                 dim0=batch_size,
                                 dim1=seq_len)

    q = torch.rand(seq_len, batch_size, embedding, requires_grad=True)
    k = torch.rand(seq_len, batch_size, embedding, requires_grad=True)
    v = torch.rand(seq_len, batch_size, embedding, requires_grad=True)

    _reset_seeds()

    original_mha = MultiheadAttention(
        embedding,
        num_heads,
        dropout=0.0,
        xformers_att_config=None,
        add_bias_kv=add_bias_kv,
        add_zero_attn=add_zero_attn,
    )

    xformers_mha = MultiheadAttention(
        embedding,
        num_heads,
        dropout=0.0,
        xformers_att_config=xformers_att_config,
        add_bias_kv=add_bias_kv,
        add_zero_attn=add_zero_attn,
    )

    def original_bench_fw(q, k, v, key_padding_mask, attn_mask, static_kv):
        original_mha(
            query=q,
            key=k,
            value=v,
            key_padding_mask=key_padding_mask,
            attn_mask=attn_mask,
            static_kv=static_kv,
        )

    def xformers_bench_fw(q, k, v, key_padding_mask, attn_mask, static_kv):
        xformers_mha(
            query=q,
            key=k,
            value=v,
            key_padding_mask=key_padding_mask,
            attn_mask=attn_mask,
            static_kv=static_kv,
        )

    def original_bench_fw_bw(q, k, v, key_padding_mask, attn_mask, static_kv):
        output, _ = original_mha(
            query=q,
            key=k,
            value=v,
            key_padding_mask=key_padding_mask,
            attn_mask=attn_mask,
            static_kv=static_kv,
        )
        loss = torch.norm(output)
        loss.backward()

    def xformers_bench_fw_bw(q, k, v, key_padding_mask, attn_mask, static_kv):
        output, _ = xformers_mha(
            query=q,
            key=k,
            value=v,
            key_padding_mask=key_padding_mask,
            attn_mask=attn_mask,
            static_kv=static_kv,
        )
        loss = torch.norm(output)
        loss.backward()

    fns = [
        original_bench_fw,
        xformers_bench_fw,
        original_bench_fw_bw,
        xformers_bench_fw_bw,
    ]

    for fn in fns:
        results.append(
            benchmark.Timer(
                stmt="fn(q, k, v, key_padding_mask, attn_mask, static_kv)",
                globals={
                    "q": q,
                    "k": k,
                    "v": v,
                    "key_padding_mask": key_padding_mask,
                    "attn_mask": attn_mask,
                    "static_kv": static_kv,
                    "fn": fn,
                },
                label="multihead fw + bw",
                sub_label=f"{fn.__name__}",
                description=label,
            ).blocked_autorange(min_run_time=1))

    compare = benchmark.Compare(results)
    compare.print()
예제 #11
0
                description='mul/sum',
            ).blocked_autorange(min_run_time=1))
        results.append(
            benchmark.Timer(
                stmt='batched_dot_bmm(x, x)',
                setup='from __main__ import batched_dot_bmm',
                globals={
                    'x': x
                },
                num_threads=num_threads,
                label=label,
                sub_label=sub_label,
                description='bmm',
            ).blocked_autorange(min_run_time=1))

compare = benchmark.Compare(results)
compare.print()

######################################################################
# .. code-block:: none
#    :caption: Output
#
#     [--------------- Batched dot ----------------]
#                           |  mul/sum   |    bmm
#     1 threads: -----------------------------------
#           [1, 1]          |       5.9  |      11.2
#           [1, 64]         |       6.4  |      11.4
#           [1, 1024]       |       6.7  |      14.2
#           [1, 10000]      |      10.2  |      23.7
#           [64, 1]         |       6.3  |      11.5
#           [64, 64]        |       8.6  |      15.4
예제 #12
0
if __name__ == '__main__':
    parser = ArgumentParser(description=__doc__)
    parser.add_argument('--device', type=str, choices=DEVICE_NAMES, nargs='+', default=DEVICE_NAMES)
    parser.add_argument('--bench', type=str, choices=BENCHMARK_NAMES, nargs='+', default=BENCHMARK_NAMES)
    parser.add_argument('--seed', type=int, default=0)
    parser.add_argument('--samples', type=int, default=10)
    parser.add_argument('--probability_regular', type=float, default=1.0)
    parser.add_argument('-o', '--output', type=str)
    args = parser.parse_args()

    num_benchmarks = len(args.device) * len(args.bench)
    i = 0
    results = []
    for device in args.device:
        for bench in (BENCHMARK_MAP[b] for b in args.bench):
            results += run_benchmark(
                name=bench.name, function=bench.function, dtype=bench.dtype,
                seed=args.seed, device=device, samples=args.samples,
                probability_regular=args.probability_regular)
            i += 1
            print(f'Completed {bench.name} benchmark on {device} ({i} of {num_benchmarks})')

    if args.output is not None:
        with open(args.output, 'w') as f:
            _output_csv(f, results)

    compare = benchmark.Compare(results)
    compare.trim_significant_figures()
    compare.colorize()
    compare.print()
    ).blocked_autorange(min_run_time=1))
all_res.append(
    benchmark.Timer(
        stmt='conv(inp)',
        globals={
            'inp': inp2,
            'conv': eca
        },
        num_threads=num_threads,
        label=label2,
        sub_label="ECA",
        description='description',
    ).blocked_autorange(min_run_time=1))
all_res.append(
    benchmark.Timer(
        stmt='conv(inp)',
        globals={
            'inp': inp2,
            'conv': eca9
        },
        num_threads=num_threads,
        label=label2,
        sub_label="ECA9",
        description='description',
    ).blocked_autorange(min_run_time=1))

## divide speed by batch size
all_res = [adjust_for_bs(i) for i in all_res]

compare = benchmark.Compare(all_res)
compare.print()
예제 #14
0
params = get_params_str(conv2_sep)
t22 = benchmark.Timer(
    stmt='conv_sep(inp)',
    globals={
        'inp': inp2,
        'conv_sep': conv2_sep
    },
    num_threads=num_threads,
    label=label2,
    sub_label=f'Conv Sep. Params: {get_params_str(conv2_sep)}',
    description='description',
).blocked_autorange(min_run_time=1)

## divide speed by batch size
t0 = adjust_for_bs(t0)
t1 = adjust_for_bs(t1)
t2 = adjust_for_bs(t2)

t20 = adjust_for_bs(t20)
t21 = adjust_for_bs(t21)
t22 = adjust_for_bs(t22)

compare = benchmark.Compare([t0, t1, t2, t20, t21, t22])
compare.print()

# print(t0)
# print(dir(t0))
# print(t0.median)
# print(t1)
# print(t2)
예제 #15
0
    def test_compare(self):
        # Simulate several approaches.
        costs = (
            # overhead_optimized_fn()
            (1e-6, 1e-9),

            # compute_optimized_fn()
            (3e-6, 5e-10),

            # special_case_fn()  [square inputs only]
            (1e-6, 4e-10),
        )

        sizes = (
            (16, 16),
            (16, 128),
            (128, 128),
            (4096, 1024),
            (2048, 2048),
        )

        # overhead_optimized_fn()
        class _MockTimer_0(self._MockTimer):
            _function_costs = tuple(
                (f"fn({i}, {j})", costs[0][0] + costs[0][1] * i * j)
                for i, j in sizes)

        class MockTimer_0(benchmark_utils.Timer):
            _timer_cls = _MockTimer_0

        # compute_optimized_fn()
        class _MockTimer_1(self._MockTimer):
            _function_costs = tuple(
                (f"fn({i}, {j})", costs[1][0] + costs[1][1] * i * j)
                for i, j in sizes)

        class MockTimer_1(benchmark_utils.Timer):
            _timer_cls = _MockTimer_1

        # special_case_fn()
        class _MockTimer_2(self._MockTimer):
            _function_costs = tuple(
                (f"fn({i}, {j})", costs[2][0] + costs[2][1] * i * j)
                for i, j in sizes if i == j)

        class MockTimer_2(benchmark_utils.Timer):
            _timer_cls = _MockTimer_2

        results = []
        for i, j in sizes:
            results.append(
                MockTimer_0(
                    f"fn({i}, {j})",
                    label="fn",
                    description=f"({i}, {j})",
                    sub_label="overhead_optimized",
                ).blocked_autorange(min_run_time=10))

            results.append(
                MockTimer_1(
                    f"fn({i}, {j})",
                    label="fn",
                    description=f"({i}, {j})",
                    sub_label="compute_optimized",
                ).blocked_autorange(min_run_time=10))

            if i == j:
                results.append(
                    MockTimer_2(
                        f"fn({i}, {j})",
                        label="fn",
                        description=f"({i}, {j})",
                        sub_label="special_case (square)",
                    ).blocked_autorange(min_run_time=10))

        def check_output(output: str, expected: str):
            # VSCode will strip trailing newlines from `expected`, so we have to match
            # this behavior when comparing output.
            output_str = "\n".join(
                i.rstrip() for i in output.strip().splitlines(keepends=False))

            self.assertEqual(output_str, textwrap.dedent(expected).strip())

        compare = benchmark_utils.Compare(results)

        check_output(
            str(compare), """
            [------------------------------------------------- fn ------------------------------------------------]
                                         |  (16, 16)  |  (16, 128)  |  (128, 128)  |  (4096, 1024)  |  (2048, 2048)
            1 threads: --------------------------------------------------------------------------------------------
                  overhead_optimized     |    1.3     |     3.0     |     17.4     |     4174.4     |     4174.4
                  compute_optimized      |    3.1     |     4.0     |     11.2     |     2099.3     |     2099.3
                  special_case (square)  |    1.1     |             |      7.5     |                |     1674.7

            Times are in microseconds (us).""")

        compare.trim_significant_figures()
        check_output(
            str(compare), """
            [------------------------------------------------- fn ------------------------------------------------]
                                         |  (16, 16)  |  (16, 128)  |  (128, 128)  |  (4096, 1024)  |  (2048, 2048)
            1 threads: --------------------------------------------------------------------------------------------
                  overhead_optimized     |     1      |     3.0     |      17      |      4200      |      4200
                  compute_optimized      |     3      |     4.0     |      11      |      2100      |      2100
                  special_case (square)  |     1      |             |       8      |                |      1700

            Times are in microseconds (us).""")

        compare.colorize()
        check_output(str(compare), """
            [------------------------------------------------- fn ------------------------------------------------]
                                         |  (16, 16)  |  (16, 128)  |  (128, 128)  |  (4096, 1024)  |  (2048, 2048)
            1 threads: --------------------------------------------------------------------------------------------
                  overhead_optimized     |     1      |  \x1b[92m\x1b[1m   3.0   \x1b[0m\x1b[0m  |  \x1b[2m\x1b[91m    17    \x1b[0m\x1b[0m  |      4200      |  \x1b[2m\x1b[91m    4200    \x1b[0m\x1b[0m
                  compute_optimized      |  \x1b[2m\x1b[91m   3    \x1b[0m\x1b[0m  |     4.0     |      11      |  \x1b[92m\x1b[1m    2100    \x1b[0m\x1b[0m  |      2100
                  special_case (square)  |  \x1b[92m\x1b[1m   1    \x1b[0m\x1b[0m  |             |  \x1b[92m\x1b[1m     8    \x1b[0m\x1b[0m  |                |  \x1b[92m\x1b[1m    1700    \x1b[0m\x1b[0m

            Times are in microseconds (us)."""

                     # noqa
                     )
예제 #16
0
    def test_compare(self):
        # Simulate several approaches.
        costs = (
            # overhead_optimized_fn()
            (1e-6, 1e-9),

            # compute_optimized_fn()
            (3e-6, 5e-10),

            # special_case_fn()  [square inputs only]
            (1e-6, 4e-10),
        )

        sizes = (
            (16, 16),
            (16, 128),
            (128, 128),
            (4096, 1024),
            (2048, 2048),
        )

        # overhead_optimized_fn()
        class _MockTimer_0(self._MockTimer):
            _function_costs = tuple(
                (f"fn({i}, {j})", costs[0][0] + costs[0][1] * i * j)
                for i, j in sizes)

        class MockTimer_0(benchmark_utils.Timer):
            _timer_cls = _MockTimer_0

        # compute_optimized_fn()
        class _MockTimer_1(self._MockTimer):
            _function_costs = tuple(
                (f"fn({i}, {j})", costs[1][0] + costs[1][1] * i * j)
                for i, j in sizes)

        class MockTimer_1(benchmark_utils.Timer):
            _timer_cls = _MockTimer_1

        # special_case_fn()
        class _MockTimer_2(self._MockTimer):
            _function_costs = tuple(
                (f"fn({i}, {j})", costs[2][0] + costs[2][1] * i * j)
                for i, j in sizes if i == j)

        class MockTimer_2(benchmark_utils.Timer):
            _timer_cls = _MockTimer_2

        results = []
        for i, j in sizes:
            results.append(
                MockTimer_0(
                    f"fn({i}, {j})",
                    label="fn",
                    description=f"({i}, {j})",
                    sub_label="overhead_optimized",
                ).blocked_autorange(min_run_time=10))

            results.append(
                MockTimer_1(
                    f"fn({i}, {j})",
                    label="fn",
                    description=f"({i}, {j})",
                    sub_label="compute_optimized",
                ).blocked_autorange(min_run_time=10))

            if i == j:
                results.append(
                    MockTimer_2(
                        f"fn({i}, {j})",
                        label="fn",
                        description=f"({i}, {j})",
                        sub_label="special_case (square)",
                    ).blocked_autorange(min_run_time=10))

        def rstrip_lines(s: str) -> str:
            # VSCode will rstrip the `expected` string literal whether you like
            # it or not. So we have to rstrip the compare table as well.
            return "\n".join(
                [i.rstrip() for i in s.splitlines(keepends=False)])

        compare = benchmark_utils.Compare(results)
        self.regularizeAndAssertExpectedInline(
            rstrip_lines(str(compare).strip()), """\
            [------------------------------------------------- fn ------------------------------------------------]
                                         |  (16, 16)  |  (16, 128)  |  (128, 128)  |  (4096, 1024)  |  (2048, 2048)
            1 threads: --------------------------------------------------------------------------------------------
                  overhead_optimized     |    1.3     |     3.0     |     17.4     |     4174.4     |     4174.4
                  compute_optimized      |    3.1     |     4.0     |     11.2     |     2099.3     |     2099.3
                  special_case (square)  |    1.1     |             |      7.5     |                |     1674.7

            Times are in microseconds (us).""")

        compare.trim_significant_figures()
        self.regularizeAndAssertExpectedInline(
            rstrip_lines(str(compare).strip()), """\
            [------------------------------------------------- fn ------------------------------------------------]
                                         |  (16, 16)  |  (16, 128)  |  (128, 128)  |  (4096, 1024)  |  (2048, 2048)
            1 threads: --------------------------------------------------------------------------------------------
                  overhead_optimized     |     1      |     3.0     |      17      |      4200      |      4200
                  compute_optimized      |     3      |     4.0     |      11      |      2100      |      2100
                  special_case (square)  |     1      |             |       8      |                |      1700

            Times are in microseconds (us).""")

        compare.colorize()
        columnwise_colored_actual = rstrip_lines(str(compare).strip())
        columnwise_colored_expected = textwrap.dedent("""\
            [------------------------------------------------- fn ------------------------------------------------]
                                         |  (16, 16)  |  (16, 128)  |  (128, 128)  |  (4096, 1024)  |  (2048, 2048)
            1 threads: --------------------------------------------------------------------------------------------
                  overhead_optimized     |     1      |  \x1b[92m\x1b[1m   3.0   \x1b[0m\x1b[0m  |  \x1b[2m\x1b[91m    17    \x1b[0m\x1b[0m  |      4200      |  \x1b[2m\x1b[91m    4200    \x1b[0m\x1b[0m
                  compute_optimized      |  \x1b[2m\x1b[91m   3    \x1b[0m\x1b[0m  |     4.0     |      11      |  \x1b[92m\x1b[1m    2100    \x1b[0m\x1b[0m  |      2100
                  special_case (square)  |  \x1b[92m\x1b[1m   1    \x1b[0m\x1b[0m  |             |  \x1b[92m\x1b[1m     8    \x1b[0m\x1b[0m  |                |  \x1b[92m\x1b[1m    1700    \x1b[0m\x1b[0m

            Times are in microseconds (us)."""

                                                      # noqa
                                                      )

        compare.colorize(rowwise=True)
        rowwise_colored_actual = rstrip_lines(str(compare).strip())
        rowwise_colored_expected = textwrap.dedent("""\
            [------------------------------------------------- fn ------------------------------------------------]
                                         |  (16, 16)  |  (16, 128)  |  (128, 128)  |  (4096, 1024)  |  (2048, 2048)
            1 threads: --------------------------------------------------------------------------------------------
                  overhead_optimized     |  \x1b[92m\x1b[1m   1    \x1b[0m\x1b[0m  |  \x1b[2m\x1b[91m   3.0   \x1b[0m\x1b[0m  |  \x1b[31m\x1b[1m    17    \x1b[0m\x1b[0m  |  \x1b[31m\x1b[1m    4200    \x1b[0m\x1b[0m  |  \x1b[31m\x1b[1m    4200    \x1b[0m\x1b[0m
                  compute_optimized      |  \x1b[92m\x1b[1m   3    \x1b[0m\x1b[0m  |     4.0     |  \x1b[2m\x1b[91m    11    \x1b[0m\x1b[0m  |  \x1b[31m\x1b[1m    2100    \x1b[0m\x1b[0m  |  \x1b[31m\x1b[1m    2100    \x1b[0m\x1b[0m
                  special_case (square)  |  \x1b[92m\x1b[1m   1    \x1b[0m\x1b[0m  |             |  \x1b[31m\x1b[1m     8    \x1b[0m\x1b[0m  |                |  \x1b[31m\x1b[1m    1700    \x1b[0m\x1b[0m

            Times are in microseconds (us)."""

                                                   # noqa
                                                   )

        def print_new_expected(s: str) -> None:
            print(f'{"":>12}"""\\', end="")
            for l in s.splitlines(keepends=False):
                print("\n" + textwrap.indent(repr(l)[1:-1], " " * 12), end="")
            print('"""\n')

        if expecttest.ACCEPT:
            # expecttest does not currently support non-printable characters,
            # so these two entries have to be updated manually.
            if columnwise_colored_actual != columnwise_colored_expected:
                print("New columnwise coloring:\n")
                print_new_expected(columnwise_colored_actual)

            if rowwise_colored_actual != rowwise_colored_expected:
                print("New rowwise coloring:\n")
                print_new_expected(rowwise_colored_actual)

        self.assertEqual(columnwise_colored_actual,
                         columnwise_colored_expected)
        self.assertEqual(rowwise_colored_actual, rowwise_colored_expected)
예제 #17
0
        for x, y in load_dataset(dataset_path, hidden_size, sparsity)
    ]
    measurements = []

    for i, timer in enumerate(timers * repeats):
        m = timer.blocked_autorange(min_run_time=0.05)
        serialized_results.append(pickle.dumps(m))
        m.metadata = {
            "device": 'cuda' if m.task_spec.env.find("cuda") >= 0 else 'cpu'
        }
        measurements.append(m)
        print(f"\r{i + 1} / {len(timers) * repeats}", end="")
        sys.stdout.flush()
    print()

    comparison = benchmark_utils.Compare(
        [pickle.loads(i) for i in serialized_results])

    print("== Unformatted " + "=" * 80 + "\n" + "/" * 95 + "\n")
    comparison.print()

    print("== Formatted " + "=" * 80 + "\n" + "/" * 93 + "\n")
    comparison.trim_significant_figures()
    comparison.colorize()
    comparison.print()

    table = [(m.task_spec.sub_label, m.task_spec.description,
              m.metadata["device"], m.mean) for m in measurements]
    df = pd.DataFrame(table, columns=['method', 'sparsity', 'device', 'time'])
    df.to_pickle(df_output_path)
예제 #18
0
label1 = f"Stem conv. Shape: {inp.shape}"
t0 = benchmark.Timer(
    stmt='conv(inp)',
    globals={
        'inp': inp,
        'conv': conv_pw
    },
    num_threads=num_threads,
    label="PW Stem convs",
    #     sub_label=f'Reg Conv. Params: {get_params_str(conv)}',
    description='description',
).blocked_autorange(min_run_time=1)

t1 = benchmark.Timer(
    stmt='conv(inp)',
    globals={
        'inp': inp2,
        'conv': conv_pw2
    },
    num_threads=num_threads,
    label="PW deeper convs",
    #     sub_label=f'Conv DW. Params: {get_params_str(conv_dw)}',
    description='description',
).blocked_autorange(min_run_time=1)

## divide speed by batch size
t0 = adjust_for_bs(t0)
t1 = adjust_for_bs(t1)

compare = benchmark.Compare([t0, t1])
compare.print()