from __future__ import unicode_literals import operator_benchmark as op_bench import torch """Microbenchmarks for quantized unary operators (point-wise and reduction).""" # Configs for pointwise and reduction unary ops qunary_ops_configs_short = op_bench.config_list( attr_names=['M', 'N'], attrs=[ [512, 512], ], cross_product_configs={ 'dtype': [torch.quint8], }, tags=['short'] ) qunary_ops_configs_long = op_bench.cross_product_configs( M=[256, 1024], N=[256, 1024], dtype=[torch.quint8, torch.qint8, torch.qint32], tags=['long'] ) class QUnaryOpBenchmark(op_bench.TorchBenchmarkBase): def init(self, M, N, dtype, op_func): f_input = torch.rand(M, N)
import operator_benchmark as op_bench import torch '''Microbenchmarks for the quantized interpolate op. Note: We are not benchmarking `upsample` as it is being depricated, and calls the `interpolate` anyway. ''' qinterpolate_long_configs = op_bench.config_list( attr_names=['M', 'N', 'K'], attrs=[ [512, 512, 512], ], cross_product_configs={ 'dtype': [torch.quint8, torch.qint8, torch.qint32], 'mode': ['nearest', 'bilinear'], 'scale': [0.5, 1.0, 2.0], 'contig': [True], # TODO: Add `False` after #29435 }, tags=['long'] ) qinterpolate_short_configs = op_bench.config_list( attr_names=['M', 'N', 'K', 'dtype', 'mode', 'scale', 'contig'], attrs=[ [32, 32, 32, torch.quint8, 'nearest', 0.5, True], # Downsample [32, 32, 32, torch.quint8, 'bilinear', 0.5, True], # Downsample [32, 32, 32, torch.quint8, 'nearest', 2.0, True], # Upsample [32, 32, 32, torch.quint8, 'bilinear', 2.0, True], # Upsample
from __future__ import absolute_import from __future__ import division from __future__ import print_function from __future__ import unicode_literals import operator_benchmark as op_bench import torch """Microbenchmarks for Chunk operator""" # Configs for PT Chunk operator chunk_short_configs = op_bench.config_list( attr_names=["M", "N", "chunks"], attrs=[ [256, 512, 2], [512, 512, 2], ], cross_product_configs={ 'device': ['cpu', 'cuda'], }, tags=["short"], ) chunks_long_configs = op_bench.cross_product_configs(M=[128, 1024], N=[128, 1024], chunks=[2, 4], device=['cpu', 'cuda'], tags=['long']) class ChunkBenchmark(op_bench.TorchBenchmarkBase): def init(self, M, N, chunks, device):
import operator_benchmark as op_bench import torch import random from typing import List """Microbenchmarks for Cat operator""" # Configs for PT Cat operator cat_configs_short = op_bench.config_list( attr_names=['sizes', 'N', 'dim'], attrs=[ [(1, 1, 1), 2, 0], # noqa [(512, 512, 2), 2, 1], # noqa [(128, 1024, 2), 2, 1], # noqa ], cross_product_configs={ 'device': ['cpu', 'cuda'], }, tags=['short'], ) cat_configs_long = op_bench.config_list( attr_names=['sizes', 'N', 'dim'], attrs=[ [(2**10, 2**10, 2), 2, 0], # noqa [(2**10+1, 2**10-1, 2), 2, 1], # noqa [(2**10, 2**10, 2), 2, 2], # noqa [[ lambda: random.randint(2**6, 2**7), 2**7-17, 2**6+1], # noqa 5, 0],
import operator_benchmark as op_bench import torch import numpy """Microbenchmarks for index_select operator.""" # An example input from this configuration is M=4, N=4, dim=0. index_select_configs_short = op_bench.config_list( attr_names=["M", "N", "K", "dim"], attrs=[ [8, 8, 1, 1], [256, 512, 1, 1], [512, 512, 1, 1], [8, 8, 2, 1], [256, 512, 2, 1], [512, 512, 2, 1], ], cross_product_configs={ 'device': ['cpu', 'cuda'], }, tags=["short"] ) index_select_configs_long = op_bench.cross_product_configs( M=[128, 1024], N=[128, 1024], K=[1, 2], dim=[1], device=['cpu', 'cuda'],
from __future__ import absolute_import from __future__ import division from __future__ import print_function from __future__ import unicode_literals import operator_benchmark as op_bench import torch import torch.nn as nn """Microbenchmarks for batchnorm operator.""" configs = op_bench.config_list(attrs=[ [1, 32, 10], [4, 256, 100], [16, 1024, 256], ], attr_names=["N", "IN", "OUT"], tags=["short"]) class LinearBenchmark(op_bench.TorchBenchmarkBase): def init(self, N, IN, OUT): self.input_one = torch.rand(N, IN) self.linear = nn.Linear(IN, OUT) self.set_module_name("linear") def forward(self): return self.linear(self.input_one) op_bench.generate_pt_test(configs, LinearBenchmark)
import operator_benchmark as op_bench import torch import torch.nn.functional as F """Microbenchmarks for batchnorm operator.""" batchnorm_configs_short = op_bench.config_list( attr_names=["M", "N", "K"], attrs=[ [1, 256, 3136], ], cross_product_configs={ 'device': ['cpu', 'cuda'], }, tags=["short"] ) batchnorm_configs_long = op_bench.cross_product_configs( M=[1, 128], N=[8192, 2048], K=[1], device=['cpu', 'cuda'], tags=["long"] ) class BatchNormBenchmark(op_bench.TorchBenchmarkBase): def init(self, M, N, K, device): self.input_one = torch.rand(M, N, K, device=device, requires_grad=self.auto_set())
from __future__ import unicode_literals import operator_benchmark as op_bench import torch """Microbenchmarks for Split operator""" # Configs for PT Split operator split_configs_short = op_bench.config_list( attr_names=["M", "N", "parts"], attrs=[ [256, 512, 2], [512, 512, 2], ], cross_product_configs={ 'device': ['cpu'], }, tags=["short"], ) split_configs_long = op_bench.cross_product_configs( M=[128, 1024], N=[128, 1024], parts=[2, 4], device=['cpu'], tags=['long'] )
import torch.nn as nn """ Microbenchmarks for the softmax operators. """ # Configs for softmax ops softmax_configs_short = op_bench.config_list( attr_names=[ 'N', 'C', 'H', 'W' ], attrs=[ [4, 3, 256, 256], [8, 3, 512, 512], ], cross_product_configs={ 'device': ['cpu', 'cuda'], }, tags=['short'] ) softmax_configs_long = op_bench.cross_product_configs( N=[8, 16], C=[3, 64], H=[64, 128], W=[64, 128], device=['cpu', 'cuda'], tags=['long']
import torch """Microbenchmarks for add_ operator. Supports both Caffe2/PyTorch.""" # Configs for PT add operator add_long_configs = op_bench.cross_product_configs(M=[8, 128], N=[32, 64], K=[256, 512], device=['cpu', 'cuda'], tags=["long"]) add_short_configs = op_bench.config_list( attr_names=["M", "N", "K"], attrs=[ [1, 1, 1], [64, 64, 64], [64, 64, 128], ], cross_product_configs={ 'device': ['cpu', 'cuda'], }, tags=["short"], ) class AddBenchmark(op_bench.TorchBenchmarkBase): def init(self, M, N, K, device): self.input_one = torch.rand(M, N, K, device=device, requires_grad=self.auto_set()) self.input_two = torch.rand(M,
[3, 512, 512, torch.quint8, 'D'], ], 'tags': ['short'], } quantize_configs_long_dict = { 'C': [3, 5, 8], # this is reused for per-channel: avoid single channel test 'M': [256, 1024], 'N': [256, 1024], 'dtype': [torch.quint8, torch.qint8, torch.qint32], 'mode': ['D', 'Q'], 'tags': ['long'], } quantize_per_tensor_configs_short = op_bench.config_list( **quantize_configs_short_dict) quantize_per_tensor_configs_long = op_bench.cross_product_configs( **quantize_configs_long_dict) class QuantizePerTensorBenchmark(op_bench.TorchBenchmarkBase): r"""Benchmarks both quantization and dequantization.""" def init(self, C, M, N, dtype, mode): assert (mode in ('Q', 'D')) self.input = torch.rand(C, M, N) self.dtype = dtype self.op = nnq.Quantize(scale=1.0, zero_point=0, dtype=dtype) self.set_module_name('QuantizePerTensor') if mode == 'D':
from __future__ import absolute_import from __future__ import division from __future__ import print_function from __future__ import unicode_literals import operator_benchmark as op_bench import torch """Microbenchmarks for quantized batchnorm operator.""" batchnorm_configs_short = op_bench.config_list(attr_names=["M", "N", "K"], attrs=[ [1, 256, 3136], ], cross_product_configs={ 'device': ['cpu'], 'dtype': (torch.qint8, ), }, tags=["short"]) class QBatchNormBenchmark(op_bench.TorchBenchmarkBase): def init(self, M, N, K, device, dtype): self._init(M, N, K, device) x_scale = 0.1 x_zero_point = 0 self.q_input_one = torch.quantize_per_tensor(self.input_one, scale=x_scale, zero_point=x_zero_point, dtype=dtype) self.mean = torch.rand(N) self.var = torch.rand(N)
from caffe2.python import core """Microbenchmarks for element-wise ReplaceNaN operator.""" # Configs for C2 ReplaceNaN operator replace_nan_long_configs = op_bench.cross_product_configs( M=[32, 64, 128], N=range(32, 128, 32), dtype=["float", "double"], tags=["long"] ) replace_nan_short_configs = op_bench.config_list( attrs=[ [16, 16, "float"], [16, 16, "double"], [64, 64, "float"], [64, 64, "double"], ], attr_names=["M", "N", "dtype"], tags=["short"], ) class ReplaceNaNBenchmark(op_bench_c2.Caffe2BenchmarkBase): def init(self, M, N, dtype): self.input = self.tensor([M, N], dtype) self.set_module_name("replace_nan") def forward(self): op = core.CreateOperator("ReplaceNaN", self.input, self.input, value=1.0) return op
import operator_benchmark as op_bench import torch """Microbenchmarks for element-wise Add operator. Supports both Caffe2/PyTorch.""" # Configs for PT add operator add_long_configs = op_bench.cross_product_configs( M=[8, 64, 128], N=range(2, 10, 3), K=[2**x for x in range(0, 3)], tags=["long"]) add_short_configs = op_bench.config_list( attrs=[ [64, 64, 64], [64, 64, 128], ], attr_names=["M", "N", "K"], tags=["short"], ) class AddBenchmark(op_bench.TorchBenchmarkBase): def init(self, M, N, K): self.input_one = torch.rand(M, N, K) self.input_two = torch.rand(M, N, K) self.set_module_name("add") def forward(self): return torch.add(self.input_one, self.input_two)
import operator_benchmark as op_bench import torch from typing import List """Microbenchmarks for as_strided operator""" # Configs for PT as_strided operator as_strided_configs_short = op_bench.config_list( attr_names=["M", "N", "size", "stride", "storage_offset"], attrs=[ [8, 8, (2, 2), (1, 1), 0], [256, 256, (32, 32), (1, 1), 0], [512, 512, (64, 64), (2, 2), 1], ], cross_product_configs={ 'device': ['cpu', 'cuda'], }, tags=["short"], ) as_strided_configs_long = op_bench.cross_product_configs( M=[512], N=[1024], size=[(16, 16), (128, 128)], stride=[(1, 1)], storage_offset=[0, 1], device=['cpu', 'cuda'], tags=['long']) class As_stridedBenchmark(op_bench.TorchBenchmarkBase): def init(self, M, N, size, stride, storage_offset, device):
import torch.nn as nn """ Microbenchmarks for the hardsigmoid operator. """ # Configs for hardsigmoid ops hardsigmoid_configs_short = op_bench.config_list( attr_names=[ 'N', 'C', 'H', 'W' ], attrs=[ [1, 3, 256, 256], [4, 3, 256, 256], ], cross_product_configs={ 'device': ['cpu'], }, tags=['short'] ) hardsigmoid_configs_long = op_bench.cross_product_configs( N=[8, 16], C=[3], H=[256, 512], W=[256, 512], device=['cpu'], tags=['long']
LENGTH=range(1, 100), M=[1], N=[2], MAX_LENGTH=range(1, 100), device=['cpu', 'cuda'], dtype=[torch.int32], tags=["long"], ) clip_ranges_short_configs = op_bench.config_list( attrs=[ [6, 1, 2, 1, torch.int32], [7, 1, 2, 2, torch.int32], [8, 1, 2, 3, torch.int32], [9, 1, 2, 4, torch.int32], [10, 1, 2, 5, torch.int32], ], attr_names=["LENGTH", "M", "N", "MAX_LENGTH", "dtype"], cross_product_configs={ 'device': ['cpu', 'cuda'], }, tags=["short"], ) class ClipRangesBenchmark(op_bench.TorchBenchmarkBase): def init(self, LENGTH, M, N, MAX_LENGTH, device, dtype): self.inputs = { "input": torch.rand(LENGTH, M, N, device=device).type(dtype), "max_length": MAX_LENGTH } self.set_module_name("clip_ranges")
import operator_benchmark as op_bench import torch # Configs for pointwise and reduction unary ops qmethods_configs_short = op_bench.config_list( attr_names=['M', 'N'], attrs=[ [32, 32], ], cross_product_configs={ 'dtype': [torch.quint8], 'contig': [False, True], }, tags=['short'] ) qmethods_configs_long = op_bench.cross_product_configs( M=[256, 1024], N=[256, 1024], dtype=[torch.qint8, torch.qint32], contig=[False, True], tags=['long'] ) class _QMethodBenchmarkBase(op_bench.TorchBenchmarkBase): def init(self, M, N, dtype, contig): f_input = torch.rand(M, N) scale = 1.0 zero_point = 0 self.q_input = torch.quantize_per_tensor(f_input, scale=scale,
from __future__ import absolute_import from __future__ import division from __future__ import print_function from __future__ import unicode_literals import operator_benchmark as op_bench import torch import torch.nn as nn """ Microbenchmarks for the softmax operators. """ # Configs for softmax ops softmax_configs_short = op_bench.config_list(attrs=[ [1, 3, 32, 32], [2, 3, 64, 64], ], attr_names=['N', 'C', 'H', 'W'], tags=['short']) softmax_configs_long = op_bench.config_list(attrs=[ [8, 3, 128, 128], [16, 512, 14, 14], [16, 256, 28, 28], ], attr_names=['N', 'C', 'H', 'W'], tags=['long']) softmax_ops_list = op_bench.op_list( attr_names=['op_name', 'op_func'], attrs=[ ['Softmax', nn.Softmax],
import operator_benchmark as op_bench import torch import numpy """Microbenchmarks for gather operator.""" # An example input from this configuration is M=4, N=4, dim=0. gather_configs_short = op_bench.config_list(attr_names=["M", "N", "dim"], attrs=[ [256, 512, 0], [512, 512, 1], ], cross_product_configs={ 'device': ['cpu', 'cuda'], }, tags=["short"]) gather_configs_long = op_bench.cross_product_configs(M=[128, 1024], N=[128, 1024], dim=[0, 1], device=['cpu', 'cuda'], tags=["long"]) class GatherBenchmark(op_bench.TorchBenchmarkBase): def init(self, M, N, dim, device): self.input_one = torch.rand(M, N, device=device) self.dim = dim min_val = M if dim == 0 else N numpy.random.seed((1 << 32) - 1) self.index = torch.tensor(numpy.random.randint(0, min_val, (M, N)), device=device)
import operator_benchmark as op_bench import torch """ Microbenchmarks for batch matrix mult with einsum and torch.bmm. """ batch_mm_configs_short = op_bench.config_list( attr_names=["B", "M", "N", "K"], attrs=[ [4, 5, 3, 2], [32, 25, 20, 30], [128, 100, 120, 110], ], cross_product_configs={ 'device': ['cpu', 'cuda'], }, tags=["short"], ) batch_mm_configs_long = op_bench.config_list( attr_names=["B", "M", "N", "K"], attrs=[ [128, 256, 128, 256], [512, 1024, 1024, 512], ], cross_product_configs={ 'device': ['cpu', 'cuda'], }, tags=["long"], )
import torch import operator_benchmark as op_bench # 2D pooling will have input matrix of rank 3 or 4 qpool2d_long_configs = op_bench.config_list( attrs=( # C H W k s p (1, 3, 3, (3, 3), (1, 1), (0, 0)), # dummy # noqa (3, 64, 64, (3, 3), (2, 2), (1, 1)), # dummy # noqa # VGG16 pools with original input shape: (-1, 3, 224, 224) (64, 224, 224, (2, 2), (2, 2), (0, 0)), # MaxPool2d-4 # noqa (256, 56, 56, (2, 2), (2, 2), (0, 0)), # MaxPool2d-16 # noqa ), attr_names=( 'C', 'H', 'W', # Input layout 'k', 's', 'p'), # Pooling parameters cross_product_configs={ 'N': (1, 4), 'contig': (False, True), 'dtype': (torch.quint8, ), }, tags=('long', )) qpool2d_short_configs = op_bench.config_list( attrs=((1, 3, 3, (3, 3), (1, 1), (0, 0)), ), # dummy # noqa attr_names=( 'C',
from caffe2.python import core """Microbenchmarks for MatMul operator""" # Configs for C2 Matmul operator mm_long_configs = op_bench.cross_product_configs(M=[8, 64, 128], N=range(2, 10, 3), K=[2**x for x in range(0, 3)], trans_a=[True, False], trans_b=[True, False], tags=["long"]) mm_short_configs = op_bench.config_list( attrs=[ [128, 128, 128, False, True], [1024, 1024, 256, True, False], [8192, 8192, 1024, True, False], ], attr_names=["M", "N", "K", "trans_a", "trans_b"], tags=["short"], ) class MatMulBenchmark(op_bench_c2.Caffe2BenchmarkBase): def init(self, M, N, K, trans_a, trans_b): self.input_one = self.tensor([N, M]) if trans_a else self.tensor( [M, N]) self.input_two = self.tensor([K, N]) if trans_b else self.tensor( [N, K]) self.args = {'trans_a': trans_a, 'trans_b': trans_b} self.output = self.tensor([M, K]) self.set_module_name("matmul")
self.set_module_name("interpolate") def forward(self, input_image, output_size, mode): return torch.nn.functional.interpolate(input_image, size=output_size, mode=mode, align_corners=False) config_short = op_bench.config_list( attr_names=["input_size", "output_size"], attrs=[ [(1, 3, 60, 40), (24, 24)], [(1, 3, 600, 400), (240, 240)], [(1, 3, 320, 320), (256, 256)], ], cross_product_configs={ 'channels_last': [True, False], }, tags=["short"], ) config_long = op_bench.config_list( attr_names=["input_size", "output_size"], attrs=[ [(1, 3, 320, 320), (512, 512)], [(1, 3, 500, 500), (256, 256)], [(1, 3, 500, 500), (800, 800)], [(2, 128, 64, 46), (128, 128)], ], cross_product_configs={
from __future__ import absolute_import, division, print_function, unicode_literals import operator_benchmark as op_bench import torch # Configs for pointwise unary ops unary_ops_configs = op_bench.config_list( attrs=[ [128, 128], ], attr_names=["M", "N"], tags=["short"] ) unary_ops_list = op_bench.op_list( attr_names=["op_name", "op_func"], attrs=[ ["abs", torch.abs], ["acos", torch.acos], ], ) class UnaryOpBenchmark(op_bench.TorchBenchmarkBase): def init(self, M, N, op_func): self.input_one = torch.rand(M, N) self.op_func = op_func def forward(self): return self.op_func(self.input_one)
import operator_benchmark as op_bench import torch from torch import nn """ Microbenchmarks for RNNs. """ qrnn_configs = op_bench.config_list( attrs=[ [1, 3, 1], [5, 7, 4], ], # names: input_size, hidden_size, num_layers attr_names=["I", "H", "NL"], cross_product_configs={ "B": (True, ), # Bias always True for quantized "D": (False, True), # Bidirectional "dtype": (torch.qint8, ) # Only qint8 dtype works for now }, tags=["short"]) class LSTMBenchmark(op_bench.TorchBenchmarkBase): def init(self, I, H, NL, B, D, dtype): sequence_len = 128 batch_size = 16 # The quantized.dynamic.LSTM has a bug. That's why we create a regular # LSTM, and quantize it later. See issue #31192. scale = 1.0 / 256 zero_point = 0
import torch.nn.quantized as nnq """ Microbenchmarks for Quantized Linear operators. """ # Configs for qlinear qlinear_configs = op_bench.config_list( attrs=[ [1024, 1024, 1024], [64, 800, 320], [64, 768, 512], [16, 256, 512], [128, 128, 128], [256, 512, 256], [6400, 15, 141], [6400, 8, 141], [16, 211, 2504], [16, 369, 1434], [1, 1024, 3496], [16, 256, 512], [1, 1600, 3456], ], attr_names=["N", "OUT", "IN"], # M, N, K tags=["short"], ) class QLinearBenchmark(op_bench.TorchBenchmarkBase): def init(self, N, IN, OUT): scale = 1.0 / 255 zero_point = 0
from __future__ import absolute_import from __future__ import division from __future__ import print_function from __future__ import unicode_literals import operator_benchmark as op_bench import torch """Microbenchmarks for MatMul operator""" # Configs for PT Matmul operator mm_short_configs = op_bench.config_list( attr_names=["M", "N", "K", "trans_a", "trans_b"], attrs=[ [1, 1, 1, True, False], [128, 128, 128, True, False], [256, 256, 256, False, True], ], cross_product_configs={ 'device': ['cpu', 'cuda'], }, tags=["short"], ) mm_long_configs = op_bench.cross_product_configs(M=[32], N=[512, 128], K=[64], trans_a=[False, True], trans_b=[True, False], device=['cpu', 'cuda'], tags=["long"])
from __future__ import absolute_import from __future__ import division from __future__ import print_function from __future__ import unicode_literals import operator_benchmark as op_bench import torch import torch.nn as nn """Microbenchmarks for Linear operator.""" linear_configs_short = op_bench.config_list(attr_names=["N", "IN", "OUT"], attrs=[ [4, 256, 128], [16, 512, 256], ], cross_product_configs={ 'device': ['cpu', 'cuda'], }, tags=["short"]) linear_configs_long = op_bench.cross_product_configs(N=[32, 64], IN=[128, 512], OUT=[64, 128], device=['cpu', 'cuda'], tags=["long"]) class LinearBenchmark(op_bench.TorchBenchmarkBase): def init(self, N, IN, OUT, device): self.input_one = torch.rand(N, IN, device=device) self.linear = nn.Linear(IN, OUT).to(device=device)
import operator_benchmark as op_bench import torch """Microbenchmarks for element-wise Add operator. Supports both Caffe2/PyTorch.""" add_short_configs = op_bench.config_list( attr_names=['M', 'N', 'K'], attrs=[ [8, 16, 32], [16, 16, 64], [64, 64, 128], ], cross_product_configs={ 'device': ['cpu', 'cuda'], 'dtype': [torch.float, torch.float64], }, tags=['short'], ) class AddBenchmark(op_bench.TorchBenchmarkBase): def init(self, M, N, K, device, dtype): self.input_one = torch.rand(M, N, K, device=device, dtype=dtype, requires_grad=True) self.input_two = torch.rand(M, N, K, device=device, dtype=dtype) self.set_module_name('add') def forward(self): return torch.add(self.input_one, self.input_two) op_bench.generate_pt_test(add_short_configs, AddBenchmark)