# Quantize the tensor self.q_input = torch.quantize_per_tensor(f_input, scale=self.scale, zero_point=self.zero_point, dtype=dtype) if not contig: # Make non-contiguous new_shape = list(range(self.q_input.ndim))[::-1] self.q_input = self.q_input.permute(new_shape) def init(self, dims, contig, inplace, dtype, op_func): self._setup(dims, contig, dtype) self.qop = op_func def forward(self): if self.qop in (nnq.functional.hardswish, nnq.functional.elu, nnq.functional.celu): return self.qop(self.q_input, scale=self.scale, zero_point=self.zero_point) return self.qop(self.q_input) op_bench.generate_pt_tests_from_op_list( qactivation_ops, qactivation_short_configs + qactivation_long_configs, QActivationBenchmarkBase) if __name__ == "__main__": op_bench.benchmark_runner.main()
q_input = q_input.permute(new_shape) self.inputs = {"q_input": q_input} def init(self, dims, contig, inplace, dtype, op_func): self._setup(dims, contig, dtype) self.qop = op_func class QActivationBenchmark(QActivationBenchmarkBase): def forward(self, q_input): return self.qop(q_input) op_bench.generate_pt_tests_from_op_list( qactivation_ops, qactivation_short_configs + qactivation_long_configs, QActivationBenchmark) qactivation_scale_zero_point_ops = op_bench.op_list( attrs=( ('functional.hardswish', nnq.functional.hardswish), ('functional.elu', nnq.functional.elu), ('functional.celu', nnq.functional.celu), ), attr_names=('op_name', 'op_func'), ) class QActivationScaleZeroPointBenchmark(QActivationBenchmarkBase): def forward(self, q_input): return self.qop(q_input, scale=self.scale, zero_point=self.zero_point)
hardswish_configs_long = op_bench.cross_product_configs(N=[8, 16], C=[3], H=[256, 512], W=[256, 512], device=['cpu'], tags=['long']) hardswish_ops_list = op_bench.op_list( attr_names=['op_name', 'op_func'], attrs=[ ['Hardswish', nn.Hardswish], ], ) class HardswishBenchmark(op_bench.TorchBenchmarkBase): def init(self, N, C, H, W, device, op_func): self.input_one = torch.rand(N, C, H, W, device=device) self.op_func = op_func() def forward(self): return self.op_func(self.input_one) op_bench.generate_pt_tests_from_op_list( hardswish_ops_list, hardswish_configs_short + hardswish_configs_long, HardswishBenchmark) if __name__ == "__main__": op_bench.benchmark_runner.main()
) class Pool1dBenchmark(op_bench.TorchBenchmarkBase): def init(self, kernel, stride, N, C, L, device, op_func): self.input = torch.rand(N, C, L, device=device) self.kernel = kernel self.stride = stride self.op_func = op_func(self.kernel, stride=self.stride) def forward(self): return self.op_func(self.input) op_bench.generate_pt_tests_from_op_list( pool_1d_ops_list, pool_1d_configs_short + pool_1d_configs_long, Pool1dBenchmark) """ Microbenchmarks for MaxPool2d and AvgPool2d operators. """ # Configs for pool-2d ops pool_2d_configs_short = op_bench.config_list( attr_names=['kernel', 'stride', 'N', 'C', 'H', 'W'], attrs=[ [[3, 1], [2, 1], 1, 16, 32, 32], ], cross_product_configs={ 'device': ['cpu', 'cuda'], }, tags=['short'])
"q_input_a": self.q_input_a, "q_input_b": self.q_input_a, "scale": self.scale, "zero_point": self.zero_point } self.op_func = op_func def forward(self, q_input_a, q_input_b, scale: float, zero_point: int): return self.op_func(q_input_a, q_input_b, scale=scale, zero_point=zero_point) op_bench.generate_pt_tests_from_op_list(qarithmetic_binary_ops, qarithmetic_binary_configs, QFunctionalBenchmark) class QFunctionalScalarBenchmark(_QFunctionalBinaryArithmeticBenchmarkBase): def init(self, N, dtype, contig, op_func): super(QFunctionalScalarBenchmark, self).setup(N, dtype, contig) self.inputs = {"q_input": self.q_input_a, "scalar_input": 42} self.op_func = op_func def forward(self, q_input, scalar_input: int): return self.op_func(q_input, scalar_input) op_bench.generate_pt_tests_from_op_list(qarithmetic_binary_scalar_ops, qarithmetic_binary_configs,
self.inputs = { "input": self.input, "scale": self.scale, "zero_point": self.zero_point, "quant_min": self.quant_min, "quant_max": self.quant_max, } self.op_func = op_func def forward(self, input, scale, zero_point, quant_min: int, quant_max: int): return self.op_func(input, scale, zero_point, quant_min, quant_max) op_bench.generate_pt_tests_from_op_list( fake_quantize_per_tensor_ops, fake_quantize_operator_configs_short + fake_quantize_operator_configs_long, FakeQuantizePerTensorBaseOpBenchmark) op_bench.generate_pt_gradient_tests_from_op_list( fake_quantize_per_tensor_ops, fake_quantize_operator_configs_short + fake_quantize_operator_configs_long, FakeQuantizePerTensorBaseOpBenchmark) def fakeQuantizePerChannelLearnableKernel(input, scale, zero_point, axis: int, quant_min: int, quant_max: int): return torch._fake_quantize_learnable_per_channel_affine( input, scale, zero_point, axis, quant_min, quant_max) def fakeQuantizePerChannelOriginalKernel(input, scale, zero_point, axis: int, quant_min: int, quant_max: int):
class BatchElementWiseBenchmark(op_bench.TorchBenchmarkBase): def init(self, B, M, N, device, op_func): self.inputs = { "input_one": torch.rand(B, M, N, device=device), "input_two": torch.rand(B, M, N, device=device) } self.op_func = op_func def forward(self, input_one, input_two): if self.op_func.__name__ == "einsum": return torch.einsum('bij,bij->bij', input_one, input_two) else: return torch.mul(input_one, input_two) op_bench.generate_pt_tests_from_op_list( batch_mm_op_list, batch_mm_configs_short + batch_mm_configs_long, BatchMatrixMultBenchmark, ) op_bench.generate_pt_tests_from_op_list( batch_elementwise_op_list, batch_elementwise_configs_short + batch_elementwise_configs_long, BatchElementWiseBenchmark, ) if __name__ == "__main__": op_bench.benchmark_runner.main()
def init(self, num_embeddings, embedding_dim, op_func): self.weight = torch.from_numpy((np.random.random_sample( (num_embeddings, embedding_dim)) + 1).astype(np.float32)) self.op_func = op_func def forward(self): return self.op_func(self.weight) class EmbeddingBagFusedToFloatBase(op_bench.TorchBenchmarkBase): def init(self, num_embeddings, embedding_dim, op_func): weight = torch.randn(num_embeddings, embedding_dim + 8, dtype=torch.float) self.packed_weight = weight.to(torch.uint8) self.op_func = op_func def forward(self): return self.op_func(self.packed_weight) op_bench.generate_pt_tests_from_op_list( conversion_ops, embeddingbag_conversion_short_configs + embeddingbag_conversion_long_configs, EmbeddingBagFloatToFusedBase) op_bench.generate_pt_tests_from_op_list( unpack_ops, embeddingbag_conversion_short_configs + embeddingbag_conversion_long_configs, EmbeddingBagFusedToFloatBase) if __name__ == "__main__": op_bench.benchmark_runner.main()
def forward( self, prepacked_weights, indices, offsets, mode: int, per_sample_weights: Optional[torch.Tensor], include_last_offset: bool, is_pruned_weights: bool, compressed_indices: Optional[torch.Tensor] ): return self.op_func(prepacked_weights, indices, offsets, mode=0, per_sample_weights=per_sample_weights, include_last_offset=self.include_last_offset, pruned_weights=self.is_pruned_weights, compressed_indices_mapping=self.compressed_indices) op_bench.generate_pt_tests_from_op_list(four_bit_rowwise_ops, full_configs, EmbedddingBag4BitRowwiseOffsetsTest) op_bench.generate_pt_tests_from_op_list(byte_rowwise_ops, full_configs, EmbedddingBagByteRowwiseOffsetsTest) if __name__ == "__main__": op_bench.benchmark_runner.main()
W=[256, 512], device=['cpu'], tags=['long'] ) hardsigmoid_ops_list = op_bench.op_list( attr_names=['op_name', 'op_func'], attrs=[ ['Hardsigmoid', nn.Hardsigmoid], ], ) class HardsigmoidBenchmark(op_bench.TorchBenchmarkBase): def init(self, N, C, H, W, device, op_func): self.input_one = torch.rand(N, C, H, W, device=device) self.op_func = op_func() def forward(self): return self.op_func(self.input_one) op_bench.generate_pt_tests_from_op_list(hardsigmoid_ops_list, hardsigmoid_configs_short + hardsigmoid_configs_long, HardsigmoidBenchmark) if __name__ == "__main__": op_bench.benchmark_runner.main()
}, tags=["short"]) class BinaryOpBcastBenchmark(op_bench.TorchBenchmarkBase): def init(self, in_one, in_two, dtype, device, op_func): self.in_one = torch.randn(in_one, device=device).to(dtype=dtype) self.in_two = torch.randn(in_two, device=device).to(dtype=dtype) self.op_func = op_func def forward(self): return self.op_func(self.in_one, self.in_two) op_bench.generate_pt_tests_from_op_list(binary_ops_bcast_list, binary_configs_broadcast, BinaryOpBcastBenchmark) # Benchmark ops performance without broadcast binary_ops_list = op_bench.op_list( attr_names=['op_name', 'op_func'], attrs=[ ['add', torch.add], ['copy_', lambda in1, in2: in1.copy_(in2)], ], ) binary_short_configs = op_bench.config_list( attr_names=['M', 'N', 'K'], attrs=[ [1, 1, 1],
attrs=[ ['PerChannelMinMaxObserver', obs.PerChannelMinMaxObserver], [ 'MovingAveragePerChannelMinMaxObserver', obs.MovingAveragePerChannelMinMaxObserver ], ]) class QObserverBenchmark(op_bench.TorchBenchmarkBase): def init(self, C, M, N, dtype, qscheme, op_func, device): self.f_input = torch.rand(C, M, N, device=device) self.op_func = op_func(dtype=dtype, qscheme=qscheme).to(device) def forward(self): return self.op_func(self.f_input) op_bench.generate_pt_tests_from_op_list( qobserver_per_tensor_list, qobserver_per_tensor_configs_short + qobserver_per_tensor_configs_long, QObserverBenchmark) op_bench.generate_pt_tests_from_op_list( qobserver_per_channel_list, qobserver_per_channel_configs_short + qobserver_per_channel_configs_long, QObserverBenchmark) if __name__ == "__main__": op_bench.benchmark_runner.main()
self.q_input_a = torch.quantize_per_tensor(f_input, scale=scale, zero_point=zero_point, dtype=dtype) if not contig: permute_dims = list(range(f_input.ndim))[::-1] self.q_input_a = self.q_input_a.permute(permute_dims) def forward(self): return getattr(self.qfunctional, self.qop)(self.q_input_a, self.q_input_b) class QFunctionalAddBenchmarkBase(_QFunctionalBinaryArithmeticBenchmarkBase): def init(self, N, dtype, contig, op_func): super(QFunctionalAddBenchmarkBase, self).setup(N, dtype, contig) self.qop = op_func if self.qop.endswith('_scalar'): self.q_input_b = 42 else: self.q_input_b = self.q_input_a op_bench.generate_pt_tests_from_op_list(qarithmetic_binary_ops, qarithmetic_binary_configs, QFunctionalAddBenchmarkBase) if __name__ == '__main__': op_bench.benchmark_runner.main()
q_input_a = torch.quantize_per_tensor(f_input, scale=scale, zero_point=zero_point, dtype=dtype) if other_scalar: q_input_b = 42 else: q_input_b = q_input_a.clone() if not contig: permute_dims = list(range(f_input.ndim))[::-1] q_input_a = q_input_a.permute(permute_dims) self.qop = op_func self.args = (q_input_a, q_input_b) self.kwargs = {} if out_variant: self.kwargs['out'] = torch.tensor([], dtype=torch.bool) def forward(self): return self.qop(*self.args, **self.kwargs) op_bench.generate_pt_tests_from_op_list(qcomparators_ops, qcomparators_configs, QComparatorBenchmark) if __name__ == '__main__': op_bench.benchmark_runner.main()
[16, 256, 28, 28], ], attr_names=['N', 'C', 'H', 'W'], tags=['long']) softmax_ops_list = op_bench.op_list( attr_names=['op_name', 'op_func'], attrs=[ ['Softmax', nn.Softmax], ['Softmax2d', nn.Softmax2d], ['LogSoftmax', nn.LogSoftmax], ], ) class SoftmaxBenchmark(op_bench.TorchBenchmarkBase): def init(self, N, C, H, W, op_func): self.input_one = torch.rand(N, C, H, W) self.op_func = op_func() def forward(self): return self.op_func(self.input_one) op_bench.generate_pt_tests_from_op_list( softmax_ops_list, softmax_configs_short + softmax_configs_long, SoftmaxBenchmark) if __name__ == "__main__": op_bench.benchmark_runner.main()
def init(self, M, N, dtype, contig, op_func): f_input = torch.rand(M, N) scale = 1.0 zero_point = 0 self.q_input = torch.quantize_per_tensor(f_input, scale=scale, zero_point=zero_point, dtype=dtype) if not contig: permute_dims = list(range(self.q_input.ndim))[::-1] self.q_input = self.q_input.permute(permute_dims) self.op_func = op_func class QMethodTensorInputBenchmark(_QMethodBenchmarkBase): def forward(self): getattr(self.q_input, self.op_func)(self.q_input) class QMethodNoInputBenchmark(_QMethodBenchmarkBase): def forward(self): getattr(self.q_input, self.op_func)() op_bench.generate_pt_tests_from_op_list( qmethods_tensor_input_list, qmethods_configs_short + qmethods_configs_long, QMethodTensorInputBenchmark) if __name__ == "__main__": op_bench.benchmark_runner.main()
N=[32, 64], K=[256, 512], device=['cpu', 'cuda'], dtype=[torch.int32, torch.float, torch.double], tags=['long']) class RemainderOpBenchmark(op_bench.TorchBenchmarkBase): def init(self, M, N, K, device, dtype, op_func): self.dividend = torch.rand(M, N, K, device=device) self.dividend = (self.dividend * 1000 - 500).to(dtype=dtype) self.divisor = torch.rand(M, N, K, device=device) # +1 so we don't divide by zero self.divisor = (self.divisor * 40 + 1).to(dtype=dtype) self.inputs = {"dividend": self.dividend, "divisor": self.divisor} self.op_func = op_func def forward(self, dividend, divisor): return self.op_func(dividend, divisor) op_bench.generate_pt_tests_from_op_list( remainder_ops_list, remainder_short_configs + remainder_long_configs, RemainderOpBenchmark) if __name__ == "__main__": op_bench.benchmark_runner.main()
['sinh', torch.sinh], ['sqrt', torch.sqrt], ['sqrt_', torch.sqrt_], ['tan', torch.tan], ['tan_', torch.tan_], ['tanh', torch.tanh], ['tanh_', torch.tanh_], ['trunc', torch.trunc], ['trunc_', torch.trunc_], ['unique', torch.unique], ['zero_', torch.zero_], ['bernoulli_', lambda t: t.bernoulli_()], ['cauchy_', lambda t: t.cauchy_()], ['digamma_', lambda t: t.digamma_()], ['exponential_', lambda t: t.exponential_()], ['normal_', lambda t: t.normal_()], ['random_', lambda t: t.random_()], ['sign_', lambda t: t.sign_()], ['uniform_', lambda t: t.uniform_()], ['half', lambda t: t.half()], ['long', lambda t: t.long()], ], ) op_bench.generate_pt_tests_from_op_list( unary_ops_list, unary_ops_configs_short + unary_ops_configs_long, UnaryOpBenchmark) if __name__ == "__main__": op_bench.benchmark_runner.main()
class ReplaceNaNBenchmark(op_bench.TorchBenchmarkBase): def init(self, M, N, dtype, replace_inf, op_func): input = torch.randn(M, N, dtype=dtype) input[0][0] = float("nan") self.inputs = { "input": input, "replace_inf": replace_inf } self.op_func = op_func self.set_module_name("nan_to_num") def forward(self, input, replace_inf: bool): # compare inplace if replace_inf: return self.op_func(input, nan=1.0) else: return self.op_func(input, nan=1.0, posinf=math.inf, neginf=-math.inf) op_bench.generate_pt_tests_from_op_list( nan_to_num_ops_list, nan_to_num_long_configs + nan_to_num_short_configs, ReplaceNaNBenchmark, ) if __name__ == "__main__": op_bench.benchmark_runner.main()