def test_constant_pad_nd_memory_format(self, device, dtype): # Test memory format is preserved in unambiguous cases for mf, ndim in ( (torch.channels_last, 4), (torch.contiguous_format, 4), (torch.channels_last_3d, 5), (torch.contiguous_format, 5), ): a = torch.zeros([2] * ndim).to(memory_format=mf) res = refs.constant_pad_nd(a, pad=[1] * (2 * ndim)) self.assertTrue(res.is_contiguous(memory_format=mf)) # Ambiguous cases # is_channels_last_ and is_contiguous_, results in channels_last output a = torch.empty_strided((2, 1, 2, 2), stride=(4, 1, 2, 1)) self.assertTrue(a.is_contiguous(memory_format=torch.channels_last)) self.assertTrue(a.is_contiguous()) actual = refs.constant_pad_nd(a, pad=[1] * 8) expect = torch.constant_pad_nd(a, pad=[1] * 8) self.assertEqual(actual.stride(), expect.stride()) self.assertTrue( actual.is_contiguous(memory_format=torch.channels_last)) # is_channels_last_contiguous_ but not is_channels_last_, results in # contiguous output a = torch.empty_strided((2, 1, 2, 2), stride=(4, 4, 2, 1)) self.assertTrue(a.is_contiguous(memory_format=torch.channels_last)) self.assertTrue(a.is_contiguous()) actual = refs.constant_pad_nd(a, pad=[1] * 8) expect = torch.constant_pad_nd(a, pad=[1] * 8) self.assertEqual(actual.stride(), expect.stride()) self.assertTrue(actual.is_contiguous())
def run_gemm(n, repeat=3, dtype=torch.float32): torch.random.manual_seed(10) device = torch.device("cuda:0") matrix = torch.randn((n, n), dtype=dtype) matrix = matrix.T gpu_in = torch.empty_strided((n, n), stride=matrix.stride(), dtype=matrix.dtype, device=device, requires_grad=False) gpu_out = torch.empty_strided((n, n), stride=matrix.stride(), dtype=matrix.dtype, device=device, requires_grad=False) gpu_in.copy_(matrix) torch.cuda.synchronize() gpu_times = [] for i in range(repeat): gpu_out.fill_(0.0) start_time = time.time() torch.mm(gpu_in, gpu_in, out=gpu_out) torch.cuda.synchronize() gpu_times.append(time.time() - start_time) gpu_time = min(gpu_times) flop = n**3 * 2 flops = flop / gpu_time print( f"GEMM Exp. of size {n} - GPU time {gpu_time:.2f}s - GFlops {flops / 1e9}" )
def _allocate_torch_weightset(self, weightset="slow"): H = self.Hlr if weightset in {"slow"} else self.Hgt Hu8 = self._np_Hlru8 if weightset in {"slow"} else self._np_Hgtu8 M = self.M N = self.Nm Ns = self.Ns W0shape, W0stride = (M, Ns, H), (Ns * Hu8, Hu8, 1) B0shape, B0stride = (M, H), (Hu8, 1) W1shape, W1stride = (Ns, H), (Hu8, 1) B1shape, B1stride = (Ns, ), (1, ) W0 = torch.empty_strided(W0shape, W0stride, dtype=torch.float32, device="cpu") B0 = torch.empty_strided(B0shape, B0stride, dtype=torch.float32, device="cpu") W1 = torch.empty_strided(W1shape, W1stride, dtype=torch.float32, device="cpu") B1 = torch.empty_strided(B1shape, B1stride, dtype=torch.float32, device="cpu") W0s = W0.storage() B0s = B0.storage() W1s = W1.storage() B1s = B1.storage() W0s.resize_((W0s.size() + 7) & ~7).fill_(0) B0s.resize_((B0s.size() + 7) & ~7).fill_(0) W1s.resize_((W1s.size() + 7) & ~7).fill_(0) B1s.resize_((B1s.size() + 7) & ~7).fill_(0) return W0, B0, W1, B1
def run_test(n, k, upper, unitriangular, transpose): triangle_function = torch.triu if upper else torch.tril A = make_tensor((n, n), dtype=dtype, device=device) A = triangle_function(A) A_sparse = A.to_sparse_csr() B = make_tensor((n, k), dtype=dtype, device=device) expected = torch.triangular_solve(B, A, upper=upper, unitriangular=unitriangular, transpose=transpose) expected_X = expected.solution actual = torch.triangular_solve(B, A_sparse, upper=upper, unitriangular=unitriangular, transpose=transpose) actual_X = actual.solution actual_A_clone = actual.cloned_coefficient self.assertTrue(actual_A_clone.numel() == 0) self.assertEqual(actual_X, expected_X) # test out with C contiguous strides out = torch.empty_strided((n, k), (k, 1), dtype=dtype, device=device) torch.triangular_solve( B, A_sparse, upper=upper, unitriangular=unitriangular, transpose=transpose, out=(out, actual_A_clone) ) self.assertEqual(out, expected_X) # test out with F contiguous strides # TODO (@ivanyashchuk): mixed memory format doesn't work yet for cuda # out is F contiguous but B is C contiguous if self.device_type == 'cuda' and (n > 0 and k > 1): with self.assertRaisesRegex(RuntimeError, "INTERNAL ASSERT FAILED"): out = torch.empty_strided((n, k), (1, n), dtype=dtype, device=device) torch.triangular_solve( B, A_sparse, upper=upper, unitriangular=unitriangular, transpose=transpose, out=(out, actual_A_clone) ) else: out = torch.empty_strided((n, k), (1, n), dtype=dtype, device=device) torch.triangular_solve( B, A_sparse, upper=upper, unitriangular=unitriangular, transpose=transpose, out=(out, actual_A_clone) ) self.assertEqual(out, expected_X) self.assertEqual(out.stride(), (1, n)) # test out with discontiguous strides out = torch.empty_strided((2 * n, k), (1, 2 * n), dtype=dtype, device=device)[::2] if n > 0 and k > 0: self.assertFalse(out.is_contiguous()) self.assertFalse(out.t().is_contiguous()) before_stride = out.stride() torch.triangular_solve( B, A_sparse, upper=upper, unitriangular=unitriangular, transpose=transpose, out=(out, actual_A_clone) ) self.assertEqual(out, expected_X) self.assertEqual(out.stride(), before_stride)
def TensorMeta( tensorlike: Optional[Union[NumberType, torch.Tensor]] = None, *, shape: Optional[ShapeType] = None, strides: Optional[StrideType] = None, dtype: Optional[torch.dtype] = None, device: Optional[Union[torch.device, str]] = None, ): if isinstance(tensorlike, Number): assert not shape and (shape is None or isinstance(shape, Sequence)) assert not strides and (strides is None or isinstance(strides, Sequence)) inferred_shape: Tuple[int, ...] = () inferred_strides: Tuple[int, ...] = () inferred_dtype = type_to_dtype(type(tensorlike)) inferred_device = torch.device("cpu") # TODO: This looks wrong, a number that is wrapped into a tensor # needs to behave differently than a scalar tensor for type # promotion purposes elif tensorlike is not None: assert isinstance(tensorlike, torch.Tensor) inferred_shape = tuple(tensorlike.shape) inferred_strides = tuple(tensorlike.stride()) inferred_dtype = tensorlike.dtype inferred_device = tensorlike.device else: # If no tensorlike "example" is given then all metadata # must be provided explicitly assert shape is not None assert strides is not None assert dtype is not None assert device is not None shape = inferred_shape if shape is None else tuple(shape) strides = inferred_strides if strides is None else tuple(strides) dtype = inferred_dtype if dtype is None else dtype device = inferred_device if device is None else device if isinstance(device, str): device = torch.device(device) if isinstance(tensorlike, FakeTensor): mode = tensorlike.fake_mode else: mode = get_prim_fake_mode() if device.type == "meta": return torch.empty_strided(shape, strides, dtype=dtype, device="meta") else: return FakeTensor( mode, torch.empty_strided(shape, strides, dtype=dtype, device="meta"), device, )
def test_zero_stride(self): device = self.get_device() cpu_tensor = torch.empty_strided(size=(6, 1024, 512), stride=(0, 0, 0)) assert cpu_tensor.storage().size() == 1 ort_tensor_copied = cpu_tensor.to(device) assert torch.allclose(cpu_tensor, ort_tensor_copied.cpu()) ort_tensor = torch.empty_strided(size=(6, 1024, 512), stride=(0, 0, 0), device=device) assert ort_tensor.is_ort assert ort_tensor.stride() == (0, 0, 0) cpu_tensor_copied = ort_tensor.cpu() assert cpu_tensor_copied.stride() == (0, 0, 0)
def __new__(cls, elem, *args, **kwargs): assert type(elem) is not cls, \ "Wrapping a CompositeCompliantTensor in a CompositeCompliantTensor is not supported" # The storage of CompositeCompliantTensor should never be used directly # by a Composite operation; if the Composite # operator attempts to read from the storage without dispatching then it'll # raise a RuntimeError due to it being a meta storage. r = torch.Tensor._make_wrapper_subclass( # type: ignore[attr-defined] cls, elem.size(), dtype=elem.dtype, layout=elem.layout, device=elem.device, requires_grad=elem.requires_grad, strides=elem.stride(), storage_offset=elem.storage_offset()) if elem.requires_grad: # CompositeCompliantTensor steals the "requires_grad"-ness. # Why a new copy of `elem`? Because sometimes OpInfo shares inputs between tests... tmp = torch.empty_strided(elem.shape, elem.stride(), dtype=elem.dtype, device=elem.device, layout=elem.layout, requires_grad=False) tmp.copy_(elem.detach()) r.elem = tmp else: r.elem = elem assert r.stride() == r.elem.stride() # Propagate conjugate bits to the wrapper tensor # Ref: https://github.com/albanD/subclass_zoo/issues/24 # Ref: https://github.com/albanD/subclass_zoo/issues/21 torch._C._set_conj(r, r.elem.is_conj()) torch._C._set_neg(r, r.elem.is_neg()) return r
def cutlass_matmul(a, b): if _cutlass is None: raise RuntimeError("Cannot find cutlass library") M, N = a.shape[0], b.shape[1] c = torch.empty_strided((M, N), (1, M), dtype=a.dtype, device=a.device) _cutlass.matmul(a, b, c) return c
def run_test(n, k, upper, unitriangular, transpose): triangle_function = torch.triu if upper else torch.tril A = make_tensor((n, n), dtype=dtype, device=device) A = triangle_function(A) A_sparse = A.to_sparse_csr() B = make_tensor((n, k), dtype=dtype, device=device) expected = torch.triangular_solve(B, A, upper=upper, unitriangular=unitriangular, transpose=transpose) expected_X = expected.solution actual = torch.triangular_solve(B, A_sparse, upper=upper, unitriangular=unitriangular, transpose=transpose) actual_X = actual.solution actual_A_clone = actual.cloned_coefficient self.assertTrue(actual_A_clone.numel() == 0) self.assertEqual(actual_X, expected_X) # test out with C contiguous strides out = torch.empty_strided((n, k), (k, 1), dtype=dtype, device=device) torch.triangular_solve( B, A_sparse, upper=upper, unitriangular=unitriangular, transpose=transpose, out=(out, actual_A_clone) ) self.assertEqual(out, expected_X) # test out with F contiguous strides out = torch.empty_strided((n, k), (1, n), dtype=dtype, device=device) torch.triangular_solve( B, A_sparse, upper=upper, unitriangular=unitriangular, transpose=transpose, out=(out, actual_A_clone) ) self.assertEqual(out, expected_X) self.assertEqual(out.stride(), (1, n)) # test out with discontiguous strides out = torch.empty_strided((2 * n, k), (1, 2 * n), dtype=dtype, device=device)[::2] if n > 0 and k > 0: self.assertFalse(out.is_contiguous()) self.assertFalse(out.t().is_contiguous()) before_stride = out.stride() torch.triangular_solve( B, A_sparse, upper=upper, unitriangular=unitriangular, transpose=transpose, out=(out, actual_A_clone) ) self.assertEqual(out, expected_X) self.assertEqual(out.stride(), before_stride)
def init_inputs(N, C, H, W, in_order, out_order, dtype): shape = (N, C, H, W) stride_x = torch_blocksparse._permute.strides(N, C, H, W, in_order) stride_y = torch_blocksparse._permute.strides(N, C, H, W, out_order) x = torch.rand(N * C * H * W, requires_grad=True).as_strided(shape, stride_x).cuda().type(dtype) y = torch.empty_strided(shape, stride_y, device=x.device, dtype=dtype) return x, y
def backward(ctx, dy): # load from context x, y = ctx.saved_tensors # get kernel if x.dtype not in _relu.bwd_kernel: defines = {'TYPE': x.dtype, 'TN': [128]} _relu.bwd_kernel[x.dtype] = triton.kernel(_relu.bwd_src, defines=defines, num_warps=[4]) kernel = _relu.bwd_kernel[x.dtype] # allocate output dx = torch.empty_strided(x.shape, x.stride(), device=x.device, dtype=x.dtype) dres = torch.empty_strided(x.shape, x.stride(), device=x.device, dtype=x.dtype) dscale = torch.zeros((1,), device=dy.device, dtype=torch.float32) dbias = torch.zeros_like(dscale) # launch kernel N = x.numel() grid = lambda opt: [triton.cdiv(N, opt.d('TN'))] kernel(x, y, ctx.scale.item(), dx, dy, dscale, dbias, dres, N, grid=grid) return dx, dscale.type(x.dtype), dbias.type(x.dtype), dres
def unwrap(t): # TODO: doesn't setup aliasing relation on views correctly if isinstance(t, TensorMeta): return torch.empty_strided(t.shape, t.stride(), dtype=t.dtype, device="meta") else: return t
def make_tensor_from_type(inp_type: torch._C.TensorType): size = inp_type.sizes() stride = inp_type.strides() device = inp_type.device() dtype = inp_type.dtype() assert size is not None assert stride is not None assert device is not None assert dtype is not None return torch.empty_strided(size=size, stride=stride, device=device, dtype=dtype)
def forward(ctx, x, scale, bias, res): if x.dtype not in _relu.fwd_kernel: defines = {'TYPE': x.dtype, 'TN': [128]} _relu.fwd_kernel[x.dtype] = triton.kernel(_relu.fwd_src, defines=defines, num_warps=[4]) kernel = _relu.fwd_kernel[x.dtype] # launch kernel y = torch.empty_strided(x.shape, x.stride(), device=x.device, dtype=x.dtype) N = x.numel() grid = lambda opt: [triton.cdiv(N, opt.d('TN'))] kernel(x, y, scale.item(), bias.item(),res, N, grid=grid) # update context ctx.save_for_backward(x, y) ctx.scale = scale return y
def tensor_creation_ops(self): i = torch.tensor([[0, 1, 1], [2, 0, 2]]) v = torch.tensor([3, 4, 5], dtype=torch.float32) real = torch.tensor([1, 2], dtype=torch.float32) imag = torch.tensor([3, 4], dtype=torch.float32) inp = torch.tensor([-1.5, 0.0, 2.0]) values = torch.tensor([0.5]) quantized = torch.quantize_per_channel( torch.tensor([[-1.0, 0.0], [1.0, 2.0]]), torch.tensor([0.1, 0.01]), torch.tensor([10, 0]), 0, torch.quint8, ) return ( torch.tensor([[0.1, 1.2], [2.2, 3.1], [4.9, 5.2]]), # torch.sparse_coo_tensor(i, v, [2, 3]), # not work for iOS torch.as_tensor([1, 2, 3]), torch.as_strided(torch.randn(3, 3), (2, 2), (1, 2)), torch.zeros(2, 3), torch.zeros((2, 3)), torch.zeros([2, 3], out=i), torch.zeros(5), torch.zeros_like(torch.empty(2, 3)), torch.ones(2, 3), torch.ones((2, 3)), torch.ones([2, 3]), torch.ones(5), torch.ones_like(torch.empty(2, 3)), torch.arange(5), torch.arange(1, 4), torch.arange(1, 2.5, 0.5), torch.range(1, 4), torch.range(1, 4, 0.5), torch.linspace(3.0, 3.0, steps=1), torch.logspace(start=2, end=2, steps=1, base=2.0), torch.eye(3), torch.empty(2, 3), torch.empty_like(torch.empty(2, 3), dtype=torch.int64), torch.empty_strided((2, 3), (1, 2)), torch.full((2, 3), 3.141592), torch.full_like(torch.full((2, 3), 3.141592), 2.71828), torch.quantize_per_tensor( torch.tensor([-1.0, 0.0, 1.0, 2.0]), 0.1, 10, torch.quint8 ), torch.dequantize(quantized), torch.complex(real, imag), torch.polar(real, imag), torch.heaviside(inp, values), )
def _new_strided_tensor(size: Tuple[int], stride: Tuple[int], dtype: torch.dtype, device: Union[str, torch.device], pin_memory: bool) -> torch.Tensor: if isinstance(device, torch.device): pin_memory &= device.type == 'cpu' else: pin_memory &= device.lower() == 'cpu' return torch.empty_strided(size=size, stride=stride, dtype=dtype, device=device, requires_grad=False, pin_memory=pin_memory)
def cutlass_matmul(a, b): if _cutlass is None: raise RuntimeError("Cannot find cutlass library") M, N = a.shape[0], b.shape[1] Ka, Kb = a.shape[1], b.shape[0] assert Ka == Kb assert a.dtype == b.dtype assert a.device == b.device # allocate output c = torch.empty_strided((M, N), (1, M), dtype=a.dtype, device=a.device) # run function dtype = str(a.dtype).split('.')[-1] _cutlass.matmul(a.data_ptr(), b.data_ptr(), c.data_ptr(), \ M, N, Ka,\ a.stride(0), a.stride(1),\ b.stride(0), b.stride(1),\ c.stride(0), c.stride(1),\ dtype, dtype, dtype, a.device.index, torch.cuda.current_stream(a.device).cuda_stream) return c
def forward(ctx, x, running_mean, running_var, gamma, beta, training, momentum, eps): N, C, H, W = x.shape # lazy compilation of kernel key = (training, x.dtype) if key not in _batchnorm.fwd_kernel: defines = {'TM': 256, 'TYPE': x.dtype} if training: defines['TRAINING'] = True _batchnorm.fwd_kernel[key] = triton.kernel(_batchnorm.fwd_src, defines=defines, num_warps=[4]) kernel = _batchnorm.fwd_kernel[key] # allocate outputs y = torch.empty_strided(x.shape, x.stride(), layout=x.layout, dtype=x.dtype, device=x.device) mean = torch.empty(C, dtype=torch.float32, device=x.device) var = torch.empty(C, dtype=torch.float32, device=x.device) # execute kernels grid = lambda opt: [C] kernel(y, mean, var, running_mean, running_var, x, gamma, beta, H * W * N, momentum, eps, grid=grid) # save ctx.save_for_backward(x, gamma, beta, mean, var) ctx.eps = eps return y
def backward(ctx, dy): # lazy compilation of kernel key = (dy.dtype, ) if key not in _batchnorm.bwd_kernel: _batchnorm.bwd_kernel[key] = triton.kernel(_batchnorm.bwd_src, defines={ 'TM': 256, 'TYPE': dy.dtype }, num_warps=[4]) kernel = _batchnorm.bwd_kernel[key] # retrieve info x, gamma, beta, mean, var = ctx.saved_tensors eps = ctx.eps # allocate result dx = torch.empty_strided(x.shape, x.stride(), layout=x.layout, dtype=x.dtype, device=x.device) dgamma = torch.empty_like(gamma) dbeta = torch.empty_like(beta) # execute N, C, H, W = x.shape kernel(dx, dgamma, dbeta, dy, x, gamma, mean, var, H * W * N, eps, grid=lambda opt: [C]) return dx, None, None, dgamma, dbeta, None, None, None
def _rebuild_meta_tensor_no_storage(dtype, size, stride, requires_grad): return torch.empty_strided(size, stride, dtype=dtype, device='meta', requires_grad=requires_grad)
# torch.logspace torch.logspace(start=-10, end=10, steps=5) torch.logspace(start=0.1, end=1.0, steps=5) torch.logspace(start=0.1, end=1.0, steps=1) torch.logspace(start=2, end=2, steps=1, base=2) # torch.eye torch.eye(3) # torch.empty/empty_like/empty_strided torch.empty(2, 3) torch.empty((2, 3)) torch.empty([2, 3]) torch.empty_like(torch.empty(2, 3), dtype=torch.int64) torch.empty_strided((2, 3), (1, 2)) # torch.full/full_like torch.full((2, 3), 3.141592) torch.full_like(torch.full((2, 3), 3.141592), 2.71828) # torch.quantize_per_tensor torch.quantize_per_tensor(torch.tensor([-1.0, 0.0, 1.0, 2.0]), 0.1, 10, torch.quint8) # torch.quantize_per_channel x = torch.tensor([[-1.0, 0.0], [1.0, 2.0]]) quant = torch.quantize_per_channel(x, torch.tensor([0.1, 0.01]), torch.tensor([10, 0]), 0, torch.quint8) # torch.dequantize
def run(n, repeat=3, compare_results=True, dtype=torch.float32, fn=cuda_lauum_lower, lower=True): torch.random.manual_seed(10) device = torch.device("cuda:0") # Generate random matrix matrix = torch.randn((n, n), dtype=dtype) # Fill 'ignored' side of the matrix with zeros. # matrix = torch.tril(matrix) # Make it in F-order matrix = matrix.T # Create GPU buffers for input and output matrices gpu_in = torch.empty_strided((n, n), stride=matrix.stride(), dtype=matrix.dtype, device=device, requires_grad=False) gpu_out = torch.empty_strided((n, n), stride=matrix.stride(), dtype=matrix.dtype, device=device, requires_grad=False) # Copy matrix to the GPU gpu_in.copy_(matrix) torch.cuda.synchronize(device) # Run on the CPU if compare_results: print("\tRunning CPU Exp...", flush=True) # Generate the expected output using LAPACK cpu_times = [] for i in range(repeat): start_time = time.time() expected = scll.dlauum(matrix.numpy(), lower=lower, overwrite_c=False)[0] cpu_times.append(time.time() - start_time) cpu_time = min(cpu_times) else: cpu_time = 0 # Run on the GPU gpu_times = [] for i in range(repeat): gpu_out.fill_(0.0) start_time = time.time() fn(gpu_in.shape[0], gpu_in, gpu_in.stride(1), gpu_out, gpu_out.stride(1)) torch.cuda.synchronize(device) gpu_times.append(time.time() - start_time) gpu_time = min(gpu_times) flop = (2 * n * (n + 1) * (n + 2)) / 6 flops = flop / gpu_time if False: with np.printoptions(precision=3, linewidth=160): print("INPUT") print(matrix) print("EXPECTED") print(torch.from_numpy(expected)) print("ACTUAL") print(gpu_out) # Compare outputs and print timing info if compare_results: if lower: np.testing.assert_allclose(np.tril(expected), gpu_out.cpu().numpy()) else: v_cpu = np.triu(expected) v_gpu = np.triu(gpu_out.cpu().numpy()) diff = np.abs(v_cpu - v_gpu) if False: with np.printoptions(precision=1, linewidth=160): print(diff) np.testing.assert_allclose(v_cpu, v_gpu) print( f"Exp. of size {n} - CPU time {cpu_time:.2f}s - GPU time {gpu_time:.2f}s ({fn.__name__}) - GFlops {flops/1e9:.2f}" )
def do_work(x, in_order, out_order): x_inner_mul = _permute.multiple_of(x.shape['NCHW'.index(in_order[-1])]) y_inner_mul = _permute.multiple_of(x.shape['NCHW'.index( out_order[-1])]) key = (x.dtype, in_order, out_order, x_inner_mul, y_inner_mul) if key not in _permute.kernels: TN = [32] if in_order[-1] == 'N' or out_order[-1] == 'N' else 1 TC = [32] if in_order[-1] == 'C' or out_order[-1] == 'C' else 1 THW = [32] if in_order[-1] == 'W' or out_order[-1] == 'W' else 1 defines = { 'NAME': f'permute_{in_order}_{out_order}_{x_inner_mul}_{y_inner_mul}', 'TYPE': x.dtype, # stride multiple for X 'M_STRIDE_XN': 1 if in_order[-1] == 'N' else x_inner_mul, 'M_STRIDE_XC': 1 if in_order[-1] == 'N' else x_inner_mul, 'M_STRIDE_XHW': 1 if in_order[-1] == 'N' else x_inner_mul, # stride multiple for Y 'M_STRIDE_YN': 1 if out_order[-1] == 'N' else y_inner_mul, 'M_STRIDE_YC': 1 if out_order[-1] == 'N' else y_inner_mul, 'M_STRIDE_YHW': 1 if out_order[-1] == 'N' else y_inner_mul, # strides for X 'STRIDE_XN': 1 if in_order[-1] == 'N' else 'stride_xn', 'STRIDE_XC': 1 if in_order[-1] == 'C' else 'stride_xc', 'STRIDE_XHW': 1 if in_order[-1] == 'W' else 'stride_xhw', # strides for Y 'STRIDE_YN': 1 if out_order[-1] == 'N' else 'stride_yn', 'STRIDE_YC': 1 if out_order[-1] == 'C' else 'stride_yc', 'STRIDE_YHW': 1 if out_order[-1] == 'W' else 'stride_yhw', # tile parameters 'TN': TN, 'TC': TC, 'THW': THW } _permute.kernels[key] = triton.kernel(src, defines=defines, num_warps=[4]) kernel = _permute.kernels[key] N, C, H, W = x.shape y = torch.empty_strided(x.shape, _permute.strides(N, C, H, W, out_order), device=x.device, dtype=x.dtype) stride_xn, stride_xc, _, stride_xhw = x.stride() stride_yn, stride_yc, _, stride_yhw = y.stride() grid = lambda opt: (triton.cdiv(N, opt.d('TN')), triton.cdiv(C, opt.d('TC')), triton.cdiv(H * W, opt.d('THW'))) kernel(x, y, N, C, H * W, stride_xn, stride_xc, stride_xhw, stride_yn, stride_yc, stride_yhw, grid=grid) return y
reveal_type(torch.linspace(start=-10, end=10, steps=1)) # E: {Tensor} # torch.logspace reveal_type(torch.logspace(start=-10, end=10, steps=5)) # E: {Tensor} reveal_type(torch.logspace(start=0.1, end=1.0, steps=5)) # E: {Tensor} reveal_type(torch.logspace(start=0.1, end=1.0, steps=1)) # E: {Tensor} reveal_type(torch.logspace(start=2, end=2, steps=1, base=2)) # E: {Tensor} # torch.eye reveal_type(torch.eye(3)) # E: {Tensor} # torch.empty/empty_like/empty_strided reveal_type(torch.empty(2, 3)) # E: {Tensor} reveal_type(torch.empty_like(torch.empty(2, 3), dtype=torch.int64)) # E: {Tensor} reveal_type(torch.empty_strided((2, 3), (1, 2))) # E: {Tensor} # torch.full/full_like reveal_type(torch.full((2, 3), 3.141592)) # E: {Tensor} reveal_type(torch.full_like(torch.full((2, 3), 3.141592), 2.71828)) # E: {Tensor} # torch.quantize_per_tensor reveal_type( torch.quantize_per_tensor(torch.tensor([-1.0, 0.0, 1.0, 2.0]), 0.1, 10, torch.quint8)) # E: {Tensor} # torch.quantize_per_channel x = torch.tensor([[-1.0, 0.0], [1.0, 2.0]]) quant = torch.quantize_per_channel(x, torch.tensor([0.1, 0.01]), torch.tensor([10, 0]), 0, torch.quint8)