def sample_inputs_generator(): for sample_input in sample_inputs_func(device, dtype): mask = sample_input.kwargs.get('mask') if mask is None: yield sample_input else: if layout == sample_input.input.layout: yield sample_input if layout != torch.strided: sample_input_kwargs = sample_input.kwargs.copy() sample_input_kwargs.update(mask=mask.to_dense()) yield SampleInput(sample_input.input.clone(), args=sample_input.args, kwargs=sample_input_kwargs) if layout != torch.sparse_coo and op.supports_sparse: sample_input_kwargs = sample_input.kwargs.copy() sample_input_kwargs.update(mask=mask.to_sparse()) yield SampleInput(sample_input.input.clone(), args=sample_input.args, kwargs=sample_input_kwargs) if layout != torch.sparse_csr and op.supports_sparse_csr and sample_input.input.ndim == 2: sample_input_kwargs = sample_input.kwargs.copy() sample_input_kwargs.update( mask=mask.to_sparse_csr()) yield SampleInput(sample_input.input.clone(), args=sample_input.args, kwargs=sample_input_kwargs)
def sample_inputs_index_put(op_info, device, dtype, requires_grad, **kwargs): make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad) make_idx = partial(make_tensor, dtype=torch.long, device=device, requires_grad=False) S = 5 inputs = [] for accumulate in [False, True]: # putting vectors at indexed locations inputs.append( SampleInput(make_arg((S, S)), args=((make_idx((2, ), low=0, high=4), ), make_arg((2, S))), kwargs=dict(accumulate=accumulate))) # putting multi-dim tensors at indexed locations inputs.append( SampleInput(make_arg((S, S, 2)), args=((make_idx((3, ), low=0, high=4), ), make_arg((3, S, 2))), kwargs=dict(accumulate=accumulate))) # value with size `0` dim inputs.append( SampleInput(make_arg((S, 0)), args=((make_idx((3, ), low=0, high=4), ), make_arg((3, 0))), kwargs=dict(accumulate=accumulate))) # scalar value inputs.append( SampleInput(make_arg((S, )), args=((make_idx((), low=0, high=S), ), make_arg(())), kwargs=dict(accumulate=accumulate))) # cuda and accumulate don't work well # Reference: https://github.com/pytorch/pytorch/issues/72053 if not accumulate and device == 'cuda': # Broadcast `values` inputs.append( SampleInput(make_arg((S, S)), args=((make_idx((2, ), low=0, high=S), ), make_arg((S, ))), kwargs=dict(accumulate=accumulate))) return inputs
def test_unsupported_expand_weights(self, device, dtype, op): sample_inputs = op.sample_inputs(device, dtype, requires_grad=True) unsupported_inputs = supported_inputs(op, sample_inputs, supported_inputs=False) for sample_input in unsupported_inputs: with self.assertRaisesRegex(RuntimeError, r"Expanded Weights"): if op.name == "nn.functional.embedding": # embedding flips its argument order for autograd tests sample_input = SampleInput(sample_input.args[0], args=(sample_input.input, ), kwargs=sample_input.kwargs) input = sample_input.input batch_size = input.shape[0] if len(input.shape) > 1 else 1 # get per sample grads with ExpandedWeights objects (ew_input, ew_args, ew_kwargs) = make_expanded_weight(sample_input, batch_size) result = run_op(op, ew_input, *ew_args, **ew_kwargs) diff_input_list = (ew_input, ) + tuple(ew_args) + tuple( ew_kwargs.values()) diff_input_list = [ i for i in diff_input_list if is_diff_tensor(i) ] diff_input_list = [ i.orig_weight if isinstance(i, ExpandedWeight) else i for i in diff_input_list ] result.sum().backward( ) # grad doesn't work with ExpandedWeight because it calls __torch_function__
def sample_inputs_aten_index_put(op_info, device, dtype, requires_grad, **kwargs): make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad) inputs = [] adv_idx = torch.LongTensor([[0, 1], [2, 3]]) # self_shape, indices additional = [ ((5, 6, 7, 8), [None, adv_idx, adv_idx, None]), ((5, 6, 7, 8), [None, adv_idx, None, adv_idx]), ((5, 6, 7, 8), [adv_idx, None, None, adv_idx]), ((5, 6, 7, 8), [None, None, adv_idx, adv_idx]), ((5, 6, 7, 8, 9), [None, None, adv_idx, None, adv_idx]), ((5, 6, 7, 8, 9), [None, None, adv_idx, adv_idx, None]), ((5, 6, 7, 8, 9, 10), [None, None, None, adv_idx, adv_idx]), ((5, 6, 7, 8, 9, 10), [None, None, adv_idx, adv_idx, adv_idx]), ] for self_shape, indices in additional: for broadcast_value in [False, True]: inp = make_arg(self_shape) tmp_indices = [ slice(None) if idx is None else idx for idx in indices ] values_shape = inp[tmp_indices].shape if broadcast_value: values_shape = values_shape[3:] values = make_arg(values_shape) inputs.append(SampleInput(inp, args=(tuple(indices), values))) return inputs
def sample_inputs_getitem(op_info, device, dtype, requires_grad, **kwargs): S = 5 test_args = [ ([1, 2], ), (slice(0, 3), ), ([slice(0, 3), 1], ), ([[0, 2, 3], [1, 3, 3], [0, 0, 2]], ), ([[0, 0, 3], [1, 1, 3], [0, 0, 2]], ), ([slice(None), slice(None), [0, 3]], ), ([slice(None), [0, 3], slice(None)], ), ([[0, 3], slice(None), slice(None)], ), ([[0, 3], [1, 2], slice(None)], ), ([ [0, 3], ], ), ([[0, 3], slice(None)], ), ([[0, 3], Ellipsis], ), ([[0, 2, 3], [1, 3, 3], torch.LongTensor([0, 0, 2])], ), ] return tuple( SampleInput(make_tensor((S, S, S), device=device, dtype=dtype, low=None, high=None, requires_grad=requires_grad), args=args) for args in test_args)
def _generate_sample_data(device="cpu", dtype=torch.float, requires_grad=True, layout=torch.strided): assert layout in { torch.strided, torch.sparse_coo, torch.sparse_csr, }, "Layout must be strided/sparse_coo/sparse_csr" shapes = [ [], [2], [3, 5], [3, 2, 1, 2], ] inputs = [] for s in shapes: data = make_tensor( s, device=device, dtype=dtype, requires_grad=requires_grad) # type: ignore[arg-type] mask = _create_random_mask(s, device) if layout == torch.sparse_coo: mask = mask.to_sparse_coo().coalesce() data = data.sparse_mask(mask).requires_grad_(requires_grad) elif layout == torch.sparse_csr: if data.ndim != 2 and mask.ndim != 2: continue mask = mask.to_sparse_csr() data = data.sparse_mask(mask) inputs.append(SampleInput(data, kwargs={"mask": mask})) return inputs
def test_expanded_weight_per_sample_grad(self, device, dtype, op): sample_inputs = op.sample_inputs(device, dtype, requires_grad=True) for sample_input in supported_inputs(op, sample_inputs): if op.name == "nn.functional.embedding": # embedding flips its argument order for autograd tests sample_input = SampleInput(sample_input.args[0], args=(sample_input.input,), kwargs=sample_input.kwargs) input = sample_input.input args = sample_input.args kwargs = sample_input.kwargs batch_size = input.shape[0] if len(input.shape) > 1 else 1 # get per sample grads with ExpandedWeights objects (ew_input, ew_args, ew_kwargs) = make_expanded_weight(sample_input, batch_size) diff_input_list = (ew_input,) + tuple(ew_args) + tuple(ew_kwargs.values()) diff_input_list = [i for i in diff_input_list if is_diff_tensor(i)] diff_input_list = [i.orig_weight if isinstance(i, ExpandedWeight) else i for i in diff_input_list] if not diff_input_list: continue result = run_op(op, ew_input, *ew_args, **ew_kwargs) result.sum().backward() # grad doesn't work with ExpandedWeight because it calls __torch_function__ expanded_weight_grad = tuple(i.grad_sample if hasattr(i, "grad_sample") else i.grad for i in diff_input_list) # get per sample grads with for loop func = partial(run_op, op) per_sample_grad = for_loop_per_sample_grad(batch_size, input, func, *args, **kwargs) # check equality self.assertEqual(len(per_sample_grad), len(expanded_weight_grad)) for (result_grad, expected_grad) in zip(expanded_weight_grad, per_sample_grad): if result_grad is None: result_grad = torch.zeros_like(expected_grad) self.assertEqual(result_grad, expected_grad)
def sample_inputs_new_zeros_with_same_feature_meta(op_info, device, dtype, requires_grad, **kwargs): make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad) matrix = [ # tangent, base, num_tangent_bdims ([5], [2, 3], 0), ([2, 3], [2, 3], 0), ([5], [2], 0), ([1, 0, 2], [1, 2], 0), ([], [1, 2], 0), ([8, 7, 5], [2, 3, 11], 1), ([6, 7, 5], [2, 3, 4], 2), ([6, 4], [3], 2), ] results = [] for tangent_shape, base_shape, num_tangent_bdims in matrix: tangent = make_arg(tangent_shape) base = make_arg(base_shape) results.append( SampleInput(tangent, args=(base, ), kwargs=dict(self_num_batch_dims=num_tangent_bdims))) return results
def sample_inputs_conv2d(has_bias, self, device, dtype, requires_grad, extra_args=(), groups=1): in_ch, out_ch = 6, 4 inp = make_tensor((2, in_ch * groups, 7, 5), device=device, dtype=dtype, requires_grad=requires_grad, low=-1, high=1) weight = make_tensor((out_ch * groups, in_ch, 3, 2), device=device, dtype=dtype, requires_grad=requires_grad, low=-1, high=1) bias = None if has_bias: bias = make_tensor((out_ch * groups, ), device=device, dtype=dtype, requires_grad=requires_grad, low=-1, high=1) return [SampleInput(inp, args=((weight, bias) + extra_args))]
def sample_inputs_masked_fill(op_info, device, dtype, requires_grad, **kwargs): S = 3 make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad) yield SampleInput(make_arg((S, S)), args=(torch.randn(S, S, device=device) > 0, 10)) yield SampleInput(make_arg((S, S)), args=(torch.randn(S, device=device) > 0, 10)) yield SampleInput(make_arg(()), args=(torch.randn((), device=device) > 0, 10)) yield SampleInput(make_arg((S, S)), args=(torch.randn((), device=device) > 0, 10)) yield SampleInput(make_arg((S, )), args=(torch.randn(S, S, device=device) > 0, 10), broadcasts_input=True)
def sample_inputs_getitem(op_info, device, dtype, requires_grad, **kwargs): # Short for "advanced index" adv_idx = torch.LongTensor([[0, 1], [2, 3]]) S = 5 # self_dim, indices test_args = [ (3, ([1, 2], )), (3, (slice(0, 3), )), (3, ([slice(0, 3), 1], )), (3, ([[0, 2, 3], [1, 3, 3], [0, 0, 2]], )), (3, ([[0, 0, 3], [1, 1, 3], [0, 0, 2]], )), (3, ([slice(None), slice(None), [0, 3]], )), (3, ([slice(None), [0, 3], slice(None)], )), (3, ([[0, 3], slice(None), slice(None)], )), (3, ([[0, 3], [1, 2], slice(None)], )), (3, ([ [0, 3], ], )), (3, ([[0, 3], slice(None)], )), (3, ([[0, 3], Ellipsis], )), (3, ([[0, 2, 3], [1, 3, 3], torch.LongTensor([0, 0, 2])], )), (4, ([slice(None), adv_idx, adv_idx, slice(None)], )), (4, ([slice(None), adv_idx, slice(None), adv_idx], )), (4, ([adv_idx, slice(None), slice(None), adv_idx], )), (4, ([slice(None), slice(None), adv_idx, adv_idx], )), (4, ([Ellipsis, adv_idx, adv_idx], )), (5, ([slice(None), slice(None), adv_idx, slice(None), adv_idx], )), (5, ([slice(None), slice(None), adv_idx, adv_idx, slice(None)], )), (5, ([slice(None), slice(None), adv_idx, None, adv_idx, slice(None)], )), (6, ([slice(None), slice(None), slice(None), adv_idx, adv_idx], )), (6, ([slice(None), slice(None), adv_idx, adv_idx, adv_idx], )), (6, ([slice(None), slice(None), None, adv_idx, adv_idx, adv_idx], )), ] def get_shape(dim): return tuple(S + i for i in range(dim)) return tuple( SampleInput(make_tensor(get_shape(self_dim), device=device, dtype=dtype, low=None, high=None, requires_grad=requires_grad), args=args) for self_dim, args in test_args)
def test_expanded_weight_per_sample_grad_mean(self, device, dtype, op): sample_inputs = op.sample_inputs(device, dtype, requires_grad=True) for sample_input in supported_inputs(op, sample_inputs): if op.name == "nn.functional.embedding": # embedding flips its argument order for autograd tests sample_input = SampleInput(sample_input.args[0], args=(sample_input.input, ), kwargs=sample_input.kwargs) self._compare_ew_and_for_loop_per_sample_grads( op, sample_input, torch.mean)
def sample_inputs_conversion(op_info, device, dtype, requires_grad, **kwargs): make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad) shapes = ((), (2, 3)) memory_format_options = [None, torch.contiguous_format] for shape, memory_format in itertools.product(shapes, memory_format_options): yield SampleInput( make_arg(shape), kwargs={'memory_format': memory_format} if memory_format else {})
def test_expanded_weight_forward(self, device, dtype, op): sample_inputs = op.sample_inputs(device, dtype) for sample_input in supported_inputs(op, sample_inputs): if op.name == "nn.functional.embedding": # embedding flips its argument order for autograd tests sample_input = SampleInput(sample_input.args[0].clone(), args=(sample_input.input.clone(),), kwargs=sample_input.kwargs) if "cuda" in device and "max_norm" in sample_input.kwargs and "padding_idx" in sample_input.kwargs: self.skipTest("embedding is non-determinstic in this case, see issue #74679") batch_size = sample_input.input.shape[0] if len(sample_input.input.shape) > 1 else 1 (ew_input, ew_args, ew_kwargs) = make_expanded_weight(sample_input, batch_size) expanded_weight_result = run_op(op, ew_input, *ew_args, **ew_kwargs) normal_result = run_op(op, sample_input.input, *sample_input.args, **sample_input.kwargs) self.assertEqual(expanded_weight_result, normal_result)
def sample_inputs_generator(): for sample_input in sample_inputs_func(device, dtype): mask = sample_input.kwargs.get('mask') if mask is None: yield sample_input else: if layout == sample_input.input.layout: yield sample_input if layout != torch.strided: sample_input_kwargs = sample_input.kwargs.copy() sample_input_kwargs.update(mask=mask.to_dense()) yield SampleInput(sample_input.input.clone(), args=sample_input.args, kwargs=sample_input_kwargs) if layout != torch.sparse_coo and op.supports_sparse: sample_input_kwargs = sample_input.kwargs.copy() if mask.layout == torch.sparse_csr: # TODO: remove this if-block when sparse csr supports to_sparse mask = torch.sparse_coo_tensor( torch._convert_indices_from_csr_to_coo( mask.crow_indices(), mask.col_indices()), mask.values(), mask.shape)._coalesced_(True) sample_input_kwargs.update(mask=mask) else: sample_input_kwargs.update( mask=mask.to_sparse()) yield SampleInput(sample_input.input.clone(), args=sample_input.args, kwargs=sample_input_kwargs) if layout != torch.sparse_csr and op.supports_sparse_csr and sample_input.input.ndim == 2: sample_input_kwargs = sample_input.kwargs.copy() sample_input_kwargs.update( mask=mask.to_sparse_csr()) yield SampleInput(sample_input.input.clone(), args=sample_input.args, kwargs=sample_input_kwargs)
def sample_inputs_mse_loss(op_info, device, dtype, requires_grad, **kwargs): def make_input(shape, requires_grad=requires_grad): return make_tensor(shape, device=device, dtype=dtype, requires_grad=requires_grad) rhs_requires_grad = kwargs.get('rhs_requires_grad', requires_grad) S = 5 shapes = ((S, S), (S, S, S), (S, S, S, S)) reductions = ("none", "mean", "sum") for shape, reduction in itertools.product(shapes, reductions): yield SampleInput(make_input(shape), args=(make_input(shape, requires_grad=rhs_requires_grad), ), kwargs={"reduction": reduction})
def generator(): # 0-D index tensor idx = make_long_input((), low=0, high=M) yield SampleInput( make_input((M, S)), args=(idx, ), ) # 1-D index tensor idx = make_long_input((S, ), low=0, high=M) yield SampleInput( make_input((M, S)), args=(idx, ), ) # 2-D index tensor idx = make_long_input((S, S), low=0, high=M) yield SampleInput( make_input((M, S)), args=(idx, ), ) idx = make_long_input((2, 2), low=0, high=S) idx[0, 0] = 2 idx[1, 1] = 2 yield SampleInput( make_input((S, S)), args=(idx, ), kwargs={'padding_idx': 2}, ) idx = make_long_input((2, 2), low=0, high=S) idx[0, 0] = 4 idx[1, 1] = 4 yield SampleInput( make_input((S, S)), args=(idx, ), kwargs={'padding_idx': -1}, ) # Scale the gradient based on the inverse frequency of a particular index. idx = make_long_input((2, 2), low=0, high=S) idx[0, 0] = 1 idx[0, 1] = 1 weights = make_input((S, S)) yield SampleInput( weights, args=(idx, ), kwargs={'scale_grad_by_freq': True}, )