def L_op(self, inputs, outputs, output_grads): # Gradients computed by Op assert self.compute_grad and len(outputs) == 2 gradients = outputs[1] assert gradients is not None # Gradients of original function, to compose chain rule grad_op = output_grads[0] grad_shuffle = GpuDimShuffle( input_broadcastable=( False, False, False, ), new_order=(1, 0, 2), )(gradients) grad_bdot = batched_dot(grad_op, grad_shuffle) grad_shuffle_reverse = GpuDimShuffle( input_broadcastable=( False, False, False, ), new_order=(1, 0, 2), )(grad_bdot) return [ grad_shuffle_reverse, grad_undefined(self, 1, inputs[1]), grad_undefined(self, 2, inputs[2]), ]
def L_op(self, inputs, outputs, out_grads): x, k = inputs k_grad = grad_undefined(self, 1, k, "topk: k is not differentiable") if not (self.return_indices or self.return_values): x_grad = grad_undefined( self, 0, x, "topk: cannot get gradient" " without both indices and values", ) else: x_shp = shape(x) z_grad = out_grads[0] ndim = x.ndim axis = self.axis % ndim grad_indices = [ arange(x_shp[i]).dimshuffle([0] + ["x"] * (ndim - i - 1)) if i != axis else outputs[-1] for i in range(ndim) ] x_grad = x.zeros_like(dtype=z_grad.dtype) x_grad = set_subtensor(x_grad[tuple(grad_indices)], z_grad) return [x_grad, k_grad]
def L_op(self, inputs, outputs, output_grads): assert self.compute_grad and len(outputs) == 2 gradients = outputs[1] assert gradients is not None grad_op = output_grads[0] total_grad = batched_dot(grad_op, gradients.dimshuffle(1, 0, 2)).dimshuffle(1, 0, 2) return [ total_grad, grad_undefined(self, 1, inputs[1]), grad_undefined(self, 2, inputs[2]), ]
def grad(self, inputs, grads): o, W, h, inputIdx, outputIdx = inputs go = grads[0] Wgrad = gpu_sparse_block_outer(W.zeros_like(), h, go, inputIdx, outputIdx) hgrad = gpu_sparse_block_gemv(h.zeros_like(), W.dimshuffle( (1, 0, 3, 2)), go, outputIdx, inputIdx) return [ go, Wgrad, hgrad, grad_undefined(self, 3, inputIdx, "grad of inputIdx makes no sense"), grad_undefined(self, 4, outputIdx, "grad of outputIdx makes no sense"), ]
def grad(self, inputs, grads): o, W, h, inputIdx, outputIdx = inputs go = grads[0] outer_fun = SparseBlockOuter(self.inplace) gemv_fun = SparseBlockGemv(self.inplace) Wgrad = outer_fun(W.zeros_like(), h, go, inputIdx, outputIdx) hgrad = gemv_fun( h.zeros_like(), W.dimshuffle((1, 0, 3, 2)), go, outputIdx, inputIdx ) return [ go, Wgrad, hgrad, grad_undefined(self, 3, inputIdx, "grad of inputIdx makes no sense"), grad_undefined(self, 4, outputIdx, "grad of outputIdx makes no sense"), ]
def grad(self, inputs, output_grads): a, axis = inputs indices = self.__get_argsort_indices(a, axis) inp_grad = output_grads[0][tuple(indices)] axis_grad = grad_undefined( self, 1, axis, "The gradient of sort is not defined " "with respect to the integer axes itself", ) return [inp_grad, axis_grad]
def grad(self, inputs, output_grads): # No grad defined for integers. inp, axis = inputs inp_grad = inp.zeros_like() axis_grad = grad_undefined( self, 1, axis, "argsort is not defined for non-integer axes so" " argsort(x, axis+eps) is undefined", ) return [inp_grad, axis_grad]
def grad(self, inp, cost_grad): """ Notes ----- The gradient is currently implemented for matrices only. """ a, val, offset = inp grad = cost_grad[0] height, width = grad.shape if a.dtype.startswith("complex"): return [None, None] # only valid for matrices wr_a = fill_diagonal_offset(grad, 0, offset) offset_abs = abs_(offset) pos_offset_flag = ge(offset, 0) neg_offset_flag = lt(offset, 0) min_wh = minimum(width, height) start = offset * pos_offset_flag + offset_abs * width * neg_offset_flag num_of_step = minimum( min_wh, width * pos_offset_flag + height * neg_offset_flag - offset_abs) step = a.shape[1] + 1 end = start + step * num_of_step # input of slice should be integer start = aet.cast(start, "int32") step = aet.cast(step, "int32") end = aet.cast(end, "int32") wr_val = grad.flatten()[start:end:step].sum() wr_offset = grad_undefined( self, 2, offset, "offset is not defined for non-integer offset so" " fill_diagonal_offset(a,val,offset+eps) is undefined", ) return [wr_a, wr_val, wr_offset]
def grad(self, inputs, outputs_gradients): a, *shape = inputs (dout,) = outputs_gradients # Determine the dimensions that were added by broadcasting new_dims = list(range(dout.ndim - a.ndim)) d_wrt_a = broadcast_to(dout, shape).sum(axis=new_dims) # Determine the dimensions that were broadcast _, shape_bcast = aet.alloc_validate_shape(shape) bcast_sums = [ i for i, (a_b, s_b) in enumerate(zip(a.broadcastable, shape_bcast[-a.ndim :])) if a_b and not s_b ] if bcast_sums: d_wrt_a = d_wrt_a.sum(axis=bcast_sums, keepdims=True) return [d_wrt_a] + [ grad_undefined(self, i, shp) for i, shp in enumerate(shape, 1) ]
def grad(self, inputs, output_grads): return [grad_undefined(self, 0, inputs[0])]
def test_undefined_grad_func(self): # tests that function compilation catches undefined grads in the graph a = vector() b = grad_undefined(add, 0, a) with pytest.raises(TypeError): aesara.function([a], b, on_unused_input="ignore")
def grad(self, inputs, ograd): return [ gradient.grad_undefined( self, k, inp, "No gradient defined through " "random sampling op") for k, inp in enumerate(inputs) ]
def grad(self, inp, grads): return [grad_undefined(self, 0, inp[0])]
def grad(self, inp, grads): return [grad_undefined(self, i, inp[i]) for i in range(2)]
def grad(self, inp, grads): x, neib_shape, neib_step = inp (gz, ) = grads if self.mode in ("valid", "ignore_borders"): if (neib_shape is neib_step or neib_shape == neib_step or # Aesara Constant == do not compare the data # the equals function do that. (hasattr(neib_shape, "equals") and neib_shape.equals(neib_step) )): return [ neibs2images(gz, neib_shape, x.shape, mode=self.mode), grad_undefined(self, 1, neib_shape), grad_undefined(self, 2, neib_step), ] if self.mode in ["valid"]: # Iterate over neighborhood positions, summing contributions. def pos2map(pidx, pgz, prior_result, neib_shape, neib_step): """ Helper function that adds gradient contribution from a single neighborhood position i,j. pidx = Index of position within neighborhood. pgz = Gradient of shape (batch_size*num_channels*neibs) prior_result = Shape (batch_size, num_channnels, rows, cols) neib_shape = Number of rows, cols in a neighborhood. neib_step = Step sizes from image2neibs. """ nrows, ncols = neib_shape rstep, cstep = neib_step batch_size, num_channels, rows, cols = prior_result.shape i = pidx // ncols j = pidx - (i * ncols) # This position does not touch some img pixels in valid mode. result_indices = prior_result[:, :, i:(rows - nrows + i + 1):rstep, j:(cols - ncols + j + 1):cstep, ] newshape = ((batch_size, num_channels) + ((rows - nrows) // rstep + 1, ) + ((cols - ncols) // cstep + 1, )) return inc_subtensor(result_indices, pgz.reshape(newshape)) indices = arange(neib_shape[0] * neib_shape[1]) pgzs = gz.dimshuffle((1, 0)) result, _ = aesara.scan( fn=pos2map, sequences=[indices, pgzs], outputs_info=zeros(x.shape), non_sequences=[neib_shape, neib_step], ) grad_input = result[-1] return [ grad_input, grad_undefined(self, 1, neib_shape), grad_undefined(self, 2, neib_step), ] return [ grad_not_implemented(self, 0, x), grad_undefined(self, 1, neib_shape), grad_undefined(self, 2, neib_step), ]