def grad(self, inputs, g_outputs): x, i0, i1, amt = inputs gy = g_outputs[0] return [ gy, DisconnectedType()(), DisconnectedType()(), diagonal_subtensor(gy, i0, i1) ]
def grad(self, inp, grads): kerns, top, desc, h, w = inp img, = grads img = gpu_contiguous(img) d_kerns = GpuDnnConvGradW()(img, top, desc, kerns.shape[2], kerns.shape[3]) d_top = GpuDnnConv()(img, kerns, desc) return (d_kerns, d_top, DisconnectedType()(), DisconnectedType()(), DisconnectedType()())
def grad(self, inp, grads): img, top, desc, h, w = inp kerns, = grads kerns = gpu_contiguous(kerns) d_img = GpuDnnConvGradI()(kerns, top, desc, img.shape[2], img.shape[3]) d_top = GpuDnnConv()(img, kerns, desc) return (d_img, d_top, DisconnectedType()(), DisconnectedType()(), DisconnectedType()())
def grad(self, inputs, g_outputs): (idx_train, w_train, idx_test, w_test, gp_params, indep_noise, ys) = inputs gz, = g_outputs u = self.symbolic_kernel(self.t_diff, gp_params) grad_u = theano.gradient.jacobian(u, gp_params) return ([DisconnectedType()(), # idx_train DisconnectedType()(), # w_train DisconnectedType()(), # idx_test DisconnectedType()()] + # w_test self.grad_cov_vec(idx_train, w_train, idx_test, w_test, u, gp_params, indep_noise, ys, gz, grad_u))
def grad(self, inp, grads): x, ws, stride, pad = inp gz, = grads disc = [DisconnectedType()() for i in inp[1:]] return [PoolGrad(ignore_border=self.ignore_border, mode=self.mode)(x, gz, ws, stride, pad)] + disc
def grad(self, inputs, ograds): ref, values, ref_dim, val_dim = inputs[:4] hash_struct = inputs[4:] ograd = ograds[0] ref_dim = get_scalar_constant_value(ref_dim) val_dim = get_scalar_constant_value(val_dim) def _conv(x): return GaussianFilter()(ref, x, ref_dim, val_dim, *hash_struct) # Since the kernels are separable and symmetric, the gradient w.r.t. # input is just the same filtering applied to the output grads. grad_i = _conv(ograd) def _gradr(r_i, vals, og, *args): return (og * (_conv(vals * r_i) - r_i * _conv(vals)) + vals * (_conv(og * r_i) - r_i * _conv(og))) grad_r, _ = theano.scan(fn=_gradr, sequences=[ref], non_sequences=[values, ograd] + hash_struct, outputs_info=None) grad_r = grad_r.sum(axis=1, acc_dtype="float32") grads = [DisconnectedType()() for i in range(len(inputs))] grads[0] = grad_r grads[1] = grad_i return grads
def grad(self, inputs, g_outputs): """ .. todo:: WRITEME """ hid_acts, filters, output_shape = inputs g_images, = g_outputs g_images = as_cuda_ndarray_variable(g_images) assert not isinstance(g_images, list) global FilterActs global WeightActs if FilterActs is None: from pylearn2.sandbox.cuda_convnet.filter_acts import FilterActs from pylearn2.sandbox.cuda_convnet.weight_acts import WeightActs g_filters = WeightActs(stride=self.stride, partial_sum=self.partial_sum, pad=self.pad)(g_images, hid_acts, filters.shape[1:3])[0] assert not isinstance(g_filters, list) g_hid_acts = FilterActs(stride=self.stride, pad=self.pad, partial_sum=self.partial_sum)(g_images, filters) return [g_hid_acts, g_filters, DisconnectedType()()]
def grad(self, inputs, gout): (x, repeats) = inputs (gz, ) = gout if repeats.ndim == 0: if self.axis is None: axis = x.ndim else: if self.axis >= 0: axis = self.axis + 1 else: axis = self.axis + x.ndim + 1 shape = [x.shape[k] for k in range(x.ndim)] shape.insert(axis, repeats) return [ gz.reshape(shape, x.ndim + 1).sum(axis=axis), DisconnectedType()() ] elif repeats.ndim == 1: # For this implementation, we would need to specify the length # of repeats in order to split gz in the right way to sum # the good part. raise NotImplementedError() else: raise ValueError()
def grad(self, inputs, g_outputs): (idx_train, w_train, idx_test, w_test, gp_params, indep_noise, y) = inputs u = self.symbolic_kernel(self.t_diff, gp_params) grad_u = theano.gradient.jacobian(u, gp_params) gz, = g_outputs grad_gp_params, grad_indep_noise = self.grad_posterior_mean( idx_train, w_train, idx_test, w_test, u, gp_params, indep_noise, y, gz, grad_u) return [DisconnectedType()(), # idx_train DisconnectedType()(), # w_train DisconnectedType()(), # idx_test DisconnectedType()(), # w_test grad_gp_params, grad_indep_noise, DisconnectedType()()] # y
def grad(self, inputs, output_grads): gout, = output_grads s = inputs[1] # Divide the last dimension of the output gradients by 2, they are # double-counted by the real-IFFT due to symmetry, except the first # and last elements (for even transforms) which are unique. idx = [slice(None)] * (gout.ndim - 2) \ + [slice(1, (s[-1] // 2) + (s[-1] % 2))] + [slice(None)] gout = T.set_subtensor(gout[idx], gout[idx] * 0.5) return [cuirfft_op(gout, s), DisconnectedType()()]
def grad(self, inputs, output_grads): gout, = output_grads s = inputs[1] # # Multiply the last dimension of the gradient by 2, they represent # # both positive and negative frequencies, except the first # # and last elements (for even transforms) which are unique. # idx = [slice(None)] * (gf.ndim - 2) \ # + [slice(1, (s[-1] // 2) + (s[-1] % 2))] + [slice(None)] # gf = T.set_subtensor(gf[idx], gf[idx] * 2) return [cufft_op(gout, s), DisconnectedType()()]
def grad(self, inputs, output_grads): # Gradient is just Inverse Fourier Transform gout, = output_grads s = inputs[1] # There are no symmetry present in complex-FFT because both real and imaginary part # interact to create distinct values every time. # idx = [slice(None)] * (gout.ndim - 2) \ # + [slice(1, (s[-1] // 2) + (s[-1] % 2))] + [slice(None)] # gout = T.set_subtensor(gout[idx], gout[idx] * 0.5) return [ifft_op(gout, s), DisconnectedType()()]
def grad(self, inp, grads): kerns, top, output, desc, alpha, beta = inp img, = grads img = gpu_contiguous(img) d_kerns = GpuDnn3dConvGradW()(img, top, gpu_alloc_empty(*kerns.shape), desc) d_top = GpuDnn3dConv()(img, kerns, gpu_alloc_empty(*top.shape), desc) d_alpha = grad_not_implemented(self, 4, alpha) d_beta = grad_not_implemented(self, 5, beta) return (d_kerns * alpha, d_top * alpha, img * beta, DisconnectedType()(), d_alpha, d_beta)
def grad(self, inp, grads): x, scale, shift, mean, std = inp gz, = grads disc = [DisconnectedType()() for i in inp[3:]] AbstractBN = AbstractBatchNormalizationGrad( eps=self.eps, bias=self.bias, term=self.term, inplace=self.inplace, train_stage=self.train_stage) [gx, g_scale, g_shift] = AbstractBN(x, gz, scale, shift) return [gx, g_scale, g_shift] + disc
def grad(self, inp, grads): img, kerns, output, desc, alpha, beta = inp top, = grads top = gpu_contiguous(top) d_img = GpuDnnConvGradI()(kerns, top, empty_like(img), desc) d_kerns = GpuDnnConvGradW()(img, top, empty_like(kerns), desc) d_alpha = grad_not_implemented(self, 4, alpha) d_beta = grad_not_implemented(self, 5, beta) return [d_img * alpha, d_kerns * alpha, top * beta, DisconnectedType()(), d_alpha, d_beta]
def grad(self, inputs, output_gradients): C, d, WShape, B = inputs dLdA, = output_gradients z = T.zeros_like(C[0, 0, 0, 0, :]) dLdC = convTransp3D(dLdA, z, d, B, C.shape[1:4]) # d actually does affect the outputs, so it's not disconnected dLdd = grad_undefined(self, 1, d) # The shape of the weights doesn't affect the output elements dLdWShape = DisconnectedType()() dLdB = conv3D(C, dLdA, T.zeros_like(B[0, 0, 0, 0, :]), d) return [dLdC, dLdd, dLdWShape, dLdB]
def grad(self, inp, grads): kerns, top, output, desc, alpha, beta = inp img, = grads img = gpu_contiguous(img) d_kerns = GpuDnnConvGradW()(img, top, empty_like(kerns), desc) d_top = GpuDnnConv()(img, kerns, empty_like(top), desc) d_alpha = grad_not_implemented(self, 4, alpha) d_beta = grad_not_implemented(self, 5, beta) return (d_kerns * alpha, d_top * alpha, img * beta, DisconnectedType()(), d_alpha, d_beta)
def grad(self, inp, grads): img, top, output, desc, alpha, beta = inp kerns, = grads kerns = gpu_contiguous(kerns) d_img = GpuDnn3dConvGradI()(kerns, top, gpu_alloc_empty(*img.shape), desc) d_top = GpuDnn3dConv()(img, kerns, gpu_alloc_empty(*top.shape), desc) d_alpha = grad_not_implemented(self, 4, alpha) d_beta = grad_not_implemented(self, 5, beta) return (d_img * alpha, d_top * alpha, kerns * beta, DisconnectedType()(), d_alpha, d_beta)
def grad(self, inputs, output_gradients): W, b, d, H, RShape = inputs dCdR, = output_gradients dCdH = theano.tensor.nnet.conv3D(dCdR, W, T.zeros_like(H[0, 0, 0, 0, :]), d) WShape = W.shape dCdW = theano.tensor.nnet.convGrad3D(dCdR, d, WShape, H) dCdb = T.sum(dCdR, axis=(0, 1, 2, 3)) # not differentiable, since d affects the output elements dCdd = grad_undefined(self, 2, d) # disconnected, since RShape just determines the output shape dCdRShape = DisconnectedType()() if 'name' in dir(dCdR) and dCdR.name is not None: dCdR_name = dCdR.name else: dCdR_name = 'anon_dCdR' if 'name' in dir(H) and H.name is not None: H_name = H.name else: H_name = 'anon_H' if 'name' in dir(W) and W.name is not None: W_name = W.name else: W_name = 'anon_W' if 'name' in dir(b) and b.name is not None: b_name = b.name else: b_name = 'anon_b' dCdW.name = ('ConvTransp3D_dCdW.H=' + H_name + ',dCdR=' + dCdR_name + ',W=' + W_name) dCdb.name = ('ConvTransp3D_dCdb.H=' + H_name + ',dCdR=' + dCdR_name + ',W=' + W_name + ',b=' + b_name) dCdH.name = 'ConvTransp3D_dCdH.H=' + H_name + ',dCdR=' + dCdR_name return [dCdW, dCdb, dCdd, dCdH, dCdRShape]
def grad(self, inp, grads): x, ws, stride, pad = inp gz, = grads disc = [DisconnectedType()() for i in inp[1:]] return [U2IGrad()(x, gz)] + disc
def grad(self, inputs, g_outputs): z = tensor.zeros_like(inputs[0]) gx = inc_diagonal_subtensor(z, inputs[1], inputs[2], g_outputs[0]) return [gx, DisconnectedType()(), DisconnectedType()()]
class RepeatOp(theano.Op): # See the repeat function for docstring def __init__(self, axis=None): self.axis = axis def __eq__(self, other): return (type(self) == type(other) and self.axis == other.axis) def __hash__(self): return hash(type(self)) ^ hash(self.axis) def make_node(self, x, repeats): x = basic.as_tensor_variable(x) repeats = basic.as_tensor_variable(repeats) if repeats.dtype not in tensor.discrete_dtypes: raise TypeError("repeats.dtype must be an integer.") # Some dtypes are not supported by numpy's implementation of repeat. # Until another one is available, we should fail at graph construction # time, not wait for execution. int_bitwidth = theano.gof.cmodule.python_int_bitwidth() if int_bitwidth == 64: numpy_unsupported_dtypes = ('uint64', ) if int_bitwidth == 32: numpy_unsupported_dtypes = ('uint32', 'int64', 'uint64') if repeats.dtype in numpy_unsupported_dtypes: raise TypeError( ("dtypes %s are not supported by numpy.repeat " "for the 'repeats' parameter, " % numpy_unsupported_dtypes), repeats.dtype) if self.axis is None: out_type = theano.tensor.TensorType(dtype=x.dtype, broadcastable=[False]) else: out_type = x.type return theano.Apply(self, [x, repeats], [out_type()]) def perform(self, node, inputs, output_storage): x = inputs[0] repeats = inputs[1] z = output_storage[0] z[0] = np.repeat(x, repeats=repeats, axis=self.axis) def connection_pattern(self, node): return [[True], [False]] def grad(self, (x, repeats), (gz, )): if repeats.ndim == 0: if self.axis is None: axis = x.ndim else: if self.axis >= 0: axis = self.axis + 1 else: axis = self.axis + x.ndim + 1 shape = [x.shape[k] for k in range(x.ndim)] shape.insert(axis, repeats) return [ gz.reshape(shape, x.ndim + 1).sum(axis=axis), DisconnectedType()() ] elif repeats.ndim == 1: # For this implementation, we would need to specify the length # of repeats in order to split gz in the right way to sum # the good part. raise NotImplementedError() else: raise ValueError()
def grad(self, inputs, output_grads): # Gradient is just Inverse Fourier Transform gout, = output_grads s = inputs[1] return [fftshift_op(gout, s), DisconnectedType()()]
def test_grad_override(self, cls_ofg): x, y = tt.vectors("xy") def go(inps, gs): x, y = inps (g, ) = gs return [g * y * 2, g * x * 1.5] dedz = tt.vector("dedz") op_mul_grad = cls_ofg([x, y, dedz], go([x, y], [dedz])) op_mul = cls_ofg([x, y], [x * y], grad_overrides=go) op_mul2 = cls_ofg([x, y], [x * y], grad_overrides=op_mul_grad) # single override case (function or OfG instance) xx, yy = tt.vector("xx"), tt.vector("yy") for op in [op_mul, op_mul2]: zz = tt.sum(op(xx, yy)) dx, dy = tt.grad(zz, [xx, yy]) fn = function([xx, yy], [dx, dy]) xv = np.random.rand(16).astype(config.floatX) yv = np.random.rand(16).astype(config.floatX) dxv, dyv = fn(xv, yv) assert np.allclose(yv * 2, dxv) assert np.allclose(xv * 1.5, dyv) # list override case def go1(inps, gs): x, w, b = inps g = gs[0] return g * w * 2 def go2(inps, gs): x, w, b = inps g = gs[0] return g * x * 1.5 w, b = tt.vectors("wb") # we make the 3rd gradient default (no override) op_linear = cls_ofg([x, w, b], [x * w + b], grad_overrides=[go1, go2, "default"]) xx, ww, bb = tt.vector("xx"), tt.vector("yy"), tt.vector("bb") zz = tt.sum(op_linear(xx, ww, bb)) dx, dw, db = tt.grad(zz, [xx, ww, bb]) fn = function([xx, ww, bb], [dx, dw, db]) xv = np.random.rand(16).astype(config.floatX) wv = np.random.rand(16).astype(config.floatX) bv = np.random.rand(16).astype(config.floatX) dxv, dwv, dbv = fn(xv, wv, bv) assert np.allclose(wv * 2, dxv) assert np.allclose(xv * 1.5, dwv) assert np.allclose(np.ones(16, dtype=config.floatX), dbv) # NullType and DisconnectedType op_linear2 = cls_ofg( [x, w, b], [x * w + b], grad_overrides=[go1, NullType()(), DisconnectedType()()], ) zz2 = tt.sum(op_linear2(xx, ww, bb)) dx2, dw2, db2 = tt.grad( zz2, [xx, ww, bb], return_disconnected="Disconnected", disconnected_inputs="ignore", null_gradients="return", ) assert isinstance(dx2.type, tt.TensorType) assert dx2.ndim == 1 assert isinstance(dw2.type, NullType) assert isinstance(db2.type, DisconnectedType)
def grad(self, inputs, grads): return [DisconnectedType()() for i in inputs]
def grad(self, args, g_outs): return [DisconnectedType()() for g_out in g_outs]
class RepeatOp(theano.Op): # See the repeat function for docstring def __init__(self, axis=None): self.axis = axis def __eq__(self, other): return (type(self) == type(other) and self.axis == other.axis) def __hash__(self): return hash(type(self)) ^ hash(self.axis) def make_node(self, x, repeats): x = basic.as_tensor_variable(x) repeats = basic.as_tensor_variable(repeats) # if repeats.dtype not in theano.tensor.discrete_dtypes: # raise TypeError("repeats.dtype must be an integer.") # # # Some dtypes are not supported by numpy's implementation of repeat. # # Until another one is available, we should fail at graph construction # # time, not wait for execution. # ptr_bitwidth = theano.gof.local_bitwidth() # if ptr_bitwidth == 64: # numpy_unsupported_dtypes = ('uint64',) # if ptr_bitwidth == 32: # numpy_unsupported_dtypes = ('uint32', 'int64', 'uint64') # # if repeats.dtype in numpy_unsupported_dtypes: # raise TypeError( # ("dtypes %s are not supported by numpy.repeat " # "for the 'repeats' parameter, " # % str(numpy_unsupported_dtypes)), repeats.dtype) # # if self.axis is None: # broadcastable = [False] # else: # try: # const_reps = basic.get_scalar_constant_value(repeats) # except basic.NotScalarConstantError: # const_reps = None # if const_reps == 1: # broadcastable = x.broadcastable # else: # broadcastable = list(x.broadcastable) # broadcastable[self.axis] = False out_type = theano.tensor.ftensor4 return theano.Apply(self, [x, repeats], [out_type()]) def perform(self, node, inputs, output_storage): x = inputs[0] repeats = inputs[1] z = output_storage[0] if self.axis is 2: repeat_style = (1, 1, repeats, 1) else: repeat_style = (1, 1, 1, repeats) z[0] = scipy.ndimage.zoom(input=x, zoom=repeat_style, order=1) def connection_pattern(self, node): return [[True], [False]] def grad(self, (x, repeats), (gz, )): if repeats.ndim == 0: if self.axis is None: axis = x.ndim else: if self.axis >= 0: axis = self.axis + 1 else: axis = self.axis + x.ndim + 1 shape = [x.shape[k] for k in range(x.ndim)] shape.insert(axis, repeats) return [ gz.reshape(shape, x.ndim + 1).sum(axis=axis), DisconnectedType()() ] elif repeats.ndim == 1: # For this implementation, we would need to specify the length # of repeats in order to split gz in the right way to sum # the good part. raise NotImplementedError() else: raise ValueError()
def grad(self, inputs, output_grads): gout, = output_grads s = inputs[1] gf = fft_op(gout, s) return [gf, DisconnectedType()()]
def grad(self, inputs, output_gradients): return [DisconnectedType()()] + output_gradients
def test_grad_override(self, cls_ofg): x, y = T.vectors('xy') def go(inps, gs): x, y = inps g, = gs return [g * y * 2, g * x * 1.5] dedz = T.vector('dedz') op_mul_grad = cls_ofg([x, y, dedz], go([x, y], [dedz])) op_mul = cls_ofg([x, y], [x * y], grad_overrides=go) op_mul2 = cls_ofg([x, y], [x * y], grad_overrides=op_mul_grad) # single override case (function or OfG instance) xx, yy = T.vector('xx'), T.vector('yy') for op in [op_mul, op_mul2]: zz = T.sum(op(xx, yy)) dx, dy = T.grad(zz, [xx, yy]) fn = function([xx, yy], [dx, dy]) xv = np.random.rand(16).astype(config.floatX) yv = np.random.rand(16).astype(config.floatX) dxv, dyv = fn(xv, yv) assert np.allclose(yv * 2, dxv) assert np.allclose(xv * 1.5, dyv) # list override case def go1(inps, gs): x, w, b = inps g = gs[0] return g * w * 2 def go2(inps, gs): x, w, b = inps g = gs[0] return g * x * 1.5 w, b = T.vectors('wb') # we make the 3rd gradient default (no override) op_linear = cls_ofg([x, w, b], [x * w + b], grad_overrides=[go1, go2, 'default']) xx, ww, bb = T.vector('xx'), T.vector('yy'), T.vector('bb') zz = T.sum(op_linear(xx, ww, bb)) dx, dw, db = T.grad(zz, [xx, ww, bb]) fn = function([xx, ww, bb], [dx, dw, db]) xv = np.random.rand(16).astype(config.floatX) wv = np.random.rand(16).astype(config.floatX) bv = np.random.rand(16).astype(config.floatX) dxv, dwv, dbv = fn(xv, wv, bv) assert np.allclose(wv * 2, dxv) assert np.allclose(xv * 1.5, dwv) assert np.allclose(np.ones(16, dtype=config.floatX), dbv) # NullType and DisconnectedType op_linear2 = cls_ofg( [x, w, b], [x * w + b], grad_overrides=[go1, NullType()(), DisconnectedType()()]) zz2 = T.sum(op_linear2(xx, ww, bb)) dx2, dw2, db2 = T.grad(zz2, [xx, ww, bb], return_disconnected='Disconnected', disconnected_inputs='ignore', null_gradients='return') assert isinstance(dx2.type, T.TensorType) assert dx2.ndim == 1 assert isinstance(dw2.type, NullType) assert isinstance(db2.type, DisconnectedType)