def test_multiple_outputs(self): m = tensor.matrix('m') v = tensor.vector('v') m_ = tensor.matrix('m_') v_ = tensor.vector('v_') mval = self.rng.uniform(size=(3, 7)).astype(theano.config.floatX) vval = self.rng.uniform(size=(7, )).astype(theano.config.floatX) m_val = self.rng.uniform(size=(3, 7)).astype(theano.config.floatX) v_val = self.rng.uniform(size=(7, )).astype(theano.config.floatX) rop_out1 = tensor.Rop([m, v, m + v], [m, v], [m_, v_]) assert isinstance(rop_out1, list) assert len(rop_out1) == 3 rop_out2 = tensor.Rop((m, v, m + v), [m, v], [m_, v_]) assert isinstance(rop_out2, tuple) assert len(rop_out2) == 3 lop_out1 = tensor.Lop([m, v, m + v], (m, v), [m_, v_]) assert isinstance(lop_out1, tuple) assert len(lop_out1) == 2 lop_out2 = tensor.Lop((m, v, m + v), [m, v], [m_, v_]) assert isinstance(lop_out2, list) assert len(lop_out2) == 2 all_outs = [] for o in rop_out1, rop_out2, lop_out1, lop_out2: all_outs.extend(o) f = theano.function([m, v, m_, v_], all_outs) f(mval, vval, m_val, v_val)
def __init__(self, input_dim, n_hidden_units, n_hidden_layers, nonlinearity='tanh', bias_sigma=0.0, weight_sigma=1.25, input_layer=None, flip=False, output_dim=None): #if input_layer is not None: # assert input_layer.output_shape[1] == input_dim self.input_dim = input_dim self.n_hidden_units = n_hidden_units self.n_hidden_layers = n_hidden_layers self.nonlinearity = nonlinearity self.bias_sigma = bias_sigma self.weight_sigma = weight_sigma self.input_layer = input_layer if output_dim is None: output_dim = n_hidden_units self.output_dim = output_dim model = Sequential() if input_layer is not None: model.add(input_layer) for i in xrange(n_hidden_layers): nunits = n_hidden_units if i < n_hidden_layers - 1 else output_dim if flip: model.add( Activation(nonlinearity, input_shape=(input_dim, ), name='_a%d' % i)) model.add(Dense(nunits, name='_d%d' % i)) else: model.add( Dense(nunits, input_shape=(input_dim, ), name='_d%d' % i)) if i < n_hidden_layers - 1 or self.output_dim == self.n_hidden_units: model.add(Activation(nonlinearity, name='_a%d' % i)) else: # Theano is optimizing out the nonlinearity if it can which is breaking shit # Give it something that it won't optimize out. model.add( Activation(lambda x: T.minimum(x, 999999.999), name='_a%d' % i)) model.build() self.model = model self.weights = model.get_weights() self.dense_layers = filter(lambda x: x.name.startswith('_d'), model.layers) self.hs = [h.output for h in self.dense_layers] self.act_layers = filter(lambda x: x.name.startswith('_a'), model.layers) self.f_acts = self.f_jac = self.f_jac_hess = self.f_act = None vec = K.ones_like(self.model.input) self.Js = [T.Rop(h, self.model.input, vec) for h in self.hs] self.Hs = [T.Rop(J, self.model.input, vec) for J in self.Js]
def Gv_step(*gv_args): idx = TT.cast(gv_args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_outs = safe_clone(model.gc_outs, replace) final_results = dict( zip(model.params, [None] * len(model.params))) for nw_out, out_operator in zip(nw_outs, model.gc_outs_operator): loc_params = [ x for x in model.params if x in theano.gof.graph.inputs([nw_out]) ] loc_args = [ x for x, y in zip(args, model.params) if y in theano.gof.graph.inputs([nw_out]) ] if out_operator == 'softmax': factor = const(options['cbs']) * (nw_out + eps) elif out_operator == 'sigmoid': factor = const( options['cbs']) # * nw_out * (1 - nw_out) else: factor = const(options['cbs']) if out_operator != 'sigmoid': loc_Gvs = TT.Lop(nw_out, loc_params, TT.Rop(nw_out, loc_params, loc_args) /\ factor) else: tnwout = TT.nnet.sigmoid(nw_out) loc_Gvs = TT.Lop(nw_out, loc_params, TT.Rop(nw_out, loc_params, loc_args) *\ tnwout * (1 - tnwout)/ factor) for lp, lgv in zip(loc_params, loc_Gvs): if final_results[lp] is None: final_results[lp] = lgv else: final_results[lp] += lgv Gvs = [ ogv + final_results[param] for (ogv, param) in zip(gv_args[1:], model.params) ] return [gv_args[0] + const(1)] + Gvs nw_cost, nw_preactiv_out = safe_clone( [model.train_cost, model.preactiv_out], replace) nw_gvs = TT.Lop( nw_preactiv_out, model.params, TT.Rop(TT.grad(nw_cost, nw_preactiv_out), model.params, args)) Gvs = [ogv + ngv for (ogv, ngv) in zip(gv_args[1:], nw_gvs)] return [gv_args[0] + const(1)] + Gvs
def check_rop_lop(self, y, out_shape): """ As check_mat_rop_lop, except the input is self.x which is a vector. The output is still a vector. """ # TEST ROP vx = np.asarray(self.rng.uniform(size=self.in_shape), theano.config.floatX) vv = np.asarray(self.rng.uniform(size=self.in_shape), theano.config.floatX) yv = tensor.Rop(y, self.x, self.v) rop_f = function([self.x, self.v], yv, on_unused_input="ignore") J, _ = theano.scan( lambda i, y, x: tensor.grad(y[i], x), sequences=tensor.arange(y.shape[0]), non_sequences=[y, self.x], ) sy = tensor.dot(J, self.v) scan_f = function([self.x, self.v], sy, on_unused_input="ignore") v1 = rop_f(vx, vv) v2 = scan_f(vx, vv) assert np.allclose(v1, v2), "ROP mismatch: %s %s" % (v1, v2) known_fail = False try: tensor.Rop(theano.clone(y, replace={self.x: break_op(self.x)}), self.x, self.v) except ValueError: known_fail = True # TEST LOP vx = np.asarray(self.rng.uniform(size=self.in_shape), theano.config.floatX) vv = np.asarray(self.rng.uniform(size=out_shape), theano.config.floatX) yv = tensor.Lop(y, self.x, self.v) lop_f = function([self.x, self.v], yv, on_unused_input="ignore") J, _ = theano.scan( lambda i, y, x: tensor.grad(y[i], x), sequences=tensor.arange(y.shape[0]), non_sequences=[y, self.x], ) sy = tensor.dot(self.v, J) scan_f = function([self.x, self.v], sy) v1 = lop_f(vx, vv) v2 = scan_f(vx, vv) assert np.allclose(v1, v2), "LOP mismatch: %s %s" % (v1, v2) if known_fail: pytest.skip("Rop does not handle non-differentiable inputs " "correctly. Bug exposed by fixing Add.grad method.")
def test_rop_lop(): mx = tensor.matrix('mx') mv = tensor.matrix('mv') v = tensor.vector('v') y = matrix_inverse(mx).sum(axis=0) yv = tensor.Rop(y, mx, mv) yv2 = tensor.Rop_via_Lop(y, mx, mv) rop_f = function([mx, mv], [yv, yv2]) sy, _ = theano.scan(lambda i, y, x, v: (tensor.grad(y[i], x) * v).sum(), sequences=tensor.arange(y.shape[0]), non_sequences=[y, mx, mv]) scan_f = function([mx, mv], sy) rng = np.random.RandomState(utt.fetch_seed()) vx = np.asarray(rng.randn(4, 4), theano.config.floatX) vv = np.asarray(rng.randn(4, 4), theano.config.floatX) v1 = scan_f(vx, vv) v2, v3 = rop_f(vx, vv) assert _allclose(v2, v1), ('Rop mismatch: %s %s' % (v2, v1)) assert _allclose(v3, v1), ('Rop_via_Lop mismatch: %s %s' % (v3, v1)) raised = False try: tensor.Rop(theano.clone(y, replace={mx: break_op(mx)}), mx, mv) except ValueError: raised = True if not raised: raise Exception(('Op did not raised an error even though the function' ' is not differentiable')) try: tensor.Rop_via_Lop(theano.clone(y, replace={mx: break_op(mx)}), mx, mv) except theano.gradient.NullTypeGradError: raised = True except theano.gradient.DisconnectedInputError: raised = True if not raised: raise Exception(( 'Rop_via_Lop for Op did not raise an error even though the function' ' is not differentiable')) vv = np.asarray(rng.uniform(size=(4, )), theano.config.floatX) yv = tensor.Lop(y, mx, v) lop_f = function([mx, v], yv) sy = tensor.grad((v * y).sum(), mx) scan_f = function([mx, v], sy) v1 = lop_f(vx, vv) v2 = scan_f(vx, vv) assert _allclose(v1, v2), ('LOP mismatch: %s %s' % (v1, v2))
def Gvs(self, *args): # Contribution of hid_sig nw_args1 = TT.Lop( self.hid_sig, self.params, TT.Rop(self.hid_sig, self.params, args) / ((1 - self.hid_sig) * self.hid_sig * self.mbs)) nw_args2 = TT.Lop( self.hid_sftmax, self.params, TT.Rop(self.hid_sftmax, self.params, args) / (self.hid_sftmax * self.mbs)) return [x + y for x, y in zip(nw_args1, nw_args2)]
def compute_Gv(*args): (hid_sig, hid_sftmax) = self.get_hiddens() nw_args1 = TT.Lop( hid_sig, self.params, TT.Rop(hid_sig, self.params, args) / ((1 - hid_sig) * hid_sig * self.batchsize)) nw_args2 = TT.Lop( hid_sftmax, self.params, TT.Rop(hid_sftmax, self.params, args) / (hid_sftmax * self.batchsize)) fin_vals = [x + y for x, y in zip(nw_args1, nw_args2)] new_vals = safe_clone(fin_vals, [self.X, self.Y], [self.loc_x, self.loc_y]) return new_vals, {}
def gauss_vect_mult(v): """ Multiply a vector by the Gauss-Newton matrix JHJ' where J is the Jacobian between output and params and H is the Hessian between costs and output H should be diagonal and positive. Also add the ridge """ Jv = T.Rop(output, params, v) HJv = T.Rop(T.grad(opt_cost, output), output, Jv) JHJv = T.Lop(output, params, HJv) if not isinstance(JHJv, list): JHJv = [JHJv] JHJv = [a + ridge * b for a, b in zip(JHJv, v)] return JHJv
def _compute_nary_hessian_vector_product(self, gradients, arguments): """Returns a function accepting `2 * len(arguments)` arguments to compute a Hessian-vector product of a multivariate function. Notes ----- The implementation is based on TensorFlow's '_hessian_vector_product' function in 'tensorflow.python.ops.gradients_impl'. """ argument_types = [argument.type() for argument in arguments] try: Rop = T.Rop(gradients, arguments, argument_types) except NotImplementedError: proj = [ T.sum(gradient * disconnected_grad(argument_type)) for gradient, argument_type in zip(gradients, argument_types) ] proj_grad = [ T.grad(proj_elem, arguments, disconnected_inputs="ignore", return_disconnected="None") for proj_elem in proj ] proj_grad_transpose = map(list, zip(*proj_grad)) proj_grad_stack = [ T.stacklists([c for c in row if c is not None]) for row in proj_grad_transpose ] Rop = [T.sum(stack, axis=0) for stack in proj_grad_stack] return self._compile_function_without_warnings( list(itertools.chain(arguments, argument_types)), Rop)
def _get_updates_for(self, param, grad): D_tm1 = shared_like(param, 'D_ewma') Hv = TT.Rop(grad, param, self.rng.normal(param.shape)) D_t = self.ewma * D_tm1 + (1 - self.ewma) * Hv * Hv den = TT.sqrt(D_t) + self.epsilon yield D_tm1, D_t yield param, param - grad * self.learning_rate / den
def get_theano_fn(self, args, kwargs): self.trace(*args, **kwargs) fn_inputs, fn_outputs, graph = self.get_theano_variables( self.s_inputs, self.s_outputs) if np.any([o.ndim != 0 for o in fn_outputs]): raise TypeError('HessianVector requires scalar outputs.') # get wrt variables. If none were specified, use inputs. wrt = utils.as_seq(self.wrt) if len(wrt) == 0: wrt = [i for i in fn_inputs] else: wrt = [graph[self.get_symbolic(w)] for w in wrt] grads = utils.flat_from_doc([tt.grad(o, wrt=wrt) for o in fn_outputs]) sym_vecs = tuple( tt.TensorType(dtype=w.dtype, broadcastable=[False] * w.ndim)() for w in wrt) hess_vec = tt.Rop(grads, wrt, sym_vecs) if len(hess_vec) == 1: hess_vec = hess_vec[0] # compile function fn = theano.function(inputs=fn_inputs + sym_vecs, outputs=hess_vec, on_unused_input='ignore') return fn
def test_theano_operator(): """Test the ODL->Theano operator wrapper.""" # Define ODL operator matrix = np.random.rand(3, 2) odl_op = odl.MatrixOperator(matrix) # Define evaluation points x = [1., 2.] dy = [1., 2., 3.] # Create Theano placeholders x_theano = T.dvector() dy_theano = T.dvector() # Create Theano layer from odl operator odl_op_layer = odl.contrib.theano.TheanoOperator(odl_op) # Build computation graphs y_theano = odl_op_layer(x_theano) y_theano_func = theano.function([x_theano], y_theano) dy_theano_func = theano.function([x_theano, dy_theano], T.Rop(y_theano, x_theano, dy_theano)) # Evaluate using Theano result = y_theano_func(x) expected = odl_op(x) assert all_almost_equal(result, expected) # Evaluate the adjoint of the derivative, called gradient in Theano result = dy_theano_func(x, dy) expected = odl_op.derivative(x).adjoint(dy) assert all_almost_equal(result, expected)
def hessian_rop_wrt_list(cost, wrt_list, v, g_vec=None, g_list=None): """ Compute an expression for the Hessian of cost with respect to wrt_list, right-multiplied by a column vector v. """ if wrt_list == []: raise Exception("wrt_list must not be empty!") if g_vec is None: if g_list is None: g_list = T.grad(cost, wrt_list) g_vec = T.concatenate(g_list, axis=0) # Compute the Hessian \dot vector Rop wrt_flat = [] for wrt in wrt_list: if wrt.ndim < 1: wrt = T.shape_padright(wrt, n_ones=1) elif wrt.ndim > 1: wrt = T.flatten(wrt) wrt_flat.append(wrt) # Concatenate wrt into a single vector wrt = T.concatenate(wrt_flat, axis=0) # Compute the Rop Hv = T.Rop(g_vec, wrt, v) return Hv
def hessian(objective, argument): """ Compute the directional derivative of the gradient (which is equal to the hessian multiplied by direction). """ g = T.grad(objective, argument) # Create a new tensor A, which has the same type (i.e. same dimensionality) # as argument. A = argument.type() try: # First attempt efficient 'R-op', this directly calculates the # directional derivative of the gradient, rather than explicitly # calculating the hessian and then multiplying. R = T.Rop(g, argument, A) except NotImplementedError: shp = T.shape(argument) H = T.jacobian(g.flatten(), argument).reshape(T.concatenate([shp, shp]), 2 * A.ndim) R = T.tensordot(H, A, A.ndim) try: hess = theano.function([argument, A], R, on_unused_input='raise') except theano.compile.UnusedInputError: warn('Theano detected unused input - suggests hessian may be zero or ' 'constant.') hess = theano.function([argument, A], R, on_unused_input='ignore') return hess
def Gv_step(*gv_args): idx = TT.cast(gv_args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_outs = safe_clone(model.outs, replace) final_results = dict(zip(model.params, [None] * len(model.params))) for nw_out, out_operator in zip(nw_outs, model.outs_operator): loc_params = [x for x in model.params if x in theano.gof.graph.inputs([nw_out])] loc_args = [x for x, y in zip(cgv, model.params) if y in theano.gof.graph.inputs([nw_out])] if out_operator == 'softmax': factor = const(options['cbs']) * nw_out elif out_operator == 'sigmoid': factor = const(options['cbs']) * nw_out * (1 - nw_out) else: factor = const(options['cbs']) loc_Gvs = TT.Lop(nw_out, loc_params, TT.Rop(nw_out, loc_params, loc_args) /\ factor) for lp, lgv in zip(loc_params, loc_Gvs): if final_results[lp] is None: final_results[lp] = lgv else: final_results[lp] += lgv Gvs = [ogv + final_results[param] for (ogv, param) in zip(gv_args[1:], model.params)] return [gv_args[0] + const(1)] + Gvs
def compute_Ax(x): # There are three ways to compute the Fisher-vector product: # 1. https://github.com/joschu/modular_rl/blob/master/modular_rl/trpo.py#L54 # Use theano.gradient.disconnected_grad and call theano.tensor.grad() twice. # WARNING: In our case (with the attention mechanism) it is extremly slow. # 2. http://deeplearning.net/software/theano/tutorial/gradients.html#hessian-times-a-vector # Use only theano.tensor.Rop, but you will need to calculate the fixed_output outside # of the compiled function, because disconnected_grad will not work with Rop. # 3. https://github.com/pascanur/natgrad/blob/master/model_convMNIST_standard.py # Rop devided by output because a metric F is based on gradient of log(output). # Here we also split the vector of parameters. Not checked, but it may be # faster then supply few vectors to minresQLP. xs = [] offset = 0 for p in params: shape = p.get_value().shape size = np.prod(shape) xs.append(x[offset:offset + size].reshape(shape)) offset += size jvp = T.Rop(new_output, params, xs) / ( new_output * self.batch_size * self.history + TINY) fvp = T.Lop(new_output, params, jvp) fvp = T.concatenate([g.flatten() for g in fvp]) return [fvp], {}
def buildObjective(self): """ Construct Theano expressions for loss, gradient and gauss-newton mv-multiplication """ p = T.vector(name='p') (bhid, bvis, W) = self.unwrap(p) X = T.matrix(name='X') y = T.nnet.sigmoid(T.dot(X, W) + bhid) zinner = T.dot(y, W.T) + bvis z = T.nnet.sigmoid(zinner) L = -T.sum(X * T.log(z) + (1 - X) * T.log(1 - z), axis=1) loss = T.sum(L) / self.n loss += 0.5 * self.reg * T.dot(p, p) # compute the gradients of the cost of the `dA` with respect # to its parameters g = T.grad(loss, p) self.obj = function([X, p], (loss, g)) v = T.vector(name='v') # Essentially the Jacobian right up to before the final sigmoid is used, # in the form J.T*H*J, where H is the hessian of just the last nonlinearity Jv = T.Rop(zinner, p, v) HJv = T.grad(T.sum(T.grad(loss, zinner) * Jv), zinner, consider_constant=[Jv]) Gp = T.grad(T.sum(HJv * zinner), p, consider_constant=[HJv, Jv]) Gp = Gp + self.reg * v self.gnprod = function([X, p, v], Gp)
def build_hess_p(x, mask, ctx, cost): p = tensor.matrix(name='p', dtype='float32') ctx_grad = tensor.grad(cost, ctx) ctx_hess_p = tensor.Rop(ctx_grad, ctx, p) f_ctx_hess_p = theano.function([x, mask, ctx, p], ctx_hess_p) return f_ctx_hess_p
def _compile_theano_functions(self): p = self.number_dense_jacob_columns u = tt.vector('u') y = self.generator(u, self.constants) u_rep = tt.tile(u, (p, 1)) y_rep = self.generator(u_rep, self.constants) diag_jacob = tt.grad(tt.sum(y), u)[p:] m = tt.zeros((p, u.shape[0])) m = tt.set_subtensor(m[:p, :p], tt.eye(p)) dense_jacob = tt.Rop(y_rep, u_rep, m).T energy = self.base_energy(u) + ( 0.5 * tt.log(nla.det( tt.eye(p) + (dense_jacob.T / diag_jacob**2).dot(dense_jacob) )) + tt.log(diag_jacob).sum() ) energy_grad = tt.grad(energy, u) dy_du = tt.join(1, dense_jacob, tt.diag(diag_jacob)) self.generator_func = _timed_func_compilation( [u], y, 'generator function') self.generator_jacob = _timed_func_compilation( [u], dy_du, 'generator Jacobian') self._energy_grad = _timed_func_compilation( [u], energy_grad, 'energy gradient') self.base_energy_func = _timed_func_compilation( [u], self.base_energy(u), 'base energy function')
def check_nondiff_rop(self, y): """ If your op is not differentiable(so you can't define Rop) test that an error is raised. """ with pytest.raises(ValueError): tensor.Rop(y, self.x, self.v)
def __init__(self, t_cost, t_traj_info, t_inputs, params, reg=1e-5): t_new_params = [ _np2theano(p.name, p.get_value(borrow=True)) for p in params ] t_mean = t_traj_info['act_mean'] t_mean = t_mean.reshape((-1, t_mean.shape[-1])) t_logstd = t_traj_info['act_logstd'] t_logstd = t_logstd.reshape((-1, t_logstd.shape[-1])) t_new_mean = t_traj_info['new_act_mean'] t_new_mean = t_new_mean.reshape((-1, t_new_mean.shape[-1])) t_new_logstd = t_traj_info['new_act_logstd'] t_new_logstd = t_new_logstd.reshape((-1, t_new_logstd.shape[-1])) print 'Compiling cost function ... ', s = time() self.cost = theano.function(inputs=t_inputs, outputs=t_cost, on_unused_input='ignore') print 'finished in %f seconds' % (time() - s) print 'Building cost grad function ... ', s = time() _t_cost_grad = T.grad(-t_cost, wrt=params) print 'finished in %f seconds' % (time() - s) print 'Compiling cost grad function ... ', s = time() self._cost_grad = theano.function(inputs=t_inputs, outputs=[t_cost] + _t_cost_grad, on_unused_input='ignore') print 'finished in %f seconds' % (time() - s) print 'Building Hx function ... ', s = time() mu = T.concatenate([t_new_mean, t_new_logstd], axis=-1) Jx = sum([T.Rop(mu, p, x) for (p, x) in zip(params, t_new_params)]) M = T.tile(T.eye(2), (mu.shape[0], 1, 1)) Jx = Jx.reshape((Jx.shape[0], Jx.shape[1], 1)) Jx = T.tile(Jx, (1, 1, Jx.shape[1])) MJx = Jx JMJx = [ T.Lop(MJx, p, x, disconnected_inputs='ignore') for (p, x) in zip(params, t_new_params) ] Hx = [h + reg * p for (h, p) in zip(JMJx, t_new_params)] print 'finished in %f seconds' % (time() - s) # TODO: Use mask to handle different lengths. print 'Compiling Hx function ...', s = time() self._constraint_Hx = theano.function(inputs=t_inputs + t_new_params, outputs=Hx, on_unused_input='ignore') self.constraint_Hx = lambda inputs, params: self._constraint_Hx(*( inputs + params)) print 'finished in %f seconds' % (time() - s)
def gauss_newton_product(cost, p, v, s): Jv = T.Rop(s, p, v) HJv = T.grad(T.sum(T.grad(cost, s)*Jv), s, consider_constant=[Jv], disconnected_inputs='ignore') Gv = T.grad(T.sum(HJv*s), p, consider_constant=[HJv, Jv], disconnected_inputs='ignore') Gv = map(T.as_tensor_variable, Gv) # for CudaNdarray return Gv
def test_Rop_dot_bug_18Oct2013_Jeremiah(self): # This test refers to a bug reported by Jeremiah Lowin on 18th Oct # 2013. The bug consists when through a dot operation there is only # one differentiable path (i.e. there is no gradient wrt to one of # the inputs). x = tensor.arange(20.0).reshape([1, 20]) v = theano.shared(np.ones([20])) d = tensor.dot(x, v).sum() tensor.Rop(tensor.grad(d, v), v, v)
def test_conv(self): for conv_op in [conv.conv2d, conv2d]: for border_mode in ["valid", "full"]: image_shape = (2, 2, 4, 5) filter_shape = (2, 2, 2, 3) image_dim = len(image_shape) filter_dim = len(filter_shape) input = tensor.TensorType(theano.config.floatX, [False] * image_dim)(name="input") filters = tensor.TensorType(theano.config.floatX, [False] * filter_dim)(name="filter") ev_input = tensor.TensorType(theano.config.floatX, [False] * image_dim)(name="ev_input") ev_filters = tensor.TensorType(theano.config.floatX, [False] * filter_dim)(name="ev_filters") def sym_conv2d(input, filters): return conv_op(input, filters, border_mode=border_mode) output = sym_conv2d(input, filters).flatten() yv = tensor.Rop(output, [input, filters], [ev_input, ev_filters]) mode = None if theano.config.mode == "FAST_COMPILE": mode = "FAST_RUN" rop_f = function( [input, filters, ev_input, ev_filters], yv, on_unused_input="ignore", mode=mode, ) sy, _ = theano.scan( lambda i, y, x1, x2, v1, v2: (tensor.grad(y[i], x1) * v1).sum() + (tensor.grad(y[i], x2) * v2).sum(), sequences=tensor.arange(output.shape[0]), non_sequences=[ output, input, filters, ev_input, ev_filters ], mode=mode, ) scan_f = function( [input, filters, ev_input, ev_filters], sy, on_unused_input="ignore", mode=mode, ) dtype = theano.config.floatX image_data = np.random.random(image_shape).astype(dtype) filter_data = np.random.random(filter_shape).astype(dtype) ev_image_data = np.random.random(image_shape).astype(dtype) ev_filter_data = np.random.random(filter_shape).astype(dtype) v1 = rop_f(image_data, filter_data, ev_image_data, ev_filter_data) v2 = scan_f(image_data, filter_data, ev_image_data, ev_filter_data) assert np.allclose(v1, v2), "Rop mismatch: %s %s" % (v1, v2)
def test_rop_lop(): mx = tensor.matrix("mx") mv = tensor.matrix("mv") v = tensor.vector("v") y = matrix_inverse(mx).sum(axis=0) yv = tensor.Rop(y, mx, mv) rop_f = function([mx, mv], yv) sy, _ = theano.scan( lambda i, y, x, v: (tensor.grad(y[i], x) * v).sum(), sequences=tensor.arange(y.shape[0]), non_sequences=[y, mx, mv], ) scan_f = function([mx, mv], sy) rng = np.random.RandomState(utt.fetch_seed()) vx = np.asarray(rng.randn(4, 4), theano.config.floatX) vv = np.asarray(rng.randn(4, 4), theano.config.floatX) v1 = rop_f(vx, vv) v2 = scan_f(vx, vv) assert _allclose(v1, v2), "ROP mismatch: %s %s" % (v1, v2) raised = False try: tensor.Rop(theano.clone(y, replace={mx: break_op(mx)}), mx, mv) except ValueError: raised = True if not raised: raise Exception(("Op did not raised an error even though the function" " is not differentiable")) vv = np.asarray(rng.uniform(size=(4, )), theano.config.floatX) yv = tensor.Lop(y, mx, v) lop_f = function([mx, v], yv) sy = tensor.grad((v * y).sum(), mx) scan_f = function([mx, v], sy) v1 = lop_f(vx, vv) v2 = scan_f(vx, vv) assert _allclose(v1, v2), "LOP mismatch: %s %s" % (v1, v2)
def test_downsample(self): rng = np.random.RandomState(utt.fetch_seed()) # ws, shp examples = ( ((2, ), (16, )), ( (2, ), ( 4, 16, ), ), ( (2, ), ( 4, 2, 16, ), ), ((1, 1), (4, 2, 16, 16)), ((2, 2), (4, 2, 16, 16)), ((3, 3), (4, 2, 16, 16)), ((3, 2), (4, 2, 16, 16)), ((3, 2, 2), (3, 2, 16, 16, 16)), ((2, 3, 2), (3, 2, 16, 16, 16)), ((2, 2, 3), (3, 2, 16, 16, 16)), ((2, 2, 3, 2), (3, 2, 6, 6, 6, 5)), ) for example, ignore_border in itertools.product( examples, [True, False]): (ws, shp) = example vx = rng.rand(*shp) vex = rng.rand(*shp) x = theano.shared(vx) ex = theano.shared(vex) maxpool_op = Pool(ignore_border, ndim=len(ws)) a_pooled = maxpool_op(x, ws).flatten() yv = tensor.Rop(a_pooled, x, ex) mode = None if theano.config.mode == "FAST_COMPILE": mode = "FAST_RUN" rop_f = function([], yv, on_unused_input="ignore", mode=mode) sy, _ = theano.scan( lambda i, y, x, v: (tensor.grad(y[i], x) * v).sum(), sequences=tensor.arange(a_pooled.shape[0]), non_sequences=[a_pooled, x, ex], mode=mode, ) scan_f = function([], sy, on_unused_input="ignore", mode=mode) v1 = rop_f() v2 = scan_f() assert np.allclose(v1, v2), f"Rop mismatch: {v1} {v2}"
def check_rop_lop(self, y, out_shape): """ As check_mat_rop_lop, except the input is self.x which is a vector. The output is still a vector. """ # TEST ROP vx = numpy.asarray(self.rng.uniform(size=self.in_shape), theano.config.floatX) vv = numpy.asarray(self.rng.uniform(size=self.in_shape), theano.config.floatX) yv = tensor.Rop(y, self.x, self.v) rop_f = function([self.x, self.v], yv, on_unused_input='ignore') J, _ = theano.scan(lambda i, y, x: tensor.grad(y[i], x), sequences=tensor.arange(y.shape[0]), non_sequences=[y, self.x]) sy = tensor.dot(J, self.v) scan_f = function([self.x, self.v], sy, on_unused_input='ignore') v1 = rop_f(vx, vv) v2 = scan_f(vx, vv) assert numpy.allclose(v1, v2), ('ROP mismatch: %s %s' % (v1, v2)) known_fail = False try: self.check_nondiff_rop( theano.clone(y, replace={self.x: break_op(self.x)})) except AssertionError: known_fail = True # TEST LOP vx = numpy.asarray(self.rng.uniform(size=self.in_shape), theano.config.floatX) vv = numpy.asarray(self.rng.uniform(size=out_shape), theano.config.floatX) yv = tensor.Lop(y, self.x, self.v) lop_f = function([self.x, self.v], yv, on_unused_input='ignore') J, _ = theano.scan(lambda i, y, x: tensor.grad(y[i], x), sequences=tensor.arange(y.shape[0]), non_sequences=[y, self.x]) sy = tensor.dot(self.v, J) scan_f = function([self.x, self.v], sy) v1 = lop_f(vx, vv) v2 = scan_f(vx, vv) assert numpy.allclose(v1, v2), ('LOP mismatch: %s %s' % (v1, v2)) if known_fail: raise KnownFailureTest( "Rop doesn't handle non-differentiable " "inputs correctly. Bug exposed by fixing Add.grad" " method.")
def test_invalid_input(self): success = False try: tensor.Rop(0., [tensor.matrix()], [tensor.vector()]) success = True except ValueError: pass assert not success
def check_nondiff_rop(self, y): """ If you op is not differentiable(so you can't define Rop) test that an error is raised.""" raised = False try: tmp = tensor.Rop(y, self.x, self.v) except ValueError: raised = True if not raised: self.fail(('Op did not raised an error even though the function' ' is not differentiable'))
def __call__(self, v, cost, parameters, damp): # compute Gauss-Newton Matrix right-multiplied by `v` Jv = tt.Rop(self._s, parameters, v) HJv = tt.grad( tt.sum(tt.grad(cost, self._s) * Jv), self._s, consider_constant=[Jv] ) JHJv = tt.grad(tt.sum(HJv * self._s), parameters, consider_constant=[HJv, Jv]) # apply Tikhonov damping JHJv = [JHJvi + damp * vi for JHJvi, vi in zip(JHJv, v)] return JHJv