def make_w_updates(self, loss, params): w_updates = OrderedDict() params_tilde = [theano.shared(x.get_value()) for x in params] loss_tilde = theano.clone(loss, replace=zip(params, params_tilde)) grads = theano.grad(loss, params) grads_tilde = theano.grad(loss_tilde, params_tilde) it_num = theano.shared(np.cast['int16'](0)) it = it_num + 1 for param, grad, mu, param_tilde, grad_tilde in zip(params, grads, self.mu, params_tilde, grads_tilde): # new_param = param - self.learning_rate * (grad - grad_tilde + mu) new_param = param - (1. / self.L) * (grad - grad_tilde + mu) w_updates[param] = new_param w_updates[param_tilde] = ifelse(T.eq(it % self.m, 0), new_param, param_tilde) w_updates[self.counted_gradient] = self.counted_gradient + 2 if self.adaptive: w_updates[self.L] = self.L / 2 self.it_num = it_num w_updates[it_num] = it return w_updates
def test_prod_no_zeros_in_input(self): x = theano.tensor.dmatrix() x_val = numpy.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype='float32') pwz = Prod(axis=1, no_zeros_in_input=True)(x) fn = theano.function([x], pwz, mode=self.mode) assert numpy.allclose(fn(x_val), [6, 120, 504]) pwz = Prod(no_zeros_in_input=True)(x) g = theano.grad(pwz, x) gg = theano.grad(g.sum(), x) fn = theano.function([x], g, mode=self.mode) assert numpy.allclose(fn(x_val), [[362880., 181440., 120960.], [90720., 72576., 60480.], [51840., 45360., 40320.]]) fn = theano.function([x], gg, mode=self.mode) assert numpy.allclose(fn(x_val), [[663696., 422568., 301872.], [233964., 190800., 161016.], [139248., 122652., 109584.]]) unittest_tools.verify_grad(Prod(axis=1, no_zeros_in_input=True), [x_val], mode=self.mode) unittest_tools.verify_grad(Prod(no_zeros_in_input=True), [x_val], mode=self.mode) def second_deriv(x): return theano.grad(Prod(no_zeros_in_input=True)(x), x) unittest_tools.verify_grad(second_deriv, [x_val], mode=self.mode)
def test_grad_types(self): # This function simply tests the behaviour of the AbstractConv # Ops, not their optimizations cpu_input = tensor.ftensor4() cpu_filters = tensor.ftensor4() cpu_topgrad = tensor.ftensor4() gpu_input = gpu_ftensor4() gpu_filters = gpu_ftensor4() gpu_topgrad = gpu_ftensor4() out_shape = tensor.lvector() # Check the gradient of the forward conv2d for input, filters in itertools.product((cpu_input, gpu_input), (cpu_filters, gpu_filters)): output = conv.conv2d(input, filters) grad_input, grad_filters = theano.grad(output.sum(), wrt=(input, filters)) assert grad_input.type == input.type, (grad_input, grad_input.type, input, input.type) assert grad_filters.type == filters.type, (grad_filters, grad_filters.type, filters, filters.type) # Check the gradient of gradweight for input, topgrad in itertools.product((cpu_input, gpu_input), (cpu_topgrad, gpu_topgrad)): grad_filters = conv.AbstractConv2d_gradWeights()(input, topgrad, out_shape) grad_input, grad_topgrad = theano.grad(grad_filters.sum(), wrt=(input, topgrad)) assert grad_input.type == input.type, (grad_input, grad_input.type, input, input.type) assert grad_topgrad.type == topgrad.type, (grad_topgrad, grad_topgrad.type, topgrad, topgrad.type) # Check the gradient of gradinputs for filters, topgrad in itertools.product((cpu_filters, gpu_filters), (cpu_topgrad, gpu_topgrad)): grad_input = conv.AbstractConv2d_gradInputs()(filters, topgrad, out_shape) grad_filters, grad_topgrad = theano.grad(grad_input.sum(), wrt=(filters, topgrad)) assert grad_filters.type == filters.type, (grad_filters, grad_filters.type, filters, filters.type) assert grad_topgrad.type == topgrad.type, (grad_topgrad, grad_topgrad.type, topgrad, topgrad.type)
def test_fill_grad(self): # Fix bug reported at # https://groups.google.com/d/topic/theano-users/nQshB8gUA6k/discussion x = TensorType(config.floatX, [0, 1, 0])('x') y = TensorType(config.floatX, [0, 1, 0])('y') e = tensor.second(x, y) theano.grad(e.sum(), y)
def test_dnn_conv_merge(): if not cuda.dnn.dnn_available(): raise SkipTest(cuda.dnn.dnn_available.msg) img = T.ftensor4() kern = T.ftensor4() out = T.ftensor4() b = 1 c = 4 f = 3 ih = 5 iw = 8 kh = 2 kw = 6 img_val = numpy.random.random((b, c, ih, iw)).astype("float32") kern_val = numpy.random.random((f, c, kh, kw)).astype("float32") out_val = numpy.random.random((b, f, ih - kh + 1, iw - kw + 1)).astype("float32") conv = dnn.dnn_conv(img, kern) gw = theano.grad(conv.sum(), kern) gi = theano.grad(conv.sum(), img) lr = numpy.asarray(0.05, dtype="float32") if cuda.dnn.version() == -1: # Can't merge alpha with cudnn v1 fr = conv + out wr = kern + gw ir = img + gi else: fr = lr * (conv + out) wr = kern + lr * gw ir = img + lr * gi f1 = theano.function([img, kern, out], [fr, wr, ir], mode=mode_with_gpu) assert isinstance(f1.maker.fgraph.outputs[0].owner.inputs[0].owner.op, dnn.GpuDnnConv) assert isinstance(f1.maker.fgraph.outputs[1].owner.inputs[0].owner.op, dnn.GpuDnnConvGradW) assert isinstance(f1.maker.fgraph.outputs[2].owner.inputs[0].owner.op, dnn.GpuDnnConvGradI) mode = mode_with_gpu mode = mode.excluding("local_dnn_conv_alpha_merge") mode = mode.excluding("local_dnn_convw_alpha_merge") mode = mode.excluding("local_dnn_convi_alpha_merge") mode = mode.excluding("local_dnn_conv_output_merge") mode = mode.excluding("local_dnn_convw_output_merge") mode = mode.excluding("local_dnn_convi_output_merge") f2 = theano.function([img, kern, out], [fr, wr, ir], mode=mode) assert not isinstance(f2.maker.fgraph.outputs[0].owner.inputs[0].owner.op, dnn.GpuDnnConv) assert not isinstance(f2.maker.fgraph.outputs[1].owner.inputs[0].owner.op, dnn.GpuDnnConvGradW) assert not isinstance(f2.maker.fgraph.outputs[2].owner.inputs[0].owner.op, dnn.GpuDnnConvGradI) out_f1 = f1(img_val, kern_val, out_val) out_f2 = f2(img_val, kern_val, out_val) assert len(out_f1) == len(out_f2) for v1, v2 in zip(out_f1, out_f2): utt.assert_allclose(v1, v2)
def update_opt( self, loss, target, leq_constraint, inputs, extra_inputs=None, constraint_name="constraint", *args, **kwargs ): """ :param loss: Symbolic expression for the loss function. :param target: A parameterized object to optimize over. It should implement methods of the :class:`rllab.core.paramerized.Parameterized` class. :param leq_constraint: A constraint provided as a tuple (f, epsilon), of the form f(*inputs) <= epsilon. :param inputs: A list of symbolic variables as inputs, which could be subsampled if needed. It is assumed that the first dimension of these inputs should correspond to the number of data points :param extra_inputs: A list of symbolic variables as extra inputs which should not be subsampled :return: No return value. """ inputs = tuple(inputs) if extra_inputs is None: extra_inputs = tuple() else: extra_inputs = tuple(extra_inputs) constraint_term, constraint_value = leq_constraint params = target.get_params(trainable=True) grads = theano.grad(loss, wrt=params) flat_grad = ext.flatten_tensor_variables(grads) constraint_grads = theano.grad(constraint_term, wrt=params) xs = tuple([ext.new_tensor_like("%s x" % p.name, p) for p in params]) Hx_plain_splits = TT.grad(TT.sum([TT.sum(g * x) for g, x in itertools.izip(constraint_grads, xs)]), wrt=params) Hx_plain = TT.concatenate([TT.flatten(s) for s in Hx_plain_splits]) self._target = target self._max_constraint_val = constraint_value self._constraint_name = constraint_name if self._debug_nan: from theano.compile.nanguardmode import NanGuardMode mode = NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True) else: mode = None self._opt_fun = ext.lazydict( f_loss=lambda: ext.compile_function( inputs=inputs + extra_inputs, outputs=loss, log_name="f_loss", mode=mode ), f_grad=lambda: ext.compile_function( inputs=inputs + extra_inputs, outputs=flat_grad, log_name="f_grad", mode=mode ), f_Hx_plain=lambda: ext.compile_function( inputs=inputs + extra_inputs + xs, outputs=Hx_plain, log_name="f_Hx_plain", mode=mode ), f_constraint=lambda: ext.compile_function( inputs=inputs + extra_inputs, outputs=constraint_term, log_name="constraint", mode=mode ), f_loss_constraint=lambda: ext.compile_function( inputs=inputs + extra_inputs, outputs=[loss, constraint_term], log_name="f_loss_constraint", mode=mode ), )
def conv_grad(mode, bs, ch, nf, rImg1, rImg2, rFlt1, rFlt2, subsample, op): ishape = (bs, ch, rImg1, rImg2) kshape = (nf, ch, rFlt1, rFlt2) npy_img = theano._asarray(numpy.random.rand(*ishape), dtype='float32') npy_kern = theano._asarray(numpy.random.rand(*kshape), dtype='float32') i = cuda.CudaNdarrayType( broadcastable=[sh == 1 for sh in npy_img.shape])() k = cuda.CudaNdarrayType( broadcastable=[sh == 1 for sh in npy_kern.shape])() # TODO: also test custom pad values corr_op = op(mode, subsample)(i, k) # try to compile reference implementation without shape, # so we don't have to compile hundreds of versions conv_op = tensor.nnet.conv2d(i, k[:, :, ::-1, ::-1], border_mode=mode, subsample=subsample) try: conv_op_di = theano.grad(conv_op.sum(), i) conv_op_dk = theano.grad(conv_op.sum(), k) except Exception: # compile with shape information only when needed conv_op = tensor.nnet.conv2d(i, k[:, :, ::-1, ::-1], ishape, kshape, mode, subsample) conv_op_di = theano.grad(conv_op.sum(), i) conv_op_dk = theano.grad(conv_op.sum(), k) corr_op_di = theano.grad(corr_op.sum(), i) corr_op_dk = theano.grad(corr_op.sum(), k) outputs = [corr_op, conv_op, corr_op_di, conv_op_di, corr_op_dk, conv_op_dk] try: conv_op_dik = theano.grad(conv_op_di.sum(), k) conv_op_dki = theano.grad(conv_op_dk.sum(), i) corr_op_dik = theano.grad(corr_op_di.sum(), k) corr_op_dki = theano.grad(corr_op_dk.sum(), i) outputs.extend([corr_op_dik, conv_op_dik, corr_op_dki, conv_op_dki]) except Exception: # skip if the reference implementation can't do it pass f = theano.function([i, k], outputs, mode=theano_mode.excluding('conv_dnn', 'conv_gemm')) allvals = f(npy_img, npy_kern) for a, b, oa, ob, p in zip(allvals[::2], allvals[1::2], outputs[::2], outputs[1::2], ('top', 'dtop/dbottom', 'dtop/dweight', 'dtop/dbottom/dweight', 'dtop/dweight/dbottom')): assert oa.type.broadcastable[:2] == ob.type.broadcastable[:2] assert_allclose(a, b, rtol=1e-4)
def oneStep(w): t = rng.choice(size=(1,), a=n) loss_part_tilde = objective(getpred(data[t], param), target[t]) loss_part_tilde = loss_part_tilde.mean() g_tilde = theano.grad(loss_part_tilde, param) loss_part = objective(getpred(data[t], w), target[t]) loss_part = loss_part.mean() g = theano.grad(loss_part, w) w = w - learning_rate * (g - g_tilde + mu) return w
def test_normal_logEI(): #rng = np.random.RandomState(123) N = 2000 thresh = np.linspace(-10, 50, N) #N = 100 #thresh = np.linspace(37, 38, N) mean = thresh * 0 var = thresh * 0 + 1 s_t, s_m, s_v = theano.tensor.dvectors('tmv') fn = theano.function([s_t, s_m, s_v], gpr_math.s_normal_logEI(s_t, s_m, s_v)) if 0: #print zip(thresh, fn(thresh, mean, var)) #print a = theano.tensor.dvector() y = s_t ** 2 * a[2] + s_t * a[1] + a[0] cost = ((y - gpr_math.s_normal_logEI(s_t, s_m, s_v)) ** 2).sum() da = theano.grad(cost, a) foo = theano.function([a, s_t, s_m, s_v], [cost, da]) res = scipy.optimize.minimize(foo, [0, -1, -1], jac=True, args=(thresh, mean, var), method='L-BFGS-B') print res.x from hyperopt.criteria import logEI_gaussian if 0: import matplotlib.pyplot as plt y = fn(thresh, mean, var) z = logEI_gaussian(mean, var, thresh) plt.plot(thresh, y) plt.plot(thresh, z) plt.show() # -- the gpr_math logEI uses a quadratic approximation for very # hopeless points, which gives the right derivative, but the # slightly wrong value assert np.allclose(logEI_gaussian(mean, var, thresh), fn(thresh, mean, var), atol=1e-3, rtol=1e-4) if 0: d_t = theano.grad(gpr_math.s_normal_logEI(s_t, s_m, s_v).sum(), s_t) d_fn = theano.function([s_t, s_m, s_v], d_t) import matplotlib.pyplot as plt plt.plot(thresh, d_fn(thresh, mean, var)) plt.show()
def get_gradients(self): dot = theano.dot _dO = theano.grad(self.netS, self.outputs) _b2 = T.sum(_dO, axis=0) H = self.layers[-3] _dW2 = dot(H.T, _dO) _dH = dot(_dO, self.seg.params["W2"].T) I = self.layers[0] _dA = _dH * (H - H * H) _b1 = T.sum(_dA, axis=0) _dW1 = dot(I.T, _dA) _I = dot(_dA, self.seg.params["W1"].T) _C = theano.grad(T.sum(I * _I), self.seg.params["C"]) return [_C, _dW1, _b1, _dW2, _b2]
def _collins_grad(scores): trans_p = [self.params["A"]] net_p = [p for k, p in self.params.items() if k != "A"] net_S = [ns for ns, ts in scores] trans_S = [ts for ns, ts in scores] # transition score updates transg = [theano.grad(S, trans_p) for S in trans_S] trans_grad = [sum([transg[i][j] for i in range(len(transg))]) / self.batchsize for j in range(len(trans_p))] trans_upd = [(p, p + self.alfa[p].getupdate(g)) for p, g in zip(trans_p, trans_grad)] # network parameters update netsg = [theano.grad(S, net_p) for S in net_S] net_grad = [sum([netsg[i][j] for i in range(len(netsg))]) / self.batchsize for j in range(len(net_p))] # net_grad = [theano.grad(net_S[i], p) for p in net_p] net_upd = [(p, p + self.alfa[p].getupdate(g)) for p, g in zip(net_p, net_grad)] return trans_upd + net_upd
def get_or_compute_grads(loss_or_grads, params, regularizers={}): """Helper function returning a list of gradients. Parameters ---------- loss_or_grads : symbolic expression or list of expressions A scalar loss expression, or a list of gradient expressions params : list of shared variables The variables to return the gradients for regularizers : dict 'c' : clip_norm(g, c, n) 'func' : l2 or l1 Returns ------- list of expressions If `loss_or_grads` is a list, it is assumed to be a list of gradients and returned as is, unless it does not match the length of `params`, in which case a `ValueError` is raised. Otherwise, `loss_or_grads` is assumed to be a cost expression and the function returns `theano.grad(loss_or_grads, params)`. """ if isinstance(loss_or_grads, list): if not len(loss_or_grads) == len(params): raise ValueError("Got %d gradient expressions for %d parameters" % (len(loss_or_grads), len(params))) return loss_or_grads else: c = regularizers.get('c', 0.0) regularizers_funcs = regularizers.get('func', []) if len(regularizers_funcs) == 0 and c == 0.0: return theano.grad(loss_or_grads, params) else: grads = theano.grad(loss_or_grads, params) # Max-Norm if c > 0: norm = T.sqrt(sum([T.sum(g**2) for g in grads])) grads = [clip_norm(g, c, norm) for g in grads] new_grads = [] for p, g, r in zip(params, grads, regularizers_funcs): if r is None: new_grads.append(g) else: # L1 or L2 func new_grads.append(r(g, p)) return new_grads
def adam(loss, all_params, learn_rate=0.001, b1=0.9, b2=0.999, e=1e-8, gamma=1-1e-8): """ADAM update rules Kingma, Diederik, and Jimmy Ba. "Adam: A Method for Stochastic Optimization." arXiv preprint arXiv:1412.6980 (2014). http://arxiv.org/pdf/1412.6980v4.pdf """ updates = [] all_grads = theano.grad(loss, all_params) alpha = learn_rate t = theano.shared(np.float32(1.)) b1_t = b1 * gamma ** (t - 1.) # decay the first moment running average coefficient for theta_prev, g in zip(all_params, all_grads): m_prev = theano.shared(np.zeros(theta_prev.get_value().shape, dtype=theano.config.floatX)) v_prev = theano.shared(np.zeros(theta_prev.get_value().shape, dtype=theano.config.floatX)) m = b1_t * m_prev + (1. - b1_t) * g # update biased first moment estimate v = b2 * v_prev + (1. - b2) * g ** 2 # update biased second raw moment estimate m_hat = m / (1. - b1 ** t) # compute bias-corrected first moment estimate v_hat = v / (1. - b2 ** t) # compute bias-corrected second raw moment estimate theta = theta_prev - (alpha * m_hat) / (T.sqrt(v_hat) + e) # update parameters updates.append((m_prev, m)) updates.append((v_prev, v)) updates.append((theta_prev, theta) ) updates.append((t, t + 1.)) return updates
def __init__(self, word_vec_width, batch_size, num_hidden, learning_rate=0.1): self.num_hidden = num_hidden self.learning_rate = learning_rate self.word_vec_width = word_vec_width self.batch_size = batch_size self.vocab_mat = T.fmatrix('vocab') self.word_onehot = T.fmatrix('word_onehot') b = T.fvector('b') W = T.fmatrix('W') f = 1 / (1 + T.exp(-(W * (self.word_onehot.dot(self.vocab_mat) + b)))) s = T.sum(f) self.exec_fn = theano.function( [self.word_onehot, b, W, self.vocab_mat], f, allow_input_downcast=True) self.word_onehot_c = T.fmatrix('word_onehot_c') f_c = 1 / (1 + T.exp(-(W * (self.word_onehot_c.dot(self.vocab_mat)) + b))) s_c = T.sum(f_c) J = T.largest(0, 1 - s + s_c) self.grad = theano.grad(J, [b, W, self.vocab_mat]) self.grad_fn = theano.function( [self.word_onehot, self.word_onehot_c, b, W, self.vocab_mat], self.grad, allow_input_downcast=True)
def __init__(self, theano_mat, input_vars, cost, learning_rate=1e-2, delta=1e-2): """ cost should be a theano variable that this var should take gradients wrt input_vars should be a list of variables for whic you'll provide values when you call update() """ print '[AdaGradParam init]', theano_mat.name, 'has learning rate of', learning_rate #print '[AdaGradParam init] tvar.type =', theano_mat.type self.tvar = theano_mat # should be a theano.shared self.gg = theano.shared(np.ones_like(self.tvar.get_value(), dtype=theano_mat.dtype) * delta) # TODO upgrade to >=0.6rc5 and switch to float32s # there is a bug that has been fixed by 0.6rc5 where when you # multiply a variable with dtype='float32' with a theano.tensor.constant, # you get something back with dtype='float64' # if you get errors in this code, that is almost certainly why (unless your on >=0.6rc5) self.lr = T.constant(learning_rate) grad = theano.grad(cost=cost, wrt=self.tvar) gg_update = self.gg + (grad ** 2) tvar_update = self.tvar - self.lr * grad / (self.gg ** 0.5) #print '[AdaGradParam] gg_update.type =', gg_update.type #print '[AdaGradParam] tvar_update.type =', tvar_update.type self.updates = [(self.gg, gg_update), (self.tvar, tvar_update)] self.f_update = theano.function(input_vars, grad, updates=self.updates)
def test_grad(self): eps = 1e-7 f, args, vals = self.get_args() output0 = f(*vals) # Go through and backpropagate all of the gradients from the outputs grad0 = [] for i in range(len(output0) - 2): grad0.append([]) for j in range(output0[i].size): ind = np.unravel_index(j, output0[i].shape) g = theano.function( args, theano.grad(self.op(*args)[i][ind], args)) grad0[-1].append(g(*vals)) # Loop over each input and numerically compute the gradient for k in range(len(vals)): for l in range(vals[k].size): inner = np.unravel_index(l, vals[k].shape) vals[k][inner] += eps plus = f(*vals) vals[k][inner] -= 2*eps minus = f(*vals) vals[k][inner] += eps # Compare to the backpropagated gradients for i in range(len(output0) - 2): for j in range(output0[i].size): ind = np.unravel_index(j, output0[i].shape) delta = 0.5 * (plus[i][ind] - minus[i][ind]) / eps ref = grad0[i][j][k][inner] assert np.abs(delta - ref) < 2*eps, \ "{0}".format((k, l, i, j, delta, ref, delta-ref))
def adam_v2(loss, all_params, learning_rate=0.0002, beta1=0.1, beta2=0.001, epsilon=1e-8, l_decay=1 - 1e-8): """ Adam update rule by Kingma and Ba, ICLR 2015, version 2 (with momentum decay). learning_rate: alpha in the paper, the step size beta1: exponential decay rate of the 1st moment estimate beta2: exponential decay rate of the 2nd moment estimate l_decay: exponential increase rate of beta1 """ all_grads = theano.grad(loss, all_params) updates = [] for param_i, grad_i in zip(all_params, all_grads): t = theano.shared(1) # timestep, for bias correction mparam_i = theano.shared(np.zeros(param_i.get_value().shape, dtype=theano.config.floatX)) # 1st moment vparam_i = theano.shared(np.zeros(param_i.get_value().shape, dtype=theano.config.floatX)) # 2nd moment beta1_current = 1 - (1 - beta1) * l_decay ** (t.astype(theano.config.floatX) - 1) m = beta1_current * grad_i + (1 - beta1_current) * mparam_i # new value for 1st moment estimate v = beta2 * T.sqr(grad_i) + (1 - beta2) * vparam_i # new value for 2nd moment estimate m_unbiased = m / (1 - (1 - beta1) ** t.astype(theano.config.floatX)) v_unbiased = v / (1 - (1 - beta2) ** t.astype(theano.config.floatX)) w = param_i - learning_rate * m_unbiased / (T.sqrt(v_unbiased) + epsilon) # new parameter values updates.append((mparam_i, m)) updates.append((vparam_i, v)) updates.append((t, t + 1)) updates.append((param_i, w)) return updates
def build_updates_with_micro(loss, all_params, learning_rate, beta1=0.1, beta2=0.001, epsilon=1e-8): """ Adam update rule by Kingma and Ba, ICLR 2015. """ all_grads = theano.grad(loss, all_params) updates, micro_updates = [], [] # all_grads = nn.updates.total_norm_constraint(all_grads, 1) t = theano.shared(1) # timestep, for bias correction for param_i, grad_i in zip(all_params, all_grads): zeros = np.zeros(param_i.get_value(borrow=True).shape, dtype=theano.config.floatX) mparam_i = theano.shared(zeros) # 1st moment vparam_i = theano.shared(zeros.copy()) # 2nd moment sum_grad_i = theano.shared(zeros.copy()) micro_updates.append((sum_grad_i, sum_grad_i+grad_i)) grad = sum_grad_i / np.float32(mini_batch_size//batch_size) m = beta1 * grad + (1 - beta1) * mparam_i # new value for 1st moment estimate v = beta2 * T.sqr(grad) + (1 - beta2) * vparam_i # new value for 2nd moment estimate m_unbiased = m / (1 - (1 - beta1) ** t.astype(theano.config.floatX)) v_unbiased = v / (1 - (1 - beta2) ** t.astype(theano.config.floatX)) w = param_i - learning_rate * m_unbiased / (T.sqrt(v_unbiased) + epsilon) # new parameter values updates.append((mparam_i, m)) updates.append((vparam_i, v)) updates.append((param_i, w)) updates.append((sum_grad_i, zeros.copy())) updates.append((learning_rate, learning_rate * (1-learning_rate_decay))) updates.append((t, t + 1)) return updates, micro_updates
def get_or_compute_grads(loss_or_grads, params): """Helper function returning a list of gradients. Parameters ---------- loss_or_grads : symbolic expression or list of expressions A scalar loss expression, or a list of gradient expressions params : list of shared variables The variables to return the gradients for Returns ------- list of expressions If `loss_or_grads` is a list, it is assumed to be a list of gradients and returned as is, unless it does not match the length of `params`, in which case a `ValueError` is raised. Otherwise, `loss_or_grads` is assumed to be a cost expression and the function returns `theano.grad(loss_or_grads, params)`. """ if isinstance(loss_or_grads, list): if not len(loss_or_grads) == len(params): raise ValueError("Got %d gradient expressions for %d parameters" % (len(loss_or_grads), len(params))) return loss_or_grads else: return theano.grad(loss_or_grads, params)
def forward_jacobian_log_det(self, x): dy_dx, _ = th.scan(lambda x_i: th.grad(self.forward_func(x_i), x_i), sequences=[x.flatten()]) if self.fudge != 0.: return tt.log(dy_dx + self.fudge).sum() else: return tt.log(dy_dx).sum()
def gen_updates_sgd(loss, all_parameters, learning_rate): all_grads = [theano.grad(loss, param) for param in all_parameters] updates = [] for param_i, grad_i in zip(all_parameters, all_grads): updates.append((param_i - param_i * learning_rate * grad_i)) return updates
def forward_jacobian_log_det(self, x): y_sum = self.forward_map(x).sum() dy_dx = th.grad(y_sum, x) if self.fudge != 0.: return tt.log(dy_dx + self.fudge).sum() else: return tt.log(dy_dx).sum()
def custom_svrg2(loss, params, m, learning_rate=0.01, objective=None, data=None, target=None, getpred=None): theano.pp(loss) grads = theano.grad(loss, params) n = data.shape[0] updates = OrderedDict() rng = T.shared_randomstreams.RandomStreams(seed=149) for param, grad in zip(params, grads): value = param.get_value(borrow=True) mu = grad / n def oneStep(w): t = rng.choice(size=(1,), a=n) loss_part_tilde = objective(getpred(data[t], param), target[t]) loss_part_tilde = loss_part_tilde.mean() g_tilde = theano.grad(loss_part_tilde, param) loss_part = objective(getpred(data[t], w), target[t]) loss_part = loss_part.mean() g = theano.grad(loss_part, w) w = w - learning_rate * (g - g_tilde + mu) return w w_tilde, scan_updates = theano.scan(fn=oneStep, outputs_info=param, n_steps=m) updates.update(scan_updates) updates[param] = w_tilde[-1] return updates
def adadelta(loss, all_params, learning_rate=1.0, rho=0.95, epsilon=1e-6): """ in the paper, no learning rate is considered (so learning_rate=1.0). Probably best to keep it at this value. epsilon is important for the very first update (so the numerator does not become 0). rho = 0.95 and epsilon=1e-6 are suggested in the paper and reported to work for multiple datasets (MNIST, speech). see "Adadelta: an adaptive learning rate method" by Matthew Zeiler for more info. """ all_grads = [theano.grad(loss, param) for param in all_params] all_accumulators = [theano.shared(np.zeros(param.get_value().shape, dtype=theano.config.floatX)) for param in all_params] all_delta_accumulators = [theano.shared(np.zeros(param.get_value().shape, dtype=theano.config.floatX)) for param in all_params] # all_accumulators: accumulate gradient magnitudes # all_delta_accumulators: accumulate update magnitudes (recursive!) updates = [] for param_i, grad_i, acc_i, acc_delta_i in zip(all_params, all_grads, all_accumulators, all_delta_accumulators): acc_i_new = rho * acc_i + (1 - rho) * grad_i**2 updates.append((acc_i, acc_i_new)) update_i = grad_i * T.sqrt(acc_delta_i + epsilon) / T.sqrt(acc_i_new + epsilon) # use the 'old' acc_delta here updates.append((param_i, param_i - learning_rate * update_i)) acc_delta_i_new = rho * acc_delta_i + (1 - rho) * update_i**2 updates.append((acc_delta_i, acc_delta_i_new)) return updates
def get_partial_diff(self, differentiable_var_name): diff_var = self.var_lookup[differentiable_var_name] grad = theano.function(self.variables, theano.grad(self.output_expression, diff_var), allow_input_downcast=True) return self.f, grad
def __init__(self, config, loss, params): self._lr = get_shared_floatX(config.learning_rate, 'lr') self._t = get_shared_floatX(1, 't') self._all_m_tm1 = [] self._all_v_tm1 = [] self._updates = [(self._t, self._t + 1)] if config.lr_decay: lr_coef = tt.pow(config.lr_decay, (self._t - 1) // config.lr_decay_freq) self._updates.append((self._lr, lr_coef * config.learning_rate)) grads = theano.grad(loss, params) self._global_grad_norm = tt.sqrt(tt.sum(tt.stack([tt.sum(g**2.) for g in grads]))) if config.max_grad_norm: global_clip_factor = ifelse(tt.lt(self._global_grad_norm, config.max_grad_norm), cast_floatX_np(1.), cast_floatX(config.max_grad_norm/self._global_grad_norm)) grads = [global_clip_factor * g for g in grads] lr_t = self._lr * \ clip_sqrt(1 - tt.pow(config.adam_beta2, self._t)) / (1 - tt.pow(config.adam_beta1, self._t)) for p, g in zip(params, grads): m_tm1 = get_shared_floatX(np.zeros_like(p.get_value()), 'adam_m_' + p.name) v_tm1 = get_shared_floatX(np.zeros_like(p.get_value()), 'adam_v_' + p.name) self._all_m_tm1.append(m_tm1) self._all_v_tm1.append(v_tm1) m_t = config.adam_beta1 * m_tm1 + (1-config.adam_beta1) * g v_t = config.adam_beta2 * v_tm1 + (1-config.adam_beta2) * tt.sqr(g) delta_t = -lr_t * m_t / (clip_sqrt(v_t) + config.adam_eps) p_t = p + delta_t self._updates += [(m_tm1, m_t), (v_tm1, v_t), (p, p_t)]
def gradients_and_updates(self, grad_normalize): """Compute gradients (t_gparams) using cost and trainable weights (t_params). """ # ------ Compute gradient parameters self.t_gparams = OrderedDict({'g_' + k: theano.grad(cost=self.t_outputs['T_cost'], wrt=p) for k, p in self.t_params.iteritems()}) # ------ Compute norm and stack it like a vector (to analyze outside) # self.out_debug = self.t_gparams['g_T_B'] self.out_gnorm = T.stack([T.sqrt(T.sum(gp ** 2)) for gp in self.t_gparams.values()]) # ------ Normalize gradients self.g_norm = {} if grad_normalize.has_key('max_norm'): # maximum gradient norm limited mn = grad_normalize['max_norm'] for k in self.t_gparams.keys(): self.g_norm[k] = T.sqrt(T.sum(self.t_gparams[k] ** 2)) self.t_gparams[k] = ifel(T.gt(self.g_norm[k], mn), mn * self.t_gparams[k] / (self.g_norm[k] + 1e-6), self.t_gparams[k]) # ------ Update parameters (SGD!) self.update_params = [] for k in self.t_params.keys(): self.update_params.append([self.t_params[k], self.t_params[k] - self.t_inputs['T_lr'] * self.t_gparams['g_' + k]])
def _training_updates(self, **kwargs): """Returns the update expression for updating the model parameters during training. The formula for updating an argument is .. math: \theta^{(k+1)} = \theta^{(k)} - learning\_rate * \frac{\partial cost}{\partial \theta} Expects a 'learning_rate' and 'cost' kwarg. :type learning_rate: theano.config.floatX :param learning_rate: The learning rate for parameter updates. :type cost: theano.tensor.TensorType :param cost: The cost function of which we are computing the gradient. :returns: A list of pairs (parameter, update_expression), to be passed directly to ``theano.function`` as the ``updates`` parameter. """ utils.check_kwargs(kwargs, ['learning_rate', 'cost']) learning_rate = kwargs['learning_rate'] bound_cost = kwargs['cost'] updates = [] for param in self.params: gradient = theano.grad(cost = bound_cost, wrt = param) updates.append((param, param - learning_rate * gradient)) return updates
def _get_rmsprop_updates(self, loss, params, lr, grad_momentum , sqr_momentum, min_grad): # Modified from the Lasagne package: # https://github.com/Lasagne/Lasagne/blob/master/lasagne/updates.py grads = theano.grad(loss, params) scale_factor = 1.0 if self.max_norm > 0: scale_factor = self._clip_gradient_norm(grads, self.max_norm) updates = OrderedDict() # Using theano constant to prevent upcasting of float32 one = T.constant(1) for param, grad in zip(params, grads): value = param.get_value(borrow=True) accu_sqr = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) accu_sqr_new = sqr_momentum * accu_sqr + \ (one - sqr_momentum) * grad ** 2 accu = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) accu_new = grad_momentum * accu + (one - grad_momentum) * grad updates[accu] = accu_new updates[accu_sqr] = accu_sqr_new updates[param] = param - (lr * grad * scale_factor / T.sqrt(accu_sqr_new - accu_new ** 2 + min_grad)) return updates
def prior_dlogp(vars, model, flat_view): """Returns the gradient of the prior on the parameters as a vector of size D x 1""" terms = tt.concatenate( [theano.grad(var.logpt, var).flatten() for var in vars], axis=0) dlogp = theano.clone(terms, flat_view.replacements, strict=False) return dlogp
_theano_rng = RandomStreams(config.seed // 2 + 321) # generates random numbers directly on GPU flat_probs, params, rhn_updates, hidden_states = stacked.model( _input_data, _noise_x, _lr, _is_training, config, _theano_rng) # loss _targets = T.imatrix('targets') # (batch_size, num_steps) flat_targets = _targets.T.flatten() xentropies = T.nnet.categorical_crossentropy( flat_probs, flat_targets) # (batch_size * num_steps,) pred_loss = xentropies.sum() / config.batch_size l2_loss = 0.5 * T.sum(T.stack([T.sum(p**2) for p in params])) # regularization loss = pred_loss + config.weight_decay * l2_loss # compute gradients grads = theano.grad(loss, params) global_grad_norm = T.sqrt(T.sum(T.stack([T.sum(g**2) for g in grads ]))) # gradient clipping clip_factor = theano.ifelse.ifelse( global_grad_norm < config.max_grad_norm, cast_floatX(1), T.cast(config.max_grad_norm / global_grad_norm, theano.config.floatX)) param_updates = [(p, p - _lr * clip_factor * g) for p, g in zip(params, grads)] num_params = np.sum([param.get_value().size for param in params]) train = theano.function([_input_data, _targets, _noise_x], loss, givens={_is_training: np.int32(1)}, updates=rhn_updates + param_updates) evaluate = theano.function(
def update_opt(self, loss, target, leq_constraint, inputs, extra_inputs=None, constraint_name="constraint", *args, **kwargs): """ :param loss: Symbolic expression for the loss function. :param target: A parameterized object to optimize over. It should implement methods of the :class:`rllab.core.paramerized.Parameterized` class. :param leq_constraint: A constraint provided as a tuple (f, epsilon), of the form f(*inputs) <= epsilon. :param inputs: A list of symbolic variables as inputs, which could be subsampled if needed. It is assumed that the first dimension of these inputs should correspond to the number of data points :param extra_inputs: A list of symbolic variables as extra inputs which should not be subsampled :return: No return value. """ inputs = tuple(inputs) if extra_inputs is None: extra_inputs = tuple() else: extra_inputs = tuple(extra_inputs) constraint_term, constraint_value = leq_constraint params = target.get_params(trainable=True) grads = theano.grad(loss, wrt=params, disconnected_inputs='warn') flat_grad = ext.flatten_tensor_variables(grads) self._hvp_approach.update_opt(f=constraint_term, target=target, inputs=inputs + extra_inputs, reg_coeff=self._reg_coeff) self._target = target self._max_constraint_val = constraint_value self._constraint_name = constraint_name self._opt_fun = ext.lazydict( f_loss=lambda: ext.compile_function( inputs=inputs + extra_inputs, outputs=loss, log_name="f_loss", ), f_grad=lambda: ext.compile_function( inputs=inputs + extra_inputs, outputs=flat_grad, log_name="f_grad", ), f_constraint=lambda: ext.compile_function( inputs=inputs + extra_inputs, outputs=constraint_term, log_name="constraint", ), f_loss_constraint=lambda: ext.compile_function( inputs=inputs + extra_inputs, outputs=[loss, constraint_term], log_name="f_loss_constraint", ), )
def test_pooling3d(): # CuDNN 3d pooling requires CuDNN v3. Don't test if the CuDNN version is # too old. if not cuda.dnn.dnn_available() or cuda.dnn.version() < (3000, 3000): raise SkipTest(cuda.dnn.dnn_available.msg) x = T.TensorType(broadcastable=(False, False, False, False, False), dtype='float32')() for mode, pad in product(('max', 'average_inc_pad', 'average_exc_pad'), ((0, 0, 0), (1, 0, 0), (0, 1, 0), (0, 0, 1), (2, 3, 2), (3, 2, 2), (2, 2, 3))): if mode == 'max': func = T.max else: func = T.mean if pad != (0, 0, 0) and cuda.dnn.version() == -1: continue if pad != (0, 0, 0) and func is T.mean: continue for ws in (4, 2, 5): for stride in (2, 3): if stride > ws: continue if pad[0] > stride or pad[1] > stride or pad[2] > stride: # Not implemented continue out1 = cuda.dnn.dnn_pool(x, (ws, ws, ws), stride=(stride, stride, stride), pad=pad, mode=mode) out2 = pool3d2d(x, ds=(ws, ws, ws), strides=(stride, stride, stride), pad=pad, pool_func=func) # For max pooling pool3d2d explicitly pads the input with # -inf. Because of this, the compilation mode for the function # that uses pool3d2d should not check for infinite values or # it will falsely believe there is a error in the graph. mode_without_gpu2 = mode_without_gpu.including() mode_without_gpu2.check_isfinite = False f1 = theano.function([x], out1, mode=mode_with_gpu) assert any([isinstance(node.op, cuda.dnn.GpuDnnPool) for node in f1.maker.fgraph.apply_nodes]) f2 = theano.function([x], out2, mode=mode_without_gpu2) assert not any([isinstance(node.op, cuda.dnn.GpuDnnPool) for node in f2.maker.fgraph.apply_nodes]) for shp in [(1, 10, 100, 100, 100), (1, 3, 99, 99, 99), (32, 1, 147, 197, 37), ]: data = numpy.random.normal(0, 1, shp).astype("float32") a = f1(data).__array__() b = f2(data).__array__() utt.assert_allclose(a, b, atol=numpy.finfo(numpy.float32).eps) # Test the grad for shp in [(1, 1, 2, 2, 2), (1, 1, 3, 3, 3), (1, 1, 3, 3, 4), (1, 1, 3, 4, 3), (1, 1, 4, 3, 3), (1, 1, 4, 4, 4), (1, 1, 5, 5, 5)]: data = numpy.random.normal(0, 1, shp).astype("float32") * 10 ws = 2 stride = 2 if pad[0] > stride or pad[1] > stride or pad[2] > stride: # Not implemented continue # Test the GPU grad + GPU implementation def fn(x): dnn_op = cuda.dnn.dnn_pool( x, ws=(ws, ws, ws), stride=(stride, stride, stride), pad=pad, mode=mode) return dnn_op theano.tests.unittest_tools.verify_grad( fn, [data], cast_to_output_type=False, mode=mode_with_gpu) # Confirm that we get the good op. fg = theano.function([x], theano.grad(fn(x).sum(), x), mode=mode_with_gpu) assert any([isinstance(node.op, cuda.dnn.GpuDnnPoolGrad) for node in fg.maker.fgraph.toposort()]) g_out = fg(data) # Compare again the CPU result out = pool3d2d(x, (ws, ws, ws), strides=(stride, stride, stride), pad=pad, pool_func=func) fc = theano.function([x], theano.grad(out.sum(), x), mode=mode_without_gpu) c_out = fc(data) assert numpy.allclose(c_out, g_out)
def get_opt_output(): flat_grad = flatten_tensor_variables(theano.grad( penalized_loss, target.get_params(trainable=True), disconnected_inputs='ignore' )) return [penalized_loss.astype('float64'), flat_grad.astype('float64')]
from theano import tensor as T import theano import theano.printing a = T.scalar() pow = a**2 g = theano.grad(pow, a) print(theano.printing.debugprint(g)) print(theano.printing.debugprint(theano.function([a], g)))
def flatten_hessian(cost, wrt, consider_constant=None, disconnected_inputs='raise', block_diagonal=True): """ :type cost: Scalar (0-dimensional) Variable. :type wrt: Vector (1-dimensional tensor) 'Variable' or list of vectors (1-dimensional tensors) Variables :param consider_constant: a list of expressions not to backpropagate through :type disconnected_inputs: string :param disconnected_inputs: Defines the behaviour if some of the variables in ``wrt`` are not part of the computational graph computing ``cost`` (or if all links are non-differentiable). The possible values are: - 'ignore': considers that the gradient on these parameters is zero. - 'warn': consider the gradient zero, and print a warning. - 'raise': raise an exception. :return: either a instance of Variable or list/tuple of Variables (depending upon `wrt`) repressenting the Hessian of the `cost` with respect to (elements of) `wrt`. If an element of `wrt` is not differentiable with respect to the output, then a zero variable is returned. The return value is of same type as `wrt`: a list/tuple or TensorVariable in all cases. """ import theano from theano.tensor import arange # Check inputs have the right format import theano.tensor as TT from theano import Variable from theano import grad assert isinstance(cost, Variable), \ "tensor.hessian expects a Variable as `cost`" assert cost.ndim == 0, \ "tensor.hessian expects a 0 dimensional variable as `cost`" using_list = isinstance(wrt, list) using_tuple = isinstance(wrt, tuple) if isinstance(wrt, (list, tuple)): wrt = list(wrt) else: wrt = [wrt] hessians = [] if not block_diagonal: expr = TT.concatenate([ grad(cost, input, consider_constant=consider_constant, disconnected_inputs=disconnected_inputs).flatten() for input in wrt ]) for input in wrt: assert isinstance(input, Variable), \ "tensor.hessian expects a (list of) Variable as `wrt`" # assert input.ndim == 1, \ # "tensor.hessian expects a (list of) 1 dimensional variable " \ # "as `wrt`" if block_diagonal: expr = grad(cost, input, consider_constant=consider_constant, disconnected_inputs=disconnected_inputs).flatten() # It is possible that the inputs are disconnected from expr, # even if they are connected to cost. # This should not be an error. hess, updates = theano.scan( lambda i, y, x: grad(y[i], x, consider_constant=consider_constant, disconnected_inputs='ignore').flatten(), sequences=arange(expr.shape[0]), non_sequences=[expr, input]) assert not updates, \ ("Scan has returned a list of updates. This should not " "happen! Report this to theano-users (also include the " "script that generated the error)") hessians.append(hess) if block_diagonal: from theano.gradient import format_as return format_as(using_list, using_tuple, hessians) else: return TT.concatenate(hessians, axis=1)
dtype=theano.config.floatX) sh_s = theano.shared( np.zeros((params['max_sen_length'], params['batch_size']), dtype=np.int32)) sh_mask = theano.shared( np.zeros((params['max_sen_length'], params['batch_size']), dtype=theano.config.floatX)) sh_w = theano.shared(np.float32(0.0)) print "================= Compiling Theano.functions ====================== " givens = [(sym_s, sh_s), (sym_mask, sh_mask), (sym_w, sh_w)] import theano.gradient s_clip = theano.gradient.grad_clip(sym_s, -10.0, 10.0) # see graves generating sequences cost = l_vae.get_cost(params['keep_rate'], params['drop_out'], s_clip, sym_mask, sym_w) all_grads = theano.grad(cost, all_params) all_grads, step_norm, multiplier = step_clipping(all_grads, threshold=10.0, to_zero=False) if params['opt_function'] == 'adam': updates, steps = adam(all_grads, all_params, learning_rate=params['lr'], beta1=params['beta1'],\ beta2=params['beta2'],decay_factor=1.0-params['decay_rate'] ) elif params['opt_function'] == 'adagrad': updates, steps = adagrad(all_grads, all_params, learning_rate=params['lr']) else: updates, steps = adadelta(all_grads, all_params) outputs = [cost, step_norm] + l_vae.get_train_results() train = theano.function([], outputs + [multiplier], givens=givens,
def grad_ii(i): return theano.grad(f[i], x)[i]
def __init__(self,inputs,outputs, cost,scopes, **option): """ :param model: :param option: """ if "variables" not in option or not option["variables"]: # not fine-tuning params = [param for scope in scopes for param in ops.trainable_variables(scope)] # regularization_loss = ops.get_regularization_loss(scopes) # if regularization_loss: # cost += regularization_loss # if option["l2_scale"]: # get_l2 = ops.l2_regularizer(option["l2_scale"]) # cost += reduce(T.add, [get_l2(param) for param in params]) else: pass # fine-tuning # _logger.debug("loading specified params") # params = option["variables"] grads = theano.grad(cost, params) gradsref = grads vec = [theano.shared(numpy.zeros_like(p.get_value())) for p in params] if "algorithm" not in option: option["algorithm"] = "sgd" if "variant" not in option: option["variant"] = None if "constraint" not in option: option["constraint"] = None if "momentum" not in option: option["momentum"] = False if "norm" not in option: option["norm"] = True if "nesterov" not in option: option["nesterov"] = False if "initialize" not in option: option["initialize"] = False if "nanguard" not in option: option["nanguard"] = False algorithm = option["algorithm"] variant = option["variant"] variant = [variant] if variant != None else [] if option["norm"]: normval = constraint.global_norm(grads) outputs = outputs[:] outputs.append(normval) if option["constraint"]: method, value = option["constraint"] if method == "value": grads = constraint.clip_by_value(grads, value[0], value[1]) if method == "norm": grads = constraint.clip_by_global_norm(grads, value) if option["nanguard"]: gnorm = constraint.global_norm(gradsref) isnan = theano.tensor.isnan(gnorm) isinf = theano.tensor.isinf(gnorm) notfinite = theano.tensor.or_(isnan, isinf) newgrads = [] for p, g in zip(params, grads): newgrads.append(theano.tensor.switch(notfinite, 0.1 * p, g)) grads = newgrads if option["nesterov"]: option["momentum"] = False gup = [] scan_updates = ops.get_updates() # append update rules if isinstance(scan_updates, OrderedDict): for key, value in scan_updates.iteritems(): gup.append((key, value)) else: gup.extend(scan_updates) for v, g in zip(vec, grads): gup.append((v, g)) if algorithm == "sgd": alpha = theano.tensor.scalar() hparams = [alpha] defaults = [("alpha", 1.0)] svar, pup = updates.sgd_updates(params, vec, *hparams) elif algorithm == "adagrad": alpha = theano.tensor.scalar() epsilon = theano.tensor.scalar() hparams = [alpha, epsilon] defaults = [("alpha", 1.0), ("epsilon", 1e-6)] svar, pup = updates.adagrad_updates(params, vec, *hparams) elif algorithm == "rmsprop": alpha = theano.tensor.scalar() rho = theano.tensor.scalar() epsilon = theano.tensor.scalar() hparams = [alpha, rho, epsilon] defaults = [("alpha", 1e-2), ("rho", 0.99), ("epsilon", 1e-8)] rmsparam = hparams + variant svar, pup = updates.rmsprop_updates(params, vec, *rmsparam) elif algorithm == "rmsprop_momentum": alpha = theano.tensor.scalar() rho = theano.tensor.scalar() epsilon = theano.tensor.scalar() momentum = theano.tensor.scalar() hparams = [alpha, rho, epsilon, momentum] defaults = [("alpha", 1e-4), ("rho", 0.95), ("epsilon", 1e-4)] defaults.append(("moment", 0.9)) svar, pup = updates.rmsprop_momentum_updates(params, vec, *hparams) elif algorithm == "adadelta": alpha = theano.tensor.scalar() rho = theano.tensor.scalar() epsilon = theano.tensor.scalar() hparams = [alpha, rho, epsilon] defaults = [("alpha", 1.0), ("rho", 0.95), ("epsilon", 1e-6)] svar, pup = updates.adadelta_updates(params, vec, *hparams) elif algorithm == "adam": alpha = theano.tensor.scalar() beta1 = theano.tensor.scalar() beta2 = theano.tensor.scalar() epsilon = theano.tensor.scalar() hparams = [alpha, beta1, beta2, epsilon] defaults = [("alpha", 0.001), ("beta1", 0.9), ("beta2", 0.999)] defaults.append(("epsilon", 1e-8)) svar, pup = updates.adam_updates(params, vec, *hparams) else: raise "Error: " + algorithm + " is not supported" # restore variables used by optimizer if option["initialize"]: values = option["initialize"] for v1, v2 in zip(svar, values): v1.set_value(v2) if option["momentum"]: momentum = theano.tensor.scalar() hparams.append(momentum) defaults.append(("momentum", 0.9)) pup = updates.apply_momentum(pup, params, momentum) if option["nesterov"]: momentum = theano.tensor.scalar() hparams.append(momentum) defaults.append(("momentum", 0.9)) pup = updates.apply_momentum(pup, params, momentum) optimize = theano.function(inputs, outputs, updates=gup, on_unused_input='warn') update = theano.function(hparams, [], updates=pup, on_unused_input='warn') def wrapper(**option): values = [] for item in defaults: name = item[0] val = item[1] if name not in option: option[name] = val values.append(option[name]) return update(*values) self.optimize = optimize self.update = wrapper self.option = option self.algorithm = algorithm self.parameter = svar
def test_dnn_conv_alpha_output_merge(): if not cuda.dnn.dnn_available(): raise SkipTest(cuda.dnn.dnn_available.msg) img = T.ftensor4() kern = T.ftensor4() out = T.ftensor4() b = 1 c = 4 f = 3 ih = 5 iw = 8 kh = 2 kw = 6 img_val = numpy.random.random((b, c, ih, iw)).astype('float32') kern_val = numpy.random.random((f, c, kh, kw)).astype('float32') out_val = numpy.random.random((b, f, ih - kh + 1, iw - kw + 1)).astype('float32') conv = dnn.dnn_conv(img, kern) gw = theano.grad(conv.sum(), kern) gi = theano.grad(conv.sum(), img) lr = numpy.asarray(0.05, dtype='float32') if cuda.dnn.version() == -1: # Can't merge alpha with cudnn v1 fr = conv + out wr = kern + gw ir = img + gi else: fr = lr * (conv + out) wr = kern + lr * gw ir = img + lr * gi f1 = theano.function([img, kern, out], [fr, wr, ir], mode=mode_with_gpu) assert isinstance(f1.maker.fgraph.outputs[0].owner.inputs[0].owner.op, dnn.GpuDnnConv) assert isinstance(f1.maker.fgraph.outputs[1].owner.inputs[0].owner.op, dnn.GpuDnnConvGradW) assert isinstance(f1.maker.fgraph.outputs[2].owner.inputs[0].owner.op, dnn.GpuDnnConvGradI) mode = mode_with_gpu mode = mode.excluding('local_dnn_conv_alpha_merge') mode = mode.excluding('local_dnn_convw_alpha_merge') mode = mode.excluding('local_dnn_convi_alpha_merge') mode = mode.excluding('local_dnn_conv_output_merge') mode = mode.excluding('local_dnn_convw_output_merge') mode = mode.excluding('local_dnn_convi_output_merge') f2 = theano.function([img, kern, out], [fr, wr, ir], mode=mode) assert not isinstance(f2.maker.fgraph.outputs[0].owner.inputs[0].owner.op, dnn.GpuDnnConv) assert not isinstance(f2.maker.fgraph.outputs[1].owner.inputs[0].owner.op, dnn.GpuDnnConvGradW) assert not isinstance(f2.maker.fgraph.outputs[2].owner.inputs[0].owner.op, dnn.GpuDnnConvGradI) out_f1 = f1(img_val, kern_val, out_val) out_f2 = f2(img_val, kern_val, out_val) assert len(out_f1) == len(out_f2) for v1, v2 in zip(out_f1, out_f2): utt.assert_allclose(v1, v2)
def setUp(self): if (hasattr(keras, '__version__') == False): self.keras_version = 0.2 #didn't have the __version__ tag else: self.keras_version = float(keras.__version__[0:3]) self.inp = (np.random.randn(10 * 10 * 51 * 51).reshape(10, 10, 51, 51)) self.keras_model = keras.models.Sequential() conv_layer = keras.layers.convolutional.Convolution2D( nb_filter=2, nb_row=4, nb_col=4, subsample=(2, 2), activation="relu", input_shape=(10, 51, 51)) self.keras_model.add(conv_layer) if (self.keras_version > 0.2): self.keras_model.add( keras.layers.convolutional.MaxPooling2D(pool_size=(4, 4), strides=(2, 2))) self.keras_model.add( keras.layers.convolutional.AveragePooling2D(pool_size=(4, 4), strides=(2, 2))) else: print(self.keras_version) self.keras_model.add( keras.layers.convolutional.MaxPooling2D(pool_size=(4, 4), stride=(2, 2))) #There is no average pooling in version 0.2.0 self.keras_model.add(keras.layers.core.Flatten()) self.keras_model.add(keras.layers.core.Dense(output_dim=1)) self.keras_model.add(keras.layers.core.Activation("sigmoid")) self.keras_model.compile(loss="mse", optimizer="sgd") if (self.keras_version <= 0.3): self.keras_output_fprop_func = compile_func( [self.keras_model.layers[0].input], self.keras_model.layers[-1].get_output(False)) grad = theano.grad( theano.tensor.sum( self.keras_model.layers[-2].get_output(False)[:, 0]), self.keras_model.layers[0].input) self.grad_func = theano.function( [self.keras_model.layers[0].input], grad, allow_input_downcast=True, on_unused_input='ignore') else: keras_output_fprop_func = compile_func([ self.keras_model.layers[0].input, keras.backend.learning_phase() ], self.keras_model.layers[-1].output) self.keras_output_fprop_func =\ lambda x: keras_output_fprop_func(x,False) grad = theano.grad( theano.tensor.sum(self.keras_model.layers[-2].output[:, 0]), self.keras_model.layers[0].input) grad_func = theano.function([ self.keras_model.layers[0].input, keras.backend.learning_phase() ], grad, allow_input_downcast=True, on_unused_input='ignore') self.grad_func = lambda x: grad_func(x, False)
def get_output_for(self, inputs, **kwargs): input, layer_out, layer_in = inputs return theano.grad(None, wrt=layer_in, known_grads={layer_out: input})
gen_loss = lasagne.objectives.squared_error(fake_out, c).mean() + loss_gen_fm #igen_loss = lasagne.objectives.binary_crossentropy(fake_out, c).mean() + 5 * loss_gen_fm ''' # adding extra penalty for MSE contour pred_contour = extract_contour_tensor(gen_output) real_contour = extract_contour_tensor(GAN.input_c) loss_gen_contour = lasagne.objectives.squared_error(pred_contour, real_contour).mean() gen_loss += 20 * loss_gen_contour ''' print 'loss and function setup' #% gen_grads = theano.grad(gen_loss, wrt=gen_params) critic_grads = theano.grad(critic_loss, wrt=critic_params) gen_grads_norm = sum(T.sum(T.square(grad)) for grad in gen_grads) / len(gen_grads) critic_grads_norm = sum(T.sum(T.square(grad)) for grad in critic_grads) / len(critic_grads) gen_updates = lasagne.updates.rmsprop(gen_grads, gen_params, learning_rate=initial_eta) #gen_param_avg = [th.shared(np.cast[th.config.floatX](0.*p.get_value())) for p in gen_params] #gen_avg_updates = [(a,a + 0.0001*(p-a)) for p,a in zip(gen_params,gen_param_avg)] #gen_updates = gen_avg_updates critic_updates = lasagne.updates.rmsprop(critic_grads,
def test_pooling(): if not cuda.dnn.dnn_available(): raise SkipTest(cuda.dnn.dnn_available.msg) x = T.ftensor4() for mode, pad in product(('max', 'average_inc_pad', 'average_exc_pad'), ((0, 0), (1, 0), (1, 0), (2, 3), (3, 2))): if mode == 'max': func = T.max else: func = T.mean if pad != (0, 0) and cuda.dnn.version() == -1: continue if pad != (0, 0) and func is T.mean: continue for ws in (4, 2, 5): for stride in (2, 3): if stride > ws: continue if pad[0] > stride or pad[1] > stride: # Not implemented continue # We will check that the opt introduced it. out1 = max_pool_2d(x, (ws, ws), st=(stride, stride), ignore_border=True, padding=pad, mode=mode) out2 = pool_2d_i2n(x, ds=(ws, ws), strides=(stride, stride), pad=pad, pool_function=func) mode_without_gpu2 = mode_without_gpu.including() mode_without_gpu2.check_isfinite = False f1 = theano.function([x], out1, mode=mode_with_gpu) assert any([isinstance(node.op, cuda.dnn.GpuDnnPool) for node in f1.maker.fgraph.apply_nodes]) f2 = theano.function([x], out2, mode=mode_without_gpu2) assert not any([isinstance(node.op, cuda.dnn.GpuDnnPool) for node in f2.maker.fgraph.apply_nodes]) for shp in [(1, 10, 100, 100), (1, 3, 99, 99), (32, 1, 147, 197), ]: data = numpy.random.normal(0, 1, shp).astype("float32") a = f1(data).__array__() b = f2(data).__array__() assert numpy.allclose(a, b, atol=numpy.finfo(numpy.float32).eps) # Test the grad for shp in [(1, 1, 2, 2), (1, 1, 3, 3)]: data = numpy.random.normal(0, 1, shp).astype("float32") * 10 ws = 2 stride = 2 if pad[0] > stride or pad[1] > stride: # Not implemented continue # This test the CPU grad + opt + GPU implemtentation def fn(x): return max_pool_2d(x, (ws, ws), ignore_border=True, padding=pad, mode=mode) theano.tests.unittest_tools.verify_grad(fn, [data], cast_to_output_type=False, mode=mode_with_gpu) # Confirm that the opt would have inserted it. fg = theano.function([x], theano.grad(fn(x).sum(), x), mode=mode_with_gpu) assert any([isinstance(node.op, cuda.dnn.GpuDnnPoolGrad) for node in fg.maker.fgraph.toposort()]) # Test the GPU grad + GPU implementation def fn(x): dnn_op = cuda.dnn.dnn_pool( x, ws=(ws, ws), stride=(stride, stride), pad=pad, mode=mode) return dnn_op theano.tests.unittest_tools.verify_grad( fn, [data], cast_to_output_type=False, mode=mode_with_gpu) # Confirm that we get the good op. fg = theano.function([x], theano.grad(fn(x).sum(), x), mode=mode_with_gpu) assert any([isinstance(node.op, cuda.dnn.GpuDnnPoolGrad) for node in fg.maker.fgraph.toposort()]) g_out = fg(data) # Compare again the CPU result out = max_pool_2d(x, (ws, ws), padding=pad, ignore_border=True, mode=mode) fc = theano.function([x], theano.grad(out.sum(), x), mode=mode_without_gpu) if mode == 'max': assert any([isinstance(node.op, MaxPoolGrad) for node in fc.maker.fgraph.toposort()]) else: assert any([isinstance(node.op, AveragePoolGrad) for node in fc.maker.fgraph.toposort()]) c_out = fc(data) assert numpy.allclose(c_out, g_out)
def computeLosses(self, y, std, regMultiplier, deterministic): logitSens = self.hyperp.setdefault('logitSens', 0.) logitDiffSens = self.hyperp.setdefault('logitDiffSens', 0.) logitSqSens = self.hyperp.setdefault('logitSqSens', 0.) probSens = self.hyperp.setdefault('probSens', 0.) lossSens = self.hyperp.setdefault('lossSens', 0.) l1 = self.hyperp.setdefault('l1', 0.) l2 = self.hyperp.setdefault('l2', 0.) layers = self.classifier.layers_ lossFunction = lasagne.objectives.categorical_crossentropy aggregate = T.mean # otherwise lasagne.objectives.aggregate outputLayer = layers[-1] logitLayer = layers[-2] inputLayer = layers[0] networkInput = inputLayer.input_var networkOutput = get_output(outputLayer, deterministic=deterministic) logitOutput = get_output(logitLayer, deterministic=deterministic) ###################################################################### # Very weird thing: # lossSensitivity gradients can only be computed if the one-hot encoded # version of the loss function is used. BUT that version lacks a # stability optimization in Theano that leads to NaNs during training. # This is why both versions need to be employed here. L = lossFunction(networkOutput, y) y_oneHot = lasagne.utils.one_hot(y, outputLayer.output_shape[1]) L_oneHot = lossFunction(networkOutput, y_oneHot) ####################################################################### classificationLoss = aggregate(L) l1Loss = regularization.regularize_layer_params( layers.values(), regularization.l1) l2Loss = regularization.regularize_layer_params( layers.values(), regularization.l2) # logit sensitivity logit = T.sum(logitOutput * y_oneHot, axis=1) G_logit = T.grad(T.sum(logit), networkInput) if std is not None: G_logit = std * G_logit # Sparse logit saliency regularization absG_logit = T.abs_(G_logit) sumAbsG_logit = T.sum(absG_logit, axis=(1, 2, 3)) logitSensLoss = aggregate(sumAbsG_logit) # Squared logit saliency regularization sqG_logit = G_logit**2 sumSqG_logit = T.sum(sqG_logit, axis=(1, 2, 3)) logitSqSensLoss = aggregate(sumSqG_logit) # probability sensitivity prob = T.sum(networkOutput * y_oneHot, axis=1) G_prob = T.grad(T.sum(prob), networkInput) if std is not None: G_prob = std * G_prob # Sparse probability saliency regularization absG_prob = T.abs_(G_prob) sumAbsG_prob = T.sum(absG_prob, axis=(1, 2, 3)) probSensLoss = aggregate(sumAbsG_prob) # Loss sensitivity G_loss = theano.grad(T.sum(L_oneHot), networkInput) if std is not None: G_loss = std * G_loss absG_loss = T.abs_(G_loss) sumAbsG_loss = T.sum(absG_loss, axis=(1, 2, 3)) lossSensLoss = aggregate(sumAbsG_loss) ####### !!!!!!!!!!!!!!!!!!! EXPERIMENTAL !!!!!!!!!!!!!!!!!! ########## #### !!!! only makes sense for 2-class problems in this case !!!! #### # Clumsy way to regularize logit differences # It works by replacing the matrix of one-hot encoded labels by one # whose first column is ones and the rest is minus ones. After summing # over each row, we are left with the difference of the logit of the # first class and the (sum of the) other class(es). plusMinusOneMatrix = 2 * lasagne.utils.one_hot( 1, outputLayer.output_shape[1]) - T.ones_like(y_oneHot) logitDiff = T.sum(logitOutput * plusMinusOneMatrix, axis=1) G_logitDiff = T.grad(T.sum(logitDiff), networkInput) if std is not None: G_logitDiff = std * G_logitDiff absG_logitDiff = T.abs_(G_logitDiff) sumAbsG_logitDiff = T.sum(absG_logitDiff, axis=(1, 2, 3)) logitDiffSensLoss = aggregate(sumAbsG_logitDiff) # Sum up totalLoss = classificationLoss if l1: totalLoss += regMultiplier * l1 * l1Loss if l2: totalLoss += regMultiplier * l2 * l2Loss if logitSens: totalLoss += regMultiplier * logitSens * logitSensLoss if logitDiffSens: totalLoss += regMultiplier * logitDiffSens * logitDiffSensLoss if logitSqSens: totalLoss += regMultiplier * logitSqSens * logitSqSensLoss if probSens: totalLoss += regMultiplier * probSens * probSensLoss if lossSens: totalLoss += regMultiplier * lossSens * lossSensLoss return classificationLoss, totalLoss, l1Loss, l2Loss, logitSensLoss, logitDiffSensLoss, logitSqSensLoss, probSensLoss, lossSensLoss
def modifiedObjective(layers, loss_function, target, aggregate=aggregate, deterministic=False, l1=0, l2=0, logitSens=0, probSens=0, lossSens=0, std=None, get_output_kw=None): """ Modified implementation of the NeuralNet objective. :param layers: The underlying layers of the NeuralNetwork :param loss_function: The callable loss function to use :param target: the expected output :param aggregate: the aggregation function to use :param deterministic: Whether or not to get a deterministic output :param l1: Optional l1 regularization parameter :param l2: Optional l2 regularization parameter :param lossSens: Optional loss sensitivity regularization parameter :param lossSens: Optional loss sensitivity regularization parameter :param lossSens: Optional loss sensitivity regularization parameter :param get_output_kw: optional kwargs to pass to :meth:`NeuralNetwork.get_output` :return: The total calculated loss """ if get_output_kw is None: get_output_kw = {} output_layer = layers[-1] logit_layer = layers[-2] input_layer = layers[0] network_input = input_layer.input_var network_output = get_output(output_layer, deterministic=deterministic, **get_output_kw) logit_output = get_output(logit_layer, deterministic=deterministic, **get_output_kw) L = loss_function( network_output, lasagne.utils.one_hot(target, output_layer.output_shape[1])) loss = aggregate(L) if l1: loss += regularization.regularize_layer_params(layers.values(), regularization.l1) * l1 if l2: loss += regularization.regularize_layer_params(layers.values(), regularization.l2) * l2 # logit sensitivity if logitSens: logit = T.sum( logit_output * lasagne.utils.one_hot(target, output_layer.output_shape[1]), axis=1) G_logit = T.grad(T.sum(logit), network_input) if std is not None: G_logit = std * G_logit # Sparse saliency regularization absG_logit = T.abs_(G_logit) sumAbsG_logit = T.sum(absG_logit, axis=(1, 2, 3)) loss += aggregate(sumAbsG_logit) * logitSens # probability sensitivity if probSens: prob = T.sum( network_output * lasagne.utils.one_hot(target, output_layer.output_shape[1]), axis=1) G_prob = T.grad(T.sum(prob), network_input) if std is not None: G_prob = std * G_prob # Sparse saliency regularization absG_prob = T.abs_(G_prob) sumAbsG_prob = T.sum(absG_prob, axis=(1, 2, 3)) loss += aggregate(sumAbsG_prob) * probSens # Loss sensitivity if lossSens: G_loss = theano.grad(T.sum(L), network_input) if std is not None: G_loss = std * G_loss absG_loss = T.abs_(G_loss) loss += aggregate(T.sum(absG_loss, axis=(1, 2, 3))) * lossSens # Double Backpropagation, uncomment if desired #sqG = G**2 #sumSqG = T.sum(sqG,axis = (1,2,3)) #loss += aggregate(sumSqG) * tv return loss
def set_network_trainer(input_data, input_mask, target_data, target_mask, num_outputs, network, updater, learning_rate, grad_max_norm=10., l2_lambda=1e-5, load_updater_params=None): # get one hot target one_hot_target_data = T.extra_ops.to_one_hot(y=T.flatten(target_data, 1), nb_class=num_outputs, dtype=floatX) # get network output data predict_data = get_output(network, deterministic=False) num_seqs = predict_data.shape[0] # get prediction cost predict_data = T.reshape(x=predict_data, newshape=(-1, num_outputs), ndim=2) predict_data = predict_data - T.max(predict_data, axis=-1, keepdims=True) predict_data = predict_data - T.log(T.sum(T.exp(predict_data), axis=-1, keepdims=True)) train_predict_cost = -T.sum(T.mul(one_hot_target_data, predict_data), axis=-1) train_predict_cost = train_predict_cost*T.flatten(target_mask, 1) train_model_cost = train_predict_cost.sum()/num_seqs train_frame_cost = train_predict_cost.sum()/target_mask.sum() # get regularizer cost train_regularizer_cost = regularize_network_params(network, penalty=l2)*l2_lambda # get network parameters network_params = get_all_params(network, trainable=True) # get network gradients network_grads = theano.grad(cost=train_model_cost + train_regularizer_cost, wrt=network_params) if grad_max_norm>0.: network_grads, network_grads_norm = total_norm_constraint(tensor_vars=network_grads, max_norm=grad_max_norm, return_norm=True) else: network_grads_norm = T.sqrt(sum(T.sum(grad**2) for grad in network_grads)) # set updater train_lr = theano.shared(lasagne.utils.floatX(learning_rate)) train_updates, trainer_params = updater(loss_or_grads=network_grads, params=network_params, learning_rate=train_lr, load_params_dict=load_updater_params) # get training (update) function training_fn = theano.function(inputs=[input_data, input_mask, target_data, target_mask], outputs=[train_frame_cost, network_grads_norm], updates=train_updates) return training_fn, trainer_params
(onefeatureplane.dimshuffle('x', 0, 1) * featuremap).sum(2).sum(1), outputs_info=None, sequences=[expression[0, :, :, :]], non_sequences=expression[0, :, :, :]) layer_style_cost.append( ((grammian_testimage - grammian_original)**2).sum() / (2 * (styleimage_layer.shape[2] * styleimage_layer.shape[3])**2 * (styleimage_layer.shape[1])**2)) #layer_style_cost_function.append(theano.function([input_var], layer_style_cost[-1])) #DEFINE TOTAL COST AS WEIGHTED SUM OF CONTENT AND STYLE COST totalcost += contentweights[layerindex] * layer_content_cost[ layerindex] + styleweights[layerindex] * layer_style_cost[layerindex] totalgrad = theano.grad(totalcost, input_var) #COMPILE THEANO FUNCTIONS: cost = theano.function([input_var], totalcost) grad = theano.function([input_var], totalgrad) #CONJGRAD BASED OPTIMIZATION FOR POTENTIALLY FASTER OPTIMIZATION (REQUIRES minimize.py): def conjgrad(im, maxnumlinesearch=10, imshape=styleimage.shape): import minimize im_flat, fs, numlinesearches = minimize.minimize( im.flatten(), lambda x: cost(x.reshape(imshape)), lambda x: grad(x.reshape(imshape)).flatten(), args=[], maxnumlinesearch=maxnumlinesearch,
name = layer.__class__.__name__ num_param = sum([np.prod(p.get_value().shape) for p in layer.get_params()]) num_param = num_param.__str__() print(' %s %s %s' % (name, num_param, layer.output_shape)) y = T.cast(T.flatten(x[:, 1:]), 'int32') # training loss p1 = T.reshape(T.log(predictions[T.arange(y.shape[0]), y]), mask.shape) loss = -1. * T.mean(T.sum(mask * p1, axis=1), axis=0) # validation loss (with disabled dropout) p1_det = T.reshape(T.log(predictions_det[T.arange(y.shape[0]), y]), mask.shape) loss_det = -1. * T.mean(T.sum(mask * p1_det, axis=1), axis=0) learning_rate = theano.shared(np.float32(config.learning_rate)) grads = theano.grad(loss, all_params) updates = nn.updates.rmsprop(grads, all_params, config.learning_rate) train = theano.function([x, mask], loss, updates=updates) validate = theano.function([x, mask], loss_det) def create_batch(idxs): max_seq_len = max([len(tunes[i]) for i in idxs]) x = np.zeros((config.batch_size, max_seq_len), dtype='float32') mask = np.zeros((config.batch_size, max_seq_len - 1), dtype='float32') for i, j in enumerate(idxs): x[i, :tune_lens[j]] = tunes[j] mask[i, :tune_lens[j] - 1] = 1 return x, mask
def train_function(self, semi_supervised=True, unlabel_stable=False): ''' use_unlabel == True, semi-superviesd learning return: train function for 1 epoch use ''' self.semi_supervised = semi_supervised sym_klw = T.scalar( 'sym_klw', dtype=theano.config.floatX) # symbolic scalar of warming up sym_cw = T.scalar('sym_cw', dtype=theano.config.floatX) # classifier warm up sym_s = T.matrix('sym_s', dtype='int64') sym_mask = T.matrix('sym_mask', dtype=theano.config.floatX) sym_y = T.matrix('sym_label', dtype=theano.config.floatX) sym_s_u = T.matrix('sym_s_u', dtype='int64') sym_mask_u = T.matrix('sym_mask_u', dtype=theano.config.floatX) num_l, num_u = sym_s.shape[0].astype(theano.config.floatX), 0.0 if self.semi_supervised: print 'Train with unlabel data.' num_u = sym_s_u.shape[0].astype(theano.config.floatX) #get labeled/unlabeled cost outs1 = self.cost_label([sym_s, sym_mask, sym_y], dev_stage=False, return_mode='mean') loss_recons, loss_kl, valid_words, word_drop_num, loss_classifier, batch_ppl, acc = outs1 loss_recons_u, loss_kl_u, loss_entropy_u, batch_ppl_u = 0.0, 0.0, 0.0, 0.0 valid_words_u = 0 if self.semi_supervised: outs2 = self.cost_unlabel([sym_s_u, sym_mask_u], dev_stage=unlabel_stable, sample_by_prob=self.sample_unlabel) loss_recons_u, loss_kl_u, valid_words_u, loss_entropy_u, batch_ppl_u = outs2 ''' total Loss: L = Loss_labeled(s,mask,y) + beta*(n_l+n_u)/n_l * Loss_classisifer(s,mask,y) + Loss_unlabel(s_u, mask_u) L = recons_term + sym_klw_term + loss_classifier_term - loss_entropy_u ''' alpha = sym_cw * self.cost_beta * (num_l + num_u) / num_l total_cost = loss_recons * num_l + loss_recons_u * num_u\ + sym_klw * ( loss_kl * num_l + loss_kl_u * num_u)\ + alpha * loss_classifier * num_l\ - loss_entropy_u * num_u total_cost /= (num_l + num_u) train_params = self.get_params(only_trainable=True) all_grads = theano.grad(total_cost, train_params) all_grads = [ T.clip(g, -self.grad_clipping, self.grad_clipping) for g in all_grads ] all_grads = total_norm_constraint(all_grads, max_norm=self.max_norm) #all_grads = [T.clip(g, -self.grad_clipping, self.grad_clipping) for g in all_grads] updates = adam(all_grads, train_params, self.lr, self.beta1, self.beta2) if self.semi_supervised: train_input = [ sym_s, sym_mask, sym_y, sym_s_u, sym_mask_u, sym_klw, sym_cw ] train_output = [ total_cost, loss_recons, loss_recons_u, loss_kl, loss_kl_u, alpha, loss_classifier, loss_entropy_u, batch_ppl, batch_ppl_u, valid_words, valid_words_u, word_drop_num, acc ] else: train_input = [sym_s, sym_mask, sym_y, sym_klw, sym_cw] train_output = [ total_cost, loss_recons, loss_kl, loss_classifier, batch_ppl, valid_words, word_drop_num, acc ] train_f = theano.function(inputs=train_input, outputs=train_output, updates=updates, name='train_function') return train_f
# policy.dist_info_sym returns a dictionary, whose values are symbolic expressions for quantities related to the # distribution of the actions. For a Gaussian policy, it contains the mean and (log) standard deviation. dist_info_vars = policy.dist_info_sym(observations_var) snap_dist_info_vars = snap_policy.dist_info_sym(observations_var) surr = TT.sum( -dist.log_likelihood_sym_1traj_GPOMDP(actions_var, dist_info_vars) * d_rewards_var) params = policy.get_params(trainable=True) snap_params = snap_policy.get_params(trainable=True) importance_weights = dist.likelihood_ratio_sym_1traj_GPOMDP( actions_var, snap_dist_info_vars, dist_info_vars) grad = theano.grad(surr, params) eval_grad1 = TT.matrix('eval_grad0', dtype=grad[0].dtype) eval_grad2 = TT.vector('eval_grad1', dtype=grad[1].dtype) eval_grad3 = TT.col('eval_grad3', dtype=grad[2].dtype) eval_grad4 = TT.vector('eval_grad4', dtype=grad[3].dtype) surr_on1 = TT.sum( -dist.log_likelihood_sym_1traj_GPOMDP(actions_var, dist_info_vars) * d_rewards_var * importance_weights_var) surr_on2 = TT.sum( snap_dist.log_likelihood_sym_1traj_GPOMDP(actions_var, snap_dist_info_vars) * d_rewards_var) grad_SVRG = [ sum(x) for x in zip([eval_grad1, eval_grad2, eval_grad3, eval_grad4], theano.grad(surr_on1, params),
def _setup(self): self.all_params = [] self.all_conv_results = [] self.all_conv_pool_results = [] self.all_conv_names = [] self.x_document_input = T.imatrix( 'x_doc') # words from the source document self.x_document_id = T.ivector( 'x_doc_id') # index of which source doucment this is from self.x_surface_text_input = T.imatrix( 'x_surface_link') # text of the surface link self.x_surface_context_input = T.imatrix( 'x_surface_cxt') # words surrounding the surface link self.x_target_input = T.ivector('x_target') # id of the target vector self.x_target_words = T.imatrix( 'x_target_words') # words from the target title link self.x_matches_surface = T.ivector( 'x_match_surface' ) # indicator if the target title matches the surface self.x_matches_counts = T.imatrix( 'x_matches_counts') # info about the link counts self.x_target_document_words = T.imatrix( 'x_target_document_words' ) # words from the body of target document self.x_link_id = T.ivector( 'x_link_id') # indx of what link to compare to in the matrix self.x_denotaiton_features = T.matrix( 'x_denotation_ind_feats', dtype='int8') # the joint denotation query features self.x_query_featurs = T.matrix('x_query_ind_feats', dtype='int8') # the query features self.x_query_link_id = T.ivector( 'x_match_query') # the query that a denotation links to self.x_denotation_ranges = T.imatrix( 'x_denotation_ranges' ) # the range of joint denotations to sum over self.x_target_link_id = T.ivector( 'x_match_target' ) # the target document that maches with a given denotation self.y_isgold = T.vector( 'y_gold', dtype='int8') # is 1 if the gold item, 0 otherwise self.y_grouping = T.imatrix( 'y_grouping') # matrix containing [start_idx, end_idx, gold_idx] self.embedding_W = theano.shared( self.wordvecs.get_numpy_matrix().astype(theano.config.floatX), name='embedding_W') self.embedding_W_docs = theano.shared( self.documentvecs.get_numpy_matrix().astype(theano.config.floatX), name='embedding_W_docs') def augRectify(x): # if x is zero, then the gradient failes due to computation: x / |x| return T.maximum(x, -.01 * x) simpleConvNonLin = augRectify self.document_l = lasagne.layers.InputLayer( (None, self.document_length), input_var=self.x_document_input) self.document_embedding_l = EmbeddingLayer( self.document_l, W=self.embedding_W, add_word_params=self.enable_train_wordvecs, ) self.document_simple_conv1_l = lasagne.layers.Conv2DLayer( self.document_embedding_l, num_filters=self.dim_compared_vec, filter_size=(self.num_words_to_use_conv, self.wordvecs.vector_size), name='document_simple_conv', nonlinearity=simpleConvNonLin, ) self.document_simple_sum_l = lasagne.layers.Pool2DLayer( self.document_simple_conv1_l, name='document_simple_pool', pool_size=(self.document_length - self.num_words_to_use_conv, 1), mode='sum', ) self.all_conv_pool_results.append( lasagne.layers.get_output(self.document_simple_sum_l)) self.document_output = lasagne.layers.get_output( lasagne.layers.reshape(self.document_simple_sum_l, ([0], -1))) self.all_params += lasagne.layers.get_all_params( self.document_simple_sum_l) ########################################## ## surface text self.surface_context_l = lasagne.layers.InputLayer( (None, self.sentence_length), input_var=self.x_surface_context_input, ) self.surface_context_embedding_l = EmbeddingLayer( self.surface_context_l, W=self.embedding_W, add_word_params=self.enable_train_wordvecs, ) self.surface_context_conv1_l = lasagne.layers.Conv2DLayer( self.surface_context_embedding_l, num_filters=self.dim_compared_vec, filter_size=(self.num_words_to_use_conv, self.wordvecs.vector_size), name='surface_cxt_conv1', nonlinearity=simpleConvNonLin, ) self.surface_context_pool1_l = lasagne.layers.Pool2DLayer( self.surface_context_conv1_l, name='surface_cxt_pool1', pool_size=(self.sentence_length - self.num_words_to_use_conv, 1), mode='sum', # WAS 'MAX' FOR SOME REASON ) self.all_conv_pool_results.append( lasagne.layers.get_output(self.surface_context_pool1_l)) self.surface_output = lasagne.layers.get_output( lasagne.layers.reshape(self.surface_context_pool1_l, ([0], -1))) self.all_params += lasagne.layers.get_all_params( self.surface_context_pool1_l) self.surface_input_l = lasagne.layers.InputLayer( (None, self.sentence_length_short), input_var=self.x_surface_text_input) self.surface_embedding_l = EmbeddingLayer( self.surface_input_l, W=self.embedding_W, add_word_params=self.enable_train_wordvecs, ) self.surface_conv1_l = lasagne.layers.Conv2DLayer( self.surface_embedding_l, num_filters=self.dim_compared_vec, filter_size=(self.num_words_to_use_conv, self.wordvecs.vector_size), name='surface_conv1', nonlinearity=simpleConvNonLin, ) self.surface_pool1_l = lasagne.layers.Pool2DLayer( self.surface_conv1_l, name='surface_pool1', pool_size=(self.sentence_length_short - self.num_words_to_use_conv, 1), mode='sum', ) self.all_conv_pool_results.append( lasagne.layers.get_output(self.surface_pool1_l)) self.surface_words_output = lasagne.layers.get_output( lasagne.layers.reshape(self.surface_pool1_l, ([0], -1))) self.all_params += lasagne.layers.get_all_params(self.surface_pool1_l) ################################################### ## dealing with the target side # matched_surface_reshaped = self.x_matches_surface.reshape( # (self.x_matches_surface.shape[0], 1, 1, 1)).astype(theano.config.floatX) self.target_input_l = lasagne.layers.InputLayer( (None, ), input_var=self.x_target_input) ################################# ## target indicators features ## these have been replaced with the indicatores as provided by the scala system # self.target_matched_surface_input_l = lasagne.layers.InputLayer( # (None,1,1,1), # input_var=matched_surface_reshaped, # ) # self.target_matched_counts_input_l = lasagne.layers.InputLayer( # (None,5), # input_var=self.x_matches_counts.astype(theano.config.floatX), # ) # words from the title of the target self.target_words_input_l = lasagne.layers.InputLayer( (None, self.sentence_length_short), input_var=self.x_target_words, ) self.target_words_embedding_l = EmbeddingLayer( self.target_words_input_l, W=self.embedding_W, add_word_params=self.enable_train_wordvecs, ) self.target_words_conv1_l = lasagne.layers.Conv2DLayer( self.target_words_embedding_l, name='target_wrds_conv1', filter_size=(self.num_words_to_use_conv, self.wordvecs.vector_size), num_filters=self.dim_compared_vec, nonlinearity=simpleConvNonLin, ) self.target_words_pool1_l = lasagne.layers.Pool2DLayer( self.target_words_conv1_l, name='target_wrds_pool1', pool_size=(self.sentence_length_short - self.num_words_to_use_conv, 1), mode='sum', ) self.all_conv_pool_results.append( lasagne.layers.get_output(self.target_words_pool1_l)) self.target_title_out = lasagne.layers.get_output( lasagne.layers.reshape(self.target_words_pool1_l, ([0], -1))) self.all_params += lasagne.layers.get_all_params( self.target_words_pool1_l) # words from the body of the target self.target_body_words_input_l = lasagne.layers.InputLayer( (None, self.sentence_length), input_var=self.x_target_document_words, ) self.target_body_words_embedding_l = EmbeddingLayer( self.target_body_words_input_l, W=self.embedding_W, add_word_params=self.enable_train_wordvecs, ) self.target_body_simple_conv1_l = lasagne.layers.Conv2DLayer( self.target_body_words_embedding_l, name='target_body_simple_conv', filter_size=(self.num_words_to_use_conv, self.wordvecs.vector_size), num_filters=self.dim_compared_vec, nonlinearity=simpleConvNonLin, ) self.target_body_simple_sum_l = lasagne.layers.Pool2DLayer( self.target_body_simple_conv1_l, name='target_body_simple_sum', pool_size=(self.sentence_length - self.num_words_to_use_conv, 1), mode='sum', ) self.all_conv_pool_results.append( lasagne.layers.get_output(self.target_body_simple_sum_l)) self.target_out = lasagne.layers.get_output( lasagne.layers.reshape(self.target_body_simple_sum_l, ([0], -1))) self.all_params += lasagne.layers.get_all_params( self.target_body_simple_sum_l) ######################################################### ## compute the cosine distance between the two layers # the are going to multiple entity links per document so we have the `_id` ivectors that represent how # we need to reshuffle the inputs, this saves on computation # source body self.source_aligned_l = self.document_output[self.x_document_id, :][ self.x_link_id, :] # source context self.source_context_aligned_l = self.surface_output[self.x_link_id, :] # source surface words self.source_surface_words_aligned_l = self.surface_words_output[ self.x_link_id, :] def augNorm(v): return T.basic.pow( T.basic.pow(T.basic.abs_(v), 2).sum(axis=1) + .001, .5) def cosinsim(a, b): dotted = T.batched_dot(a, b) return dotted / (augNorm(a) * augNorm(b)) def comparedVLayers(a, b): dv = cosinsim(a, b) return lasagne.layers.InputLayer((None, 1), input_var=dv.reshape( (dv.shape[0], 1))) self.cosine_conv_layers = [] for i, l in enumerate([ comparedVLayers(self.target_out, self.source_aligned_l), comparedVLayers(self.target_out, self.source_context_aligned_l), comparedVLayers(self.target_out, self.source_surface_words_aligned_l), comparedVLayers(self.target_title_out, self.source_aligned_l), comparedVLayers(self.target_title_out, self.source_context_aligned_l), comparedVLayers(self.target_title_out, self.source_surface_words_aligned_l), ]): if i not in disable_convs: self.cosine_conv_layers.append(l) if len(self.cosine_conv_layers) != 0: self.cosine_combined = lasagne.layers.concat( self.cosine_conv_layers, axis=1) self.cosine_weighted = lasagne.layers.DenseLayer( self.cosine_combined, name='cosine_dens1', num_units=1, b=None, nonlinearity=lasagne.nonlinearities.linear, ) # encourage these weights to be positive self.cosine_weighted.W.get_value(borrow=True)[:] += 1 self.cosine_output = lasagne.layers.get_output( lasagne.layers.reshape(self.cosine_weighted, (-1, ))) self.all_params += lasagne.layers.get_all_params( self.cosine_weighted) self.aligned_cosine = self.cosine_output[self.x_target_link_id] ###################################################### ## indicator feature input self.query_feat_l = lasagne.layers.InputLayer( (None, self.num_indicator_features), input_var=self.x_query_featurs, ) #rank_feats = [f[0] for f in enumerate(featuresNames) if f[1].startswith('Rank=')] self.denotation_join_feat_l = lasagne.layers.InputLayer( (None, self.num_indicator_features), input_var=self.x_denotaiton_features, #[:, rank_feats], ) ## the query and denotation features are now combined when inputed into the same denotation vector # self.query_layer_l = lasagne.layers.DenseLayer( # self.query_feat_l, # name='query_lin', # num_units=1, # nonlinearity=lasagne.nonlinearities.linear, # ) # self.query_output = lasagne.layers.get_output( # lasagne.layers.reshape(self.query_layer_l, (-1,)) # ) # self.all_params += lasagne.layers.get_all_params(self.query_layer_l) # self.aligned_queries = self.query_output[self.x_query_link_id] self.denotation_layer_l = lasagne.layers.DenseLayer( self.denotation_join_feat_l, name='denotation_lin', num_units=1, nonlinearity=lasagne.nonlinearities.linear, #W=self.query_layer_l.W, ) self.denotation_output = lasagne.layers.get_output( lasagne.layers.reshape(self.denotation_layer_l, (-1, ))) self.all_params += lasagne.layers.get_all_params( self.denotation_layer_l) ########################### ## multiply the two parts of the join scores self.unmerged_scores = ( ( #(self.aligned_queries) + (self.denotation_output if 1000 not in disable_convs else 0)) + (self.aligned_cosine if len(self.cosine_conv_layers) != 0 else 0)) ############################################# ## normalizing the scores and recombining ## the output if there were multiple entries ## for the same target document ############################################# def sloppyMathLogSum(vals): m = vals.max() return T.log(T.exp(vals - m).sum()) + m def mergingSum(indx, unmerged): return sloppyMathLogSum(unmerged[T.arange(indx[0], indx[1])]) self.merged_scores, _ = theano.scan( mergingSum, sequences=[self.x_denotation_ranges], non_sequences=[self.unmerged_scores]) ######################################## ## true output values ######################################## self.unscaled_output = self.merged_scores def scaleRes(indx, outputs, res): ran = T.arange(indx[0], indx[1]) s = sloppyMathLogSum(res[ran]) return T.set_subtensor(outputs[ran], res[ran] - s) self.scaled_scores, _ = theano.scan( scaleRes, sequences=[self.y_grouping], non_sequences=[self.unscaled_output], outputs_info=T.zeros((self.unscaled_output.shape[0], ))) self.true_output = self.scaled_scores[-1] ############################ ## compute the loss ############################ def lossSum(indx, res): return sloppyMathLogSum(res[T.arange(indx[0], indx[1])]) self.groupped_res, _ = theano.scan( lossSum, sequences=[self.y_grouping], non_sequences=[self.true_output], ) def selectGolds(indx, res, golds): r = T.arange(indx[0], indx[1]) # fix some issue with theano? # the gold value should simply comes from the input # so there is no good reason to have to disconnect the graident here gs = theano.gradient.disconnected_grad(golds[r]) vals = gs * res[r] + (1 - gs) * -1000000 # approx 0 return sloppyMathLogSum(vals) self.gold_res, _ = theano.scan( selectGolds, sequences=[self.y_grouping], non_sequences=[self.true_output, self.y_isgold], ) self.loss_vec = self.groupped_res - self.gold_res self.loss_scalar = self.loss_vec.sum() self.updates = lasagne.updates.adadelta( theano.grad(self.loss_scalar / self.loss_vec.shape[0], self.all_params, disconnected_inputs='warn'), self.all_params) self.func_inputs = [ self.x_document_input, self.x_surface_text_input, self.x_surface_context_input, self.x_document_id, self.x_target_input, self.x_matches_surface, self.x_matches_counts, self.x_link_id, self.x_target_words, self.x_target_document_words, self.x_denotaiton_features, self.x_query_featurs, self.x_query_link_id, self.x_denotation_ranges, self.x_target_link_id, self.y_grouping, self.y_isgold, ] self.func_outputs = [ self.true_output, self.loss_vec.sum(), self.loss_scalar, self.loss_vec, #self.res_l, ] dsc_out = lasagne.layers.get_output(self.document_simple_conv1_l) scc_out = lasagne.layers.get_output(self.surface_context_conv1_l) sc_out = lasagne.layers.get_output(self.surface_conv1_l) ttc_out = lasagne.layers.get_output(self.target_words_conv1_l) tbc_out = lasagne.layers.get_output(self.target_body_simple_conv1_l) # def cmp_convs(input, against): # #T.dot( self.all_conv_names.append('document_conv') self.all_conv_results.append( dsc_out ) #cmp_convs(dsc_out[self.x_document_id][self.x_link_id], [ttc_out, tbc_out])) self.all_conv_names.append('surface_context_conv') self.all_conv_results.append( scc_out) #cmp_convs(scc_out[self.x_link_id], [ttc_outp, tbc_out])) self.all_conv_names.append('surface_conv') self.all_conv_results.append(sc_out) self.all_conv_names.append('target_title_conv') self.all_conv_results.append(ttc_out) self.all_conv_names.append('target_body_conv') self.all_conv_results.append(tbc_out) self.train_func = theano.function( self.func_inputs, self.func_outputs, updates=self.updates, on_unused_input='ignore', ) self.test_func = theano.function( self.func_inputs, self.func_outputs, on_unused_input='ignore', ) self.find_conv_active_func = theano.function( self.func_inputs, self.all_conv_results, on_unused_input='ignore', )
def build_model(self, Dir_features, args): self._set_model_param(Dir_features) # try to scale the gradients on the level of parameters like caffe # by now only change the code with sgd scale_grad = True scale_l2_w = False TOL = 1e-5 sym_y = T.imatrix() # W is regularizable, b is not regularizable (correspondence with caffe) if scale_grad: self.net['conv1a'].b.tag.grad_scale = 2 self.net['conv2a'].b.tag.grad_scale = 2 self.net['conv3a'].b.tag.grad_scale = 2 self.net['conv3b'].b.tag.grad_scale = 2 self.net['conv4a'].b.tag.grad_scale = 2 self.net['conv4b'].b.tag.grad_scale = 2 self.net['conv5a'].b.tag.grad_scale = 2 self.net['conv5b'].b.tag.grad_scale = 2 self.net['fc6-1'].b.tag.grad_scale = 2 self.net['fc8-1'].W.tag.grad_scale = 10 self.net['fc8-1'].b.tag.grad_scale = 20 output_train = lasagne.layers.get_output(self.net['prob'], deterministic=False) output_eval = lasagne.layers.get_output(self.net['prob'], deterministic=True) ############## # compute cost ############## # compute the cost for training output_flat = T.reshape( output_train, (self.batch_size, self.clip_length, self.num_classes)) cost = T.mean(ctc_cost.cost(output_flat + TOL, sym_y)) # maybe it is necessary to add l2_penalty to the cost regularizable_params = lasagne.layers.get_all_params( self.net['prob'], regularizable=True) l2_w = 0.0005 all_layers = lasagne.layers.get_all_layers(self.net['prob']) l2_penalty = lasagne.regularization.regularize_layer_params( all_layers, lasagne.regularization.l2) * l2_w cost += l2_penalty # compute the cost for evaluation output_eval_flat = T.reshape( output_eval, (self.num_batch_eval, self.clip_length, self.num_classes)) cost_eval = T.mean(ctc_cost.cost(output_eval_flat + TOL, sym_y)) trainable_params = lasagne.layers.get_all_params(self.net['prob'], trainable=True) sh_lr = theano.shared(lasagne.utils.floatX(args.lr)) ################################################################## # try to scale the gradients on the level of parameters like caffe # by now only change the code with sgd ################################################################## if scale_grad: grads = theano.grad(cost, trainable_params) for idx, param in enumerate(trainable_params): grad_scale = getattr(trainable_params, 'grad_scale', 1) if grad_scale != 1: grads[idx] *= grad_scale ################# # compute updates ################# # adam works with lr 0.001 if args.optimizer == 'rmsprop': updates_opt = lasagne.updates.rmsprop(cost, trainable_params, learning_rate=sh_lr) updates = lasagne.updates.apply_momentum(updates_opt, trainable_params, momentum=0.9) elif args.optimizer == 'adam': updates_opt = lasagne.updates.adam(cost, trainable_params, learning_rate=sh_lr) updates = lasagne.updates.apply_momentum(updates_opt, trainable_params, momentum=0.9) elif args.optimizer == 'sgd': # Stochastic Gradient Descent (SGD) with momentum if scale_grad: updates = lasagne.updates.momentum(grads, trainable_params, learning_rate=sh_lr, momentum=0.9) else: updates = lasagne.updates.momentum(cost, trainable_params, learning_rate=sh_lr, momentum=0.9) elif args.optimizer == 'adadelta': updates_opt = lasagne.updates.adadelta(cost, trainable_params, learning_rate=sh_lr) updates = lasagne.updates.apply_momentum(updates_opt, trainable_params, momentum=0.9) elif args.optimizer == 'adagrad': updates_opt = lasagne.updates.adagrad(cost, trainable_params, learning_rate=sh_lr) updates = lasagne.updates.apply_momentum(updates_opt, trainable_params, momentum=0.9) ############################# # set train and eval function ############################# f_train = theano.function( [self.net['input'].input_var, sym_y, self.net['mask'].input_var], [cost, output_train], updates=updates) f_eval = theano.function( [self.net['input'].input_var, sym_y, self.net['mask'].input_var], [cost_eval, output_eval]) return f_train, f_eval
def jax_model_and_grad(x): return jax_model(x), jax.grad(jax_model)(x) def jax_logp_dlogp_func(x): v, g = jax_model_and_grad(x) return np.asarray(v), np.asarray(g) with pm.Model() as pm_model: pm_params = pm.Flat("pm_params", shape=3) mean = pm_params[0] * x + pm_params[1] pm.Normal("obs", mu=mean, sigma=pm.math.exp(pm_params[2]), observed=y_obs) pm_model_and_grad = pm_model.fastfn([pm_model.logpt] + theano.grad(pm_model.logpt, pm_model.vars)) def pm_logp_dlogp_func(x): return pm_model_and_grad(pm_model.bijection.rmap(x)) @pytest.mark.parametrize( "framework", ["pytorch", "jax", "pymc3"], ) def test_multiprocessing_with_various_frameworks(framework): logp_dlogp_funcs = { "pytorch": torch_logp_dlogp_func, "jax": jax_logp_dlogp_func, "pymc3": pm_logp_dlogp_func,
# policy.dist_info_sym returns a dictionary, whose values are symbolic expressions for quantities related to the # distribution of the actions. For a Gaussian policy, it contains the mean and (log) standard deviation. dist_info_vars = policy.dist_info_sym(observations_var) snap_dist_info_vars = snap_policy.dist_info_sym(observations_var) surr = TT.sum( -dist.log_likelihood_sym_1traj_GPOMDP(actions_var, dist_info_vars) * d_rewards_var) params = policy.get_params(trainable=True) snap_params = snap_policy.get_params(trainable=True) importance_weights = dist.likelihood_ratio_sym_1traj_GPOMDP( actions_var, dist_info_vars, snap_dist_info_vars) grad = theano.grad(surr, params) eval_grad1 = TT.matrix('eval_grad0', dtype=grad[0].dtype) eval_grad2 = TT.vector('eval_grad1', dtype=grad[1].dtype) eval_grad3 = TT.col('eval_grad3', dtype=grad[2].dtype) eval_grad4 = TT.vector('eval_grad4', dtype=grad[3].dtype) eval_grad5 = TT.vector('eval_grad5', dtype=grad[3].dtype) surr_on1 = TT.sum( dist.log_likelihood_sym_1traj_GPOMDP(actions_var, snap_dist_info_vars) * d_rewards_var * importance_weights_var) surr_on2 = TT.sum( -snap_dist.log_likelihood_sym_1traj_GPOMDP(actions_var, dist_info_vars) * d_rewards_var) grad_imp = theano.grad(surr_on1, snap_params)
sharedY = theano.shared(np.random.randn( bs, no, (ih - kh) / dh + 1, (iw - kw) / dw + 1).astype('float32'), name='sharedY') sharedW = theano.shared( np.random.randn(*filter_shape).astype('float32'), name='sharedW') except MemoryError, e: print "SKIPPING config due to the memory error below" print e continue X = theano.tensor.tensor4('X') Y = theano.tensor.nnet.conv.conv2d(X, sharedW, input_shape, filter_shape, subsample=(dh, dw)) gW = theano.grad(None, wrt=sharedW, known_grads={Y: sharedY}) gX = theano.grad(None, wrt=X, known_grads={Y: sharedY}) # if 'legacy' not in skip_tests: # benchmark_three_ways('theano.tensor.nnet.conv.conv2d', # sharedX, sharedY, sharedW, X, Y, gW, gX, # mode.excluding('conv_gemm', 'conv_dnn')) # benchmark Theano meta-optimizer # Mimic THEANO_FLAGS=optimizer_including=conv_meta # if 'meta' not in skip_tests: # benchmark_three_ways('(experimental) meta-optimizer', # sharedX, sharedY, sharedW, X, Y, gW, gX, # mode.including('conv_meta')) # benchmark Theano FFT convolution # Mimic THEANO_FLAGS=optimizer_including=conv_fft
controller = build_controller() controller_parameters = lasagne.layers.helper.get_all_params( controller["output"]) states, all_parameters, updates = build_model() fitness = build_objectives(states) fitness = T.switch(T.isnan(fitness) + T.isinf(fitness), np.float32(0), fitness) #import theano.printing #theano.printing.debugprint(T.mean(fitness), print_type=True) print "Finding gradient since %s..." % strftime("%H:%M:%S", localtime()) loss = -T.mean(fitness) grads = theano.grad(loss, all_parameters) grads = lasagne.updates.total_norm_constraint(grads, 1.0) grads = [T.switch(T.isnan(g) + T.isinf(g), np.float32(0), g) for g in grads] #grad_norm = T.sqrt(T.sum([(g**2).sum() for g in theano.grad(loss, all_parameters)])+1e-9) #theano_to_print.append(grad_norm) updates.update(lasagne.updates.adam(grads, all_parameters, 0.0001)) # we maximize fitness print "Compiling since %s..." % strftime("%H:%M:%S", localtime()) iter_test = theano.function([], [states[1], states[2], states[3]]) st = iter_test() with open("state-dump-%s.pkl" % EXP_NAME, 'wb') as f: pickle.dump({ "states": st, "json": open(jsonfile, "rb").read()
def main(): from fast_gp import sparse_w np.random.seed(0) n_data = 10 x = np.random.uniform(size=n_data) #x = np.float32(x) x = np.sort(x) a = .1 b = 10 c = .001 mu = np.zeros(n_data) cov = a * np.exp(-b * (x[:, np.newaxis] - x)**2) + c * np.eye(n_data) y = np.random.multivariate_normal(mu, cov) #print x #print y x_min, x_max = x.min(), x.max() #len_u = 2048 + 1 len_u = 1024 + 1 #len_u = 128 + 1 #len_u = 64 extra_u = 2 margin = (x_max - x_min) / (len_u - extra_u * 2) * 2 u = np.linspace(x_min - margin, x_max + margin, len_u) x_test = u[1:] #x_test = np.linspace(x_min, x_max, 20) idx_train, w_train = sparse_w(u, x) idx_test, w_test = sparse_w(u, x_test) t_idx_train = T.imatrix() t_w_train = T.matrix() t_idx_test = T.imatrix() t_w_test = T.matrix() t_gp_params = T.vector() t_indep_noise = T.scalar() t_ys = T.matrix() t_y = T.vector() cov_vec = CovVec(u, kernel, symbolic_kernel) def linear_op(zs): return cov_vec(t_idx_train, t_w_train, t_idx_test, t_w_test, t_gp_params, t_indep_noise, zs) n_lanczos_basis = 10 batch_size = 10 cov_zs = lanczos(linear_op, t_ys, n_lanczos_basis, batch_size) post_mean = PosteriorMean(u, kernel, symbolic_kernel) mu = post_mean(t_idx_train, t_w_train, t_idx_test, t_w_test, t_gp_params, t_indep_noise, t_y) gp_samples = mu.dimshuffle('x', 0) + cov_zs gp_samples_fn = theano.function([ t_idx_train, t_w_train, t_idx_test, t_w_test, t_gp_params, t_indep_noise, t_y, t_ys ], gp_samples) len_test = len(x_test) y_test = np.random.normal(size=(batch_size, len_test)) gdraws = gp_samples_fn(idx_train, w_train, idx_test, w_test, (a, b), c, y, y_test) print gdraws.shape t_random_proj = T.matrix() val = (gp_samples * t_random_proj).sum() val_fn = theano.function([ t_idx_train, t_w_train, t_idx_test, t_w_test, t_gp_params, t_indep_noise, t_y, t_ys, t_random_proj ], val) grad_val_fn = theano.function([ t_idx_train, t_w_train, t_idx_test, t_w_test, t_gp_params, t_indep_noise, t_y, t_ys, t_random_proj ], theano.grad(val, wrt=[t_gp_params, t_indep_noise], consider_constant=[ t_idx_train, t_w_train, t_idx_test, t_w_test, t_y, t_ys, t_random_proj ])) grad_val_fn1 = theano.function([ t_idx_train, t_w_train, t_idx_test, t_w_test, t_gp_params, t_indep_noise, t_y, t_ys, t_random_proj ], theano.grad(val, wrt=[t_random_proj], consider_constant=[ t_idx_train, t_w_train, t_idx_test, t_w_test, t_y, t_ys ])) random_proj = np.random.rand(batch_size, len_test) t1 = time.time() for _ in xrange(10): grad_val_fn(idx_train, w_train, idx_test, w_test, (a, b), c, y, y_test, random_proj) t2 = time.time() print t2 - t1 t1 = time.time() for _ in xrange(10): grad_val_fn1(idx_train, w_train, idx_test, w_test, (a, b), c, y, y_test, random_proj) t2 = time.time() print t2 - t1 return n_test = 10 for _ in xrange(n_test): random_proj = np.random.rand(batch_size, len_test) print 'test grad' print val_fn(idx_train, w_train, idx_test, w_test, (a, b), c, y, y_test, random_proj) print grad_val_fn(idx_train, w_train, idx_test, w_test, (a, b), c, y, y_test, random_proj) def val_fn1(x): a, b, c = x return val_fn(idx_train, w_train, idx_test, w_test, (a, b), c, y, y_test, random_proj) def grad_val_fn1(x): a, b, c = x [a, b], c = grad_val_fn(idx_train, w_train, idx_test, w_test, (a, b), c, y, y_test, random_proj) return np.array([a, b, c]) print scipy.optimize.check_grad(val_fn1, grad_val_fn1, np.array([a, b, c])) return import pylab as pl pl.figure() for each_sample in gdraws: pl.plot(x_test, each_sample, '-', c='b', alpha=.5) pl.plot(x, y, 'o', c='r') pl.show()
# distribution of the actions. For a Gaussian policy, it contains the mean and the logarithm of the standard deviation. dist_info_vars = policy.dist_info_sym(observations_var) # policy.distribution returns a distribution object under rllab.distributions. It contains many utilities for computing # distribution-related quantities, given the computed dist_info_vars. Below we use dist.log_likelihood_sym to compute # the symbolic log-likelihood. For this example, the corresponding distribution is an instance of the class # rllab.distributions.DiagonalGaussian dist = policy.distribution # Note that we negate the objective, since most optimizers assume a minimization problem surr = -TT.mean( dist.log_likelihood_sym(actions_var, dist_info_vars) * returns_var) # Get the list of trainable parameters. params = policy.get_params(trainable=True) grads = theano.grad(surr, params) f_train = theano.function(inputs=[observations_var, actions_var, returns_var], outputs=None, updates=adam(grads, params, learning_rate=learning_rate), allow_input_downcast=True) for _ in range(n_itr): paths = [] for _ in range(N): observations = [] actions = []
def adam_opt(model, train_set, valid_set, model_save_dir, minibatch=64, valid_period=1, total_period=0, disp_period=1, n_iters=1, lr=0.001, beta1=0.1, beta2=0.001, epsilon=1e-8, gamma=1 - 1e-8): """ Adam optimizer (ICLR 2015) """ # initialize learning rate lr_file = open(model_save_dir + 'lr.txt', 'w') lr_file.write(str(lr)) lr_file.close() lr = theano.shared(numpy.array(lr).astype(theano.config.floatX)) updates = [] all_grads = theano.grad(model.costs[0], model.params) i = theano.shared(numpy.float32(1)) i_t = i + 1. fix1 = 1. - (1. - beta1)**i_t fix2 = 1. - (1. - beta2)**i_t beta1_t = 1 - (1 - beta1) * gamma**(i_t - 1) lr_t = lr * (T.sqrt(fix2) / fix1) for p, g in zip(model.params, all_grads): m = theano.shared( numpy.zeros(p.get_value().shape, dtype=theano.config.floatX)) v = theano.shared( numpy.zeros(p.get_value().shape, dtype=theano.config.floatX)) m_t = (beta1_t * g) + ((1. - beta1_t) * m) v_t = (beta2 * g**2) + ((1. - beta2) * v) g_t = m_t / (T.sqrt(v_t) + epsilon) p_t = p - (lr_t * g_t) updates.append((m, m_t)) updates.append((v, v_t)) updates.append((p, p_t)) updates.append((i, i_t)) grad_and_cost = all_grads grad_and_cost.append(model.costs[0]) train_grad_f = theano.function(model.inputs, grad_and_cost, on_unused_input='warn') train_update_params_f = theano.function(grad_and_cost[0:-1], None, updates=updates) if valid_set != None: valid_f = theano.function(model.inputs, model.costs, on_unused_input='warn') # create log file log_file = open(model_save_dir + 'log.txt', 'w') log_file.write('adam_optimizer\n') log_file.write('lr=%f, beta1=%f, beta2=%f, epsilon=%f, gamma=%f\n' % (lr.get_value(), beta1, beta2, epsilon, gamma)) log_file.close() print('... training with Adam optimizer') cap_count = 0 train_cost = [] t0 = time.clock() try: for u in range(n_iters): if u % 10 == 0: # refresh lr try: lr_file = open(model_save_dir + '_lr.txt', 'r') lr.set_value(float(lr_file.readline().rstrip())) lr_file.close() except IOError: pass grads = [ numpy.zeros_like(p).astype(theano.config.floatX) for p in model.params ] mb_cost = [] for i in train_set.iterate(True): tmp = train_grad_f(*i) new_grads = tmp[0:-1] mb_cost.append(tmp[-1]) grads = [g1 + g2 for g1, g2 in zip(grads, new_grads)] grads = [g / numpy.array(minibatch) for g in grads] train_update_params_f(*grads) train_cost.append(numpy.mean(mb_cost)) # output some information if u % disp_period == 0 and u > 0: p_now = numpy.concatenate( [p.get_value().flatten() for p in model.params]) if u < 4 * disp_period: p_last = numpy.zeros_like(p_now) delta_last = numpy.zeros_like(p_now) delta_now = p_now - p_last angle = numpy.arccos( numpy.dot(delta_now, delta_last) / numpy.linalg.norm(delta_now) / numpy.linalg.norm(delta_last)) angle = angle / numpy.pi * 180 p_last = p_now delta_last = delta_now t1 = time.clock() print('period=%d, update=%d, mb_cost=[%.4f], |delta|=[%.2e], angle=[%.1f], lr=[%.6f], t=[%.2f]sec' % \ (u/valid_period, u, numpy.mean(train_cost), numpy.mean(abs(delta_now[0:10000])), angle, lr.get_value(), (t1-t0))) t0 = time.clock() train_cost = [] if u % valid_period == 0 and u > 0: model.save_to_file(model_save_dir, total_period + (u) / valid_period) valid_loss = [] valid_acc = [] train_loss = [] train_acc = [] for i in valid_set.iterate(True): loss, acc = valid_f(*i) valid_loss.append(loss) valid_acc.append(acc) for i in train_set.iterate(True): loss, acc = valid_f(*i) train_loss.append(loss) train_acc.append(acc) cap_count += valid_period * minibatch output_info = 'period=%i, valid loss=[%.4f], valid acc=[%.4f], train loss=[%.4f], train acc=[%.4f]' % \ (u/valid_period, numpy.mean(valid_loss), numpy.mean(valid_acc), numpy.mean(train_loss), numpy.mean(train_acc)) print(output_info) log_file = open(model_save_dir + 'log.txt', 'a') log_file.write(output_info + '\n') log_file.close() except KeyboardInterrupt: print('Training interrupted.')