def __init__(self, random_state=None, low=0.0, high=1.0): super(Uniform, self).__init__(low=low, high=high, random_state=random_state, optimizer=None) # pdf self.pdf_ = T.switch( T.or_(T.lt(self.X, self.low), T.ge(self.X, self.high)), 0., 1. / (self.high - self.low)).ravel() self.make_(self.pdf_, "pdf") # -log pdf self.nnlf_ = T.switch( T.or_(T.lt(self.X, self.low), T.ge(self.X, self.high)), np.inf, T.log(self.high - self.low)).ravel() self.make_(self.nnlf_, "nnlf") # cdf self.cdf_ = T.switch( T.lt(self.X, self.low), 0., T.switch( T.lt(self.X, self.high), (self.X - self.low) / (self.high - self.low), 1.)).ravel() self.make_(self.cdf_, "cdf") # ppf self.ppf_ = self.p * (self.high - self.low) + self.low self.make_(self.ppf_, "ppf", args=[self.p])
def get_updates(self, loss, lr, max_norm=1, beta1=0.9, beta2=0.999, epsilon=1e-8, grads=None): # Gradients if grads is None: grads = tensor.grad(loss, self.trainables) # Clipping norm = tensor.sqrt(sum([tensor.sqr(g).sum() for g in grads])) m = theanotools.clipping_multiplier(norm, max_norm) grads = [m*g for g in grads] # Safeguard against numerical instability new_cond = tensor.or_(tensor.or_(tensor.isnan(norm), tensor.isinf(norm)), tensor.or_(norm < 0, norm > 1e10)) grads = [tensor.switch(new_cond, np.float32(0), g) for g in grads] # Safeguard against numerical instability #cond = tensor.or_(norm < 0, tensor.or_(tensor.isnan(norm), tensor.isinf(norm))) #grads = [tensor.switch(cond, np.float32(0), g) for g in grads] # New values t = self.time + 1 lr_t = lr*tensor.sqrt(1. - beta2**t)/(1. - beta1**t) means_t = [beta1*m + (1. - beta1)*g for g, m in zip(grads, self.means)] vars_t = [beta2*v + (1. - beta2)*tensor.sqr(g) for g, v in zip(grads, self.vars)] steps = [lr_t*m_t/(tensor.sqrt(v_t) + epsilon) for m_t, v_t in zip(means_t, vars_t)] # Updates updates = [(x, x - step) for x, step in zip(self.trainables, steps)] updates += [(m, m_t) for m, m_t in zip(self.means, means_t)] updates += [(v, v_t) for v, v_t in zip(self.vars, vars_t)] updates += [(self.time, t)] return norm, grads, updates
def get_train(U_Ot, U_R, lenW, n_facts): def phi_x1(x_t, L): return T.concatenate([L[x_t].reshape((-1,)), zeros((2*lenW,)), zeros((3,))], axis=0) def phi_x2(x_t, L): return T.concatenate([zeros((lenW,)), L[x_t].reshape((-1,)), zeros((lenW,)), zeros((3,))], axis=0) def phi_y(x_t, L): return T.concatenate([zeros((2*lenW,)), L[x_t].reshape((-1,)), zeros((3,))], axis=0) def phi_t(x_t, y_t, yp_t, L): return T.concatenate([zeros(3*lenW,), T.stack(T.switch(T.lt(x_t,y_t), 1, 0), T.switch(T.lt(x_t,yp_t), 1, 0), T.switch(T.lt(y_t,yp_t), 1, 0))], axis=0) def s_Ot(xs, y_t, yp_t, L): result, updates = theano.scan( lambda x_t, t: T.dot(T.dot(T.switch(T.eq(t, 0), phi_x1(x_t, L).reshape((1,-1)), phi_x2(x_t, L).reshape((1,-1))), U_Ot.T), T.dot(U_Ot, (phi_y(y_t, L) - phi_y(yp_t, L) + phi_t(x_t, y_t, yp_t, L)))), sequences=[xs, T.arange(T.shape(xs)[0])]) return result.sum() def sR(xs, y_t, L, V): result, updates = theano.scan( lambda x_t, t: T.dot(T.dot(T.switch(T.eq(t, 0), phi_x1(x_t, L).reshape((1,-1)), phi_x2(x_t, L).reshape((1,-1))), U_R.T), T.dot(U_R, phi_y(y_t, V))), sequences=[xs, T.arange(T.shape(xs)[0])]) return result.sum() x_t = T.iscalar('x_t') m = [x_t] + [T.iscalar('m_o%d' % i) for i in xrange(n_facts)] f = [T.iscalar('f%d_t' % i) for i in xrange(n_facts)] r_t = T.iscalar('r_t') gamma = T.scalar('gamma') L = T.fmatrix('L') # list of messages V = T.fmatrix('V') # vocab r_args = T.stack(*m) cost_arr = [0] * 2 * (len(m)-1) updates_arr = [0] * 2 * (len(m)-1) for i in xrange(len(m)-1): cost_arr[2*i], updates_arr[2*i] = theano.scan( lambda f_bar, t: T.switch(T.or_(T.eq(t, f[i]), T.eq(t, T.shape(L)-1)), 0, T.largest(gamma - s_Ot(T.stack(*m[:i+1]), f[i], t, L), 0)), sequences=[L, T.arange(T.shape(L)[0])]) cost_arr[2*i+1], updates_arr[2*i+1] = theano.scan( lambda f_bar, t: T.switch(T.or_(T.eq(t, f[i]), T.eq(t, T.shape(L)-1)), 0, T.largest(gamma + s_Ot(T.stack(*m[:i+1]), t, f[i], L), 0)), sequences=[L, T.arange(T.shape(L)[0])]) cost1, u1 = theano.scan( lambda r_bar, t: T.switch(T.eq(r_t, t), 0, T.largest(gamma - sR(r_args, r_t, L, V) + sR(r_args, t, L, V), 0)), sequences=[V, T.arange(T.shape(V)[0])]) cost = cost1.sum() for c in cost_arr: cost += c.sum() g_uo, g_ur = T.grad(cost, [U_Ot, U_R]) train = theano.function( inputs=[r_t, gamma, L, V] + m + f, outputs=[cost], updates=[(U_Ot, U_Ot-alpha*g_uo), (U_R, U_R-alpha*g_ur)]) return train
def compute_updates(self, training_cost, params): updates = [] grads = T.grad(training_cost, params) grads = OrderedDict(zip(params, grads)) # Clip stuff c = numpy.float32(self.cutoff) clip_grads = [] norm_gs = T.sqrt(sum(T.sum(g ** 2) for p, g in grads.items())) normalization = T.switch(T.ge(norm_gs, c), c / norm_gs, np.float32(1.)) notfinite = T.or_(T.isnan(norm_gs), T.isinf(norm_gs)) for p, g in grads.items(): clip_grads.append((p, T.switch(notfinite, numpy.float32(.1) * p, g * normalization))) grads = OrderedDict(clip_grads) if self.updater == 'adagrad': updates = Adagrad(grads, self.lr) elif self.updater == 'sgd': raise Exception("Sgd not implemented!") elif self.updater == 'adadelta': updates = Adadelta(grads) elif self.updater == 'rmsprop': updates = RMSProp(grads, self.lr) elif self.updater == 'adam': updates = Adam(grads) else: raise Exception("Updater not understood!") return updates
def exe(self, mainloop): """ .. todo:: WRITEME """ grads = mainloop.grads """ for p, g in grads.items(): grads[p] = g / self.batch_size g_norm = 0. for g in grads.values(): g_norm += (g**2).sum() """ g_norm = 0. for p, g in grads.items(): g /= self.batch_size grads[p] = g g_norm += (g**2).sum() not_finite = T.or_(T.isnan(g_norm), T.isinf(g_norm)) g_norm = T.sqrt(g_norm) scaler = self.scaler / T.maximum(self.scaler, g_norm) for p, g in grads.items(): grads[p] = T.switch(not_finite, 0.1 * p, g * scaler) mainloop.grads = grads
def minimize(self, loss, momentum, rescale): super(RMSPropOptimizer, self).minimize(loss) grads = self.gradparams grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), grads))) not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm)) grad_norm = T.sqrt(grad_norm) scaling_num = rescale scaling_den = T.maximum(rescale, grad_norm) # Magic constants combination_coeff = 0.9 minimum_grad = 1E-4 updates = [] params = self.params for n, (param, grad) in enumerate(zip(params, grads)): grad = T.switch(not_finite, 0.1 * param, grad * (scaling_num / scaling_den)) old_square = self.running_square_[n] new_square = combination_coeff * old_square + ( 1. - combination_coeff) * T.sqr(grad) old_avg = self.running_avg_[n] new_avg = combination_coeff * old_avg + ( 1. - combination_coeff) * grad rms_grad = T.sqrt(new_square - new_avg ** 2) rms_grad = T.maximum(rms_grad, minimum_grad) memory = self.memory_[n] update = momentum * memory - self.lr * grad / rms_grad update2 = momentum * momentum * memory - ( 1 + momentum) * self.lr * grad / rms_grad updates.append((old_square, new_square)) updates.append((old_avg, new_avg)) updates.append((memory, update)) updates.append((param, param + update2)) return updates
def mcmc(ll, *frvs): full_observations = dict(observations) full_observations.update(dict([(rv, s) for rv, s in zip(free_RVs, frvs)])) loglik = -full_log_likelihood(full_observations) proposals = free_RVs_prop H = tensor.add(*[tensor.sum(tensor.sqr(p)) for p in proposals])/2. + loglik # -- this should be an inner loop g = [] g.append(tensor.grad(loglik, frvs)) proposals = [(p - epsilon*gg[0]/2.) for p, gg in zip(proposals, g)] rvsp = [(rvs + epsilon*rvp) for rvs,rvp in zip(frvs, proposals)] full_observations = dict(observations) full_observations.update(dict([(rv, s) for rv, s in zip(free_RVs, rvsp)])) new_loglik = -full_log_likelihood(full_observations) gnew = [] gnew.append(tensor.grad(new_loglik, rvsp)) proposals = [(p - epsilon*gn[0]/2.) for p, gn in zip(proposals, gnew)] # -- Hnew = tensor.add(*[tensor.sum(tensor.sqr(p)) for p in proposals])/2. + new_loglik dH = Hnew - H accept = tensor.or_(dH < 0., U < tensor.exp(-dH)) return [tensor.switch(accept, -new_loglik, ll)] + \ [tensor.switch(accept, p, f) for p, f in zip(rvsp, frvs)], \ {}, theano.scan_module.until(accept)
def get_output_for(self, input, deterministic=False, **kwargs): if deterministic or self.p == 0: return T.ones_like(self.retain, dtype=input.dtype) else: # Using theano constant to prevent upcasting # one = T.constant(1) # retain_prob = one - self.p # if self.rescale: # input /= retain_prob # use nonsymbolic shape for dropout mask if possible mask_shape = self.input_shape if any(s is None for s in mask_shape): mask_shape = input.shape # apply dropout, respecting shared axes if self.shared_axes: shared_axes = tuple(a if a >= 0 else a + input.ndim for a in self.shared_axes) mask_shape = tuple(1 if a in shared_axes else s for a, s in enumerate(mask_shape)) mask = self._srng.binomial(mask_shape, p=self.retain, dtype=input.dtype) mask = T.or_(mask, self.previous_mask) if self.shared_axes: bcast = tuple(bool(s == 1) for s in mask_shape) mask = T.patternbroadcast(mask, bcast) return mask
def graves_rmsprop_updates(self, params, grads, learning_rate=1e-4, alpha=0.9, epsilon=1e-4, chi=0.95): """ Alex Graves' RMSProp [1]_. .. math :: n_{i} &= \chi * n_i-1 + (1 - \chi) * grad^{2}\\ g_{i} &= \chi * g_i-1 + (1 - \chi) * grad\\ \Delta_{i} &= \alpha * Delta_{i-1} - learning_rate * grad / sqrt(n_{i} - g_{i}^{2} + \epsilon)\\ w_{i} &= w_{i-1} + \Delta_{i} References ---------- .. [1] Graves, Alex. "Generating Sequences With Recurrent Neural Networks", p.23 arXiv:1308.0850 """ updates = [] grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), grads))) not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm)) for n, (param, grad) in enumerate(zip(params, grads)): grad = T.switch(not_finite, 0.1 * param, grad) old_square = self.running_square_[n] old_avg = self.running_avg_[n] old_memory = self.memory_[n] new_square = chi * old_square + (1. - chi) * grad ** 2 new_avg = chi * old_avg + (1. - chi) * grad new_memory = alpha * old_memory - learning_rate * grad / T.sqrt(new_square - \ new_avg ** 2 + epsilon) updates.append((old_square, new_square)) updates.append((old_avg, new_avg)) updates.append((old_memory, new_memory)) updates.append((param, param + new_memory)) return updates
def exe(self, mainloop): """ .. todo:: WRITEME """ grads = mainloop.grads g_norm = 0. for p, g in grads.items(): g /= T.cast(self.batch_size, dtype=theano.config.floatX) grads[p] = g g_norm += (g**2).sum() if self.check_nan: not_finite = T.or_(T.isnan(g_norm), T.isinf(g_norm)) g_norm = T.sqrt(g_norm) scaler = self.scaler / T.maximum(self.scaler, g_norm) if self.check_nan: for p, g in grads.items(): grads[p] = T.switch(not_finite, 0.1 * p, g * scaler) else: for p, g in grads.items(): grads[p] = g * scaler mainloop.grads = grads
def compute_cost_log_in_parallel(original_rnn_outputs, labels, func, x_ends, y_ends): mask = T.log(1 - T.or_(T.eq(labels, T.zeros_like(labels)), T.eq(labels, shift_matrix(labels, 2)))) initial_state = T.log(T.zeros_like(labels)) initial_state = T.set_subtensor(initial_state[:,0], 0) def select_probabilities(rnn_outputs, label): return rnn_outputs[:,label] rnn_outputs, _ = theano.map(select_probabilities, [original_rnn_outputs, labels]) rnn_outputs = T.log(rnn_outputs.dimshuffle((1,0,2))) def forward_step(probabilities, last_probabilities): all_forward_probabilities = T.stack( last_probabilities + probabilities, log_shift_matrix(last_probabilities, 1) + probabilities, log_shift_matrix(last_probabilities, 2) + probabilities + mask, ) result = func(all_forward_probabilities, 0) return result forward_probabilities, _ = theano.scan(fn = forward_step, sequences = rnn_outputs, outputs_info = initial_state) forward_probabilities = forward_probabilities.dimshuffle((1,0,2)) def compute_cost(forward_probabilities, x_end, y_end): return -func(forward_probabilities[x_end-1,y_end-2:y_end]) return theano.map(compute_cost, [forward_probabilities, x_ends, y_ends])[0]
def get_gradients(self, model, data, ** kwargs): cost = self.expr(model=model, data=data, **kwargs) params = list(model.get_params()) grads = T.grad(cost, params, disconnected_inputs='ignore') gradients = OrderedDict(izip(params, grads)) if self.gradient_clipping: norm_gs = 0. for grad in gradients.values(): norm_gs += (grad ** 2).sum() not_finite = T.or_(T.isnan(norm_gs), T.isinf(norm_gs)) norm_gs = T.sqrt(norm_gs) norm_gs = T.switch(T.ge(norm_gs, self.max_magnitude), self.max_magnitude / norm_gs, 1.) for param, grad in gradients.items(): gradients[param] = T.switch(not_finite, .1 * param, grad * norm_gs) updates = OrderedDict() return gradients, updates
def adamgc(cost, params, lr=0.0002, b1=0.1, b2=0.001, e=1e-8, max_magnitude=5.0, infDecay=0.1): updates = [] grads = T.grad(cost, params) norm = norm_gs(params, grads) sqrtnorm = T.sqrt(norm) not_finite = T.or_(T.isnan(sqrtnorm), T.isinf(sqrtnorm)) adj_norm_gs = T.switch(T.ge(sqrtnorm, max_magnitude), max_magnitude / sqrtnorm, 1.) i = shared(floatX(0.)) i_t = i + 1. fix1 = 1. - (1. - b1)**i_t fix2 = 1. - (1. - b2)**i_t lr_t = lr * (T.sqrt(fix2) / fix1) for p, g in zip(params, grads): g = T.switch(not_finite, infDecay * p, g * adj_norm_gs) m = shared(p.get_value() * 0.) v = shared(p.get_value() * 0.) m_t = (b1 * g) + ((1. - b1) * m) v_t = (b2 * T.sqr(g)) + ((1. - b2) * v) g_t = m_t / (T.sqrt(v_t) + e) p_t = p - (lr_t * g_t) updates.append((m, m_t)) updates.append((v, v_t)) updates.append((p, p_t)) updates.append((i, i_t)) return updates, norm
def tnormal_icdf(size, avg, std, lbound, ubound, theano_rng, dtype): """ Alternative Method: sample = -Phi_inv(Phi(-lbound)*(1-u) + Phi(-ubound)*u) """ def Phi(x): erfarg = (x - avg) / (std * SQRT2) rval = 0.5 * (1. + T.erf(erfarg)) return rval.astype(dtype) def Phi_inv(y, eps=3e-8): """ eps was calibrated for cublas.erfinv using float32 """ temp = 2. * y - 1. erfinv_input = T.clip(temp, -1+eps, 1-eps) rval = avg + std * SQRT2 * T.erfinv(erfinv_input) return rval.astype(dtype) # center lower and upper bounds based on mean u = theano_rng.uniform(size=size, dtype=dtype) # Inverse CDF method. When method becomes numerically unstable, we simply # return the bounds based on whether avg < lbound, or ubound < avg. cdf_range = Phi(ubound) - Phi(lbound) sample = T.switch( T.or_( T.lt(cdf_range, 3e-8), T.gt(cdf_range, 1-3e-8)), T.switch( T.lt(avg, lbound), lbound, ubound), Phi_inv(Phi(lbound) + u * cdf_range)) return sample
def truncated_normal(size, avg, std, lbound, ubound, theano_rng, dtype): def phi(x): erfarg = (x - avg) / (std * SQRT2) rval = 0.5 * (1. + T.erf(erfarg)) return rval.astype(dtype) def phi_inv(phi_x): erfinv_input = T.clip(2. * phi_x - 1., -1.+1e-6, 1.-1e-6) rval = avg + std * SQRT2 * T.erfinv(erfinv_input) return rval.astype(dtype) # center lower and upper bounds based on mean u = theano_rng.uniform(size=size, dtype=dtype) cdf_range = phi(ubound) - phi(lbound) sample = phi_inv(phi(lbound) + u * cdf_range) # if avg >> ubound, return ubound # if avg << lbound, return lbound # else return phi(lbound) + u * [phi(ubound) - phi(lbound)] rval = T.switch( T.or_(sample < lbound, sample > ubound), T.switch(avg >= ubound, ubound, lbound), sample) return rval
def adamgc_(cost, params, lr=0.0002, b1=0.1, b2=0.01, e=1e-8, max_magnitude=5.0, infDecay=0.1): updates = [] grads = T.grad(cost, params) norm = norm_gs(params, grads) sqrtnorm = T.sqrt(norm) not_finite = T.or_(T.isnan(sqrtnorm), T.isinf(sqrtnorm)) adj_norm_gs = T.switch(T.ge(sqrtnorm, max_magnitude), max_magnitude / sqrtnorm, 1.0) i = shared(floatX(0.0)) i_t = i + 1.0 fix1 = 1.0 - (1.0 - b1) ** i_t fix2 = 1.0 - (1.0 - b2) ** i_t lr_t = lr * (T.sqrt(fix2) / fix1) for p, g in zip(params, grads): g = T.switch(not_finite, infDecay * p, g * adj_norm_gs) m = shared(p.get_value() * 0.0) v = shared(p.get_value() * 0.0) m_t = (b1 * g) + ((1.0 - b1) * m) v_t = (b2 * T.sqr(g)) + ((1.0 - b2) * v) g_t = m_t / (T.sqrt(v_t) + e) p_t = p - (lr_t * g_t) # e_t = shared(p.get_value() * 0.) # de_t = (srnd.normal(p.shape, std = 0.05, dtype=theano.config.floatX)*p_t - e_t)*0.05 #*p_t # p_t = p_t + de_t # updates.append((e_t, e_t + de_t)) updates.append((m, m_t)) updates.append((v, v_t)) updates.append((p, p_t)) updates.append((i, i_t)) return updates, norm
def theano_metrics(y_pred, y_true, n_classes, void_labels): """ Returns the intersection I and union U (to compute the jaccard I/U) and the accuracy. :param y_pred: tensor of predictions. shape (b*0*1, c) with c = n_classes :param y_true: groundtruth, shape (b,0,1) or (b,c,0,1) with c=1 :param n_classes: int :param void_labels: list of indexes of void labels :return: return tensors I and U of size (n_classes), and scalar acc """ # Put y_pred and y_true under the same shape y_true = T.flatten(y_true) y_pred = T.argmax(y_pred, axis=1) # We use not_void in case the prediction falls in the void class of the groundtruth for i in range(len(void_labels)): if i == 0: not_void = T.neq(y_true, void_labels[i]) else: not_void = not_void * T.neq(y_true, void_labels[i]) I = T.zeros(n_classes) U = T.zeros(n_classes) for i in range(n_classes): y_true_i = T.eq(y_true, i) y_pred_i = T.eq(y_pred, i) I = T.set_subtensor(I[i], T.sum(y_true_i * y_pred_i)) U = T.set_subtensor(U[i], T.sum(T.or_(y_true_i, y_pred_i) * not_void)) accuracy = T.sum(I) / T.sum(not_void) return I, U, accuracy
def updates(self, cost, params, learning_rate = 0.1, momentum= 0.95, rescale=5.): grads = T.grad(cost, params) grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), grads))) not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm)) grad_norm = T.sqrt(grad_norm) scaling_num = rescale scaling_den = T.maximum(rescale, grad_norm) # Magic constants combination_coeff = 0.9 minimum_grad = 1e-4 updates = [] for n, (param, grad) in enumerate(zip(params, grads)): grad = T.switch(not_finite, 0.1 * param, grad * (scaling_num / scaling_den)) old_square = self.running_square_[n] new_square = combination_coeff * old_square + ( 1. - combination_coeff) * T.sqr(grad) old_avg = self.running_avg_[n] new_avg = combination_coeff * old_avg + ( 1. - combination_coeff) * grad rms_grad = T.sqrt(new_square - new_avg ** 2) rms_grad = T.maximum(rms_grad, minimum_grad) memory = self.memory_[n] update = momentum * memory - learning_rate * grad / rms_grad update2 = momentum * momentum * memory - ( 1 + momentum) * learning_rate * grad / rms_grad updates.append((old_square, new_square)) updates.append((old_avg, new_avg)) updates.append((memory, update)) updates.append((param, param + update2)) return updates
def abs(x, axis=0): """ Takes the matrix/vector x and finds the absolute along the axis. :param x: T.matrix :return: Absolute along the given axis. T.vector """ x = assert_op(x, T.or_(T.eq(x.ndim, 2), T.eq(x.ndim, 1))) return T.sqrt(T.sum(T.sqr(x), axis))
def weighted_thresholded_binary_cross_entropy(preds, targets, imbalance_factor, lower_threshold): loss = weighted_binary_cross_entropy(preds, targets, imbalance_factor=imbalance_factor,) # preds that are below 0.2 where there is no target, are ignored loss_mask = T.or_(T.gt(preds,lower_threshold), T.eq(targets, 1)) loss = loss * loss_mask return loss
def find_right_bound(prev_func_output, step, maxstep): func_output = f(step) is_output_decrease = T.gt(prev_func_output, func_output) step = ifelse(is_output_decrease, T.minimum(2.0 * step, maxstep), step) is_output_increse = T.lt(prev_func_output, func_output) stoprule = theano.scan_module.until(T.or_(is_output_increse, step > maxstep)) return [func_output, step], stoprule
def filter_stop_case(d, filter_vector = None): eq = T.eq(d, stop_case) f = T.prod(eq, axis=1) if filter_vector: f = T.or_(filter_vector, f) # increment new overlapped history f_inv = T.abs_(f-1) d_f = d.T * f_inv.T d_f = T.cast(d_f.T, 'float32') return d_f, f
def objective(y_true, y_pred, P, Q, alpha=0., beta=0.15, dbeta=0., gamma=0.01, gamma1=-1., poos=0.23, eps=1e-6): '''Expects a binary class matrix instead of a vector of scalar classes. ''' beta = np.float32(beta) dbeta = np.float32(dbeta) gamma = np.float32(gamma) poos = np.float32(poos) eps = np.float32(eps) # scale preds so that the class probas of each sample sum to 1 y_pred += eps y_pred /= y_pred.sum(axis=-1, keepdims=True) y_true = T.cast(y_true.flatten(), 'int64') y1 = T.and_(T.gt(y_true, 0), T.le(y_true, Q)) # in-set y0 = T.or_(T.eq(y_true, 0), T.gt(y_true, Q)) # out-of-set or unlabeled y0sum = y0.sum() + eps # number of oos y1sum = y1.sum() + eps # number of in-set # we want to reduce cross entrophy of labeled data # convert all oos/unlabeled to label=0 cost0 = T.nnet.categorical_crossentropy(y_pred, T.switch(y_true <= Q, y_true, 0)) cost0 = T.dot(y1, cost0) / y1sum # average cost per labeled example if alpha: cost1 = T.nnet.categorical_crossentropy(y_pred, y_pred) cost1 = T.dot(y0, cost1) / y0sum # average cost per labeled example cost0 += alpha*cost1 # we want to increase the average entrophy in each batch # average over batch if beta: y_pred_avg0 = T.dot(y0, y_pred) / y0sum y_pred_avg0 = T.clip(y_pred_avg0, eps, np.float32(1) - eps) y_pred_avg0 /= y_pred_avg0.sum(axis=-1, keepdims=True) cost2 = T.nnet.categorical_crossentropy(y_pred_avg0.reshape((1,-1)), P-dbeta)[0] # [None,:] cost2 = T.switch(y0sum > 0.5, cost2, 0.) # ignore cost2 if no samples cost0 += beta*cost2 # binary classifier score if gamma: y_pred0 = T.clip(y_pred[:,0], eps, np.float32(1) - eps) if gamma1 < 0.: cost3 = - T.dot(poos*y0,T.log(y_pred0)) - T.dot(np.float32(1)-poos*y0.T,T.log(np.float32(1)-y_pred0)) cost3 /= y_pred.shape[0] cost0 += gamma*cost3 elif gamma1 > 0.: cost3 = - T.dot(poos*y0,T.log(y_pred0)) - T.dot((np.float32(1)-poos)*y0,T.log(np.float32(1)-y_pred0)) cost3 /= y0sum cost31 = - T.dot(y1,T.log(np.float32(1)-y_pred0)) cost3 /= y1sum cost0 += gamma*cost3 + gamma1*cost31 else: # gamma1 == 0. cost3 = - T.dot(poos*y0,T.log(y_pred0)) - T.dot((np.float32(1)-poos)*y0, T.log(np.float32(1)-y_pred0)) cost3 /= y0sum cost0 += gamma*cost3 return cost0
def dtw(i, q_p, b_p, Q, D, inf): i0 = T.eq(i, 0) # inf = T.cast(1e10,'float32') * T.cast(T.switch(T.eq(self.n,0), T.switch(T.eq(i,0), 0, 1), 1), 'float32') penalty = T.switch(T.and_(T.neg(n0), i0), big, T.constant(0.0, 'float32')) loop = T.constant(0.0, 'float32') + q_p forward = T.constant(0.0, 'float32') + T.switch(T.or_(n0, i0), 0, Q[i - 1]) opt = T.stack([loop, forward]) k_out = T.cast(T.argmin(opt, axis=0), 'int32') return opt[k_out, T.arange(opt.shape[1])] + D[i] + penalty, k_out
def jaccard_similarity(y_true, y_predicted): """ y_true: tensor ({1, 0}) y_predicted: tensor ({1, 0}) note - we round predicted because float probabilities would not work """ y_predicted = T.round(y_predicted).astype(theano.config.floatX) either_nonzero = T.or_(T.neq(y_true, 0), T.neq(y_predicted, 0)) return T.and_(T.neq(y_true, y_predicted), either_nonzero).sum(axis=-1, dtype=theano.config.floatX) / either_nonzero.sum(axis=-1, dtype=theano.config.floatX)
def step_clipping(params, gparams, scale=1.0): grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), gparams))) notfinite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm)) multiplier = T.switch(grad_norm < scale, 1.0, scale / grad_norm) _g = [] for param, gparam in izip(params, gparams): tmp_g = gparam * multiplier _g.append(T.switch(notfinite, param * 0.1, tmp_g)) params_clipping = _g return params_clipping
def __init__(self, low=0.0, high=1.0): """Constructor. Parameters ---------- * `low` [float]: The lower bound. * `high` [float]: The upper bound """ super(Uniform, self).__init__(low=low, high=high) # pdf self.pdf_ = T.switch( T.or_(T.lt(self.X, self.low), T.ge(self.X, self.high)), 0., 1. / (self.high - self.low)).ravel() self._make(self.pdf_, "pdf") # -log pdf self.nll_ = T.switch( T.or_(T.lt(self.X, self.low), T.ge(self.X, self.high)), np.inf, T.log(self.high - self.low)).ravel() self._make(self.nll_, "nll") # cdf self.cdf_ = T.switch( T.lt(self.X, self.low), 0., T.switch( T.lt(self.X, self.high), (self.X - self.low) / (self.high - self.low), 1.)).ravel() self._make(self.cdf_, "cdf") # ppf self.ppf_ = self.p * (self.high - self.low) + self.low self._make(self.ppf_, "ppf", args=[self.p])
def _step( i, pkm1, pkm2, qkm1, qkm2, k1, k2, k3, k4, k5, k6, k7, k8, r ): xk = -(x * k1 * k2) / (k3 * k4) pk = pkm1 + pkm2 * xk qk = qkm1 + qkm2 * xk pkm2 = pkm1 pkm1 = pk qkm2 = qkm1 qkm1 = qk xk = (x * k5 * k6) / (k7 * k8) pk = pkm1 + pkm2 * xk qk = qkm1 + qkm2 * xk pkm2 = pkm1 pkm1 = pk qkm2 = qkm1 qkm1 = qk old_r = r r = tt.switch(tt.eq(qk, zero), r, pk/qk) k1 += one k2 += k26update k3 += two k4 += two k5 += one k6 -= k26update k7 += two k8 += two big_cond = tt.gt(tt.abs_(qk) + tt.abs_(pk), BIG) biginv_cond = tt.or_( tt.lt(tt.abs_(qk), BIGINV), tt.lt(tt.abs_(pk), BIGINV) ) pkm2 = tt.switch(big_cond, pkm2 * BIGINV, pkm2) pkm1 = tt.switch(big_cond, pkm1 * BIGINV, pkm1) qkm2 = tt.switch(big_cond, qkm2 * BIGINV, qkm2) qkm1 = tt.switch(big_cond, qkm1 * BIGINV, qkm1) pkm2 = tt.switch(biginv_cond, pkm2 * BIG, pkm2) pkm1 = tt.switch(biginv_cond, pkm1 * BIG, pkm1) qkm2 = tt.switch(biginv_cond, qkm2 * BIG, qkm2) qkm1 = tt.switch(biginv_cond, qkm1 * BIG, qkm1) return ((pkm1, pkm2, qkm1, qkm2, k1, k2, k3, k4, k5, k6, k7, k8, r), until(tt.abs_(old_r - r) < (THRESH * tt.abs_(r))))
def __init__(self, x, xr, mask, L_enc, pdrop, args): # NOTE shape[1] is batch size since shape[0] is seq length outputs_info = [T.zeros((x.shape[1], args.rnn_dim)).astype(floatX)] flayers = list() blayers = list() fsubset = L_enc[x.flatten()] bsubset = L_enc[xr.flatten()] finp = fsubset.reshape((x.shape[0], x.shape[1], L_enc.shape[1])) binp = bsubset.reshape((x.shape[0], x.shape[1], L_enc.shape[1])) fseqmask = get_sequence_dropout_mask((x.shape[0], x.shape[1], L_enc.shape[1]), pdrop) bseqmask = get_sequence_dropout_mask((x.shape[0], x.shape[1], L_enc.shape[1]), pdrop) finplayer = GRULayer(finp.astype(floatX), mask, fseqmask, args.rnn_dim, outputs_info, args, suffix="fenc0") binplayer = GRULayer( binp.astype(floatX), mask, bseqmask, args.rnn_dim, outputs_info, args, suffix="benc0", backwards=True ) flayers.append(finplayer) blayers.append(binplayer) self.routs = list() # unlike RNNEncoder, contains hs, not just final h self.routs.append(finplayer.out + binplayer.out) downs = [] for k in xrange(1, args.rlayers): # concatenate consecutive steps in the sequence (which are downscaled to half from the previous layer) d = Downscale(self.routs[-1], args.rnn_dim, suffix="ds%d" % k) downs.append(d) inp = d.out twocols = mask.T.reshape([-1, 2]) mask = T.or_(twocols[:, 0], twocols[:, 1]).reshape([mask.shape[1], -1]).T fseqmask = get_sequence_dropout_mask((inp.shape[0], inp.shape[1], args.rnn_dim), pdrop) bseqmask = get_sequence_dropout_mask((inp.shape[0], inp.shape[1], args.rnn_dim), pdrop) flayer = GRULayer( Dropout(inp, pdrop).out, mask, fseqmask, args.rnn_dim, outputs_info, args, suffix="fenc%d" % k ) blayer = GRULayer( Dropout(inp, pdrop).out, mask, bseqmask, args.rnn_dim, outputs_info, args, suffix="benc%d" % k, backwards=True, ) self.routs.append(flayer.out + blayer.out) flayers.append(flayer) blayers.append(blayer) self.hs = self.routs[-1] # for attention olayer = LayerWrapper(self.routs) rlayers = flayers + blayers # NOTE careful not to assume rlayers = # layers in all cases # undo the temporary hack super(BiPyrRNNEncoder, self).__init__(rlayers, olayer, downscales=downs)
def shadow(self, points, lights): """ Returns whether points are in shadow of this object. See: http://en.wikipedia.org/wiki/Line-sphere_intersection """ y = points # vector from points to our center x = T.tensordot(y, -1*lights[0].normed_dir(), 1) decider = T.sqr(x) - T.sum(T.mul(y, y), 2) + 1 # if shadow, below is >= 0 is_nan_or_nonpos = T.or_(T.isnan(decider), decider <= 0) return T.switch(is_nan_or_nonpos, -1, -x - T.sqrt(decider))
def IoU_flatt(y_true, y_pred): '''Expects a binary class matrix instead of a vector of scalar classes. ''' if dim_ordering == 'th': y_pred = K.permute_dimensions(y_pred, (0, 2, 3, 1)) shp_y_pred = K.shape(y_pred) y_pred = K.reshape(y_pred, (shp_y_pred[0]*shp_y_pred[1]*shp_y_pred[2], shp_y_pred[3])) # go back to b01,c # shp_y_true = K.shape(y_true) y_true = K.cast(K.flatten(y_true), 'int32') # b,01 -> b01 y_pred = K.argmax(y_pred, axis=-1) # We use not_void in case the prediction falls in the void class of # the groundtruth for i in range(len(void_labels)): if i == 0: not_void = K.not_equal(y_true, void_labels[i]) else: not_void = not_void * K.not_equal(y_true, void_labels[i]) sum_I = K.zeros((1,), dtype='float32') out = {} for i in range(n_classes): y_true_i = K.equal(y_true, i) y_pred_i = K.equal(y_pred, i) if dim_ordering == 'th': I_i = K.sum(y_true_i * y_pred_i) U_i = K.sum(T.or_(y_true_i, y_pred_i) * not_void) # I = T.set_subtensor(I[i], I_i) # U = T.set_subtensor(U[i], U_i) sum_I = sum_I + I_i else: U_i = K.sum(K.cast(tf.logical_and(tf.logical_or(y_true_i, y_pred_i), not_void), 'float32')) y_true_i = K.cast(y_true_i, 'float32') y_pred_i = K.cast(y_pred_i, 'float32') I_i = K.sum(y_true_i * y_pred_i) sum_I = sum_I + I_i out['I'+str(i)] = I_i out['U'+str(i)] = U_i if dim_ordering == 'th': accuracy = K.sum(sum_I) / K.sum(not_void) else: accuracy = K.sum(sum_I) / tf.reduce_sum(tf.cast(not_void, 'float32')) out['acc'] = accuracy return out
def masked_categorical_crossentropy(output, target, mask, from_logits=False): if from_logits: output = T.nnet.softmax(output) else: # scale preds so that the class probas of each sample sum to 1 output /= output.sum(axis=-1, keepdims=True) # avoid numerical instability with _EPSILON clipping output = T.clip(output, _EPSILON, 1.0 - _EPSILON) objective = -T.sum(target * T.log(output), axis=output.ndim - 1) objective = T.set_subtensor( objective[T.or_(T.eq(target[:, :, mask], 1), T.eq(target[:, :, 0], 1)).nonzero()], 0.0) return printing.Print('Objective', global_fn=_debug_fn)(objective)
def gradient_descent(self, loss): """Momentum GD with gradient clipping.""" grad = T.grad(loss, self.params) self.momentum_velocity_ = [0.] * len(grad) grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), grad))) updates = OrderedDict() not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm)) scaling_den = T.maximum(5.0, grad_norm) for n, (param, grad) in enumerate(zip(self.params, grad)): grad = T.switch(not_finite, 0.1 * param, grad * (5.0 / scaling_den)) velocity = self.momentum_velocity_[n] update_step = self.momentum * velocity - self.learning_rate * grad self.momentum_velocity_[n] = update_step updates[param] = param + update_step return updates
def zoom_itertion_step(x_low, y_low, y_deriv_low, x_high, y_high, x_recent, y_recent, x_star): x_new = cubic_minimizer(x_low, y_low, y_deriv_low, x_high, y_high, x_recent, y_recent) y_new = f(x_new) y_deriv_new = f_deriv(x_new) stop_loop_rule = sequential_and( y_new <= y0 + c1 * x_new * y_deriv_0, y_new < y_low, abs(y_deriv_new) <= -c2 * y_deriv_0, ) condition1 = T.or_( y_new > y0 + c1 * x_new * y_deriv_0, y_new >= y_low ) condition2 = y_deriv_new * (x_high - x_low) >= zero y_recent, x_recent, x_high, y_high = ifelse( condition1, [y_high, x_high, x_new, y_new], ifelse( condition2, [y_high, x_high, x_low, y_low], [y_low, x_low, x_high, y_high], ) ) x_low, y_low, y_deriv_low = ifelse( condition1, [x_low, y_low, y_deriv_low], [x_new, y_new, y_deriv_new], ) x_star = x_new return ( [ x_low, y_low, y_deriv_low, x_high, y_high, y_recent, x_recent, x_star ], theano.scan_module.scan_utils.until(stop_loop_rule) )
def search_iteration_step(x_previous, x_current, y_previous, y_current, y_deriv_previous, is_first_iteration, x_star): y_deriv_current = f_deriv(x_current) x_new = x_current * asfloat(2) y_new = f(x_new) condition1 = T.or_( y_current > (y0 + c1 * x_current * y_deriv_0), T.and_( y_current >= y_previous, bitwise_not(is_first_iteration), )) condition2 = T.abs_(y_deriv_current) <= -c2 * y_deriv_0 condition3 = y_deriv_current >= zero x_star = ifelse( condition1, zoom(x_previous, x_current, y_previous, y_current, y_deriv_previous, f, f_deriv, y0, y_deriv_0, c1, c2), ifelse( condition2, x_current, ifelse( condition3, zoom(x_current, x_previous, y_current, y_previous, y_deriv_current, f, f_deriv, y0, y_deriv_0, c1, c2), x_new, ), ), ) y_deriv_previous_new = ifelse(condition1, y_deriv_previous, y_deriv_current) is_any_condition_satisfied = sequential_or(condition1, condition2, condition3) y_current_new = ifelse(is_any_condition_satisfied, y_current, y_new) return ([ x_current, x_new, y_current, y_current_new, y_deriv_previous_new, theano_false, x_star ], theano.scan_module.scan_utils.until( sequential_or( T.eq(x_new, zero), is_any_condition_satisfied, )))
def rmsprop(cost, params, learning_rate, momentum=0.5, rescale=5.): grads = T.grad(cost=cost, wrt=params) running_square_ = [ theano.shared(np.zeros_like(p.get_value(), dtype=p.dtype), broadcastable=p.broadcastable) for p in params ] running_avg_ = [ theano.shared(np.zeros_like(p.get_value(), dtype=p.dtype), broadcastable=p.broadcastable) for p in params ] memory_ = [ theano.shared(np.zeros_like(p.get_value(), dtype=p.dtype), broadcastable=p.broadcastable) for p in params ] grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), grads))) not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm)) grad_norm = T.sqrt(grad_norm) scaling_num = rescale scaling_den = T.maximum(rescale, grad_norm) # Magic constants combination_coeff = 0.9 minimum_grad = 1E-4 updates = [] for n, (param, grad) in enumerate(zip(params, grads)): grad = T.switch(not_finite, 0.1 * param, grad * (scaling_num / scaling_den)) old_square = running_square_[n] new_square = combination_coeff * old_square + ( 1. - combination_coeff) * T.sqr(grad) old_avg = running_avg_[n] new_avg = combination_coeff * old_avg + (1. - combination_coeff) * grad rms_grad = T.sqrt(new_square - new_avg**2) rms_grad = T.maximum(rms_grad, minimum_grad) memory = memory_[n] update = momentum * memory - learning_rate * grad / rms_grad update2 = momentum * momentum * memory - ( 1 + momentum) * learning_rate * grad / rms_grad updates.append((old_square, new_square)) updates.append((old_avg, new_avg)) updates.append((memory, update)) updates.append((param, param + update2)) return updates
def _step(i, pkm1, pkm2, qkm1, qkm2, k1, k2, k3, k4, k5, k6, k7, k8, r): xk = -(x * k1 * k2) / (k3 * k4) pk = pkm1 + pkm2 * xk qk = qkm1 + qkm2 * xk pkm2 = pkm1 pkm1 = pk qkm2 = qkm1 qkm1 = qk xk = (x * k5 * k6) / (k7 * k8) pk = pkm1 + pkm2 * xk qk = qkm1 + qkm2 * xk pkm2 = pkm1 pkm1 = pk qkm2 = qkm1 qkm1 = qk old_r = r r = tt.switch(tt.eq(qk, zero), r, pk / qk) k1 += one k2 += k26update k3 += two k4 += two k5 += one k6 -= k26update k7 += two k8 += two big_cond = tt.gt(tt.abs_(qk) + tt.abs_(pk), BIG) biginv_cond = tt.or_(tt.lt(tt.abs_(qk), BIGINV), tt.lt(tt.abs_(pk), BIGINV)) pkm2 = tt.switch(big_cond, pkm2 * BIGINV, pkm2) pkm1 = tt.switch(big_cond, pkm1 * BIGINV, pkm1) qkm2 = tt.switch(big_cond, qkm2 * BIGINV, qkm2) qkm1 = tt.switch(big_cond, qkm1 * BIGINV, qkm1) pkm2 = tt.switch(biginv_cond, pkm2 * BIG, pkm2) pkm1 = tt.switch(biginv_cond, pkm1 * BIG, pkm1) qkm2 = tt.switch(biginv_cond, qkm2 * BIG, qkm2) qkm1 = tt.switch(biginv_cond, qkm1 * BIG, qkm1) return ( (pkm1, pkm2, qkm1, qkm2, k1, k2, k3, k4, k5, k6, k7, k8, r), until(tt.abs_(old_r - r) < (THRESH * tt.abs_(r))), )
def get_clip_rmsprop_updates(self, params, cost, learning_rate, momentum, rescale=5.): gparams = T.grad(cost, params) updates = OrderedDict() if not hasattr(self, "running_average_"): self.running_square_ = [0.] * len(gparams) self.running_avg_ = [0.] * len(gparams) self.updates_storage_ = [0.] * len(gparams) if not hasattr(self, "momentum_velocity_"): self.momentum_velocity_ = [0.] * len(gparams) # Gradient clipping grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), gparams))) not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm)) grad_norm = T.sqrt(grad_norm) scaling_num = rescale scaling_den = T.maximum(rescale, grad_norm) for n, (param, gparam) in enumerate(zip(params, gparams)): gparam = T.switch(not_finite, 0.1 * param, gparam * (scaling_num / scaling_den)) combination_coeff = 0.9 minimum_grad = 1e-4 old_square = self.running_square_[n] new_square = combination_coeff * old_square + ( 1. - combination_coeff) * T.sqr(gparam) old_avg = self.running_avg_[n] new_avg = combination_coeff * old_avg + ( 1. - combination_coeff) * gparam rms_grad = T.sqrt(new_square - new_avg**2) rms_grad = T.maximum(rms_grad, minimum_grad) velocity = self.momentum_velocity_[n] update_step = momentum * velocity - learning_rate * (gparam / rms_grad) self.running_square_[n] = new_square self.running_avg_[n] = new_avg self.updates_storage_[n] = update_step self.momentum_velocity_[n] = update_step updates[param] = param + update_step return updates
def clip_grad_remove_nan(grads, clip_c_shared, mt_tparams): g2 = 0. for g in grads: g2 += (g * g).sum() not_finite = tensor.or_(tensor.isnan(g2), tensor.isinf(g2)) if clip_c_shared.get_value() > 0.: new_grads = [] for g, p in zip(grads, itemlist(mt_tparams)): tmpg = tensor.switch(g2 > (clip_c_shared * clip_c_shared), g / tensor.sqrt(g2) * clip_c_shared, g) new_grads.append( tensor.switch(not_finite, np.float32(0.1) * p, tmpg)) return new_grads, tensor.sqrt(g2) else: return grads, tensor.sqrt(g2)
def distance(self, rayField): """ Returns the distances along the rays that hits occur. If no hit, returns inf. """ rf = self.w2o(rayField) pdotv = T.tensordot(rf.rays, rf.origin, 1) vnorm = T.sum(rf.rays * rf.rays, axis=2) determinent = self._hit(rf.rays, rf.origin) distance1 = (-pdotv - T.sqrt(determinent)) / vnorm distance2 = (-pdotv + T.sqrt(determinent)) / vnorm distance = T.minimum(distance1, distance2) is_nan_or_negative = T.or_(determinent <= 0, T.isnan(determinent)) stabilized = T.switch(is_nan_or_negative, float('inf'), distance) return stabilized
def compute_updates(self, training_cost, params): updates = {} grads = T.grad(training_cost, params) grads = OrderedDict(zip(params, grads)) # Clip stuff c = numpy.float32(self.cutoff) clip_grads = [] norm_gs = T.sqrt(sum(T.sum(g**2) for p, g in grads.items())) normalization = T.switch(T.ge(norm_gs, c), c / norm_gs, np.float32(1.)) notfinite = T.or_(T.isnan(norm_gs), T.isinf(norm_gs)) for p, g in grads.items(): clip_grads.append((p, T.switch(notfinite, numpy.float32(.1) * p, g * normalization))) grads = OrderedDict(clip_grads) if self.initialize_from_pretrained_word_embeddings and self.fix_pretrained_word_embeddings: # Keep pretrained word embeddings fixed logger.debug("Will use mask to fix pretrained word embeddings") grads[self.language_model.W_emb] = grads[ self.language_model. W_emb] * self.language_model.W_emb_pretrained_mask else: logger.debug("Will train all word embeddings") if self.updater == 'adagrad': updates = Adagrad(grads, self.lr) elif self.updater == 'sgd': raise Exception("Sgd not implemented!") elif self.updater == 'adadelta': updates = Adadelta(grads) elif self.updater == 'rmsprop': updates = RMSProp(grads, self.lr) elif self.updater == 'adam': updates = Adam(grads) else: raise Exception("Updater not understood!") return updates
def sgdmgc(cost, params, lr=1.0, alpha=0.1, max_magnitude=5.0, infDecay=0.1): """SGD with momentum and gradient clipping""" grads = T.grad(cost=cost, wrt=params) updates = [] norm = norm_gs(params, grads) sqrtnorm = T.sqrt(norm) not_finite = T.or_(T.isnan(sqrtnorm), T.isinf(sqrtnorm)) adj_norm_gs = T.switch(T.ge(sqrtnorm, max_magnitude), max_magnitude / sqrtnorm, 1.) for p, g in zip(params, grads): v = shared(p.get_value() * 0.) g = T.switch(not_finite, infDecay * p, g * adj_norm_gs) v_new = v * (1.0 - alpha) - alpha * lr * g updates.append((v, v_new)) updates.append((p, p + v_new)) return updates, norm
def get_action_results(self, last_states, actions, **kwargs): #unpack state and action last_state = check_list(last_states)[0] action = check_list(actions)[0] #state is a boolean vector: whether or not i-th action #was tried already during this session #last output[:,end_code] always remains 1 after first being triggered #whether session was active before tick session_active = T.eq(last_state[:, -1], 0) #whether session was terminated by the end of this tick session_terminated = T.or_(T.eq(session_active, 0), in1d(action, self.terminal_action_ids)) batch_range = T.arange(action.shape[0]) state_after_action = T.set_subtensor(last_state[batch_range, action], 1) state_after_action = T.set_subtensor(state_after_action[:, -1], session_terminated) new_state = T.switch(session_active.reshape([-1, 1]), state_after_action, last_state) #if allowed to see attribute observed_attrs = T.switch( state_after_action[:, :self.attributes.shape[1]], self.attributes, -1) observation = T.concatenate( [ observed_attrs, #float32[batch,1] response T.extra_ops.to_one_hot( action, self.joint_data.shape[1]), #what action was commited session_terminated.reshape( [-1, 1]), # whether session is terminated by now ], axis=1) return new_state, observation
def clip_gradients_norm(gradients, threshold, parameters, fix_nan=False): gradient_sqr_vec = T.concatenate([T.sqr(g.flatten()) for g in gradients]) gradient_norm = T.sqrt(gradient_sqr_vec.sum()) rescale = T.maximum(gradient_norm, threshold) if fix_nan: isnan = T.or_(T.isnan(gradient_norm), T.isinf(gradient_norm)) else: isnan = None rv = [] for i, g in enumerate(gradients): if fix_nan: alt_g = 0.1 * parameters[i] print_alt_g = Print( "NaN detected! Fixing with pseudogradient with mean:", ["mean"])(alt_g) new_g = T.switch(isnan, print_alt_g, g / rescale) else: new_g = g / rescale rv.append(new_g) return rv
def adamgc_(cost, params, lr=0.0002, b1=0.1, b2=0.01, e=1e-8, max_magnitude=5.0, infDecay=0.1): updates = [] grads = T.grad(cost, params) norm = norm_gs(params, grads) sqrtnorm = T.sqrt(norm) not_finite = T.or_(T.isnan(sqrtnorm), T.isinf(sqrtnorm)) adj_norm_gs = T.switch(T.ge(sqrtnorm, max_magnitude), max_magnitude / sqrtnorm, 1.) i = shared(floatX(0.)) i_t = i + 1. fix1 = 1. - (1. - b1)**i_t fix2 = 1. - (1. - b2)**i_t lr_t = lr * (T.sqrt(fix2) / fix1) for p, g in zip(params, grads): g = T.switch(not_finite, infDecay * p, g * adj_norm_gs) m = shared(p.get_value() * 0.) v = shared(p.get_value() * 0.) m_t = (b1 * g) + ((1. - b1) * m) v_t = (b2 * T.sqr(g)) + ((1. - b2) * v) g_t = m_t / (T.sqrt(v_t) + e) p_t = p - (lr_t * g_t) #e_t = shared(p.get_value() * 0.) #de_t = (srnd.normal(p.shape, std = 0.05, dtype=theano.config.floatX)*p_t - e_t)*0.05 #*p_t #p_t = p_t + de_t #updates.append((e_t, e_t + de_t)) updates.append((m, m_t)) updates.append((v, v_t)) updates.append((p, p_t)) updates.append((i, i_t)) return updates, norm
def th_el_nearestd(x1: tt.TensorVariable, x2: tt.TensorVariable, mu: tt.TensorConstant, Ainv: tt.TensorConstant) -> tt.TensorVariable: # See numpy version for info/comments. D = x1.shape[1] x1g = th_el_backproject_all(x1, mu, Ainv) # (N, K, D) x2g = th_el_backproject_all(x2, mu, Ainv) x1gf = x1g.reshape((-1, D)) x2gf = x2g.reshape((-1, D)) # x1gf = x1g.reshape(-1, D) # x2gf = x2g.reshape(-1, D) diff = x2gf - x1gf num = -matmul(x1gf.dimshuffle(0, 'x', 1), diff.dimshuffle(0, 1, 'x')).squeeze() # num = -np.matmul(x1gf[..., None, :], diff[..., :, None]).squeeze() den = matmul(diff.dimshuffle(0, 'x', 1), diff.dimshuffle(0, 1, 'x')).squeeze() # den = np.matmul(diff[..., None, :], diff[..., :, None]).squeeze() t = num / den # type: tt.TensorVariable tneg = t < 0 tbig = t > tt.sqrt(den) # tbig = t > np.sqrt(den) tout = tt.or_(tneg, tbig) # tout = np.logical_or(tneg, tbig) tin = ~tout # tin = np.logical_not(tout) d_ = x1gf + diff * t.dimshuffle(0, 'x') dpoa = tt.sqrt(tt.sum(d_ * d_, axis=1, keepdims=True)).squeeze() * tin # dpoa = np.linalg.norm(x1gf + diff * t[:, None], axis=1) * tin dx1 = tt.sqrt(tt.sum(x1gf * x1gf, axis=1, keepdims=True)).squeeze() * tneg # dx1 = np.linalg.norm(x1gf, axis=1) * tneg dx2 = tt.sqrt(tt.sum(x2gf * x2gf, axis=1, keepdims=True)).squeeze() * tbig # dx2 = np.linalg.norm(x2gf, axis=1) * tbig d = dpoa + dx1 + dx2 d = d.reshape(x1g.shape[:-1]) # (N, K) return tt.min(d, axis=1) # (N,)
def mcmc(ll, *frvs): full_observations = dict(observations) full_observations.update( dict([(rv, s) for rv, s in zip(free_RVs, frvs)])) loglik = -full_log_likelihood(full_observations) proposals = free_RVs_prop H = tensor.add(*[tensor.sum(tensor.sqr(p)) for p in proposals]) / 2. + loglik # -- this should be an inner loop g = [] g.append(tensor.grad(loglik, frvs)) proposals = [(p - epsilon * gg[0] / 2.) for p, gg in zip(proposals, g)] rvsp = [(rvs + epsilon * rvp) for rvs, rvp in zip(frvs, proposals)] full_observations = dict(observations) full_observations.update( dict([(rv, s) for rv, s in zip(free_RVs, rvsp)])) new_loglik = -full_log_likelihood(full_observations) gnew = [] gnew.append(tensor.grad(new_loglik, rvsp)) proposals = [(p - epsilon * gn[0] / 2.) for p, gn in zip(proposals, gnew)] # -- Hnew = tensor.add(*[tensor.sum(tensor.sqr(p)) for p in proposals]) / 2. + new_loglik dH = Hnew - H accept = tensor.or_(dH < 0., U < tensor.exp(-dH)) return [tensor.switch(accept, -new_loglik, ll)] + \ [tensor.switch(accept, p, f) for p, f in zip(rvsp, frvs)], \ {}, theano.scan_module.until(accept)
def compute_updates(training_cost, params, config): updates = [] grads = T.grad(training_cost, params) grads = OrderedDict(zip(params, grads)) # Clip stuff c = np.float32(1.) clip_grads = [] norm_gs = T.sqrt(sum(T.sum(g ** 2) for p, g in grads.items())) normalization = T.switch(T.ge(norm_gs, c), c / norm_gs, np.float32(1.)) notfinite = T.or_(T.isnan(norm_gs), T.isinf(norm_gs)) for p, g in grads.items(): clip_grads.append((p, T.switch(notfinite, np.float32(.1) * p, g * normalization))) grads = OrderedDict(clip_grads) updates = Adam(grads, config.learning_rate) #使用adam梯度更新策略 return updates
def adam(self, cost, params, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8): all_grads = T.grad(cost=cost, wrt=params) all_grads = total_norm_constraint(all_grads, 10) grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), all_grads))) not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm)) t_prev = theano.shared(utils.floatX(0.)) updates = OrderedDict() t = t_prev + 1 a_t = learning_rate * T.sqrt(1 - beta2**t) / (1 - beta1**t) for param, g_t in zip(params, all_grads): g_t = T.switch(not_finite, 0.1 * param, g_t) value = param.get_value(borrow=True) m_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) v_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) m_t = beta1 * m_prev + (1 - beta1) * g_t v_t = beta2 * v_prev + (1 - beta2) * g_t**2 step = a_t * m_t / (T.sqrt(v_t) + epsilon) updates[m_prev] = m_t updates[v_prev] = v_t updates[param] = param - step updates[t_prev] = t return updates
def anneal_learning_rate(lr, t, method='half-life', **kwargs): if not isinstance(lr, (T.sharedvar.ScalarSharedVariable, T.sharedvar.TensorSharedVariable)): raise TypeError('lr must be a shared variable, got %s.' % type(lr)) lr_ = lr.get_value() if method == 'half-life': num_iters = kwargs.pop('num_iters', None) decay = kwargs.pop('decay', .1) if num_iters is None: raise ValueError('num_iters must be provided.') cond = T.cast(T.or_(T.eq(t, num_iters // 2), T.eq(t, 3 * num_iters // 4)), theano.config.floatX) lr.default_update = lr * decay * cond + (1. - cond) * lr elif method == 'step': step = kwargs.pop('step', None) decay = kwargs.pop('decay', .5) if step is None: raise ValueError('step must be provided.') cond = T.cast(T.eq(T.mod(t, step), 0), theano.config.floatX) lr.default_update = lr * decay * cond + (1. - cond) * lr elif method == 'exponential': decay = kwargs.pop('decay', 1e-4) t = T.cast(t, theano.config.floatX) lr.default_update = lr_ * T.exp(-decay * t) elif method == 'linear': num_iters = kwargs.pop('num_iters', None) if num_iters is None: raise ValueError('num_iters must be provided.') t = T.cast(t, theano.config.floatX) lr.default_update = lr_ * (1. - t / np.cast[theano.config.floatX](num_iters)) elif method == 'inverse': decay = kwargs.pop('decay', .01) t = T.cast(t, theano.config.floatX) lr.default_update = lr_ / (1. + decay * t) else: raise ValueError('Unknown annealing method.')
def get_clip_sgd_updates(self, X_sym, y_sym, params, cost, learning_rate, momentum, rescale=5.): gparams = T.grad(cost, params) updates = OrderedDict() if not hasattr(self, "momentum_velocity_"): self.momentum_velocity_ = [0.] * len(gparams) # Gradient clipping grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), gparams))) not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm)) grad_norm = T.sqrt(grad_norm) scaling_num = rescale scaling_den = T.maximum(rescale, grad_norm) for n, (param, gparam) in enumerate(zip(params, gparams)): # clip gradient directly, not momentum etc. gparam = T.switch(not_finite, 0.1 * param, gparam * (scaling_num / scaling_den)) velocity = self.momentum_velocity_[n] update_step = momentum * velocity - learning_rate * gparam self.momentum_velocity_[n] = update_step updates[param] = param + update_step return updates
def clip_gradient(self, params, gparams, scalar=5, check_nanF=True): """ Sequence to sequence """ num_params = len(gparams) g_norm = 0. for i in xrange(num_params): gparam = gparams[i] g_norm += (gparam**2).sum() if check_nanF: not_finite = T.or_(T.isnan(g_norm), T.isinf(g_norm)) g_norm = T.sqrt(g_norm) scalar = scalar / T.maximum(scalar, g_norm) if check_nanF: for i in xrange(num_params): param = params[i] gparams[i] = T.switch(not_finite, 0.1 * param, gparams[i] * scalar) else: for i in xrange(num_params): gparams[i] = gparams[i] * scalar return gparams
def to_weights(d, m, p, prior): hid_inp = self.dwe[d, :] # mw x ms x hd if self.is_lstm or self.is_gru: logit = T.exp(T.dot(hid_inp, L0)[:,:,p])# (mw x ms) x mw mk = T.switch(T.lt(p, 0), 0, 1) # mw: word-level mask (different mask from m) mask = mk.dimshuffle(0, 'x', 'x') l2 = logit * mask # mw x ms x mw l2 = T.sum(l2 * mk.dimshuffle('x', 'x', 0), axis=2) * m # mw x ms w0 = l2 / T.sum(l2, axis=1).dimshuffle(0, 'x') w1 = T.switch(T.isnan(w0), 0, w0) else: if self.lm_mode == 'diag': B = hid_inp * Wt.dimshuffle('x', 'x', 0) tmp = T.tensordot(B, B.T, axes = 1) elif self.lm_mode == 'iden': logit = T.tensordot(self.dwe[d, :], self.dwe.T, axes=1)[:,:,d] # mw x ms x mw x ms cnt = T.sum(m, axis=1).dimshuffle('x', 'x', 0) # 1 x 1 x mw logit = T.sum(logit * m.dimshuffle('x', 'x', 0, 1), axis=3) / cnt # mw x ms x mw logit = T.exp(10*T.switch(T.isnan(logit), 0, logit)) # mw x ms x mw logit = T.prod(logit, axis=2) * prior # mw x ms sm = T.sum(logit * m, axis=1, keepdims=True) # mw x 1 logit = (logit * m) / sm # mw x ms return T.switch(T.or_(T.isnan(logit), T.isinf(logit)), 0, logit) else: tmp = T.tensordot(T.dot(hid_inp, self.params['Wt']), hid_inp.T, axes=1) # mw x ms x ms x mw tmp = T.exp(tmp.dimshuffle(0, 1, 3, 2)) # mw x ms x mw x ms tmp = tmp * m.dimshuffle('x', 'x', 0, 1) nrm = T.sum(tmp, axis=3) tmp = tmp / nrm.dimshuffle(0, 1, 2, 'x') tmp = T.switch(T.isnan(tmp), 0, tmp) mk = T.switch(T.lt(p, 0), 0, 1) # mw: word-level mask (different mask from m) tmp = T.max(tmp, axis=3) * mk.dimshuffle('x', 'x', 0) # mw x ms x mw tmp = T.exp(T.sum(T.log(T.switch(T.eq(tmp, 0), 1, tmp)), axis=2)) * m # mw x ms tmp = tmp * prior tmp = tmp / T.sum(tmp, axis=1).dimshuffle(0, 'x') w1 = T.switch(T.isnan(tmp), 0, tmp) return w1
def get_idx(q_nbrs, q_mem): """Gets the index of sample in memory for computing loss. We first look to see if the query label can be found in the retrieved neighbours, and if not, look to memory for a key with the same value. We keep track of a boolean mask, which indicates whether or not we were able to find a sample with a label that matches the query. """ # Whether a matching sample can be found in neighbours or memory any_match_nbrs = T.any(q_nbrs, axis=1) any_match_mem = T.any(q_mem, axis=1) any_match = T.or_(any_match_nbrs, any_match_mem) # Look in neighbours then memory for corresponding sample. # If from neighbours, we need to retrieve the full mem idx. rows = T.arange(nbrs.shape[0]) idx = T.switch(any_match_nbrs, nbrs[rows, tensor_choose_k(q_nbrs, self.rng, k=1)], tensor_choose_k(q_mem, self.rng, k=1, random=True)) return (idx, any_match)
def update_s(s, alphas, scorematrix, queryseq, blank, t): l = (s - 1) // 2 alphas = ifelse( tensor.eq(s % 2, 0), ifelse(tensor.eq(s, 0), tensor.set_subtensor( alphas[s, t], alphas[s, t - 1] * scorematrix[blank, t]), tensor.set_subtensor( alphas[s, t], (alphas[s, t - 1] + alphas[s - 1, t - 1]) * scorematrix[blank, t]), name='for_blank_symbol'), ifelse(tensor.or_(tensor.eq(s, 1), tensor.eq(queryseq[l], queryseq[l - 1])), tensor.set_subtensor( alphas[s, t], (alphas[s, t - 1] + alphas[s - 1, t - 1]) * scorematrix[queryseq[l], t]), tensor.set_subtensor( alphas[s, t], (alphas[s, t - 1] + alphas[s - 1, t - 1] + alphas[s - 2, t - 1]) * scorematrix[queryseq[l], t]), name='for_same_label_twice')) return alphas
def truncated_normal(size, avg, std, lbound, ubound, theano_rng, dtype): def phi(x): erfarg = (x - avg) / (std * SQRT2) rval = 0.5 * (1. + T.erf(erfarg)) return rval.astype(dtype) def phi_inv(phi_x): erfinv_input = T.clip(2. * phi_x - 1., -1. + 1e-6, 1. - 1e-6) rval = avg + std * SQRT2 * T.erfinv(erfinv_input) return rval.astype(dtype) # center lower and upper bounds based on mean u = theano_rng.uniform(size=size, dtype=dtype) cdf_range = phi(ubound) - phi(lbound) sample = phi_inv(phi(lbound) + u * cdf_range) # if avg >> ubound, return ubound # if avg << lbound, return lbound # else return phi(lbound) + u * [phi(ubound) - phi(lbound)] rval = T.switch(T.or_(sample < lbound, sample > ubound), T.switch(avg >= ubound, ubound, lbound), sample) return rval
def __init__(self, nc, nf, kwargs): assert nf; assert nc self.kwargs = extract_rnn_params(kwargs) for pname in RDNN.param_names: setattr(self, pname, kwargs[pname]) self.lr = theano.shared(np.array(self.lr, dtype='float32'), allow_downcast=True) self.gclip = False if self.gclip == 0 else self.gclip # mysteriously, we need this line self.activation = [self.activation] * len(self.n_hidden) self.deep_ltypes = [act_str.split('-')[1] for act_str in self.activation] self.opt = getattr(lasagne.updates, self.opt) ldepth = len(self.n_hidden) # network default_gate = lambda : lasagne.layers.Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform()) forget_gate = lambda : lasagne.layers.Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), b=lasagne.init.Constant(self.fbias)) """default_gate = lambda : lasagne.layers.Gate(W_in=lasagne.init.Orthogonal(), W_hid=lasagne.init.Orthogonal()) forget_gate = lambda : lasagne.layers.Gate(W_in=lasagne.init.Orthogonal(), W_hid=lasagne.init.Orthogonal(), b=lasagne.init.Constant(self.fbias))""" l_in = lasagne.layers.InputLayer(shape=(None, None, nf)) logging.debug('l_in: {}'.format(lasagne.layers.get_output_shape(l_in))) N_BATCH_VAR, MAX_SEQ_LEN_VAR, _ = l_in.input_var.shape # symbolic ref to input_var shape # l_mask = lasagne.layers.InputLayer(shape=(N_BATCH_VAR, MAX_SEQ_LEN_VAR)) l_mask = lasagne.layers.InputLayer(shape=(None, None)) logging.debug('l_mask: {}'.format(lasagne.layers.get_output_shape(l_mask))) curlayer = l_in if self.emb: l_reshape = lasagne.layers.ReshapeLayer(l_in, (-1, nf)) logging.debug('l_reshape: {}'.format(lasagne.layers.get_output_shape(l_reshape))) l_emb = lasagne.layers.DenseLayer(l_reshape, num_units=self.emb, nonlinearity=None, b=None) logging.debug('l_emb: {}'.format(lasagne.layers.get_output_shape(l_emb))) l_emb = lasagne.layers.ReshapeLayer(l_emb, (N_BATCH_VAR, MAX_SEQ_LEN_VAR, self.emb)) logging.debug('l_emb: {}'.format(lasagne.layers.get_output_shape(l_emb))) curlayer = l_emb if self.drates[0] > 0: l_in_drop = lasagne.layers.DropoutLayer(curlayer, p=self.drates[0]) logging.debug('l_drop: {}'.format(lasagne.layers.get_output_shape(l_in_drop))) curlayer = l_in_drop self.layers = [curlayer] self.blayers = [] for level, ltype, n_hidden in zip(range(1,ldepth+1), self.deep_ltypes, self.n_hidden): prev_layer = self.layers[level-1] if ltype in ['relu','lrelu', 'relu6', 'elu']: LayerType = lasagne.layers.RecurrentLayer if ltype == 'relu': nonlin = lasagne.nonlinearities.rectify elif ltype == 'lrelu': nonlin = lasagne.nonlinearities.leaky_rectify elif ltype == 'relu6': nonlin = lambda x: T.min(lasagne.nonlinearities.rectify(x), 6) elif ltype == 'elu': nonlin = lambda x: T.switch(x >= 0, x, T.exp(x) - 1) l_forward = LayerType(prev_layer, n_hidden, mask_input=l_mask, grad_clipping=self.gclip, gradient_steps=self.truncate, W_hid_to_hid=Identity(), W_in_to_hid=lasagne.init.GlorotUniform(gain='relu'), nonlinearity=nonlin) l_backward = LayerType(prev_layer, n_hidden, mask_input=l_mask, grad_clipping=self.gclip, gradient_steps=self.truncate, W_hid_to_hid=Identity(), W_in_to_hid=lasagne.init.GlorotUniform(gain='relu'), nonlinearity=nonlin, backwards=True) elif ltype == 'lstm': LayerType = lasagne.layers.LSTMLayer l_forward = LayerType(prev_layer, n_hidden, ingate=default_gate(), forgetgate=forget_gate(), outgate=default_gate(), mask_input=l_mask, grad_clipping=self.gclip, gradient_steps=self.truncate) l_backward = LayerType(prev_layer, n_hidden, ingate=default_gate(), forgetgate=forget_gate(), outgate=default_gate(), mask_input=l_mask, grad_clipping=self.gclip, gradient_steps=self.truncate, backwards=True) elif ltype == 'gru': LayerType = lasagne.layers.GRULayer l_forward = LayerType(prev_layer, n_hidden, mask_input=l_mask, grad_clipping=self.gclip, gradient_steps=self.truncate) l_backward = LayerType(prev_layer, n_hidden, mask_input=l_mask, grad_clipping=self.gclip, gradient_steps=self.truncate, backwards=True) logging.debug('l_forward: {}'.format(lasagne.layers.get_output_shape(l_forward))) logging.debug('l_backward: {}'.format(lasagne.layers.get_output_shape(l_backward))) if self.fbmerge == 'concat': l_fbmerge = lasagne.layers.ConcatLayer([l_forward, l_backward], axis=2) elif self.fbmerge == 'sum': l_fbmerge = lasagne.layers.ElemwiseSumLayer([l_forward, l_backward]) logging.debug('l_fbmerge: {}'.format(lasagne.layers.get_output_shape(l_fbmerge))) if self.drates[level] > 0: l_fbmerge = lasagne.layers.DropoutLayer(l_fbmerge, p=self.drates[level]) self.blayers.append((l_forward, l_backward)) self.layers.append(l_fbmerge) l_fbmerge = lasagne.layers.ConcatLayer([l_fbmerge, curlayer], axis=2) if self.in2out else l_fbmerge if self.recout == 1: logging.info('using recout:%d.'%self.recout) l_out = lasagne.layers.RecurrentLayer(l_fbmerge, num_units=nc, mask_input=l_mask, W_hid_to_hid=Identity(), W_in_to_hid=lasagne.init.GlorotUniform(), nonlinearity=log_softmax) # W_in_to_hid=lasagne.init.GlorotUniform(), nonlinearity=lasagne.nonlinearities.softmax) CHANGED logging.debug('l_out: {}'.format(lasagne.layers.get_output_shape(l_out))) elif self.recout == 2: logging.info('using recout:%d.'%self.recout) l_fout = lasagne.layers.RecurrentLayer(l_fbmerge, num_units=nc, mask_input=l_mask, W_hid_to_hid=Identity(), W_in_to_hid=lasagne.init.GlorotUniform(), nonlinearity=log_softmax) l_bout = lasagne.layers.RecurrentLayer(l_fbmerge, num_units=nc, mask_input=l_mask, W_hid_to_hid=Identity(), W_in_to_hid=lasagne.init.GlorotUniform(), nonlinearity=log_softmax, backwards=True) l_out = lasagne.layers.ElemwiseSumLayer([l_fout, l_bout], coeffs=0.5) # l_out = LogSoftMerge([l_fout, l_bout]) logging.debug('l_out: {}'.format(lasagne.layers.get_output_shape(l_out))) else: l_reshape = lasagne.layers.ReshapeLayer(l_fbmerge, (-1, self.n_hidden[-1]*(2 if self.fbmerge=='concat' else 1))) logging.debug('l_reshape: {}'.format(lasagne.layers.get_output_shape(l_reshape))) l_rec_out = lasagne.layers.DenseLayer(l_reshape, num_units=nc, nonlinearity=log_softmax) logging.debug('l_rec_out: {}'.format(lasagne.layers.get_output_shape(l_rec_out))) l_out = lasagne.layers.ReshapeLayer(l_rec_out, (N_BATCH_VAR, MAX_SEQ_LEN_VAR, nc)) logging.debug('l_out: {}'.format(lasagne.layers.get_output_shape(l_out))) self.l_soft_out = l_rec_out self.output_layer = l_out target_output = T.tensor3('target_output') out_mask = T.tensor3('mask') """ def cost(output): return -T.sum(out_mask*target_output*T.log(output))/T.sum(out_mask) """ def cost(output): # expects log softmax output return -T.sum(out_mask*target_output*output)/T.sum(out_mask) cost_train = cost(lasagne.layers.get_output(l_out, deterministic=False)) cost_eval = cost(lasagne.layers.get_output(l_out, deterministic=True)) all_params = lasagne.layers.get_all_params(l_out, trainable=True) logging.debug(all_params) self.recout_hid2hid = lambda : l_out.get_params() if self.recout == 0 else lambda : l_out.get_params()[-1].get_value() grads = T.grad(cost_train, all_params) all_grads, total_norm = lasagne.updates.total_norm_constraint(grads, self.norm, return_norm=True) #all_grads.append(grads[-2]) #all_grads.append(grads[-1]) all_grads = [T.switch(T.or_(T.isnan(total_norm), T.isinf(total_norm)), p*0.01 , g) for g,p in zip(all_grads, all_params)] if self.gnoise: from theano.tensor.shared_randomstreams import RandomStreams srng = RandomStreams(seed=1234) e_prev = theano.shared(lasagne.utils.floatX(0.)) nu = 0.01 gamma = 0.55 gs = [g + srng.normal(T.shape(g), std=(nu / ((1 + e_prev)**gamma))) for g in all_grads] updates = self.opt(gs, all_params, self.lr, self.eps) updates[e_prev] = e_prev + 1 else: updates = self.opt(all_grads, all_params, self.lr, self.eps) logging.info("Compiling functions...") self.train_model = theano.function(inputs=[l_in.input_var, target_output, l_mask.input_var, out_mask], outputs=cost_train, updates=updates, allow_input_downcast=True) self.predict_model = theano.function( inputs=[l_in.input_var, target_output, l_mask.input_var, out_mask], outputs=[cost_eval, lasagne.layers.get_output(l_out, deterministic=True)]) # aux self.train_model_debug = theano.function( inputs=[l_in.input_var, target_output, l_mask.input_var, out_mask], outputs=[cost_train]+lasagne.layers.get_output([l_out, l_fbmerge], deterministic=True)+[total_norm], updates=updates) self.compute_cost = theano.function([l_in.input_var, target_output, l_mask.input_var, out_mask], cost_eval) self.compute_cost_train = theano.function([l_in.input_var, target_output, l_mask.input_var, out_mask], cost_train) # self.info_model = theano.function([],recout_hid2hid) logging.info("Compiling done.")
fp_multiplier = 1 loss = T.mean( T.sqr(T.maximum(0., 1. - target * train_output)) * np.asarray([fp_multiplier, 1])) err = T.mean(T.neq(T.argmax(train_output, axis=1), T.argmax(target, axis=1)), dtype=theano.config.floatX) train_1_when_0 = T.sum(T.gt(T.argmax(train_output, axis=1), T.argmax(target, axis=1)), dtype=theano.config.floatX) # face = 0, bg = 1 : fn train_0_when_1 = T.sum(T.lt(T.argmax(train_output, axis=1), T.argmax(target, axis=1)), dtype=theano.config.floatX) # fp # the T.invert function seems to react differently depending on theano versions... train_0_when_0 = T.sum(T.invert( T.or_(T.argmax(train_output, axis=1), T.argmax(target, axis=1))), dtype=theano.config.floatX) # if this does not work, try # train_0_when_0 = batch_size - T.sum(T.or_(T.argmax(train_output,axis=1),T.argmax(target,axis=1))),dtype=theano.config.floatX) train_precision = train_0_when_0 / (train_0_when_0 + train_0_when_1 ) # TP/(TP+FP) train_recall = train_0_when_0 / (train_0_when_0 + train_1_when_0 ) # TP/(TP+FN) if binary: # W updates W = lasagne.layers.get_all_params(cnn, binary=True) W_grads = binary_net.compute_grads(loss, cnn) updates = lasagne.updates.adam(loss_or_grads=W_grads, params=W,
def __init__(self, model, state, data): """ :type model: groundhog model class :param model: class depicting the model to be optimized :type state: dictionary or jobman DD object :param state: dictionary containing various hyper-parameters. The class will write into this dictionary updates like the current training error and so on :type data: groundhog dataset object :param data: data iterator over which training is done """ ##################################### # Step 0. Constructs shared variables ##################################### bs = state['bs'] self.model = model self.rng = numpy.random.RandomState(state['seed']) srng = RandomStreams(self.rng.randint(213)) self.gs = [ theano.shared(numpy.zeros(p.get_value(borrow=True).shape, dtype=theano.config.floatX), name=p.name) for p in model.params ] self.step = 0 self.bs = bs self.state = state self.data = data self.step_timer = time.time() self.gdata = [ theano.shared(numpy.zeros((2, ) * x.ndim, dtype=x.dtype), name=x.name) for x in model.inputs ] if 'profile' not in self.state: self.state['profile'] = 0 ################################### # Step 1. Compile training function ################################### print 'Constructing grad function' loc_data = self.gdata lr = TT.scalar('lr') self.prop_exprs = [x[1] for x in model.properties] self.prop_names = [x[0] for x in model.properties] self.update_rules = [x[1] for x in model.updates] rval = theano.clone(model.param_grads + self.update_rules + \ self.prop_exprs + [model.train_cost], replace=zip(model.inputs, loc_data)) nparams = len(model.params) nouts = len(self.prop_exprs) nrules = len(self.update_rules) gs = rval[:nparams] rules = rval[nparams:nparams + nrules] outs = rval[nparams + nrules:] norm_gs = sum( TT.sum(x**2) for x, p in zip(gs, self.model.params) if p not in self.model.exclude_params_for_norm) if 'cutoff' in state and state['cutoff'] > 0: c = numpy.float32(state['cutoff']) if state['cutoff_rescale_length']: c = c * TT.cast(loc_data[0].shape[0], 'float32') notfinite = TT.or_(TT.isnan(norm_gs), TT.isinf(norm_gs)) _gs = [] for g, p in zip(gs, self.model.params): if p not in self.model.exclude_params_for_norm: tmpg = TT.switch(TT.ge(norm_gs, c), g * c / norm_gs, g) _gs.append( TT.switch(notfinite, numpy.float32(.1) * p, tmpg)) else: _gs.append(g) gs = _gs store_gs = [(s, g) for s, g in zip(self.gs, gs)] updates = store_gs + [(s[0], r) for s, r in zip(model.updates, rules)] print 'Compiling grad function' st = time.time() self.train_fn = theano.function([], outs, name='train_function', updates=updates, givens=zip(model.inputs, loc_data), profile=self.state['profile']) print 'took', time.time() - st self.lr = numpy.float32(state['lr']) new_params = [ p - s * lr * g for s, p, g in zip(model.params_grad_scale, model.params, self.gs) ] self.update_fn = theano.function([lr], [], name='update_function', allow_input_downcast=True, updates=zip(model.params, new_params), profile=self.state['profile']) self.old_cost = 1e20 self.schedules = model.get_schedules() self.return_names = self.prop_names + \ ['cost', 'time_step', 'whole_time', 'lr']