def marginalize_over_v_z(self, h): # energy = \sum_{i=1}^{|h|} h_i*b_i - \beta * ln(1 + e^{b_i}) # In theory should use the following line # energy = (h * self.b).T # However, when there is broadcasting, the Theano element-wise multiplication between np.NaN and 0 is 0 instead of np.NaN! # so we use T.tensordot and T.diagonal instead as a workaround! # See Theano issue #3848 (https://github.com/Theano/Theano/issues/3848) energy = T.tensordot(h, self.b, axes=0) energy = T.diagonal(energy, axis1=1, axis2=2).T if self.penalty == "softplus_bi": energy = energy - self.beta * T.log(1 + T.exp(self.b))[:, None] elif self.penalty == "softplus0": energy = energy - self.beta * T.log(1 + T.exp(0))[:, None] else: raise NameError("Invalid penalty term") energy = T.set_subtensor(energy[(T.isnan(energy)).nonzero()], 0) # Remove NaN energy = T.sum(energy, axis=0, keepdims=True).T ener = T.tensordot(h, self.W, axes=0) ener = T.diagonal(ener, axis1=1, axis2=2) ener = T.set_subtensor(ener[(T.isnan(ener)).nonzero()], 0) ener = T.sum(ener, axis=2) + self.c[None, :] ener = T.sum(T.log(1 + T.exp(ener)), axis=1, keepdims=True) return -(energy + ener)
def updates(self, cost): grad = T.grad(cost, self.param) grad2 = hessian_diagonal(cost, self.param, grad=grad) # calculate memory constants tau_rec = 1.0 / self.tau tau_inv_rec = 1.0 - tau_rec # new moving average of gradient g_avg_new = tau_inv_rec * self.g_avg + tau_rec * grad # new moving average of squared gradient v_avg_new = tau_inv_rec * self.v_avg + tau_rec * grad**2 # new moving average of hessian diagonal h_avg_new = tau_inv_rec * self.h_avg + tau_rec * T.abs_(grad2) rate_unsafe = (g_avg_new ** 2) / (v_avg_new * h_avg_new) rate = T.switch(T.isinf(rate_unsafe) | T.isnan(rate_unsafe), self.learning_rate, rate_unsafe) tau_unsafe = (1 - (g_avg_new ** 2) / v_avg_new) * self.tau + 1 tau_new = T.switch(T.isnan(tau_unsafe) | T.isinf(tau_unsafe), self.tau, tau_unsafe) return [(self.g_avg, g_avg_new), (self.v_avg, v_avg_new), (self.h_avg, h_avg_new), (self.tau, tau_new), (self.last_grad, grad), (self.last_grad2, grad2), (self.last_rate, rate), (self.param, self.param - rate * grad)]
def from_partial(self, X, dX): eps = 1e-10 U, S, V = X dU, dS, dV = dX umask = 1 - (1 - tensor.isnan(dU)) * (1 - tensor.isinf(dU) ) # indicators of nan/inf values vmask = 1 - (1 - tensor.isnan(dV)) * (1 - tensor.isinf(dV) ) # indicators of nan/inf values # U S V => U mask product by columns, V by rows smask = 1 - tensor.prod(1 - umask, axis=0) * tensor.prod(1 - vmask, axis=1) S = tensor.diag(S) dU = tensor.set_subtensor(dU[umask.nonzero()], 0.0) S_pinv = tensor.switch(tensor.gt(abs(S), eps), 1.0 / S, 0.0) S_pinv = tensor.set_subtensor(S_pinv[smask.nonzero()], 0.0) S_pinv = tensor.diag(S_pinv) dV = tensor.set_subtensor(dV[vmask.nonzero()], 0.0) ZV = dU.dot(S_pinv) UtZV = dS ZtU = S_pinv.dot(dV) Zproj = (ZV - U.dot(UtZV), UtZV, ZtU - (UtZV.dot(V))) return Zproj
def get_f_scores(self): prediction = self.get_predictions(0.5) # 0.5 is an arbitrary threshold # Different computation for R, P and F with the autoencoder true_pos = T.sum(prediction & self.x_as_int, axis=0) pos = T.sum(self.x_as_int, axis=0) predicted_pos = T.sum(prediction, axis=0) # If pos==0 (no actual positives) recall is undefined # Simple way out of div zero: wherever pos==0, setting pos=1 is fine (since recall==1) recalls = T.switch(T.eq(pos, 0), float('nan'), true_pos) / T.switch(T.eq(pos, 0), 1., pos) # Simple way out of div zero: wherever predicted_pos==0 we're setting num directly, so 1 denom is fine precisions = T.switch( T.eq(predicted_pos, 0) & T.eq(pos, 0), float('nan'), # Don't penalize precision if there are no positives true_pos / T.switch(T.eq(predicted_pos, 0), 1., predicted_pos) ) f_scores = T.switch( T.isnan(precisions) | T.isnan(recalls), float('nan'), 2. * precisions * recalls / T.switch( precisions + recalls > 0, precisions + recalls, 1. ), ) return f_scores, precisions, recalls
def updates(self, cost): grad = T.grad(cost, self.param) grad2 = hessian_diagonal(cost, self.param, grad=grad) # calculate memory constants tau_rec = 1.0 / self.tau tau_inv_rec = 1.0 - tau_rec # new moving average of gradient g_avg_new = tau_inv_rec * self.g_avg + tau_rec * grad # new moving average of squared gradient v_avg_new = tau_inv_rec * self.v_avg + tau_rec * grad**2 # new moving average of hessian diagonal h_avg_new = tau_inv_rec * self.h_avg + tau_rec * T.abs_(grad2) rate_unsafe = (g_avg_new**2) / (v_avg_new * h_avg_new) rate = T.switch( T.isinf(rate_unsafe) | T.isnan(rate_unsafe), self.learning_rate, rate_unsafe) tau_unsafe = (1 - (g_avg_new**2) / v_avg_new) * self.tau + 1 tau_new = T.switch( T.isnan(tau_unsafe) | T.isinf(tau_unsafe), self.tau, tau_unsafe) return [(self.g_avg, g_avg_new), (self.v_avg, v_avg_new), (self.h_avg, h_avg_new), (self.tau, tau_new), (self.last_grad, grad), (self.last_grad2, grad2), (self.last_rate, rate), (self.param, self.param - rate * grad)]
def scaled_cost(x, t): sq_error = (x - t) ** 2 above_thresh_sq_error = sq_error[(t > THRESHOLD).nonzero()] below_thresh_sq_error = sq_error[(t <= THRESHOLD).nonzero()] above_thresh_mean = above_thresh_sq_error.mean() below_thresh_mean = below_thresh_sq_error.mean() above_thresh_mean = ifelse(T.isnan(above_thresh_mean), 0.0, above_thresh_mean) below_thresh_mean = ifelse(T.isnan(below_thresh_mean), 0.0, below_thresh_mean) return (above_thresh_mean + below_thresh_mean) / 2.0
def __init__(self, labels, g=0.1, m=0.01, feature_dimension=128, n_codewords=16, n_feature_samples=100, eta=0.01): """ The labels of the objects used for the optimization. The objects must be in the same order when the fit function is called :param labels: labels of the objects used for the optimization :param g: BoW quantization parameter :param m: entropy softness parameter :param feature_dimension: dimension of the extracted feature vectors :param n_codewords: number of codewords in the dictionary :param n_feature_samples: number of feature vectors to use in each iteration :param eta: learning rate """ SoftBoW.__init__(self, g=g, feature_dimension=feature_dimension, n_codewords=n_codewords) self.entropy = SoftEntropy(m=m, labels=labels) self.entropy_loss = None self.learning_rate = eta self.n_feature_samples = n_feature_samples # Histograms self.S = self._sym_histograms(self.X) # Entropy loss self.entropy_loss = self.entropy._sym_entropy(self.S) # Compile loss function self.calculate_loss_theano = theano.function([self.X], self.entropy_loss) # Define gradients w.r.t. V (and take care of NaNs) entropy_grad = T.grad(self.entropy_loss, self.S) entropy_grad = T.switch(T.isnan(entropy_grad), 0, entropy_grad) dictionary_grad = T.grad(self.entropy._sym_entropy(self.S), self.V, known_grads={self.S: entropy_grad}) dictionary_grad = T.switch(T.isnan(dictionary_grad), 0, dictionary_grad) # Define and compile the training function self.updates = adam([dictionary_grad], [self.V], learning_rate=self.learning_rate) self.train_theano = theano.function(inputs=[self.X], outputs=[self.entropy_loss], updates=self.updates)
def get_nesterov_sgd_updates(param_list, gradients, velocities, lr, mu): """Do SGD updates with Nesterov momentum.""" updates = [] for p, g, v in zip(param_list, gradients, velocities): new_v = mu * v - lr * g new_p = p - mu * v + (1 + mu) * new_v has_non_finite = (T.any(T.isnan(new_p) + T.isinf(new_p)) + T.any(T.isnan(new_v) + T.isinf(new_v))) updates.append((p, ifelse(has_non_finite, p, new_p))) updates.append((v, ifelse(has_non_finite, v, new_v))) return updates
def to_weight(d, m, p, prior): logit = T.tensordot(dwe[d, :], dwe.T, axes=1)[:, :, d] # mw x ms x mw x ms cnt = T.sum(m, axis=1).dimshuffle('x', 'x', 0) # 1 x 1 x mw logit = T.sum(logit * m.dimshuffle('x', 'x', 0, 1), axis=3) / cnt # mw x ms x mw logit = T.exp(10 * T.switch(T.isnan(logit), 0, logit)) # mw x ms x mw logit = T.prod(logit, axis=2) * prior # mw x ms sm = T.sum(logit * m, axis=1, keepdims=True) # mw x 1 #mask = T.switch(T.lt(p, 0), 0, 1).dimshuffle(0, 'x') # logit = (logit * m) / sm # mw x ms return T.switch(T.or_(T.isnan(logit), T.isinf(logit)), 0, logit)
def predict_logK(self, x, z, params): if self.conditional: s_x = TT.switch(TT.isnan(x), self.n_idxs - 1, x) s_z = TT.switch(TT.isnan(z), self.n_idxs - 1, z) else: s_x = x s_z = z P_unit = self.unit(params) K = TT.dot(P_unit[s_x.flatten().astype('int32')], P_unit[s_x.flatten().astype('int32')].T) #K_reg = K + 1e-12 * TT.eye(x.shape[0]) K_new = TT.dot(P_unit[s_x.flatten().astype('int32')], P_unit[s_z.flatten().astype('int32')].T) return TT.log(K), TT.log(K_new)
def get_clip_sgd_updates(self, params, cost, learning_rate, momentum, rescale=5.): gparams = T.grad(cost, params) updates = OrderedDict() if not hasattr(self, "momentum_velocity_"): self.momentum_velocity_ = [0.] * len(gparams) # Gradient clipping grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), gparams))) not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm)) grad_norm = T.sqrt(grad_norm) scaling_num = rescale scaling_den = T.maximum(rescale, grad_norm) for n, (param, gparam) in enumerate(zip(params, gparams)): # clip gradient directly, not momentum etc. gparam = T.switch(not_finite, 0.1 * param, gparam * (scaling_num / scaling_den)) velocity = self.momentum_velocity_[n] update_step = momentum * velocity - learning_rate * gparam self.momentum_velocity_[n] = update_step updates[param] = param + update_step return updates
def graves_rmsprop_updates(self, params, grads, learning_rate=1e-4, alpha=0.9, epsilon=1e-4, chi=0.95): """ Alex Graves' RMSProp [1]_. .. math :: n_{i} &= \chi * n_i-1 + (1 - \chi) * grad^{2}\\ g_{i} &= \chi * g_i-1 + (1 - \chi) * grad\\ \Delta_{i} &= \alpha * Delta_{i-1} - learning_rate * grad / sqrt(n_{i} - g_{i}^{2} + \epsilon)\\ w_{i} &= w_{i-1} + \Delta_{i} References ---------- .. [1] Graves, Alex. "Generating Sequences With Recurrent Neural Networks", p.23 arXiv:1308.0850 """ updates = [] grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), grads))) not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm)) for n, (param, grad) in enumerate(zip(params, grads)): grad = T.switch(not_finite, 0.1 * param, grad) old_square = self.running_square_[n] old_avg = self.running_avg_[n] old_memory = self.memory_[n] new_square = chi * old_square + (1. - chi) * grad ** 2 new_avg = chi * old_avg + (1. - chi) * grad new_memory = alpha * old_memory - learning_rate * grad / T.sqrt(new_square - \ new_avg ** 2 + epsilon) updates.append((old_square, new_square)) updates.append((old_avg, new_avg)) updates.append((old_memory, new_memory)) updates.append((param, param + new_memory)) return updates
def compute_step(self, param, previous_step): grad_norm = l2_norm([previous_step]) not_finite = tensor.or_(tensor.isnan(grad_norm), tensor.isinf(grad_norm)) step = tensor.switch(not_finite, self.scaler * param, previous_step) return step, []
def get_updates(self, loss, lr, max_norm=1, beta1=0.9, beta2=0.999, epsilon=1e-8, grads=None): # Gradients if grads is None: grads = tensor.grad(loss, self.trainables) # Clipping norm = tensor.sqrt(sum([tensor.sqr(g).sum() for g in grads])) m = theanotools.clipping_multiplier(norm, max_norm) grads = [m*g for g in grads] # Safeguard against numerical instability new_cond = tensor.or_(tensor.or_(tensor.isnan(norm), tensor.isinf(norm)), tensor.or_(norm < 0, norm > 1e10)) grads = [tensor.switch(new_cond, np.float32(0), g) for g in grads] # Safeguard against numerical instability #cond = tensor.or_(norm < 0, tensor.or_(tensor.isnan(norm), tensor.isinf(norm))) #grads = [tensor.switch(cond, np.float32(0), g) for g in grads] # New values t = self.time + 1 lr_t = lr*tensor.sqrt(1. - beta2**t)/(1. - beta1**t) means_t = [beta1*m + (1. - beta1)*g for g, m in zip(grads, self.means)] vars_t = [beta2*v + (1. - beta2)*tensor.sqr(g) for g, v in zip(grads, self.vars)] steps = [lr_t*m_t/(tensor.sqrt(v_t) + epsilon) for m_t, v_t in zip(means_t, vars_t)] # Updates updates = [(x, x - step) for x, step in zip(self.trainables, steps)] updates += [(m, m_t) for m, m_t in zip(self.means, means_t)] updates += [(v, v_t) for v, v_t in zip(self.vars, vars_t)] updates += [(self.time, t)] return norm, grads, updates
def __init__(self, n_comp=10, verbose=False): # Theano initialization self.T_weights = shared(np.eye(n_comp, dtype=np.float32)) self.T_bias = shared(np.ones((n_comp, 1), dtype=np.float32)) T_p_x_white = T.fmatrix() T_lrate = T.fscalar() T_block = T.fscalar() T_unmixed = T.dot(self.T_weights,T_p_x_white) + T.addbroadcast(self.T_bias,1) T_logit = 1 - 2 / (1 + T.exp(-T_unmixed)) T_out = self.T_weights + T_lrate * T.dot(T_block * T.identity_like(self.T_weights) + T.dot(T_logit, T.transpose(T_unmixed)), self.T_weights) T_bias_out = self.T_bias + T_lrate * T.reshape(T_logit.sum(axis=1), (-1,1)) T_max_w = T.max(self.T_weights) T_isnan = T.any(T.isnan(self.T_weights)) self.w_up_fun = theano.function([T_p_x_white, T_lrate, T_block], [T_max_w, T_isnan], updates=[(self.T_weights, T_out), (self.T_bias, T_bias_out)], allow_input_downcast=True) T_matrix = T.fmatrix() T_cov = T.dot(T_matrix,T.transpose(T_matrix))/T_block self.cov_fun = theano.function([T_matrix, T_block], T_cov, allow_input_downcast=True) self.loading = None self.sources = None self.weights = None self.n_comp = n_comp self.verbose = verbose
def rmsprop(params, cost=None, gradients=None, learningrate=0.0005, rho=0.9, epsilon=1e-6): # Validate input assert not (cost is None and gradients is None), "Update function rmsprop requires either a cost scalar or a " \ "list of gradients." # Compute gradients if requested if gradients is None and cost is not None: pdC = T.grad(cost, wrt=params) # Kill gradients if cost is nan dC = [th.ifelse.ifelse(T.isnan(cost), T.zeros_like(dparam), dparam) for dparam in pdC] else: dC = gradients # Init update list updates = [] for param, dparam in zip(params, dC): # Check if layer is trainable. Skip if not. if not netutils.getbaggage(param, 'trainable', True): continue paramshape = param.get_value().shape acc = th.shared(np.zeros(paramshape, dtype=th.config.floatX)) newacc = rho * acc + (1 - rho) * dparam ** 2 gradscale = T.sqrt(newacc + epsilon) dparam = dparam / gradscale updates.append((acc, newacc)) updates.append((param, param - learningrate * dparam)) return updates
def nan_shield(parameters, deltas, other_updates): delta_sum = sum(T.sum(d) for d in deltas) not_finite = T.isnan(delta_sum) | T.isinf(delta_sum) parameter_updates = [(p, T.switch(not_finite, 0.9 * p, p - d)) for p, d in izip(parameters, deltas)] other_updates = [(p, T.switch(not_finite, p, u)) for p, u in other_updates] return parameter_updates, other_updates
def lda_logp(rt, gaze, values, error_lls, s_condition_index, s_subject_index, v_condition_index, v_subject_index, tau_condition_index, tau_subject_index, gamma_condition_index, gamma_subject_index, t0_condition_index, t0_subject_index, zerotol): # compute drifts drift = glam.components.expdrift( v[tt.cast(v_subject_index, dtype='int32'), tt.cast(v_condition_index, dtype='int32')][:, None], tau[tt.cast(tau_subject_index, dtype='int32'), tt.cast(tau_condition_index, dtype='int32')][:, None], gamma[tt.cast(gamma_subject_index, dtype='int32'), tt.cast(gamma_condition_index, dtype='int32')][:, None], values, gaze, zerotol) glam_ll = glam.components.tt_wienerrace_pdf( rt[:, None], drift, s[tt.cast(s_subject_index, dtype='int32'), tt.cast(s_condition_index, dtype='int32')][:, None], b, t0[tt.cast(t0_subject_index, dtype='int32'), tt.cast(t0_condition_index, dtype='int32')][:, None], zerotol) # mix likelihoods mixed_ll = ((1 - p_error) * glam_ll + p_error * error_lls[subject_idx]) mixed_ll = tt.where(tt.isnan(mixed_ll), 0., mixed_ll) mixed_ll = tt.where(tt.isinf(mixed_ll), 0., mixed_ll) return tt.sum(tt.log(mixed_ll + zerotol))
def adadelta(cost, parameters, param_clip=0, l_r=1.0, decay=0.95, consider_constant=None): """ Each element of parameters is an array with 4 elements: param, update, hist_grad, hist_update """ updates_for_func = OrderedDict() for param, update, hist_grad, hist_update in parameters: gparam = T.grad(cost, param, consider_constant=consider_constant) gparam = ifelse(T.isnan(T.sum(gparam)), T.zeros_like(gparam), gparam) new_hist_grad = decay * hist_grad + (1 - decay) * (gparam**2) new_update = -l_r * T.sqrt(hist_update + 1e-6) / T.sqrt(new_hist_grad + 1e-6) * gparam if (param_clip > 0): new_update = T.clip(new_update, -param_clip, param_clip) new_param = param + new_update new_hist_update = decay * hist_update + (1 - decay) * (new_update**2) # Note that the order is important updates_for_func[hist_grad] = new_hist_grad updates_for_func[update] = new_update updates_for_func[param] = new_param updates_for_func[hist_update] = new_hist_update return updates_for_func
def exe(self, mainloop): """ .. todo:: WRITEME """ grads = mainloop.grads """ for p, g in grads.items(): grads[p] = g / self.batch_size g_norm = 0. for g in grads.values(): g_norm += (g**2).sum() """ g_norm = 0. for p, g in grads.items(): g /= self.batch_size grads[p] = g g_norm += (g**2).sum() not_finite = T.or_(T.isnan(g_norm), T.isinf(g_norm)) g_norm = T.sqrt(g_norm) scaler = self.scaler / T.maximum(self.scaler, g_norm) for p, g in grads.items(): grads[p] = T.switch(not_finite, 0.1 * p, g * scaler) mainloop.grads = grads
def exe(self, mainloop): """ .. todo:: WRITEME """ grads = mainloop.grads g_norm = 0. for p, g in grads.items(): g /= T.cast(self.batch_size, dtype=theano.config.floatX) grads[p] = g g_norm += (g**2).sum() if self.check_nan: not_finite = T.or_(T.isnan(g_norm), T.isinf(g_norm)) g_norm = T.sqrt(g_norm) scaler = self.scaler / T.maximum(self.scaler, g_norm) if self.check_nan: for p, g in grads.items(): grads[p] = T.switch(not_finite, 0.1 * p, g * scaler) else: for p, g in grads.items(): grads[p] = g * scaler mainloop.grads = grads
def unet_crossentropy_loss_sampled(y_true, y_pred): epsilon = 1.0e-4 y_pred_clipped = T.flatten(T.clip(y_pred, epsilon, 1.0-epsilon)) y_true = T.flatten(y_true) # this seems to work # it is super ugly though and I am sure there is a better way to do it # but I am struggling with theano to cooperate # filter the right indices classPos = 1 classNeg = 0 indPos = T.eq(y_true, classPos).nonzero()[0] indNeg = T.eq(y_true, classNeg).nonzero()[0] #pos = y_true[ indPos ] #neg = y_true[ indNeg ] # shuffle n = indPos.shape[0] indPos = indPos[UNET.srng.permutation(n=n)] n = indNeg.shape[0] indNeg = indNeg[UNET.srng.permutation(n=n)] # take equal number of samples depending on which class has less n_samples = T.cast(T.min([ indPos.shape[0], indNeg.shape[0]]), dtype='int64') #n_samples = T.cast(T.min([T.sum(y_true), T.sum(1-y_true)]), dtype='int64') indPos = indPos[:n_samples] indNeg = indNeg[:n_samples] #loss_vector = -T.mean(T.log(y_pred_clipped[indPos])) - T.mean(T.log(1-y_pred_clipped[indNeg])) loss_vector = -T.mean(T.log(y_pred_clipped[indPos])) - T.mean(T.log(y_pred_clipped[indNeg])) loss_vector = T.clip(loss_vector, epsilon, 1.0-epsilon) average_loss = T.mean(loss_vector) if T.isnan(average_loss): average_loss = T.mean( y_pred_clipped[indPos]) return average_loss
def lda_logp(rt, gaze, values, error_ll, v_index, tau_index, gamma_index, s_index, t0_index, is_multiplicative, zerotol): # compute drifts ## Select the right drift function drift = ifelse( is_multiplicative, glam.components.tt_drift_multiplicative( v[0, tt.cast(v_index, dtype='int32')][:, None], tau[0, tt.cast(tau_index, dtype='int32')][:, None], gamma[0, tt.cast(gamma_index, dtype='int32')][:, None], values, gaze, zerotol), glam.components.tt_drift_additive( v[0, tt.cast(v_index, dtype='int32')][:, None], tau[0, tt.cast(tau_index, dtype='int32')][:, None], gamma[0, tt.cast(gamma_index, dtype='int32')][:, None], values, gaze, zerotol)) # drift = driftfun(v[0, tt.cast(v_index, dtype='int32')][:, None], # tau[0, tt.cast(tau_index, dtype='int32')][:, None], # gamma[0, tt.cast(gamma_index, dtype='int32')][:, None], # values, # gaze, # zerotol) glam_ll = glam.components.tt_wienerrace_pdf( rt[:, None], drift, s[0, tt.cast(s_index, dtype='int32')][:, None], b, t0[0, tt.cast(t0_index, dtype='int32')][:, None], zerotol) # mix likelihoods mixed_ll = ((1 - p_error) * glam_ll + p_error * error_ll) mixed_ll = tt.where(tt.isnan(mixed_ll), 0., mixed_ll) mixed_ll = tt.where(tt.isinf(mixed_ll), 0., mixed_ll) return tt.log(mixed_ll + zerotol)
def theano_digitize(x, bins): """ Equivalent to numpy digitize. Parameters ---------- x : Theano tensor or array_like The array or matrix to be digitized bins : array_like The bins with which x should be digitized Returns ------- A Theano tensor The indices of the bins to which each value in input array belongs. """ binned = T.zeros_like(x) + len(bins) for i in range(len(bins)): bin = bins[i] if i == 0: binned = T.switch(T.lt(x, bin), i, binned) else: ineq = T.and_(T.ge(x, bins[i - 1]), T.lt(x, bin)) binned = T.switch(ineq, i, binned) binned = T.switch(T.isnan(x), len(bins), binned) return binned
def minimize(self, loss, momentum, rescale): super(RMSPropOptimizer, self).minimize(loss) grads = self.gradparams grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), grads))) not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm)) grad_norm = T.sqrt(grad_norm) scaling_num = rescale scaling_den = T.maximum(rescale, grad_norm) # Magic constants combination_coeff = 0.9 minimum_grad = 1E-4 updates = [] params = self.params for n, (param, grad) in enumerate(zip(params, grads)): grad = T.switch(not_finite, 0.1 * param, grad * (scaling_num / scaling_den)) old_square = self.running_square_[n] new_square = combination_coeff * old_square + ( 1. - combination_coeff) * T.sqr(grad) old_avg = self.running_avg_[n] new_avg = combination_coeff * old_avg + ( 1. - combination_coeff) * grad rms_grad = T.sqrt(new_square - new_avg ** 2) rms_grad = T.maximum(rms_grad, minimum_grad) memory = self.memory_[n] update = momentum * memory - self.lr * grad / rms_grad update2 = momentum * momentum * memory - ( 1 + momentum) * self.lr * grad / rms_grad updates.append((old_square, new_square)) updates.append((old_avg, new_avg)) updates.append((memory, update)) updates.append((param, param + update2)) return updates
def theano_digitize(x, bins): """ Equivalent to numpy digitize. Parameters ---------- x : Theano tensor or array_like The array or matrix to be digitized bins : array_like The bins with which x should be digitized Returns ------- A Theano tensor The indices of the bins to which each value in input array belongs. """ binned = T.zeros_like(x) + len(bins) for i in range(len(bins)): bin=bins[i] if i == 0: binned=T.switch(T.lt(x,bin),i,binned) else: ineq = T.and_(T.ge(x,bins[i-1]),T.lt(x,bin)) binned=T.switch(ineq,i,binned) binned=T.switch(T.isnan(x), len(bins), binned) return binned
def compute_step(self, parameter, previous_step): step_sum = tensor.sum(previous_step) not_finite = (tensor.isnan(step_sum) + tensor.isinf(step_sum)) step = tensor.switch( not_finite > 0, (1 - self.scaler) * parameter, previous_step) return step, []
def updates(self, params, grads, learning_rate, momentum, rescale=5.): grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), grads))) not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm)) grad_norm = T.sqrt(grad_norm) scaling_num = rescale scaling_den = T.maximum(rescale, grad_norm) # Magic constants combination_coeff = 0.9 minimum_grad = 1E-4 updates = [] for n, (param, grad) in enumerate(zip(params, grads)): grad = T.switch(not_finite, 0.1 * param, grad * (scaling_num / scaling_den)) old_square = self.running_square_[n] new_square = combination_coeff * old_square + ( 1. - combination_coeff) * T.sqr(grad) old_avg = self.running_avg_[n] new_avg = combination_coeff * old_avg + (1. - combination_coeff) * grad rms_grad = T.sqrt(new_square - new_avg**2) rms_grad = T.maximum(rms_grad, minimum_grad) memory = self.memory_[n] update = momentum * memory - learning_rate * grad / rms_grad update2 = momentum * momentum * memory - ( 1 + momentum) * learning_rate * grad / rms_grad updates.append((old_square, new_square)) updates.append((old_avg, new_avg)) updates.append((memory, update)) updates.append((param, param + update2)) return updates
def adamgc_(cost, params, lr=0.0002, b1=0.1, b2=0.01, e=1e-8, max_magnitude=5.0, infDecay=0.1): updates = [] grads = T.grad(cost, params) norm = norm_gs(params, grads) sqrtnorm = T.sqrt(norm) not_finite = T.or_(T.isnan(sqrtnorm), T.isinf(sqrtnorm)) adj_norm_gs = T.switch(T.ge(sqrtnorm, max_magnitude), max_magnitude / sqrtnorm, 1.0) i = shared(floatX(0.0)) i_t = i + 1.0 fix1 = 1.0 - (1.0 - b1) ** i_t fix2 = 1.0 - (1.0 - b2) ** i_t lr_t = lr * (T.sqrt(fix2) / fix1) for p, g in zip(params, grads): g = T.switch(not_finite, infDecay * p, g * adj_norm_gs) m = shared(p.get_value() * 0.0) v = shared(p.get_value() * 0.0) m_t = (b1 * g) + ((1.0 - b1) * m) v_t = (b2 * T.sqr(g)) + ((1.0 - b2) * v) g_t = m_t / (T.sqrt(v_t) + e) p_t = p - (lr_t * g_t) # e_t = shared(p.get_value() * 0.) # de_t = (srnd.normal(p.shape, std = 0.05, dtype=theano.config.floatX)*p_t - e_t)*0.05 #*p_t # p_t = p_t + de_t # updates.append((e_t, e_t + de_t)) updates.append((m, m_t)) updates.append((v, v_t)) updates.append((p, p_t)) updates.append((i, i_t)) return updates, norm
def compute_step(self, param, previous_step): not_finite = tensor.any( tensor.or_(tensor.isnan(previous_step), tensor.isinf(previous_step))) step = tensor.switch(not_finite, self.scaler * param, previous_step) return step, []
def get_gradients(self, model, data, ** kwargs): cost = self.expr(model=model, data=data, **kwargs) params = list(model.get_params()) grads = T.grad(cost, params, disconnected_inputs='ignore') gradients = OrderedDict(izip(params, grads)) if self.gradient_clipping: norm_gs = 0. for grad in gradients.values(): norm_gs += (grad ** 2).sum() not_finite = T.or_(T.isnan(norm_gs), T.isinf(norm_gs)) norm_gs = T.sqrt(norm_gs) norm_gs = T.switch(T.ge(norm_gs, self.max_magnitude), self.max_magnitude / norm_gs, 1.) for param, grad in gradients.items(): gradients[param] = T.switch(not_finite, .1 * param, grad * norm_gs) updates = OrderedDict() return gradients, updates
def sym_entropy(self, S, mapping): """ Defines the symbolic calculation of the soft entropy """ if self.distance == 'euclidean': distances = euclidean_distance(S, self.C) else: distances = cosine_distance(S, self.C) Q = T.nnet.softmax(-distances / self.m) # Calculates the fuzzy membership vector for each histogram S # Q, scan_u = theano.map(fn=self.sym_get_similarity, sequences=[S]) Nk = T.sum(Q, axis=0) H = T.dot(mapping.T, Q) P = H / Nk entropy_per_cluster = P * T.log2(P) entropy_per_cluster = T.switch(T.isnan(entropy_per_cluster), 0, entropy_per_cluster) entropy_per_cluster = entropy_per_cluster.sum(axis=0) Rk = Nk / Nk.sum() E = -(entropy_per_cluster * Rk).sum() return T.squeeze(E)
def adamgc(cost, params, lr=0.0002, b1=0.1, b2=0.001, e=1e-8, max_magnitude=5.0, infDecay=0.1): updates = [] grads = T.grad(cost, params) norm = norm_gs(params, grads) sqrtnorm = T.sqrt(norm) not_finite = T.or_(T.isnan(sqrtnorm), T.isinf(sqrtnorm)) adj_norm_gs = T.switch(T.ge(sqrtnorm, max_magnitude), max_magnitude / sqrtnorm, 1.) i = shared(floatX(0.)) i_t = i + 1. fix1 = 1. - (1. - b1)**i_t fix2 = 1. - (1. - b2)**i_t lr_t = lr * (T.sqrt(fix2) / fix1) for p, g in zip(params, grads): g = T.switch(not_finite, infDecay * p, g * adj_norm_gs) m = shared(p.get_value() * 0.) v = shared(p.get_value() * 0.) m_t = (b1 * g) + ((1. - b1) * m) v_t = (b2 * T.sqr(g)) + ((1. - b2) * v) g_t = m_t / (T.sqrt(v_t) + e) p_t = p - (lr_t * g_t) updates.append((m, m_t)) updates.append((v, v_t)) updates.append((p, p_t)) updates.append((i, i_t)) return updates, norm
def sgd(cost, parameters, mom, l_r, gradient_clip, param_clip=0, consider_constant=None): """ Each element of parameters is an array with 4 elements: param, update, hist_grad, hist_update """ updates_for_func = OrderedDict() for param, update in parameters: gparam = T.grad(cost, param, consider_constant=consider_constant) gparam = ifelse(T.isnan(T.sum(gparam)), T.zeros_like(gparam), gparam) upd = mom * update - l_r * gparam if (gradient_clip > 0): gradient_len = T.sqrt(T.sum( upd**2)) + 0.0000001 # To avoid zero divident upd = ifelse(T.lt(gradient_len, gradient_clip), upd, upd / gradient_len * gradient_clip) updates_for_func[update] = upd new_weight = param + upd if (param_clip > 0): new_weight = T.clip(new_weight, -param_clip, param_clip) updates_for_func[param] = new_weight return updates_for_func
def updates(self, cost, params, learning_rate = 0.1, momentum= 0.95, rescale=5.): grads = T.grad(cost, params) grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), grads))) not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm)) grad_norm = T.sqrt(grad_norm) scaling_num = rescale scaling_den = T.maximum(rescale, grad_norm) # Magic constants combination_coeff = 0.9 minimum_grad = 1e-4 updates = [] for n, (param, grad) in enumerate(zip(params, grads)): grad = T.switch(not_finite, 0.1 * param, grad * (scaling_num / scaling_den)) old_square = self.running_square_[n] new_square = combination_coeff * old_square + ( 1. - combination_coeff) * T.sqr(grad) old_avg = self.running_avg_[n] new_avg = combination_coeff * old_avg + ( 1. - combination_coeff) * grad rms_grad = T.sqrt(new_square - new_avg ** 2) rms_grad = T.maximum(rms_grad, minimum_grad) memory = self.memory_[n] update = momentum * memory - learning_rate * grad / rms_grad update2 = momentum * momentum * memory - ( 1 + momentum) * learning_rate * grad / rms_grad updates.append((old_square, new_square)) updates.append((old_avg, new_avg)) updates.append((memory, update)) updates.append((param, param + update2)) return updates
def compute_updates(self, training_cost, params): updates = [] grads = T.grad(training_cost, params) grads = OrderedDict(zip(params, grads)) # Clip stuff c = numpy.float32(self.cutoff) clip_grads = [] norm_gs = T.sqrt(sum(T.sum(g**2) for p, g in grads.items())) normalization = T.switch(T.ge(norm_gs, c), c / norm_gs, np.float32(1.)) notfinite = T.or_(T.isnan(norm_gs), T.isinf(norm_gs)) for p, g in grads.items(): clip_grads.append((p, T.switch(notfinite, numpy.float32(.1) * p, g * normalization))) grads = OrderedDict(clip_grads) if self.updater == 'adagrad': updates = Adagrad(grads, self.lr) elif self.updater == 'sgd': raise Exception("Sgd not implemented!") elif self.updater == 'adadelta': updates = Adadelta(grads) elif self.updater == 'rmsprop': updates = RMSProp(grads, self.lr) elif self.updater == 'adam': updates = Adam(grads) else: raise Exception("Updater not understood!") return updates
def get_probs(self, z_p, z_h): probs = get_output(self.nn_out, {self.nn_in[0]: z_p, self.nn_in[1]: z_h}, deterministic=True) probs = T.switch(T.isnan(probs), 0, probs) return probs
def posdef(self, x, diag): """ Check to determine postive definiteness of the Kronecker-structured covariance matrix. This operation is slow, and is thus not recommended to be called repeatedly as a check during optimization. Rather, the user should use this function as a guide to ensuring positive definiteness of the model for varying values of the kernel parameters. Args: tensor x: The input coordinates. tensor diag: The white noise variances. This should be an NxM array where N is the length of x and M is the size of alpha. Returns: isposdef: A boolean that is True if the covariance matrix is positive definite and False otherwise. The user will need to call ``isposdef.eval()`` to compute the returned value from the theano tensor variable. """ diag = tt.as_tensor_variable(diag) diag = tt.reshape(diag.T, (1, diag.size))[0] x = tt.as_tensor_variable(x) T = self.term.value(x[:, None] - x[None, :]) if 'alpha' in vars(self): R = self.alpha[:, None] * self.alpha[None, :] K = tt.slinalg.kron(T, R) elif 'R' in vars(self): K = tt.slinalg(T, self.R) chol = tt.slinalg.Cholesky(on_error='nan') L = chol(K + tt.diag(diag)) return tt.switch(tt.any(tt.isnan(L)), np.array(False), np.array(True))
def compute_updates(self, training_cost, params): updates = [] grads = T.grad(training_cost, params) grads = OrderedDict(zip(params, grads)) # Clip stuff c = numpy.float32(self.cutoff) clip_grads = [] norm_gs = T.sqrt(sum(T.sum(g ** 2) for p, g in grads.items())) normalization = T.switch(T.ge(norm_gs, c), c / norm_gs, np.float32(1.)) notfinite = T.or_(T.isnan(norm_gs), T.isinf(norm_gs)) for p, g in grads.items(): clip_grads.append((p, T.switch(notfinite, numpy.float32(.1) * p, g * normalization))) grads = OrderedDict(clip_grads) if self.updater == 'adagrad': updates = Adagrad(grads, self.lr) elif self.updater == 'sgd': raise Exception("Sgd not implemented!") elif self.updater == 'adadelta': updates = Adadelta(grads) elif self.updater == 'rmsprop': updates = RMSProp(grads, self.lr) elif self.updater == 'adam': updates = Adam(grads) else: raise Exception("Updater not understood!") return updates
def __init__(self, n_visible, n_hidden=150, n_hidden_recurrent=100, lr=0.001, l2_norm=None, l1_norm=None): (v, v_sample, cost, monitor, params, updates_train, v_t, updates_generate, n_steps) = build_rnnrbm(n_visible, n_hidden, n_hidden_recurrent, lr, l2_norm=l2_norm, l1_norm=l1_norm) for param in params: gradient = T.grad(cost, param, consider_constant=[v_sample]) # remove nan and inf values not_finite = T.or_(T.isnan(gradient), T.isinf(gradient)) gradient = T.switch(not_finite, 0.1 * param, gradient) # max_grad = param * 1e-3 # gradient = T.switch(T.gt(gradient, max_grad), max_grad, gradient) # momentum # velocity = shared_zeros('velocity_' + str(param.name), param.get_value(borrow=True).shape) # update = param - T.cast(lr, dtype=dtype) * gradient # x = momentum * velocity + update - param # updates_train[velocity] = x # updates_train[param] = momentum * x + update # rmsprop accu = shared_zeros('accu_' + str(param.name), param.get_value(borrow=True).shape) accu_new = 0.9 * accu + 0.1 * gradient ** 2 updates_train[accu] = accu_new updates_train[param] = param - (lr * gradient / T.sqrt(accu_new + 1e-6)) self.params = params self.train_function = theano.function([v], monitor, updates=updates_train) self.generate_function = theano.function([n_steps], v_t, updates=updates_generate)
def clip(grads, threshold, square=True, params=None): ''' Build the computational graph that clips the gradient if the norm of the gradient exceeds the threshold. :type grads: theano variable :param grads: the gradient to be clipped :type threshold: float :param threshold: the threshold of the norm of the gradient :returns: theano variable. The clipped gradient. ''' grads_norm2 = sum(tensor.sum(g**2) for g in grads) if square: grads_norm2 = tensor.sqrt(grads_norm2) grads_clip = [ tensor.switch(tensor.ge(grads_norm2, threshold), g / grads_norm2 * threshold, g) for g in grads ] #deal with nan grads_clip = [ tensor.switch(tensor.isnan(grads_norm2), 0.01 * p, g) for p, g in zip(params, grads_clip) ] return grads_clip, grads_norm2
def clip(clip_size,parameters,gradients): grad_mag = T.sqrt(sum(T.sum(T.sqr(w)) for w in parameters)) exploded = T.isnan(grad_mag) | T.isinf(grad_mag) scale = clip_size / T.maximum(clip_size,grad_mag) return [ T.switch(exploded, 0.1 * p, scale * g ) for p,g in zip(parameters,gradients) ]
def log_add(lna, lnb): """ Compute the ln(a+b) given {lna,lnb} :param :return: ln(a+b) """ max_ = tensor.maximum(lna, lnb) result = (max_ + tensor.log1p(tensor.exp(lna + lnb - 2 * max_))) #log1p(x) = log(1+x) return tensor.switch(tensor.isnan(result), max_, result)
def __init__(self, paramMap, loss, learning_rate): g_mom = {} updates = {} sqr_gradients = {} paramObjLst = paramMap.values() obj2Grad = {} l2_loss = 0.0 for param in paramObjLst: gparam_mom = theano.shared(np.zeros(param.get_value(borrow=True).shape,dtype=theano.config.floatX)) g_mom[param] = gparam_mom sqr_grad = theano.shared(np.zeros(param.get_value(borrow=True).shape,dtype=theano.config.floatX)) sqr_gradients[param] = sqr_grad l2_loss += T.sum(param**2) gradLst = T.grad(loss, paramObjLst) for i in range(0, len(paramObjLst)): obj2Grad[paramObjLst[i]] = gradLst[i] for param in paramObjLst: gparam = g_mom[param] sqr_grad = sqr_gradients[param] #new_gradient = T.grad(loss, param) new_gradient = obj2Grad[param] scaling_factor = 1.0 #T.maximum(1.0, (T.sqrt(T.sum(T.sqr(new_gradient))))) #Divide by the norm of the gradient if it is greater than one new_gradient = new_gradient / scaling_factor new_gradient = T.switch(T.isnan(new_gradient), 0.0, new_gradient) mom = 0.7 learning_rate_use = learning_rate# / (T.sqrt(sqr_grad) + 1.0) updates[gparam] = T.cast(mom * gparam - (1.0 - mom) * learning_rate_use * new_gradient, theano.config.floatX) updates[sqr_grad] = T.cast(T.clip(sqr_grad + T.abs_(new_gradient), 0.0, 10000.0), theano.config.floatX) for param in paramObjLst: updated_value = param + updates[g_mom[param]] if param.ndim == 2: updated_value = normalize(updated_value) updates[param] = T.cast(updated_value, theano.config.floatX) self.updates = updates
def replace_nans(tensor): """ convert nans and infs to float_max. convert -infs to float_min. """ tensor = T.switch(T.isnan(tensor), sys.float_info.max, tensor) return T.switch(T.isinf(tensor), T.switch(T.lt(tensor, 0), sys.float_info.min, sys.float_info.max), tensor)
def grads(self, cost): grad_dict = {} grads = T.grad(cost, self.params) for param, grad in zip(self.params, grads): grad = T.switch(T.isnan(grad), 0.0, grad) if param in self.param_masks: grad = grad * self.param_masks[param] grad_dict[param] = grad return grad_dict
def marginalize_over_v_z(self, h): # energy = \sum_{i=1}^{|h|} h_i*b_i - \beta * ln(1 + e^{b_i}) if self.penalty == "softplus_bi": energy = (h * self.b).T - self.beta * T.log(1 + T.exp(self.b))[:, None] elif self.penalty == "softplus0": energy = (h * self.b).T - self.beta * T.log(1 + T.exp(0))[:, None] else: raise NameError("Invalid penalty term") energy = T.set_subtensor(energy[(T.isnan(energy)).nonzero()], 0) # Remove energy = T.sum(energy, axis=0, keepdims=True).T ener = T.tensordot(h, self.W, axes=0) ener = T.diagonal(ener, axis1=1, axis2=2) ener = T.set_subtensor(ener[(T.isnan(ener)).nonzero()], 0) ener = T.sum(ener, axis=2) + self.c[None, :] ener = T.sum(T.log(1 + T.exp(ener)), axis=1, keepdims=True) return -(energy + ener)
def custom_loss(y_true_mach, y_pred): # y_pred = T.clip(y_pred, _EPSILON, 1.0 - _EPSILON) # Flatten() is crucial, i don't know why :) y_true_mach = y_true_mach.flatten() # Find out non -1 targets nz = T.neq(y_true_mach, -1).nonzero()[0] # loss can be nan if for this position no target is available in the batch # replace nan loss with a value near 0 # May slow down training, take care of this with another approach. loss = -T.log(y_pred[nz, T.cast(y_true_mach[nz], "uint16")]).mean() return ifelse(T.isnan(loss), 0.0001, loss)
def get_cost_grads_updates(self, x): ha, h, ya, y = self.network.propVHV(x, noise_std=self.train_hypers['noise_std']) q = T.switch(T.isnan(self.q), h.mean(axis=0), 0.9*self.q + 0.1*h.mean(axis=0)) lamb = T.cast(self.train_hypers['lamb'], self.dtype) rho = T.cast(self.train_hypers['rho'], self.dtype) cost = ((x - y)**2).mean(axis=0).sum() + lamb*(T.abs_(q - rho)).sum() updates = {self.q: q} return cost, self.grads(cost), updates
def safe_logaddexp(a, b): """Symbolic log(exp(a) + exp(b)). The edge case where `a` - `b` is undefined is handled by setting the difference to 0. This occurs if both `a` and `b` are +inf or -inf. Returns: symbolic log(exp(a) + exp(b)) """ diff = b - a safe_diff = tt.switch(tt.isnan(diff), 0, diff) return tt.switch(safe_diff >= 0, b + tt.log1p(tt.exp(-safe_diff)), a + tt.log1p(tt.exp(safe_diff)))
def step_clipping(params, gparams, scale=1.0): grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), gparams))) notfinite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm)) multiplier = T.switch(grad_norm < scale, 1.0, scale / grad_norm) _g = [] for param, gparam in izip(params, gparams): tmp_g = gparam * multiplier _g.append(T.switch(notfinite, param * 0.1, tmp_g)) params_clipping = _g return params_clipping
def shadow(self, points, lights): """ Returns whether points are in shadow of this object. See: http://en.wikipedia.org/wiki/Line-sphere_intersection """ y = points # vector from points to our center x = T.tensordot(y, -1*lights[0].normed_dir(), 1) decider = T.sqr(x) - T.sum(T.mul(y, y), 2) + 1 # if shadow, below is >= 0 is_nan_or_nonpos = T.or_(T.isnan(decider), decider <= 0) return T.switch(is_nan_or_nonpos, -1, -x - T.sqrt(decider))