def get_cost_updates(self, x, W, W_prime, b, b_prime, corruption_level, learning_rate, l2reg=0., l1reg=0.): """ This function computes the cost and the updates for one trainng step of the dA """ self.x = x self.W = W self.W_prime = W_prime self.b = b self.b_prime = b_prime self.params = [self.W, self.W_prime, self.b, self.b_prime] if corruption_level == None: tilde_x = self.x else: tilde_x = self.get_corrupted_input(self.x, corruption_level) y = self.get_hidden_values( tilde_x) z = self.get_reconstructed_input(y) # note : we sum over the size of a datapoint; if we are using minibatches, # L will be a vector, with one entry per example in minibatch XE = self.x * T.log(z) + (1 - self.x) * T.log(1-z) cost = -T.mean(T.sum(XE, axis=1),axis=0) if l2reg != 0.: cost += l2reg * (T.mean(T.sum(self.W*self.W,1),0) + T.mean(T.sum(self.W_prime*self.W_prime,1),0)) if l1reg != 0.: cost += l1reg * (T.mean(T.sum(T.abs_(y),1),0) + T.mean(T.sum(T.abs_(y),1),0)) # compute the gradients of the cost of the `dA` with respect # to its parameters gparams = T.grad(cost, self.params) # # generate the list of updates # updates = {} # for param, gparam in zip(self.params, gparams): # updates[param] = param - learning_rate*gparam updates = [-learning_rate*gparam for gparam in gparams] return (cost, updates)
def sample(self, Y): """ Given samples from the upper layer Y, sample values from X and return then together with their log probability. Parameters ---------- Y: T.tensor samples from the upper layer Returns ------- X: T.tensor samples from the lower layer log_p: T.tensor log-posterior for the samples returned in X """ n_X, = self.get_hyper_params(['n_X']) W, b = self.get_model_params(['W', 'b']) n_samples = Y.shape[0] # sample X given Y prob_X = self.sigmoid(T.dot(Y, W) + b) U = theano_rng.uniform((n_samples, n_X), nstreams=512) X = T.cast(U <= prob_X, dtype=floatX) log_prob = X*T.log(prob_X) + (1-X)*T.log(1-prob_X) log_prob = log_prob.sum(axis=1) return X, log_prob
def get_cost_updates(self, corruption_level, learning_rate): """ This function computes the cost and the updates for one trainng step of the dA """ tilde_x = self.get_corrupted_input(self.x, corruption_level) y = self.get_hidden_values(tilde_x) z = self.get_reconstructed_input(y) # note : we sum over the size of a datapoint; if we are using # minibatches, L will be a vector, with one entry per # example in minibatch L = - T.sum(self.x * T.log(z) + (1 - self.x) * T.log(1 - z), axis=1) # note : L is now a vector, where each element is the # cross-entropy cost of the reconstruction of the # corresponding example of the minibatch. We need to # compute the average of all these to get the cost of # the minibatch cost = T.mean(L) # compute the gradients of the cost of the `dA` with respect # to its parameters gparams = T.grad(cost, self.params) # generate the list of updates updates = [ (param, param - learning_rate * gparam) for param, gparam in zip(self.params, gparams) ] return (cost, updates)
def cross_entropy(self, y): #return (-(y * T.log(self.y) + (1.0 - y) * T.log(1.0 - self.y))).mean() #return T.nnet.binary_crossentropy(self.y, y).mean() y_used = self.y y_used = T.clip(self.y, 0.0000001, 0.999999999) return T.mean(-y * T.log(y_used) - (1 - y) * T.log(1 - y_used))
def get_reconstruction_cost(self, updates, pre_sigmoid_nv): """ Approximation to the recontruction error Note that this function requires the pre-sigmoid actiavtion as input. To undertstand why this is so you need to understand a bit about how Theano works. Whenever you compile a Theano function, the computational graph that you pass as input gets optimized for speed and stability. This is done by changing several parts of the subgraphs with others. One such optimization expresses terms of softplus. We need this optimizations for the cross-entropy since sigmoid of numbers larger than 30. (or even less then that) return to 1. and numbers of smaller than -30, turn to 0 which ini terms will force theano to compute log(0) and thereforce we will get either -inf or NaN as cost. If the value is expressed in terms of softplus we do not get this undersirable behaviour. This optimiation usually works fine, but here we have a special case. The sigmoid is applied inside the scan op, while the log is outisde. Therefore Theano will only see log(scan(...)) instead of log(sigmoid(..)) and will not apply the wanted optimization. We can not go and replace the sigmoid in scan with something else alse, because this only needs to be done on the last step. Therefore the easiest adn more efficient way is to get also teh pre-sigmoid activation as an output of scan, and apply both the log and sigmoid outside scan sunch that Theano can catch and optimize the expression. """ cross_entropy = T.mean( T.sum( self.input * T.log(T.nnet.sigmoid(pre_sigmoid_nv)) + ( 1 - self.input) * T.log(1 - T.nnet.sigmoid(pre_sigmoid_nv)), axis=1 ) ) return cross_entropy
def _build_marginal_likelihood_logp(self, y, X, Xu, sigma): sigma2 = tt.square(sigma) Kuu = self.cov_func(Xu) Kuf = self.cov_func(Xu, X) Luu = cholesky(stabilize(Kuu)) A = solve_lower(Luu, Kuf) Qffd = tt.sum(A * A, 0) if self.approx == "FITC": Kffd = self.cov_func(X, diag=True) Lamd = tt.clip(Kffd - Qffd, 0.0, np.inf) + sigma2 trace = 0.0 elif self.approx == "VFE": Lamd = tt.ones_like(Qffd) * sigma2 trace = ((1.0 / (2.0 * sigma2)) * (tt.sum(self.cov_func(X, diag=True)) - tt.sum(tt.sum(A * A, 0)))) else: # DTC Lamd = tt.ones_like(Qffd) * sigma2 trace = 0.0 A_l = A / Lamd L_B = cholesky(tt.eye(Xu.shape[0]) + tt.dot(A_l, tt.transpose(A))) r = y - self.mean_func(X) r_l = r / Lamd c = solve_lower(L_B, tt.dot(A, r_l)) constant = 0.5 * X.shape[0] * tt.log(2.0 * np.pi) logdet = 0.5 * tt.sum(tt.log(Lamd)) + tt.sum(tt.log(tt.diag(L_B))) quadratic = 0.5 * (tt.dot(r, r_l) - tt.dot(c, c)) return -1.0 * (constant + logdet + quadratic + trace)
def compileFunctions(self, x_image_global, examples, ib, B, K, corrupt): if x_image_global == None: x_image_global = self.x if corrupt == 0.0: self.x_c = self.x else: self.x_c = self.theano_rng.binomial( size=self.x.shape, n=1, p=1-corrupt, dtype=theano.config.floatX) * self.x self.h = self.g(T.dot(self.x_c, self.W_hl) + self.b_hl) self.x_r = self.o(T.dot(self.h, self.W_ol) + self.b_ol) self.params = [self.W_hl, self.b_hl, self.b_ol] self.cost = \ (- T.sum( self.x * T.log(self.x_r) + (1 - self.x) * T.log(1 - self.x_r), axis=(0,1))) gparams = T.grad(self.cost, self.params) updates = [ (param, param - K * gparam) for param, gparam in zip(self.params, gparams) ] fun_train = theano.function( inputs=[ib], outputs=(self.cost, self.x_r, self.x_c), updates=updates, givens={ x_image_global: examples[ib*B: (ib+1)*B] } ) return fun_train
def binomial_lpdf(node, x, kw): random_state, size, n, p = node.inputs # for the n > 1 the "choose" operation is required # TODO assert n == 1 return tensor.switch(tensor.eq(x, 1.0), tensor.log(p), tensor.log(1.0 - p))
def _elbo_t(logp, uw, inarray, n_mcsamples, random_seed): """Create Theano tensor of approximate ELBO by Monte Carlo sampling. """ l = (uw.size / 2) l_int = l.astype('int64') u = uw[:l_int] w = uw[l_int:] # Callable tensor def logp_(input): return theano.clone(logp, {inarray: input}, strict=False) # Naive Monte-Carlo if random_seed is None: r = MRG_RandomStreams(gen_random_state()) else: r = MRG_RandomStreams(seed=random_seed) if n_mcsamples == 1: n = r.normal(size=inarray.tag.test_value.shape) q = n * tt.exp(w) + u elbo = logp_(q) + tt.sum(w) + 0.5 * l * (1 + tt.log(2.0 * np.pi)) else: n = r.normal(size=(n_mcsamples, u.tag.test_value.shape[0])) qs = n * tt.exp(w) + u logps, _ = theano.scan(fn=lambda q: logp_(q), outputs_info=None, sequences=[qs]) elbo = tt.mean(logps) + tt.sum(w) + 0.5 * l * (1 + tt.log(2.0 * np.pi)) return elbo
def logp(self, value): mu = self.mu tau = self.tau return bound(-0.5 * tau * (T.log(value) - mu)**2 + 0.5 * T.log(tau/(2. * np.pi)) - T.log(value), tau > 0)
def logp(self, value): alpha = self.alpha beta = self.beta return bound(T.log(alpha) - T.log(beta) + (alpha - 1) * T.log(value/beta) - (value/beta)**alpha, value >= 0, alpha > 0, beta > 0)
def grad_init(self): mask_ = self.mask.flatten() rewards_ = self.rewards.flatten() actions_ = self.actions.reshape([self.actions.shape[0]*self.actions.shape[1],-1]) #self.mov_std = theano.shared(numpy.float32(1.), 'std') pp = self.params.values() mean_rewards = (mask_ * rewards_).sum(-1, keepdims=True) / tensor.maximum(1., mask_.sum(-1, keepdims=True)) centered_rewards = rewards_ - self.vapprox.v[:,0] - mean_rewards mean2_rewards = (mask_ * (rewards_ ** 2)).sum(-1, keepdims=True) / tensor.maximum(1., mask_.sum(-1, keepdims=True)) var_rewards = mean2_rewards - (mean_rewards ** 2) scaled_rewards = centered_rewards / tensor.maximum(1., tensor.sqrt(tensor.maximum(0., var_rewards))) #scaled_rewards = centered_rewards logprob = 0. reg = 0. for oi in xrange(self.n_out): labs = actions_[:,oi].flatten() labs_idx = tensor.arange(labs.shape[0]) * self.out_dim + labs logprob = logprob + (mask_ * tensor.log(self.pi[oi].flatten()+1e-6)[labs_idx]) reg = reg - (self.pi[oi] * tensor.log(self.pi[oi]+1e-6)).sum(-1).sum(0) self.cost = -tensor.mean(scaled_rewards * logprob + self.reg_c * reg) self.grads = tensor.grad(self.cost, wrt=pp)
def get_cost(self, p=0, sigma=1): # the last layer z = self.sigmoid_layers[-1].output L = -T.sum(self.x * T.log(z) + (1 - self.x) * T.log(1 - z), axis=1) p_idx = len(self.sigmoid_layers)/2 - 1 #penalty layer, the middle layer if p == 0: cost = T.mean(L) # cost = T.mean(T.sqrt(T.mean(self.errors, axis=1))) #Log Spectral Distance(LSD) elif (p != 0) and (sigma == 0):# for square penalty square_cost = self.get_square_cost(self.sigmoid_layers[p_idx].output, p) cost = T.mean(L) + T.mean(square_cost) elif(p != 0) and (sigma != 0):# for Gaussian penalty gaussian_cost = self.get_gaussian_cost(self.sigmoid_layers[p_idx].output, p, sigma) cost = T.mean(L) + T.mean(gaussian_cost) # elif(p == -1) and (sigma == 0):#binary # code_val = self.sigmoid_layers[p_idx].output # binary_val = code_val>=0.5 # self.sigmoid_layers[p_idx+1].input = binary_val # z = self.sigmoid_layers[-1].output # L = -T.sum(self.x * T.log(z) + (1 - self.x) * T.log(1 - z), axis=1) # cost = T.mean(L) # elif(p == -1) and (sigma != 0):#add gaussian noise # gaussian_data = self.theano_rng.normal(size=self.sigmoid_layers[p_idx-1].output.shape, std=sigma, # dtype=theano.config.floatX) # self.sigmoid_layers[p_idx].input = self.sigmoid_layers[p_idx-1].output + gaussian_data # z = self.sigmoid_layers[-1].output # L = -T.sum(self.x * T.log(z) + (1 - self.x) * T.log(1 - z), axis=1) # cost = T.mean(L) else: cost = T.mean(L) return cost
def unet_crossentropy_loss_sampled(y_true, y_pred): print 'unet_crossentropy_loss_sampled' epsilon = 1.0e-4 y_pred_clipped = T.flatten(T.clip(y_pred, epsilon, 1.0-epsilon)) y_true = T.flatten(y_true) # this seems to work # it is super ugly though and I am sure there is a better way to do it # but I am struggling with theano to cooperate # filter the right indices indPos = T.nonzero(y_true)[0] # no idea why this is a tuple indNeg = T.nonzero(1-y_true)[0] # shuffle n = indPos.shape[0] indPos = indPos[srng.permutation(n=n)] n = indNeg.shape[0] indNeg = indNeg[srng.permutation(n=n)] # take equal number of samples depending on which class has less n_samples = T.cast(T.min([T.sum(y_true), T.sum(1-y_true)]), dtype='int64') indPos = indPos[:n_samples] indNeg = indNeg[:n_samples] loss_vector = -T.mean(T.log(y_pred_clipped[indPos])) - T.mean(T.log(1-y_pred_clipped[indNeg])) average_loss = T.mean(loss_vector) print 'average_loss:', average_loss return average_loss
def get_model(Ws, bs, dropout=False): v = T.matrix('input') m = T.matrix('missing') q = T.matrix('target') k = T.vector('normalization factor') # Set all missing/target values to 0.5 keep_mask = (1-m) * (1-q) h = keep_mask * (v * 2 - 1) # Convert to +1, -1 # Normalize layer 0 h *= k.dimshuffle(0, 'x') for l in xrange(len(Ws)): h = T.dot(h, Ws[l]) + bs[l] if l < len(Ws) - 1: h = h * (h > 0) # relu if dropout: mask = srng.binomial(n=1, p=0.5, size=h.shape) h = h * mask * 2 output = sigmoid(h) LL = v * T.log(output) + (1 - v) * T.log(1 - output) # loss = -(q * LL).sum() / q.sum() loss = -((1 - m) * LL).sum() / (1 - m).sum() return v, m, q, k, output, loss
def expr(self, model, data): v = data mid = model.get_enc(v) rou_mid = mid.mean(axis=0) cs294_sparse = (self.rou * T.log(self.rou / rou_mid) + (1 - self.rou) * T.log((1 - self.rou) / (1 - rou_mid))).sum() return cs294_sparse
def GMM(y, mu, sig, coeff): """ Gaussian mixture model negative log-likelihood Parameters ---------- y : TensorVariable mu : FullyConnected (Linear) sig : FullyConnected (Softplus) coeff : FullyConnected (Softmax) """ n_dim = y.ndim shape_y = y.shape y = y.reshape((-1, shape_y[-1])) y = y.dimshuffle(0, 1, "x") mu = mu.reshape((-1, mu.shape[-1] / coeff.shape[-1], coeff.shape[-1])) sig = sig.reshape((-1, sig.shape[-1] / coeff.shape[-1], coeff.shape[-1])) coeff = coeff.reshape((-1, coeff.shape[-1])) inner = -0.5 * T.sum(T.sqr(y - mu) / sig ** 2 + 2 * T.log(sig) + T.log(2 * np.pi), axis=-2) nll = -logsumexp(T.log(coeff) + inner, axis=-1) # Adjust dimension new_dim = T.set_subtensor(shape_y[-1], 1) nll = nll.reshape(new_dim, ndim=n_dim) nll = nll.flatten(n_dim - 1) return nll
def lp_norm(self, n, k, r, c, z): ''' Lp = ( 1/n * sum(|x_i|^p, 1..n))^(1/p) where p = 1 + ln(1+e^P) :param n: :param k: :param r: :param c: :param z: :return: ''' ds0, ds1 = self.pool_size st0, st1 = self.stride pad_h = self.pad[0] pad_w = self.pad[1] row_st = r * st0 row_end = T.minimum(row_st + ds0, self.img_rows) row_st = T.maximum(row_st, self.pad[0]) row_end = T.minimum(row_end, self.x_m2d + pad_h) col_st = c * st1 col_end = T.minimum(col_st + ds1, self.img_cols) col_st = T.maximum(col_st, self.pad[1]) col_end = T.minimum(col_end, self.x_m1d + pad_w) Lp = T.pow( T.mean(T.pow( T.abs_(T.flatten(self.y[n, k, row_st:row_end, col_st:col_end], 1)), 1 + T.log(1 + T.exp(self.P)) )), 1 / (1 + T.log(1 + T.exp(self.P))) ) return T.set_subtensor(z[n, k, r, c], Lp)
def sequence_log_likelihood(y, y_hat, y_mask, y_hat_mask, blank_symbol, log_scale=True): """ Based on code from Shawn Tan. Credits to Kyle Kastner as well. This function computes the CTC log likelihood for a sequence that has been augmented with blank labels. """ y_hat_mask_len = tensor.sum(y_hat_mask, axis=0, dtype="int32") y_mask_len = tensor.sum(y_mask, axis=0, dtype="int32") if log_scale: log_probabs = _log_path_probabs(y, T.log(y_hat), y_mask, y_hat_mask, blank_symbol) batch_size = log_probabs.shape[1] # Add the probabilities of the final time steps to get the total # sequence likelihood. log_labels_probab = _log_add( log_probabs[y_hat_mask_len - 1, tensor.arange(batch_size), y_mask_len - 1], log_probabs[y_hat_mask_len - 1, tensor.arange(batch_size), y_mask_len - 2], ) else: probabilities = _path_probabs(y, y_hat, y_mask, y_hat_mask, blank_symbol) batch_size = probabilities.shape[1] labels_probab = ( probabilities[y_hat_mask_len - 1, tensor.arange(batch_size), y_mask_len - 1] + probabilities[y_hat_mask_len - 1, tensor.arange(batch_size), y_mask_len - 2] ) log_labels_probab = tensor.log(labels_probab) return log_labels_probab
def __init__(self, n_in, n_out, n_h, learning_rate=0.12): self.x = T.matrix(dtype=theano.config.floatX) # @UndefinedVariable self.target = T.matrix(dtype=theano.config.floatX) # @UndefinedVariable bound_x = numpy.sqrt(6. / (n_in + n_h)) bound_h = numpy.sqrt(6. / (n_h + n_h)) self.params = [] self.w_x = theano.shared(np.array(np.random.uniform(low=-bound_x, high=bound_x, size=(n_in, n_h)), dtype=theano.config.floatX)) # @UndefinedVariable self.params.append(self.w_x) self.w_h = theano.shared(np.array(np.random.uniform(low=-bound_h, high=bound_h, size=(n_h, n_h)), dtype=theano.config.floatX)) # @UndefinedVariable self.params.append(self.w_h) self.b_h = theano.shared(np.array(np.random.uniform(low=-bound_h, high=bound_h, size=(n_h,)), dtype=theano.config.floatX)) # @UndefinedVariable self.params.append(self.b_h) self.w = theano.shared(np.array(np.random.uniform(low=-bound_h, high=bound_h, size=(n_h, n_out)), dtype=theano.config.floatX)) # @UndefinedVariable self.params.append(self.w) self.b = theano.shared(np.array(np.random.uniform(low=-bound_h, high=bound_h, size=(n_out,)), dtype=theano.config.floatX)) # @UndefinedVariable self.params.append(self.b) self.h0 = theano.shared(np.array(np.random.uniform(low=-bound_x, high=bound_x, size=(n_h,)), dtype=theano.config.floatX)) # @UndefinedVariable self.params.append(self.h0) def one_step(x, h1): h = T.nnet.sigmoid(T.dot(x, self.w_x) + T.dot(h1, self.w_h) + self.b_h) y = T.nnet.sigmoid(T.dot(h, self.w) + self.b) return h, y [hs, ys], _ = theano.scan(fn=one_step, sequences=self.x, outputs_info=[self.h0, None]) cost = -T.mean(self.target * T.log(ys) + (1 - self.target) * T.log(1 - ys)) grads = T.grad(cost, self.params) updates = [(param, param - learning_rate * grad) for param, grad in zip(self.params, grads)] self.train = theano.function([self.x, self.target], cost, updates=updates) self.predict = theano.function([self.x], ys)
def get_cost_updates(self, contraction_level, learning_rate): """ This function computes the cost and the updates for one trainng step of the cA """ y = self.get_hidden_values(self.x) z = self.get_reconstructed_input(y) J = self.get_jacobian(y, self.W) # note : we sum over the size of a datapoint; if we are using # minibatches, L will be a vector, with one entry per # example in minibatch self.L_rec = - T.sum(self.x * T.log(z) + (1 - self.x) * T.log(1 - z), axis=1) # Compute the jacobian and average over the number of samples/minibatch self.L_jacob = T.sum(J ** 2) // self.n_batchsize # note : L is now a vector, where each element is the # cross-entropy cost of the reconstruction of the # corresponding example of the minibatch. We need to # compute the average of all these to get the cost of # the minibatch cost = T.mean(self.L_rec) + contraction_level * T.mean(self.L_jacob) # compute the gradients of the cost of the `cA` with respect # to its parameters gparams = T.grad(cost, self.params) # generate the list of updates updates = [] for param, gparam in zip(self.params, gparams): updates.append((param, param - learning_rate * gparam)) return (cost, updates)
def test_log1msigm_to_softplus(self): x = T.matrix() out = T.log(1 - sigmoid(x)) f = theano.function([x], out, mode=self.m) topo = f.maker.fgraph.toposort() assert len(topo) == 2 assert isinstance(topo[0].op.scalar_op, theano.tensor.nnet.sigm.ScalarSoftplus) assert isinstance(topo[1].op.scalar_op, theano.scalar.Neg) f(numpy.random.rand(54, 11).astype(config.floatX)) # Same test with a flatten out = T.log(1 - T.flatten(sigmoid(x))) f = theano.function([x], out, mode=self.m) topo = f.maker.fgraph.toposort() assert len(topo) == 3 assert isinstance(topo[0].op, T.Flatten) assert isinstance(topo[1].op.scalar_op, theano.tensor.nnet.sigm.ScalarSoftplus) assert isinstance(topo[2].op.scalar_op, theano.scalar.Neg) f(numpy.random.rand(54, 11).astype(config.floatX)) # Same test with a reshape out = T.log(1 - sigmoid(x).reshape([x.size])) f = theano.function([x], out, mode=self.m) topo = f.maker.fgraph.toposort() #assert len(topo) == 3 assert any(isinstance(node.op, T.Reshape) for node in topo) assert any(isinstance(getattr(node.op, 'scalar_op', None), theano.tensor.nnet.sigm.ScalarSoftplus) for node in topo) f(numpy.random.rand(54, 11).astype(config.floatX))
def negative_log_likelihood(self, y): """ Return the mean of the negative log-likelihood of the prediction of this model under a given target distribution. .. math:: \frac{1}{|\mathcal{D}|} \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) = \frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\ \ell (\theta=\{W,b\}, \mathcal{D}) :type y: theano.tensor.TensorType :param y: corresponds to a vector that gives for each example the correct label Note: we use the mean instead of the sum so that the learning rate is less dependent on the batch size """ # y.shape[0] is (symbolically) the number of rows in y, i.e., # number of examples (call it n) in the minibatch # T.arange(y.shape[0]) is a symbolic vector which will contain # [0,1,2,... n-1] T.log(self.p_y_given_x) is a matrix of # Log-Probabilities (call it LP) with one row per example and # one column per class LP[T.arange(y.shape[0]),y] is a vector # v containing [LP[0,y[0]], LP[1,y[1]], LP[2,y[2]], ..., # LP[n-1,y[n-1]]] and T.mean(LP[T.arange(y.shape[0]),y]) is # the mean (across minibatch examples) of the elements in v, # i.e., the mean log-likelihood across the minibatch. if self.is_binary: -T.mean(T.log(self.p_y_given_x)) return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y])
def simple_RNN(nh): Wx = theano.shared(0.2 * numpy.random.uniform(-1.0, 1.0, (1, nh)).astype(theano.config.floatX)) Wh = theano.shared(0.2 * numpy.random.uniform(-1.0, 1.0, (nh, nh)).astype(theano.config.floatX)) Wy = theano.shared(0.2 * numpy.random.uniform(-1.0, 1.0, (nh, 1)).astype(theano.config.floatX)) bh = theano.shared(numpy.zeros(nh, dtype=theano.config.floatX)) by = theano.shared(numpy.zeros(1, dtype=theano.config.floatX)) h0 = theano.shared(numpy.zeros(nh, dtype=theano.config.floatX)) p = [Wx, Wh, Wy, bh, by, h0] x = T.matrix() def recurrence(x_t, h_tm1): h_t = T.tanh(T.dot(x_t, Wx) + T.dot(h_tm1, Wh) + bh) s_t = T.dot(h_t, Wy) + by return [h_t, s_t] ([h, activations], updates) = theano.scan(fn=recurrence, sequences=x, outputs_info=[h0, dict()]) t = x[0, 0] s = activations[-1, 0] y = T.nnet.sigmoid(s) loss = -t*T.log(y + 1e-14) - (1-t)*T.log((1-y) + 1e-14) acc = T.neq(T.round(y), t) return p, [x], s, [loss, acc], h
def get_cost_updates(self, contraction_level, learning_rate, cost_measure="cross_entropy"): """ This function computes the cost and the updates for one trainng step of the cA """ y = self.get_hidden_values(self.x) z = self.get_reconstructed_input(y) J = self.get_jacobian(y, self.W) if cost_measure=="cross_entropy": #self.L_rec = - T.sum(self.x * T.log(z) + (1 - self.x) * T.log(1 - z), axis=1) self.L_rec = T.mean(- T.sum(self.x * T.log(z) + (1 - self.x) * T.log(1 - z),axis=1)) elif cost_measure=="euclidean": self.L_rec = T.mean(T.sum((self.x-z)**2,axis=1)) # Compute the jacobian and average over the number of samples/minibatch self.L_jacob = T.mean(T.sum(J ** 2) / self.n_batchsize) cost = self.L_rec + contraction_level * self.L_jacob # compute the gradients of the cost of the `cA` with respect # to its parameters gparams = T.grad(cost, self.params) # generate the list of updates updates = [] for param, gparam in zip(self.params, gparams): updates.append((param, param - learning_rate * gparam)) return (cost, updates)
def forward_jacobian_log_det(self, x): dy_dx, _ = th.scan(lambda x_i: th.grad(self.forward_func(x_i), x_i), sequences=[x.flatten()]) if self.fudge != 0.: return tt.log(dy_dx + self.fudge).sum() else: return tt.log(dy_dx).sum()
def forward_jacobian_log_det(self, x): y_sum = self.forward_map(x).sum() dy_dx = th.grad(y_sum, x) if self.fudge != 0.: return tt.log(dy_dx + self.fudge).sum() else: return tt.log(dy_dx).sum()
def get_sparsity_cost(self): # update mean activation using exponential moving average hack_h = self.h_given_v(self.sp_pos_v) # define loss based on value of sp_type if self.sp_type == 'kl': eps = npy_floatX(1./self.batch_size) loss = lambda targ, val: - npy_floatX(targ) * T.log(eps + val) \ - npy_floatX(1-targ) * T.log(1 - val + eps) else: raise NotImplementedError('Sparsity type %s is not implemented' % self.sp_type) cost = T.zeros((), dtype=floatX) params = [] if self.sp_weight['h']: cost += self.sp_weight['h'] * T.sum(loss(self.sp_targ['h'], hack_h.mean(axis=0))) params += [self.hbias] if self.sp_type in ['kl'] and self.sp_weight['h']: params += [self.Wv, self.alpha, self.mu] if self.flags['split_norm']: params += [self.scalar_norms] return costmod.Cost(cost, params)
def forward_jacobian_log_det(self, x): if x.ndim == 1: return tt.log(tt.abs_(self.diag_weights)).sum() elif x.ndim == 2: return x.shape[0] * tt.log(tt.abs_(self.diag_weights)).sum() else: raise ValueError('x must be one or two dimensional.')
def get_reconstruction_cost(self, updates, pre_sigmoid_nv): cross_entropy = T.mean( T.sum(self.input * T.log(T.nnet.sigmoid(pre_sigmoid_nv)) + (1 - self.input) * T.log(1 - T.nnet.sigmoid(pre_sigmoid_nv)), axis=1)) return cross_entropy
def hinge_loss(self, y): return -T.mean(T.log(self.p_y_given_x)[:,y]) # TODO
def forward(self, x): return tt.switch(x < 1, tt.log(x), x - 1.0)
def log(x): return T.log(x)
def logp(self, value): w = self.w return bound(logsumexp(tt.log(w) + self._comp_logp(value), axis=-1).sum(), w >= 0, w <= 1, tt.allclose(w.sum(axis=-1), 1))
def __init__(self, dim, n_entities, batch_size=None, validation_samples=2): self.__dict__.update(locals()) del self.self theano_rng = RandomStreams(numpy.random.randint(2**30)) #Start by defining the graph ##Parameter setup self.emb = theano.shared((numpy.random.uniform( -1.0, 1.0, (self.n_entities, self.dim))).astype(theano.config.floatX)) self.emb.tag.test_value = (numpy.random.uniform( -1.0, 1.0, (self.n_entities, self.dim))).astype(theano.config.floatX) self.a = theano.shared(numpy.asarray(1.0).astype(theano.config.floatX)) self.b = theano.shared(numpy.asarray(0.0).astype(theano.config.floatX)) self.params = [self.emb, self.a, self.b] ### Input setup! self.x1_idxs = T.ivector() self.x2_idxs = T.ivector() self.x1_idxs.tag.test_value = numpy.asarray([0, 1], dtype=numpy.int32) self.x2_idxs.tag.test_value = numpy.asarray([1, 2], dtype=numpy.int32) #generate negative samples choice = theano_rng.binomial(size=self.x1_idxs.shape) alternative = theano_rng.random_integers(size=self.x1_idxs.shape, low=0, high=n_entities - 1) self.x1_idxs_negative = T.switch(choice, self.x1_idxs, alternative) self.x2_idxs_negative = T.switch(choice, alternative, self.x2_idxs) ### Define graph from input to predictive loss def get_embed(index_tensor): return sigmoid(self.emb[index_tensor].reshape( (index_tensor.shape[0], self.dim))) x1_emb = get_embed(self.x1_idxs) x2_emb = get_embed(self.x2_idxs) x1neg_emb = get_embed(self.x1_idxs_negative) x2neg_emb = get_embed(self.x2_idxs_negative) def get_prob1(embed_tensor1, embed_tensor2): return sigmoid( self.a * T.mean(embed_tensor1 * embed_tensor2 + (1 - embed_tensor1) * (1 - embed_tensor2), axis=1) + self.b) #probability of a link, 0 to 1.' self.loss = T.mean(-T.log(get_prob1(x1_emb, x2_emb)) - T.log(1 - get_prob1(x1neg_emb, x2neg_emb))) ###Define graph from input to sampled/validated loss randomizationA = theano_rng.uniform(size=(self.validation_samples, self.dim)) randomizationB = theano_rng.uniform(size=(self.validation_samples, self.dim))
def softplus_f(v): return T.log(1 + T.exp(v))
def hinge_loss_sum(self, y): return -T.sum(T.log(self.p_y_given_x)[:,y])
def tlogit(x): return T.log(x / (np.float32(1) - x))
b2_init = np.zeros(output_size) thX = T.matrix("X") thT = T.matrix("T") W1 = theano.shared(W1_init, "W1") W2 = theano.shared(W2_init, "W2") b1 = theano.shared(b1_init, "b1") b2 = theano.shared(b2_init, "b2") thZ = T.nnet.relu(thX.dot(W1) + b1) thY = T.nnet.softmax(thZ.dot(W2) + b2) prediction = T.argmax(thY, axis=1) cost = -(thT * T.log(thY)).sum() + reg * ((W1 * W1).sum() + (b1 * b1).sum() + (W2 * W2).sum() + (b2 * b2).sum()) update_W1 = W1 - lr * T.grad(cost, W1) update_b1 = b1 - lr * T.grad(cost, b1) update_W2 = W2 - lr * T.grad(cost, W2) update_b2 = b2 - lr * T.grad(cost, b2) train = theano.function([thX, thT], updates=[(W1, update_W1), (W2, update_W2), (b1, update_b1), (b2, update_b2)]) get_prediction = theano.function(inputs=[thX, thT], outputs=[cost, prediction]) costs = []
def negative_log_likelihood_sum(self, y): return -T.sum(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y])
def kld(self, mu, var): return 0.5 * T.sum(1 + T.log(var) - mu**2 - var, axis=1)
def LME(x, axis=None, dtype=None, keepdims=False, acc_dtype=None): return T.log(T.mean(T.exp(x), axis, dtype, keepdims, acc_dtype))
def f_softplus(x): return T.log(T.exp(x) + 1) # - np.log(2)
def cost_nll(self, pred, label): cost = -T.log(pred) * label cost = T.mean(T.sum(cost, axis=1)) return cost
def __init__(self, We_initial, params): if params.maxval: self.nout = params.maxval - params.minval + 1 if params.traintype == "reg" or params.traintype == "rep": p = cPickle.load(file(params.regfile, 'rb')) print p #containes We if params.traintype == "reg": print "regularizing to parameters" if params.traintype == "rep": print "not updating embeddings" #params initial_We = theano.shared(np.asarray(We_initial, dtype=config.floatX)) We = theano.shared(np.asarray(We_initial, dtype=config.floatX)) if params.traintype == "reg": initial_We = theano.shared( np.asarray(p[0].get_value(), dtype=config.floatX)) We = theano.shared( np.asarray(p[0].get_value(), dtype=config.floatX)) if params.traintype == "rep": We = theano.shared( np.asarray(p[0].get_value(), dtype=config.floatX)) g1batchindices = T.imatrix() g2batchindices = T.imatrix() g1mask = T.matrix() g2mask = T.matrix() scores = T.matrix() l_in = lasagne.layers.InputLayer((None, None, 1)) l_mask = lasagne.layers.InputLayer(shape=(None, None)) l_emb = lasagne.layers.EmbeddingLayer( l_in, input_size=We.get_value().shape[0], output_size=We.get_value().shape[1], W=We) l_out = lasagne_average_layer([l_emb, l_mask]) embg1 = lasagne.layers.get_output(l_out, { l_in: g1batchindices, l_mask: g1mask }) embg2 = lasagne.layers.get_output(l_out, { l_in: g2batchindices, l_mask: g2mask }) g1_dot_g2 = embg1 * embg2 g1_abs_g2 = abs(embg1 - embg2) lin_dot = lasagne.layers.InputLayer((None, We.get_value().shape[1])) lin_abs = lasagne.layers.InputLayer((None, We.get_value().shape[1])) l_sum = lasagne.layers.ConcatLayer([lin_dot, lin_abs]) l_sigmoid = lasagne.layers.DenseLayer( l_sum, params.memsize, nonlinearity=lasagne.nonlinearities.sigmoid) if params.task == "sim": l_softmax = lasagne.layers.DenseLayer(l_sigmoid, self.nout, nonlinearity=T.nnet.softmax) X = lasagne.layers.get_output(l_softmax, { lin_dot: g1_dot_g2, lin_abs: g1_abs_g2 }) Y = T.log(X) cost = scores * (T.log(scores) - Y) cost = cost.sum(axis=1) / (float(self.nout)) prediction = 0. i = params.minval while i <= params.maxval: prediction = prediction + i * X[:, i - 1] i += 1 elif params.task == "ent": l_softmax = lasagne.layers.DenseLayer(l_sigmoid, 3, nonlinearity=T.nnet.softmax) X = lasagne.layers.get_output(l_softmax, { lin_dot: g1_dot_g2, lin_abs: g1_abs_g2 }) cost = theano.tensor.nnet.categorical_crossentropy(X, scores) prediction = T.argmax(X, axis=1) else: raise ValueError('Params.task not set correctly.') # if params.l_out == '': # lasagne.layers.set_all_param_values(l_out, s) # # if params.l_so self.network_params = lasagne.layers.get_all_params( l_out, trainable=True) + lasagne.layers.get_all_params( l_softmax, trainable=True) self.network_params.pop(0) self.all_params = lasagne.layers.get_all_params( l_out, trainable=True) + lasagne.layers.get_all_params( l_softmax, trainable=True) reg = self.getRegTerm(params, We, initial_We) self.trainable = self.getTrainableParams(params) cost = T.mean(cost) + reg self.feedforward_function = theano.function([g1batchindices, g1mask], embg1) self.scoring_function = theano.function( [g1batchindices, g2batchindices, g1mask, g2mask], prediction) self.cost_function = theano.function( [scores, g1batchindices, g2batchindices, g1mask, g2mask], cost) grads = theano.gradient.grad(cost, self.trainable) if params.clip: grads = [ lasagne.updates.norm_constraint(grad, params.clip, range(grad.ndim)) for grad in grads ] updates = params.learner(grads, self.trainable, params.eta) self.train_function = theano.function( [scores, g1batchindices, g2batchindices, g1mask, g2mask], cost, updates=updates)
def multivariate_bernoulli(self, y_pred, y_true): return T.sum(y_true * T.log(y_pred) + (1 - y_true) * T.log(1 - y_pred), axis=1)
def _compute_losses(self, model_output): # model_output.shape : (batch_size, seq_len, K, M, target_size) # self.dataset.symb_targets.shape = (batch_size, seq_len+K-1, target_dims) # mask.shape : (batch_size, seq_len) or None mask = self.dataset.symb_mask # mu.shape = (batch_size, seq_len, K, M, target_dims) mu = model_output[:, :, :, :, 0:3] # sigma.shape = (batch_size, seq_len, K, M, target_dims) sigma = model_output[:, :, :, :, 3:6] # Stack K targets for each input (sliding window style) # targets.shape = (batch_size, seq_len, K, target_dims) targets = T.stack([ self.dataset.symb_targets[:, i:(-self.model.k + i + 1) or None] for i in range(self.model.k) ], axis=2) # Add new axis for sum over M # targets.shape = (batch_size, seq_len, K, 1, target_dims) targets = targets[:, :, :, None, :] # For monitoring the L2 error of using $mu$ as the predicted direction (should be comparable to MICCAI's work). normalized_mu = mu[:, :, 0, 0] / l2distance( mu[:, :, 0, 0], keepdims=True, eps=1e-8) normalized_targets = targets[:, :, 0, 0] / l2distance( targets[:, :, 0, 0], keepdims=True, eps=1e-8) self.L2_error_per_item = T.sqrt( T.sum(((normalized_mu - normalized_targets)**2), axis=2)) if mask is not None: self.mean_sqr_error = T.sum(self.L2_error_per_item * mask, axis=1) / T.sum(mask, axis=1) else: self.mean_sqr_error = T.mean(self.L2_error_per_item, axis=1) # Likelihood of multivariate gaussian (n dimensions) is : # ((2 \pi)^D |\Sigma|)^{-1/2} exp(-1/2 (x - \mu)^T \Sigma^-1 (x - \mu)) # We suppose a diagonal covariance matrix, so we have : # => |\Sigma| = \prod_n \sigma_n^2 # => (x - \mu)^T \Sigma^-1 (x - \mu) = \sum_n ((x_n - \mu_n) / \sigma_n)^2 m_log_likelihoods = -np.float32( (self.target_dims / 2.) * np.log(2 * np.pi)) + T.sum( -T.log(sigma) - 0.5 * T.sqr((targets - mu) / sigma), axis=4) # k_losses_per_timestep.shape : (batch_size, seq_len, K) self.k_losses_per_timestep = T.log(self.m) - logsumexp( m_log_likelihoods, axis=3, keepdims=False) # loss_per_timestep.shape : (batch_size, seq_len) self.loss_per_time_step = T.mean(self.k_losses_per_timestep, axis=2) # Average over sequence steps. # k_nlls_per_seq.shape :(batch_size, K) if mask is not None: self.k_losses_per_seq = T.sum( self.k_losses_per_timestep * mask[:, :, None], axis=1) / T.sum( mask, axis=1, keepdims=True) else: self.k_losses_per_seq = T.mean(self.k_losses_per_timestep, axis=1) # Average over K # loss_per_seq.shape :(batch_size,) self.loss_per_seq = T.mean(self.k_losses_per_seq, axis=1) return self.loss_per_seq
def factors(self, x, z, A): v = self.v w = self.w ''' z is unused x['x'] is the data The names of dict z[...] may be confusing here: the latent variable z is not included in the dict z[...], but implicitely computed from epsilon and parameters in w. z is computed with g(.) from eps and variational parameters let logpx be the generative model density: log p(x|z) where z=g(.) let logpz be the prior of Z plus the entropy of q(z|x): logp(z) + H_q(z|x) So the lower bound L(x) = logpx + logpz let logpv and logpw be the (prior) density of the parameters ''' def f_softplus(x): return T.log(T.exp(x) + 1) # - np.log(2) def f_rectlin(x): return x * (x > 0) def f_rectlin2(x): return x * (x > 0) + 0.01 * x nonlinear = { 'tanh': T.tanh, 'sigmoid': T.nnet.sigmoid, 'softplus': f_softplus, 'rectlin': f_rectlin, 'rectlin2': f_rectlin2 } nonlinear_q = nonlinear[self.nonlinear_q] nonlinear_p = nonlinear[self.nonlinear_p] #rng = rng_curand.CURAND_RandomStreams(0) import theano.tensor.shared_randomstreams rng = theano.tensor.shared_randomstreams.RandomStreams(0) # Compute q(z|x,y) # # it seems that z = f(v['w0x'] * x + v['w0y'] * y + b) # hidden_q = [ nonlinear_q( T.dot(v['w0x'], x['x']) + T.dot(v['w0y'], x['y']) + T.dot(v['b0'], A)) ] for i in range(1, len(self.n_hidden_q)): hidden_q.append( nonlinear_q( T.dot(v['w' + str(i)], hidden_q[-1]) + T.dot(v['b' + str(i)], A))) q_mean = T.dot(v['mean_w'], hidden_q[-1]) + T.dot(v['mean_b'], A) if self.type_qz == 'gaussian' or self.type_qz == 'gaussianmarg': q_logvar = T.dot(v['logvar_w'], hidden_q[-1]) + T.dot( v['logvar_b'], A) else: raise Exception() # function for distribution q(z|x) theanofunc = lazytheanofunc('warn', mode='FAST_RUN') self.dist_qz['z'] = theanofunc([x['x'], x['mean_prior'], x['y']] + [A], [q_mean, q_logvar]) # Compute virtual sample eps = rng.normal(size=q_mean.shape, dtype='float32') _z = q_mean + T.exp(0.5 * q_logvar) * eps # Compute log p(x|z) # # log p(x | z, y) # It seems that x = f((w0y * y + w0z * z) + b0) # hidden_p = [ nonlinear_p( T.dot(w['w0y'], x['y']) + T.dot(w['w0z'], _z) + T.dot(w['b0'], A)) ] for i in range(1, len(self.n_hidden_p)): hidden_p.append( nonlinear_p( T.dot(w['w' + str(i)], hidden_p[-1]) + T.dot(w['b' + str(i)], A))) if self.dropout: hidden_p[-1] *= 2. * (rng.uniform(size=hidden_p[-1].shape, dtype='float32') > .5) if self.type_px == 'bernoulli': p = T.nnet.sigmoid( T.dot(w['out_w'], hidden_p[-1]) + T.dot(w['out_b'], A)) _logpx = -T.nnet.binary_crossentropy(p, x['x']) self.dist_px['x'] = theanofunc([x['y'], _z] + [A], p) elif self.type_px == 'gaussian': x_mean = T.dot(w['out_w'], hidden_p[-1]) + T.dot(w['out_b'], A) x_logvar = T.dot(w['out_logvar_w'], hidden_p[-1]) + T.dot( w['out_logvar_b'], A) _logpx = ap.logpdfs.normal2(x['x'], x_mean, x_logvar) self.dist_px['x'] = theanofunc([x['y'], _z] + [A], [x_mean, x_logvar]) elif self.type_px == 'laplace': x_mean = T.dot(w['out_w'], hidden_p[-1]) + T.dot(w['out_b'], A) x_logvar = T.dot(w['out_logvar_w'], hidden_p[-1]) + T.dot( w['out_logvar_b'], A) _logpx = ap.logpdfs.laplace(x['x'], x_mean, x_logvar) self.dist_px['x'] = theanofunc([x['y'], _z] + [A], [x_mean, x_logvar]) else: raise Exception("") # Note: logpx is a row vector (one element per sample) logpx = T.dot(shared32(np.ones((1, self.n_x))), _logpx) # logpx = log p(x|z,w) # log p(y) (prior of y) #_logpy = w['logpy'] #if self.uniform_y: _logpy *= 0 #py_model = T.nnet.softmax(T.dot(_logpy, A).T).T #logpy = (- T.nnet.categorical_crossentropy(py_model.T, x['y'].T).T).reshape((1,-1)) #logpx += logpy #self.dist_px['y'] = theanofunc([A], py_model) # log p(z) (prior of z) # # E_q[log(p(z))] # if self.type_pz == 'gaussianmarg': logpz = -0.5 * (np.log(2 * np.pi) + ( (q_mean - x['mean_prior'])**2 + T.exp(q_logvar))).sum( axis=0, keepdims=True) elif self.type_pz == 'gaussian': logpz = ap.logpdfs.standard_normal(_z).sum(axis=0, keepdims=True) elif self.type_pz == 'mog': pz = 0 for i in range(self.n_mixture): pz += T.exp( ap.logpdfs.normal2(_z, T.dot(w['mog_mean' + str(i)], A), T.dot(w['mog_logvar' + str(i)], A))) logpz = T.log(pz).sum(axis=0, keepdims=True) - self.n_z * np.log( float(self.n_mixture)) elif self.type_pz == 'laplace': logpz = ap.logpdfs.standard_laplace(_z).sum(axis=0, keepdims=True) elif self.type_pz == 'studentt': logpz = ap.logpdfs.studentt(_z, T.dot(T.exp(w['logv']), A)).sum(axis=0, keepdims=True) else: raise Exception("Unknown type_pz") # loq q(z|x) (entropy of z) # # E_q[-log(q)] # if self.type_qz == 'gaussianmarg': logqz = -0.5 * (np.log(2 * np.pi) + 1 + q_logvar).sum( axis=0, keepdims=True) elif self.type_qz == 'gaussian': logqz = ap.logpdfs.normal2(_z, q_mean, q_logvar).sum(axis=0, keepdims=True) else: raise Exception() # Note: logpv and logpw are a scalars def f_prior(_w, prior_sd=self.prior_sd): return ap.logpdfs.normal(_w, 0, prior_sd).sum() logpv = 0 logpv += f_prior(v['w0x']) logpv += f_prior(v['w0y']) for i in range(1, len(self.n_hidden_q)): logpv += f_prior(v['w' + str(i)]) logpv += f_prior(v['mean_w']) if self.type_qz in ['gaussian', 'gaussianmarg']: logpv += f_prior(v['logvar_w']) logpw = 0 logpw += f_prior(w['w0y']) logpw += f_prior(w['w0z']) for i in range(1, len(self.n_hidden_p)): logpw += f_prior(w['w' + str(i)]) logpw += f_prior(w['out_w']) if self.type_px in ['sigmoidgaussian', 'gaussian', 'laplace']: logpw += f_prior(w['out_logvar_w']) if self.type_pz == 'studentt': logpw += f_prior(w['logv']) #return logpv, logpw, logpx, logpz, logqz return logpx, logpz, logqz
def KL(self, y): return T.mean(y * T.log(y / self.y_pred) + (1 - y) * T.log((1 - y) / (1 - self.y_pred)))
x = T.dmatrix("x") y = T.dvector("y") learning_rate = T.dscalar("lr") # declare the weight w and b w = theano.shared(value=numpy.random.rand(feat), name="w") b = theano.shared(value=0., name="b") print("initialized weights \n") print(w.get_value()) print(b.get_value()) # build the graph output = 1/(1+T.exp(-T.dot(x, w)-b)) prediction = output > 0.5 cross_entropy = -y * T.log(output) - (1-y)*T.log(1-output) loss = cross_entropy.mean() + 0.01*(w**2).sum() gradW, gradb = T.grad(loss, [w, b]) # train function train = theano.function(inputs=[x,y,learning_rate], outputs=[prediction, cross_entropy,loss, learning_rate], \ updates=((w,w-learning_rate*gradW), (b,b-learning_rate*gradb))) # predict function predict = theano.function(inputs=[x], outputs=prediction) for i in range(training_step): if (i < 1000): learning_rate = 0.1 else: learning_rate =0.01 pred, cro, l,lr = train(D[0], D[1], learning_rate)
def NLL(probs, labels) : # labels are not one-hot code return - T.mean( T.log(probs)[T.arange(labels.shape[0]), T.cast(labels,'int32')] )
def loss(self,delta): #return T.log(1+T.exp(euclid(self.output1,self.output2))) #return T.log(1+T.exp(T.sqrt(T.sum(T.sqr(self.output1-self.output2))))) #return T.log(1+T.exp(T.sqrt(T.sum(T.sqr(self.output1))))) return T.log(1+T.exp(delta*(T.sum(T.sqr(self.output1-self.output2)))))
def trainMB(self, V_egMin, noOfEpoch, noOfMiniBatchEx): """ trains the current RBM object, returns nothing with parameter updates being internal args: V_egMin (theano.shared 2D array): call eval() to supply as argument. rows of this are input examples. V_egMin[N:M] extracts M-N examples, each of size noOfVisible units noOfEpoch (int): total number of Epoch to simulate, each Epoch goes through V_egMin noOfMiniBatchEx (int): number of examples to be grouped into minibatches """ self.miniBatchSize = noOfMiniBatchEx print("size of input example is: " + str(V_egMin.shape)) V_egM = T.matrix(name="T_egM", dtype=theano.config.floatX) [V_CDmAcc, H_CDmAcc, H_CDmean, V_CDmean] , scan_updates = theano.scan(self.vtovMBall, outputs_info=[V_egM, None, None, None] , n_steps=self.CD_n) V_CDm = V_CDmAcc[-1] #these are matrixes H_CDm = H_CDmAcc[-1] #these are matrixes H_egM = self.vtohMB(V_egM) energyVector_eg = self.energyFnMB(V_egM, H_egM) energyVector_cd = self.energyFnMB(V_CDm, H_CDm) costFn = T.mean(energyVector_eg, dtype=theano.config.floatX, acc_dtype=theano.config.floatX) - T.mean(energyVector_cd, dtype=theano.config.floatX, acc_dtype=theano.config.floatX) Ta_grad, Tb_grad, Tz_grad, Tomg_grad = T.grad(cost=costFn, wrt=[self.T_a, self.T_b, self.T_z, self.T_omega], consider_constant=[V_egM, H_egM, V_CDm, H_CDm]) #regular gradient gradFromMB = theano.function(inputs=[V_egM], outputs=[Ta_grad, Tb_grad, Tz_grad, Tomg_grad], allow_input_downcast=True, updates = scan_updates + [(self.T_a, self.T_a + self.aRate*Ta_grad), (self.T_b, self.T_b + self.bRate*Tb_grad), (self.T_z, self.T_z + self.sigmaRate*Tz_grad), (self.T_omega, self.T_omega + self.omegaRate*Tomg_grad)], mode='FAST_RUN')#NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True)) #rprop: Code not used Ta_rpropMag = T.mul(T.abs_(self.Ta_grad_prev), T.mul(self.T_posUpdate, T.abs_(T.sgn(self.Ta_grad_prev)+T.sgn(Ta_grad))) + T.mul(self.T_negUpdate, T.abs_(T.abs_(T.sgn(self.Ta_grad_prev)+T.sgn(Ta_grad))-np.float32(2.0)))) Ta_rprop = T.mul(T.sgn(Ta_grad),Ta_rpropMag.clip(np.float32(self.epsilon),50)) Tb_rpropMag = T.mul(T.abs_(self.Tb_grad_prev), T.mul(self.T_posUpdate, T.abs_(T.sgn(self.Tb_grad_prev)+T.sgn(Tb_grad))) + T.mul(self.T_negUpdate, T.abs_(T.abs_(T.sgn(self.Tb_grad_prev)+T.sgn(Tb_grad))-np.float32(2.0)))) Tb_rprop = T.mul(T.sgn(Tb_grad),Tb_rpropMag.clip(np.float32(self.epsilon),50)) Tz_rpropMag = T.mul(T.abs_(self.Tz_grad_prev), T.mul(self.T_posUpdate, T.abs_(T.sgn(self.Tz_grad_prev)+T.sgn(Tz_grad))) + T.mul(self.T_negUpdate, T.abs_(T.abs_(T.sgn(self.Tz_grad_prev)+T.sgn(Tz_grad))-np.float32(2.0))) ) Tz_rprop = T.mul(T.sgn(Tz_grad),Tz_rpropMag.clip(np.float32(self.epsilon),50)) Tomg_rpropMag = T.mul(T.abs_(self.Tomg_grad_prev), T.mul(self.T_posUpdate, T.abs_(T.sgn(self.Tomg_grad_prev)+T.sgn(Tomg_grad))) + T.mul(self.T_negUpdate, T.abs_(T.abs_(T.sgn(self.Tomg_grad_prev)+T.sgn(Tomg_grad))-np.float32(2.0)))) Tomg_rprop = T.mul(T.sgn(Tomg_grad),Tomg_rpropMag.clip(np.float32(self.epsilon),50)) gradFromMBrprop = theano.function(inputs=[V_egM], outputs=[Ta_rprop, Tb_rprop, Tz_rprop, Tomg_rprop], allow_input_downcast=True, updates = scan_updates + [(self.T_a, self.T_a + Ta_rprop), (self.T_b, self.T_b + Tb_rprop), (self.T_z, self.T_z + Tz_rprop), (self.T_omega, self.T_omega + Tomg_rprop), (self.Ta_grad_prev, Ta_rprop), (self.Tb_grad_prev, Tb_rprop), (self.Tz_grad_prev, Tz_rprop), (self.Tomg_grad_prev, Tomg_rprop)], mode='FAST_RUN')#NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True)) #RMSprop only: [a_grad, b_grad, z_grad, omg_grad] = gradFromMB(V_egMin[0:noOfMiniBatchEx]) #initial RMS correction if (not(self.parameterLoaded) and not(self.parameterSaved)): self.Ta_rms.set_value(np.float32(np.abs(a_grad))) # = theano.shared(value = np.float32(np.abs(a_grad)), name = 'Ta_rms', borrow=True, allow_downcast=True) Tb_rms = theano.shared(value = np.float32(np.abs(b_grad)), name = 'Tb_rms', borrow=True, allow_downcast=True) Tz_rms = theano.shared(value = np.float32(np.abs(z_grad)), name = 'Tz_rms', borrow=True, allow_downcast=True) Tomg_rms = theano.shared(value = np.float32(np.abs(omg_grad)), name = 'Tomg_rms', borrow=True, allow_downcast=True) gradFromMBRMSprop = theano.function(inputs=[V_egM], outputs=[Ta_grad, Tb_grad, Tz_grad, Tomg_grad], allow_input_downcast=True, updates = scan_updates + [(self.Ta_rms, T.sqrt(T.mul(np.float32(0.9),T.mul(self.Ta_rms,self.Ta_rms))+T.mul(np.float32(0.1),T.mul(Ta_grad,Ta_grad)))), (Tb_rms, T.sqrt(T.mul(np.float32(0.9),T.mul(Tb_rms,Tb_rms))+T.mul(np.float32(0.1),T.mul(Tb_grad,Tb_grad)))), (Tz_rms, T.sqrt(T.mul(np.float32(0.9),T.mul(Tz_rms,Tz_rms))+T.mul(np.float32(0.1),T.mul(Tz_grad,Tz_grad)))), (Tomg_rms, T.sqrt(T.mul(np.float32(0.9),T.mul(Tomg_rms,Tomg_rms))+T.mul(np.float32(0.1),T.mul(Tomg_grad,Tomg_grad)))), (self.T_a, self.T_a + self.aRate*T.mul(Ta_grad,T.maximum(np.float32(self.epsilon),self.Ta_rms)**-1)), (self.T_b, self.T_b + self.bRate*T.mul(Tb_grad,T.maximum(np.float32(self.epsilon),Tb_rms)**-1)), (self.T_z, self.T_z + self.sigmaRate*T.mul(Tz_grad,T.maximum(np.float32(self.epsilon),Tz_rms)**-1)), (self.T_omega, self.T_omega + self.omegaRate*T.mul(Tomg_grad,T.maximum(np.float32(self.epsilon),Tomg_rms)**-1))], mode='FAST_RUN')#NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True)) #sparse hidden units optimization + RMSprop: #first calculate probability of hidden units firing given visible examples: aVomg = T.dot(T.mul(T.fill(V_egM, T.exp(-self.T_z)), V_egM), self.T_omega) aT_Hp = T.nnet.sigmoid(T.fill(aVomg, self.T_b) + aVomg)#T.nnet.ultra_fast_sigmoid() did not work for us aT_HpMean = T.mean(aT_Hp) # mean activation over minibatch and all Hk #cross entropy between mean hidden unit activation and target mean activation probability "self.sparseTargetp" sparseHcost = T.mul(np.float32(-self.sparseTargetp), T.log(aT_HpMean)) - T.mul((np.float32(1.0)-self.sparseTargetp), T.log(np.float32(1.0)-aT_HpMean)) Tb_gradH, Tz_gradH, Tomg_gradH = T.grad(cost=sparseHcost, wrt=[self.T_b, self.T_z, self.T_omega], consider_constant=[V_egM]) sparseGradFn = theano.function(inputs = [V_egM], outputs =[Tb_gradH, Tz_gradH, Tomg_gradH], allow_input_downcast=True, mode = 'FAST_RUN') [b_gradH, z_gradH, omg_gradH] = sparseGradFn(V_egMin[0:noOfMiniBatchEx]) #initial RMS correction if (not(self.parameterLoaded) and not(self.parameterSaved)): self.Tb_rmsH.set_value(np.float32(np.abs(b_grad - b_gradH))) self.Tz_rmsH.set_value(np.float32(np.abs(z_grad - z_gradH))) self.Tomg_rmsH.set_value(np.float32(np.abs(omg_grad - omg_gradH))) gradSparseH = theano.function(inputs=[V_egM], outputs=[Ta_grad, Tb_grad, Tz_grad, Tomg_grad, Tb_gradH, Tz_gradH, Tomg_gradH], allow_input_downcast=True, updates = scan_updates + [(self.Ta_rms, T.sqrt(T.mul(np.float32(0.9),T.mul(self.Ta_rms,self.Ta_rms))+T.mul(np.float32(0.1),T.mul(Ta_grad,Ta_grad)))), (self.Tb_rmsH, T.sqrt(T.mul(np.float32(0.9),T.mul(self.Tb_rmsH,self.Tb_rmsH))+T.mul(np.float32(0.1),T.mul(Tb_grad-Tb_gradH,Tb_grad-Tb_gradH)))), (self.Tz_rmsH, T.sqrt(T.mul(np.float32(0.9),T.mul(self.Tz_rmsH,self.Tz_rmsH))+T.mul(np.float32(0.1),T.mul(Tz_grad-Tz_gradH,Tz_grad-Tz_gradH)))), (self.Tomg_rmsH, T.sqrt(T.mul(np.float32(0.9),T.mul(self.Tomg_rmsH,self.Tomg_rmsH))+T.mul(np.float32(0.1),T.mul(Tomg_grad-Tomg_gradH,Tomg_grad-Tomg_gradH)))), (self.T_a, self.T_a + self.aRate*T.mul(Ta_grad,T.maximum(np.float32(self.epsilon),self.Ta_rms)**-1)), (self.T_b, self.T_b + self.bRate*T.mul(Tb_grad-Tb_gradH,T.maximum(np.float32(self.epsilon),self.Tb_rmsH)**-1)), (self.T_z, self.T_z + self.sigmaRate*T.mul(Tz_grad-Tz_gradH,T.maximum(np.float32(self.epsilon),self.Tz_rmsH)**-1)), (self.T_omega, self.T_omega + self.omegaRate*T.mul(Tomg_grad-Tomg_gradH,T.maximum(np.float32(self.epsilon),self.Tomg_rmsH)**-1))], mode='FAST_RUN')#NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True)) #reconstruction errors: [V_egM_recon, H_egM_reconStub, H_meanStubC, V_meanStubC] = self.vtovMBall(V_egM) V_error = V_egM - V_egM_recon V_errorSqr = T.mul(V_error, V_error) reconError = theano.function(inputs = [V_egM], outputs = [T.mean(T.sum(V_errorSqr,axis=1, acc_dtype=theano.config.floatX), acc_dtype=theano.config.floatX)], allow_input_downcast=True, mode='FAST_RUN') print("***************************************************************************************************") print("training network with " + str(self.nv) + " real visible units and " + str(self.nh) + " binary hidden units") print("reconstruction error before training = " + str(np.array(reconError(V_egMin))[0])) noOfMiniBatches = np.int(len(V_egMin)/noOfMiniBatchEx) print("number of mini-batches = " + str(noOfMiniBatches) + ", with " + str(noOfMiniBatchEx) + " examples per mini-batch") print("number of Epochs = " + str(noOfEpoch)) print("***************************************************************************************************") #input images already randomised with consecutive images belonging to different class, use directly as minibatch. for j in xrange(noOfEpoch): pretime=time.time() for i in xrange(noOfMiniBatches): [a_upDate, b_upDate, z_upDate, omg_upDate, b_upDateH, z_upDateH, omg_upDateH] = gradSparseH(V_egMin[i*noOfMiniBatchEx:(i+1)*noOfMiniBatchEx]) myErr = reconError(V_egMin) self.likelihood4plot = self.likelihood4plot + [np.float32(myErr)] print("epoch " + str(j) + ": reconstruction error = " + str(myErr[0]) + ", time taken = " + str(time.time() - pretime)) print("\n***************************************************************************************************") print("reconstruction error after training for " + str(noOfEpoch) + " epochs = " + str(np.array(reconError(V_egMin))[0])) self.checkNaN() print("***************************************************************************************************") plt.figure plt.plot(np.arange(0.0, len(self.likelihood4plot), 1), self.likelihood4plot) plt.show()
def CrossEntropy(self, y): return -T.mean(y * T.log(self.y_pred))
training_epochs = 25 learning_rate = 0.1 batch_size = 128 W1 = init_weights(28 * 28, 900) b1 = init_bias(900) b1_prime = init_bias(28 * 28) W1_prime = W1.transpose() W2 = init_weights(900, 10) b2 = init_bias(10) tilde_x = theano_rng.binomial( size=x.shape, n=1, p=1 - corruption_level, dtype=theano.config.floatX) * x y1 = T.nnet.sigmoid(T.dot(tilde_x, W1) + b1) z1 = T.nnet.sigmoid(T.dot(y1, W1_prime) + b1_prime) cost1 = -T.mean(T.sum(x * T.log(z1) + (1 - x) * T.log(1 - z1), axis=1)) params1 = [W1, b1, b1_prime] grads1 = T.grad(cost1, params1) updates1 = [(param1, param1 - learning_rate * grad1) for param1, grad1 in zip(params1, grads1)] train_da1 = theano.function(inputs=[x], outputs=cost1, updates=updates1, allow_input_downcast=True) p_y2 = T.nnet.softmax(T.dot(y1, W2) + b2) y2 = T.argmax(p_y2, axis=1) cost2 = T.mean(T.nnet.categorical_crossentropy(p_y2, d)) params2 = [W1, b1, W2, b2]
def negative_log_likelihood(self, y): return -T.mean(T.log(self.y_t)[:, y])
def get_tester(self, y): return self.inp, T.log(self.p_y_given_x)[T.arange(y.shape[0]), y]
def build_model(alpha, beta, tparams, options): trng = RandomStreams(SEED) # Used for dropout. use_noise = theano.shared(numpy_floatX(0.)) x_zheng = tensor.matrix('x_zheng', dtype='int32') x_zheng_mask = tensor.matrix('x_zheng_mask', dtype=config.floatX) x_ni = tensor.matrix('x_ni', dtype='int32') x_ni_mask = tensor.matrix('x_ni_mask', dtype=config.floatX) y = tensor.vector('y', dtype='int32') n_timesteps = x_zheng.shape[0] n_samples = x_zheng.shape[1] emb_zheng = tparams['Wemb'][x_zheng.flatten()].reshape( [n_timesteps, n_samples, options['dim_proj']]) proj1 = get_layer(options['encoder'])[1](tparams, emb_zheng, options, prefix='lstm_zheng', mask=x_zheng_mask) if options['encoder'] == 'lstm': proj_zheng = (proj1 * x_zheng_mask[:, :, None]).sum(axis=0) proj_zheng = proj_zheng / x_zheng_mask.sum(axis=0)[:, None] emb_ni = tparams['Wemb'][x_ni.flatten()].reshape( [n_timesteps, n_samples, options['dim_proj']]) proj2 = get_layer(options['encoder'])[1](tparams, emb_ni, options, prefix='lstm_ni', mask=x_ni_mask) if options['encoder'] == 'lstm': proj_ni = (proj2 * x_ni_mask[:, :, None]).sum(axis=0) proj_ni = proj_ni / x_ni_mask.sum(axis=0)[:, None] proj = tensor.concatenate((proj_zheng, proj_ni), axis=1) if options['use_dropout']: proj = dropout_layer(proj, use_noise, trng) pred = tensor.nnet.softmax(tensor.dot(proj, tparams['U']) + tparams['b']) pred_zheng = tensor.nnet.softmax( tensor.dot(proj_zheng, tparams['U_zheng'] + tparams['b'])) pred_ni = tensor.nnet.softmax( tensor.dot(proj_ni, tparams['U_ni'] + tparams['b'])) f_pred_prob = theano.function([x_zheng, x_zheng_mask, x_ni, x_ni_mask], pred, name='f_pred_prob') f_pred = theano.function([x_zheng, x_zheng_mask, x_ni, x_ni_mask], pred.argmax(axis=1), name='f_pred') f_proj = theano.function([x_zheng, x_zheng_mask, x_ni, x_ni_mask], proj, name='f_proj') off = 1e-8 if pred.dtype == 'float16': off = 1e-6 cost1 = -tensor.log(pred[tensor.arange(n_samples), y] + off).mean() cost2 = -tensor.log(pred_zheng[tensor.arange(n_samples), y] + off).mean() cost3 = -tensor.log(pred_ni[tensor.arange(n_samples), y] + off).mean() cost4 = tensor.sum(tensor.square(proj_zheng - proj_ni), axis=1).mean() cost = alpha * (cost1 + cost2 + cost3) + beta * cost4 return use_noise, x_zheng, x_zheng_mask, x_ni, x_ni_mask, y, f_pred_prob, f_pred, cost1, cost2, cost3, cost4, cost, f_proj
def logp(self, value): w = self.w return bound(logsumexp(tt.log(w) + self._comp_logp(value), axis=-1), w >= 0, w <= 1, tt.allclose(w.sum(axis=-1), 1), broadcast_conditions=False)
def cost(self, net): "Return the log-likelihood cost." return -T.mean( T.log(self.output_dropout)[T.arange(net.y.shape[0]), net.y])