def rbm_K(self, X, effective_batch_size): D, DH = self.n_visible, self.n_hidden W, bh, bv = self.W, self.bh, self.bv #one bit flipped connected states Y = X.reshape((effective_batch_size, 1, D), 3) * T.ones((1, D, 1)) #tile out data vectors (repeat each one D times) Y1 = (Y + T.eye(D).reshape((1, D, D), 3))%2 # flip each bit once # minimal activation connected states onehots = T.eye(D) blanks = T.zeros(D) eX = energy(X, W, bh, bv) eY = energy(Y1, W, bh, bv) eO = energy(onehots, W, bh, bv) eB = energy(blanks, W, bh, bv) edif = eX.dimshuffle(0, 'x') - eY #- eB #- eO.dimshuffle('x', 0) - eB Z = T.exp(0.5*edif) K = T.sum(Z) / effective_batch_size K.name = 'K' K = T.cast(K, 'float32') return K
def recurrence_relation(y, y_mask): # with blank symbol of -1 this falls back to the recurrence that fails # with repeating symbols! blank_symbol = 2 n_y = y.shape[0] blanks = tensor.zeros((2, y.shape[1])) + blank_symbol ybb = tensor.concatenate((y, blanks), axis=0).T # ybb = B x (L'+2) L'=2*label_noblank_length+1 # ybb[:,:-2] == y.T ybb[:,2:] = y.T move along 2 label # see Alex's paper: # tensor.neq(ybb[:, :-2], ybb[:, 2:]) -> l'=b or l'_u == l'_(u-2) set 0 # tensor.eq(ybb[:, 1:-1], blank_symbol) -> [0,?,0,?,...0,1,0] ? depends on whether l'_u == l'_(u-2) result[0,i]==1 means ybb[0,i] != ybb[0,i+2] # sec_diag = B x L' sec_diag = (tensor.neq(ybb[:, :-2], ybb[:, 2:]) * tensor.eq(ybb[:, 1:-1], blank_symbol) * y_mask.T) # r2: L'xL' # r3: L'xL'xB r2 = tensor.eye(n_y, k=1) # tensor.eye(n_y, k=2).dimshuffle(0, 1, 'x') L' x L' x 1 # sec_diag.dimshuffle(1, 'x', 0) L' x 1 x B r3 = (tensor.eye(n_y, k=2).dimshuffle(0, 1, 'x') * sec_diag.dimshuffle(1, 'x', 0)) return r2, r3
def each_loss(outpt, inpt): # y 是填充了blank之后的ans blank = 26 y_nblank = T.neq(inpt, blank) n = T.dot(y_nblank, y_nblank) # 真实的字符长度 N = 2 * n + 1 # 填充后的字符长度,去除尾部多余的填充 labels = inpt[:N] labels2 = T.concatenate((labels, [blank, blank])) sec_diag = T.neq(labels2[:-2], labels2[2:]) * T.eq(labels2[1:-1], blank) recurrence_relation = \ T.eye(N) + \ T.eye(N, k=1) + \ T.eye(N, k=2) * sec_diag.dimshuffle((0, 'x')) pred_y = outpt[:, labels] fwd_pbblts, _ = theano.scan( lambda curr, accum: T.switch(T.eq(curr*T.dot(accum, recurrence_relation), 0.0), T.dot(accum, recurrence_relation) , curr*T.dot(accum, recurrence_relation)), sequences=[pred_y], outputs_info=[T.eye(N)[0]] ) #return fwd_pbblts #liklihood = fwd_pbblts[0, 0] liklihood = fwd_pbblts[-1, -1] + fwd_pbblts[-1, -2] #liklihood = T.switch(T.lt(liklihood, 1e-35), 1e-35, liklihood) #loss = -T.log(T.cast(liklihood, "float32")) #loss = 10 * (liklihood - 1) * (liklihood - 100) loss = (T.le(liklihood, 1.0)*(10*(liklihood-1)*(liklihood-100)))+(T.gt(liklihood, 1.0)*(-T.log(T.cast(liklihood, "float32")))) return loss
def vanilla_ctc(self, ): my_labels = TT.concatenate((self.labels, [self.blank, self.blank])) pre_V = TT.neq(my_labels[:-2], my_labels[2:]) * \ TT.eq(my_labels[1:-1], self.blank) capLambda = \ TT.eye(self.n) + \ TT.eye(self.n, k=1) + \ TT.eye(self.n, k=2) * pre_V.dimshuffle((0, 'x')) softmax_outputs = self.inpt[:, self.labels] alphas, _ = theano.scan( lambda outPuts, old_alpha: outPuts * TT.dot(old_alpha, capLambda), sequences=[softmax_outputs], outputs_info=[TT.eye(self.n)[0]] ) # TODO: This is what we really should use for the initialization. # Need to debug and make sure there are no errors. # initial_alphas = TT.zeros(n) # initial_alphas[0]=inpt[0][-1] # initial_alphas[1]=inpt[0][labels[1]] # alphas, _ = theano.scan( # lambda outPuts, old_alpha: outPuts * TT.dot(old_alpha, capLambda), # sequences=[softmax_outputs], # outputs_info=[initial_alphas] # ) transcript_prob = TT.sum(alphas[-1, -2:]) self.cost = -TT.log(transcript_prob) self.debug = alphas.T
def __init__(self, inpt, labels): ''' Recurrent Relation: A matrix that specifies allowed transistions in paths. At any time, one could 0) Stay at the same label (diagonal is identity) 1) Move to the next label (first upper diagonal is identity) 2) Skip to the next to next label if a) next label is blank and b) the next to next label is different from the current (Second upper diagonal is product of conditons a & b) ''' n_labels = labels.shape[0] big_I = T.cast(T.eye(n_labels+2), 'float64') recurrence_relation1 = T.cast(T.eye(n_labels), 'float64') + big_I[2:,1:-1] + big_I[2:,:-2] * T.cast((T.arange(n_labels) % 2), 'float64') recurrence_relation = T.cast(recurrence_relation1, 'float64') ''' Forward path probabilities ''' pred_y = inpt[:, labels] probabilities, _ = theano.scan( lambda curr, prev: curr * T.dot(prev, recurrence_relation), sequences=[pred_y], outputs_info=[T.cast(T.eye(n_labels)[0], 'float64')] ) # Final Costs labels_probab = T.sum(probabilities[-1, -2:]) self.cost = -T.log(labels_probab) self.params = []
def _plain_ctc(self, ): labels2 = tt.concatenate((self.labels, [self.blank, self.blank])) sec_diag = tt.neq(labels2[:-2], labels2[2:]) * tt.eq(labels2[1:-1], self.blank) # Last two entries of sec_diag do not matter as they multiply zero rows below. recurrence_relation = \ tt.eye(self.n) + \ tt.eye(self.n, k=1) + \ tt.eye(self.n, k=2) * sec_diag.dimshuffle((0, 'x')) pred_y = self.inpt[:, self.labels] fwd_pbblts, _ = th.scan( lambda curr, accum: curr * tt.dot(accum, recurrence_relation), sequences=[pred_y], outputs_info=[tt.eye(self.n)[0]] ) # TODO: Add probabilites[-1, -2] only if last label is blank. # liklihood = ifelse(tt.eq(self.n, 1), fwd_pbblts[-1, -1], # ifelse(tt.neq(self.labels[-1], self.blank), fwd_pbblts[-1, -1], # fwd_pbblts[-1, -1] + fwd_pbblts[-1, -2])) liklihood = fwd_pbblts[-1, -1] self.cost = -tt.log(liklihood) self.debug = fwd_pbblts.T
def likelihood(f, l, R, mu, eps, sigma2, lambda_1=1e-4): # The similarity matrix W is a linear combination of the slices in R W = T.tensordot(R, mu, axes=1) # The following indices correspond to labeled and unlabeled examples labeled = T.eq(l, 1).nonzero() # Calculating the graph Laplacian of W D = T.diag(W.sum(axis=0)) L = D - W # The Covariance (or Kernel) matrix is the inverse of the (regularized) Laplacian epsI = eps * T.eye(L.shape[0]) rL = L + epsI Sigma = nlinalg.matrix_inverse(rL) # The marginal density of labeled examples uses Sigma_LL as covariance (sub-)matrix Sigma_LL = Sigma[labeled][:, labeled][:, 0, :] # We also consider additive Gaussian noise with variance sigma2 K_L = Sigma_LL + (sigma2 * T.eye(Sigma_LL.shape[0])) # Calculating the inverse and the determinant of K_L iK_L = nlinalg.matrix_inverse(K_L) dK_L = nlinalg.det(K_L) f_L = f[labeled] # The (L1-regularized) log-likelihood is given by the summation of the following four terms term_A = - (1 / 2) * f_L.dot(iK_L.dot(f_L)) term_B = - (1 / 2) * T.log(dK_L) term_C = - (1 / 2) * T.log(2 * np.pi) term_D = - lambda_1 * T.sum(abs(mu)) return term_A + term_B + term_C + term_D
def recurrence(blanked_label,blank_symbol): ''' A(s) = alpha(t,s) + alpha(t,s-1) if l_s = blank or l_s = l_{s-2} = alpha(t,s) + alpha(t,s-1) + alpha(t,s-2) if l_s != l_{s-2} we can define a L' x L' matrix R to help do this(L' = 2L+1) A = alpha(t,:) * R ''' length = blanked_label.shape[1] blanks = T.zeros((1,2)) + blank_symbol ybb = T.concatenate((blanked_label, blanks), axis=1) ''' ybb: 1 x L'+2 L' = 2*L+1 ybb[0,:-2] == blanked_label ybb[0,2:] = blanked_label move along 2 label T.neq(ybb[:, :-2], ybb[:, 2:]) -> l'=b or l'_u == l'_(u-2) set 0 T.eq(ybb[:, 1:-1], blank_symbol) -> [0,?,0,?,...0,1,0] ? depends on whether l_s = l_{s-2} result[0,i]==1 means ybb[0,i] != ybb[0,i+2] ''' setDiagMatrix = T.neq(ybb[:,:-2],ybb[:,2:])*T.eq(ybb[:,1:-1],blank_symbol) ''' r2: L' x L' r3: L' x L' ''' r2 = T.eye(length,k=1) r3 = T.eye(length,k=2)*(setDiagMatrix.T) return r2,r3
def recurrence_relation(size): """ Based on code from Shawn Tan """ eye2 = T.eye(size + 2) return T.eye(size) + eye2[2:, 1:-1] + eye2[2:, :-2] * (T.arange(size) % 2)
def recurrence_relation(self, y): def sec_diag_i(yt, ytp1, ytp2): return T.neq(yt, ytp2) * T.eq(ytp1, self.n_out) y_extend = T.concatenate((y, [self.n_out, self.n_out])) sec_diag, _ = theano.scan(sec_diag_i, sequences={'input':y_extend, 'taps':[0, 1, 2]}) y_sz = y.shape[0] return T.eye(y_sz) + \ T .eye(y_sz, k=1) + \ T.eye(y_sz, k=2) * sec_diag.dimshuffle((0, 'x'))
def __init__(self, v=None, **kwargs): super(HouseholderFlow, self).__init__(**kwargs) v = self.add_param(v, 'v') self.shared_params = dict(v=v) if self.batched: vv = v.dimshuffle(0, 1, 'x') * v.dimshuffle(0, 'x', 1) I = tt.eye(self.dim).dimshuffle('x', 0, 1) vvn = (1e-10+(v**2).sum(-1)).dimshuffle(0, 'x', 'x') else: vv = tt.outer(v, v) I = tt.eye(self.dim) vvn = ((v**2).sum(-1)+1e-10) self.H = I - 2. * vv / vvn
def _mb_normal_ctc(self, network_output, labels, mask): n_y = labels.shape[1] / 2 y = labels[:,:n_y] y = y.dimshuffle(1,0) y_mask = labels[:,n_y:].astype(theano.config.floatX) # y_row = labels.dimshuffle(1,0) # n_y = y_row.shape[0] / 2 # y = y_row[:n_y,:] # y_mask = y_row[n_y:,:].astype(theano.config.floatX) y_hat = network_output.dimshuffle(0, 2, 1) pred_y = y_hat[:, y.astype('int32'), T.arange(self.tpo["batch_size"])] ybb = T.concatenate((y, self.blanks), axis=0).T sec_diag = (T.neq(ybb[:, :-2], ybb[:, 2:]) * T.eq(ybb[:, 1:-1], self.tpo["CTC_blank"]) * y_mask) # r1: LxL # r2: LxL # r3: LxLxB r2 = T.eye(n_y, k=1) r3 = (T.eye(n_y, k=2).dimshuffle(0, 1, 'x') * sec_diag.dimshuffle(1, 'x', 0)) def step(p_curr, p_prev): # instead of dot product, we * first # and then sum oven one dimension. # objective: T.dot((p_prev)BxL, LxLxB) # solusion: Lx1xB * LxLxB --> LxLxB --> (sumover)xLxB dotproduct = (p_prev + T.dot(p_prev, r2) + (p_prev.dimshuffle(1, 'x', 0) * r3).sum(axis=0).T) return p_curr.T * dotproduct * y_mask # B x L probabilities, _ = theano.scan( step, sequences=[pred_y], outputs_info=[T.eye(n_y)[0] * T.ones([self.tpo["batch_size"], n_y])]) labels_probab = T.sum(probabilities[-1,:, -2:]) return T.mean(-T.log(labels_probab))
def grad(self, inputs, cost_grad): """ Notes: 1. The gradient is computed under the assumption that perturbations of the input array respect triangularity, i.e. partial derivatives wrt triangular region are zero. 2. In contrast with the usual mathematical presentation, in order to apply theano's 'reshape' function wich implements row-order (i.e. C order), the differential expressions below have been derived based on the row-vectorizations of inputs 'a' and 'b'. See The Matrix Reference Manual, Copyright 1998-2011 Mike Brookes, Imperial College, London, UK """ a, b = inputs ingrad = cost_grad ingrad = tensor.as_tensor_variable(ingrad) shp_a = (tensor.shape(inputs[0])[1], tensor.shape(inputs[0])[1]) I_M = tensor.eye(*shp_a) if self.lower: inv_a = solve_triangular(a, I_M, lower=True) tri_M = tril(tensor.ones(shp_a)) else: inv_a = solve_triangular(a, I_M, lower=False) tri_M = triu(tensor.ones(shp_a)) if b.ndim == 1: prod_a_b = tensor.tensordot(-b.T, inv_a.T, axes=1) prod_a_b = tensor.shape_padleft(prod_a_b) jac_veca = kron(inv_a, prod_a_b) jac_b = inv_a outgrad_veca = tensor.tensordot(ingrad, jac_veca, axes=1) outgrad_a = tensor.reshape(outgrad_veca, (inputs[0].shape[0], inputs[0].shape[0])) * tri_M outgrad_b = tensor.tensordot(ingrad, jac_b, axes=1).flatten(ndim=1) else: ingrad_vec = ingrad.flatten(ndim=1) prod_a_b = tensor.tensordot(-b.T, inv_a.T, axes=1) jac_veca = kron(inv_a, prod_a_b) I_N = tensor.eye(tensor.shape(inputs[1])[1], tensor.shape(inputs[1])[1]) jac_vecb = kron(inv_a, I_N) outgrad_veca = tensor.tensordot(ingrad_vec, jac_veca, axes=1) outgrad_a = tensor.reshape(outgrad_veca, (inputs[0].shape[0], inputs[0].shape[0])) * tri_M outgrad_vecb = tensor.tensordot(ingrad_vec, jac_vecb, axes=1) outgrad_b = tensor.reshape(outgrad_vecb, (inputs[1].shape[0], inputs[1].shape[1])) return [outgrad_a, outgrad_b]
def compute_log_averaged_ei(self, x, X, randomness, incumbent): # We compute the old predictive mean at x Kzz = compute_kernel(self.lls, self.lsf, self.z, self.z) + T.eye(self.z.shape[ 0 ]) * self.jitter * T.exp(self.lsf) KzzInv = T.nlinalg.MatrixInversePSD()(Kzz) LLt = T.dot(self.LParamPost, T.transpose(self.LParamPost)) covCavityInv = KzzInv + LLt * casting(self.n_points - self.set_for_training) / casting(self.n_points) covCavity = T.nlinalg.MatrixInversePSD()(covCavityInv) meanCavity = T.dot(covCavity, casting(self.n_points - self.set_for_training) / casting(self.n_points) * self.mParamPost) KzzInvmeanCavity = T.dot(KzzInv, meanCavity) Kxz = compute_kernel(self.lls, self.lsf, x, self.z) m_old_x = T.dot(Kxz, KzzInvmeanCavity) # We compute the old predictive mean at X KXz = compute_kernel(self.lls, self.lsf, X, self.z) m_old_X = T.dot(KXz, KzzInvmeanCavity) # We compute the required cross covariance matrices KXX = compute_kernel(self.lls, self.lsf, X, X) - T.dot(T.dot(KXz, KzzInv), KXz.T) + T.eye(X.shape[ 0 ]) * self.jitter * T.exp(self.lsf) KXXInv = T.nlinalg.MatrixInversePSD()(KXX) KxX = compute_kernel(self.lls, self.lsf, x, X) xX = T.concatenate([ x, X ], 0) KxXz = compute_kernel(self.lls, self.lsf, xX, self.z) KxX = KxX - T.dot(T.dot(KxXz[ 0 : x.shape[ 0], : ], KzzInv), KxXz[ x.shape[ 0 ] : xX.shape[ 0 ], : ].T) # We compute the new posterior mean samples_internal = T.dot(MatrixChol()(KXX), randomness) new_predictive_mean = T.tile(m_old_x, [ 1, randomness.shape[ 1 ] ]) + T.dot(KxX, T.dot(KXXInv, samples_internal)) # We compute the new posterior variance z_expanded = T.concatenate([ self.z, X ], 0) Kxz_expanded = compute_kernel(self.lls, self.lsf, x, z_expanded) Kzz_expanded = compute_kernel(self.lls, self.lsf, z_expanded, z_expanded) + T.eye(z_expanded.shape[ 0 ]) * self.jitter * T.exp(self.lsf) Kzz_expandedInv = T.nlinalg.MatrixInversePSD()(Kzz_expanded) v_out = T.exp(self.lsf) - T.dot(Kxz_expanded * T.dot(Kxz_expanded, Kzz_expandedInv), T.ones_like(z_expanded[ : , 0 : 1 ])) new_predictive_var = T.tile(v_out, [ 1, randomness.shape[ 1 ] ]) s = (incumbent - new_predictive_mean) / T.sqrt(new_predictive_var) log_ei = T.log((incumbent - new_predictive_mean) * ratio(s) + T.sqrt(new_predictive_var)) + log_n_pdf(s) return T.mean(LogSumExp(log_ei, 1), 1)
def recurrence_relation_(y_, blank_symbol): y = y_.dimshuffle(0,'x') n_y = y.shape[0] blanks = T.zeros((2, y.shape[1])) + blank_symbol ybb = T.concatenate((y, blanks), axis=0).T sec_diag = (T.neq(ybb[:, :-2], ybb[:, 2:]) * T.eq(ybb[:, 1:-1], blank_symbol)) # r1: LxL # r2: LxL # r3: LxL r2 = T.eye(n_y, k=1) r3 = (T.eye(n_y, k=2) * sec_diag) return r2, r3
def recurrence_relation(y, y_mask): # with blank symbol of -1 this falls back to the recurrence that fails # with repeating symbols! blank_symbol = -1 n_y = y.shape[0] blanks = tensor.zeros((2, y.shape[1])) + blank_symbol ybb = tensor.concatenate((y, blanks), axis=0).T sec_diag = tensor.neq(ybb[:, :-2], ybb[:, 2:]) * tensor.eq(ybb[:, 1:-1], blank_symbol) * y_mask.T # r1: LxL # r2: LxL # r3: LxLxB r2 = tensor.eye(n_y, k=1) r3 = tensor.eye(n_y, k=2).dimshuffle(0, 1, "x") * sec_diag.dimshuffle(1, "x", 0) return r2, r3
def __init__(self, atomc, dist, atom_mask, num_hidden=60, num_passes=2, include_diagonal=False, nonlinearity=lasagne.nonlinearities.tanh, Wcf=lasagne.init.GlorotNormal(1.0), Wfc=lasagne.init.GlorotNormal(1.0), Wdf=lasagne.init.GlorotNormal(1.0), bcf=lasagne.init.Constant(0.0), bdf=lasagne.init.Constant(0.0), **kwargs): super(RecurrentLayer, self).__init__([atomc, dist, atom_mask], **kwargs) num_atoms = self.input_shapes[0][1] c_len = self.input_shapes[0][2] d_len = self.input_shapes[1][3] self.Wcf = self.add_param(Wcf, (c_len, num_hidden), name="W_atom_c") self.bcf = self.add_param(bcf, (num_hidden, ), name="b_atom_c") self.Wdf = self.add_param(Wdf, (d_len, num_hidden), name="W_dist") self.bdf = self.add_param(bdf, (num_hidden, ), name="b_dist") self.Wfc = self.add_param(Wfc, (num_hidden, c_len), name="W_hidden_to_c") self.num_passes = num_passes self.nonlin = nonlinearity if include_diagonal: self.inv_eye_mask = None else: self.inv_eye_mask = (T.eye(num_atoms, num_atoms) < 1).dimshuffle( "x", 0, 1, "x")
def scale(x): """Returns a transform to represent a scaling""" x = T.as_tensor_variable(x) m = T.eye(4, 4) m = T.set_subtensor(m[0, 0], x[0]) m = T.set_subtensor(m[1, 1], x[1]) m = T.set_subtensor(m[2, 2], x[2]) mInv = T.eye(4, 4) mInv = T.set_subtensor(mInv[0, 0], 1. / x[0]) mInv = T.set_subtensor(mInv[1, 1], 1. / x[1]) mInv = T.set_subtensor(mInv[2, 2], 1. / x[2]) return Transform(m, mInv)
def __init__(self, z0=None, dim=None, v=None, jitter=.1): super(HouseholderFlow, self).__init__(dim=dim, z0=z0, jitter=jitter) if v is None: v = self.add_param(dim, 'v') self.shared_params = dict(v=v) v = v.dimshuffle(0, 'x') self.H = tt.eye(dim) - 2. * v.dot(v.T) / ((v**2).sum() + 1e-10)
def _build_marginal_likelihood_logp(self, y, X, Xu, sigma): sigma2 = tt.square(sigma) Kuu = self.cov_func(Xu) Kuf = self.cov_func(Xu, X) Luu = cholesky(stabilize(Kuu)) A = solve_lower(Luu, Kuf) Qffd = tt.sum(A * A, 0) if self.approx == "FITC": Kffd = self.cov_func(X, diag=True) Lamd = tt.clip(Kffd - Qffd, 0.0, np.inf) + sigma2 trace = 0.0 elif self.approx == "VFE": Lamd = tt.ones_like(Qffd) * sigma2 trace = ((1.0 / (2.0 * sigma2)) * (tt.sum(self.cov_func(X, diag=True)) - tt.sum(tt.sum(A * A, 0)))) else: # DTC Lamd = tt.ones_like(Qffd) * sigma2 trace = 0.0 A_l = A / Lamd L_B = cholesky(tt.eye(Xu.shape[0]) + tt.dot(A_l, tt.transpose(A))) r = y - self.mean_func(X) r_l = r / Lamd c = solve_lower(L_B, tt.dot(A, r_l)) constant = 0.5 * X.shape[0] * tt.log(2.0 * np.pi) logdet = 0.5 * tt.sum(tt.log(Lamd)) + tt.sum(tt.log(tt.diag(L_B))) quadratic = 0.5 * (tt.dot(r, r_l) - tt.dot(c, c)) return -1.0 * (constant + logdet + quadratic + trace)
def init_train_updates(self): network_output = self.variables.network_output prediction_func = self.variables.train_prediction_func last_error = self.variables.last_error error_func = self.variables.error_func mu = self.variables.mu new_mu = ifelse( T.lt(last_error, error_func), mu * self.mu_update_factor, mu / self.mu_update_factor, ) mse_for_each_sample = T.mean((network_output - prediction_func)**2, axis=1) params = list(iter_parameters(self)) param_vector = parameters2vector(self) J = compute_jaccobian(mse_for_each_sample, params) n_params = J.shape[1] updated_params = param_vector - T.nlinalg.matrix_inverse( J.T.dot(J) + new_mu * T.eye(n_params)).dot( J.T).dot(mse_for_each_sample) updates = [(mu, new_mu)] parameter_updates = setup_parameter_updates(params, updated_params) updates.extend(parameter_updates) return updates
def neumann_inv_batch(A, it=10, F=None, reg=0.0): N, d, _ = A.shape if F is None: F = T.sqrt(T.sum(A * A, axis=(1, 2))) F = T.reshape(F, (N, 1, 1)) F = T.tile(F, (1, d, d)) G = A / F Y = T.tile(T.eye(d), (N, 1, 1)) * (1 - reg / F) - G Z = [None] * it Z[0] = T.tile(T.eye(d), (N, 1, 1)) for i in xrange(1, it): Z[i] = T.batched_dot(Z[i - 1], Y) output = sum(Z) / F return output
def run_irl(world, car, reward, theta, data): def gen(): for point in data: for c, x0, u in zip(world.cars, point['x0'], point['u']): c.traj.x0.set_value(x0) for cu, uu in zip(c.traj.u, u): cu.set_value(uu) yield r = car.traj.reward(reward) g = utils.grad(r, car.traj.u) H = utils.hessian(r, car.traj.u) I = tt.eye(utils.shape(H)[0]) reg = utils.vector(1) reg.set_value([1e-1]) H = H - reg[0] * I L = tt.dot(g, tt.dot(tn.MatrixInverse()(H), g)) + tt.log(tn.Det()(-H)) for _ in gen(): pass optimizer = utils.Maximizer(L, [theta], gen=gen, method='gd', eps=0.1, debug=True, iters=1000, inf_ignore=10) optimizer.maximize() print theta.get_value()
def compute_log_ei(self, x, incumbent): Kzz = compute_kernel(self.lls, self.lsf, self.z, self.z) + \ T.eye(self.z.shape[0]) * self.jitter * T.exp(self.lsf) KzzInv = T.nlinalg.MatrixInversePSD()(Kzz) LLt = T.dot(self.LParamPost, T.transpose(self.LParamPost)) covCavityInv = KzzInv + LLt * \ casting(self.n_points - self.set_for_training) / \ casting(self.n_points) covCavity = T.nlinalg.MatrixInversePSD()(covCavityInv) meanCavity = T.dot( covCavity, casting(self.n_points - self.set_for_training) / casting(self.n_points) * self.mParamPost) KzzInvcovCavity = T.dot(KzzInv, covCavity) KzzInvmeanCavity = T.dot(KzzInv, meanCavity) Kxz = compute_kernel(self.lls, self.lsf, x, self.z) B = T.dot(KzzInvcovCavity, KzzInv) - KzzInv v_out = T.exp(self.lsf) + T.dot(Kxz * T.dot(Kxz, B), T.ones_like(self.z[:, 0:1])) m_out = T.dot(Kxz, KzzInvmeanCavity) s = (incumbent - m_out) / T.sqrt(v_out) log_ei = T.log((incumbent - m_out) * ratio(s) + T.sqrt(v_out)) + log_n_pdf(s) return log_ei
def log_path_probs(y_hat, y): eye = tensor.eye(y.shape[0]) first = eye[0] mask0 = 1 - eye[0] mask1 = 1 - eye[1] alt_mask = tensor.cast(tensor.arange(y.shape[0]) % 2, theano.config.floatX) skip_mask = mask0 * mask1 * alt_mask prev_idx = tensor.arange(-1, y.shape[0] - 1) prev_prev_idx = tensor.arange(-2, y.shape[0] - 2) log_mask0 = log_(mask0) log_skip_mask = log_(skip_mask) log_first = log_(first) def step(log_p_curr, log_p_prev): log_after_trans = logplus_( log_p_prev, logplus_(log_mask0 + log_p_prev[prev_idx], log_skip_mask + log_p_prev[prev_prev_idx])) log_p_next = log_p_curr + log_after_trans return log_p_next L = tensor.log(y_hat[:, y]) log_f_probs, _ = theano.scan(step, sequences=[L], outputs_info=[log_first]) log_b_probs, _ = theano.scan(step, sequences=[L[::-1, ::-1]], outputs_info=[log_first]) log_probs = log_f_probs + log_b_probs[::-1, ::-1] return log_probs, prev_idx, prev_prev_idx
def feedback(self, outputs): assert self.output_dim == 0 eye = tensor.eye(self.num_outputs) check_theano_variable(outputs, None, "int") output_shape = [outputs.shape[i] for i in range(outputs.ndim)] + [self.feedback_dim] return eye[outputs.flatten()].reshape(output_shape)
def RBF(self, X1, X2=None): _X2 = X1 if X2 is None else X2 dist = ((X1 / self.l)**2).sum(1)[:, None] + ( (_X2 / self.l)**2).sum(1)[None, :] - 2 * (X1 / self.l).dot( (_X2 / self.l).T) RBF = self.sf2 * T.exp(-dist / 2.0) return (RBF + eps * T.eye(X1.shape[0])) if X2 is None else RBF
def _recurrence_relation(y, y_mask, blank_symbol): """ Construct a permutation matrix and tensor for computing CTC transitions. Parameters ---------- y : matrix (L, B) the target label sequences y_mask : matrix (L, B) indicates which values of y to use blank_symbol: integer indicates the symbol that signifies a blank label. Returns ------- matrix (L, L) tensor3 (L, L, B) """ n_y = y.shape[0] blanks = tensor.zeros((2, y.shape[1])) + blank_symbol ybb = tensor.concatenate((y, blanks), axis=0).T sec_diag = (tensor.neq(ybb[:, :-2], ybb[:, 2:]) * tensor.eq(ybb[:, 1:-1], blank_symbol) * y_mask.T) # r1: LxL # r2: LxL # r3: LxLxB eye2 = tensor.eye(n_y + 2) r2 = eye2[2:, 1:-1] # tensor.eye(n_y, k=1) r3 = (eye2[2:, :-2].dimshuffle(0, 1, 'x') * sec_diag.dimshuffle(1, 'x', 0)) return r2, r3
def tiled_eye(n1, n2, dtype="float32"): r1 = T.maximum((n1 - 1) / n2 + 1, 1) r2 = T.maximum((n2 - 1) / n1 + 1, 1) small_eye = T.eye(T.minimum(n1, n2), dtype=dtype) tiled_big = T.tile(small_eye, (r1, r2)) tiled_part = tiled_big[:n1, :n2] return tiled_part
def parser_loss(energies, heads, types, masks): """ compute minus log likelihood of parser as parser loss. :param energies: Theano 4D tensor energies of each edge. the shape is [batch_size, n_steps, n_steps, num_labels], where the summy root is at index 0. :param heads: Theano 2D tensor heads in the shape [batch_size, n_steps]. :param types: Theano 2D tensor types in the shape [batch_size, n_steps]. :param masks: Theano 2D tensor masks in the shape [batch_size, n_steps]. :return: Theano 1D tensor an expression for minus log likelihood loss. """ input_shape = energies.shape batch_size = input_shape[0] length = input_shape[1] # get the exp of energies, and add along the label axis. # the shape is [batch_size, n, n]. E = T.exp(energies).sum(axis=3) # zero out the elements out the length of each sentence. if masks is not None: masks_shuffled = masks.dimshuffle(0, 1, 'x') E = E * masks_shuffled masks_shuffled = masks.dimshuffle(0, 'x', 1) E = E * masks_shuffled # compute the D tensor. # the shape is [batch_size, n, n] D = E.sum(axis=1) D = T.zeros_like(E) + D.dimshuffle(0, 1, 'x') # zeros out all elements except diagonal. D = D * T.eye(length, length, 0).dimshuffle('x', 0, 1) # compute lengths lengths = T.cast(masks, dtype='int32').sum(axis=1) # compute laplacian matrix L = D - E # compute partition Z(x) partitions, _ = theano.scan( fn=lambda laps, length: nlinalg.logabsdet(laps[1:length, 1:length]), outputs_info=None, sequences=[L, lengths]) # compute targets energy # first create indice matrix indices = T.zeros_like(heads) + T.arange(length).dimshuffle('x', 0) # compute loss matrix shape = [n_steps, batch_size] target_energy = energies[T.arange(batch_size), heads.T, indices.T, types.T] # shuffle loss to [batch_size, n_steps] target_energy = target_energy.dimshuffle(1, 0) # remove the first element [batch, n_steps -1] target_energy = target_energy[:, 1:] # sum over n_step shape = [batch_size] target_energy = target_energy.sum(axis=1) return partitions - target_energy #, E, D, L, partitions, target_energy
def log_likelihood(self): # sequence step for each sentence is word_vec*M + rest where M moves the word vector to sentence embedding space def seq_step(words_vectors, seq_sum_vectors): return T.dot(words_vectors, self.M_word_to_sent) + seq_sum_vectors # 3-Tensor for word embeddings for a minibatch. After dimshuffle, the first dimension is that of the word sequence for sentences. So we want to iterate over each of the sentences in the minibatch simultaneously word_emb_tensor = T.concatenate([ T.dot( T.eye(self.max_words, self.num_words)[self.one_hot_sent_matrix[i], :], self.Wemb) for i in xrange( self.one_hot_sent_matrix.get_value( borrow=True, return_internal_type=True).shape[0]) ]).dimshuffle(2, 1, 0) # use scan to generate the sequence rnn sent_emb, _ = theano.scan(fn=seq_step, sequences=[word_emb_tensor], outputs_info=[np.zeros(self.sent_dim)]) return T.mean( T.sum(T.log( np.ones(self.num_entities) - self.answer_matrix - T.nnet.softmax(T.dot(sent_emb[-1], self.M_softmax))), axis=1))
def tangent2ambient(self, X, Z): U = tensor.stack((X.U.dot(Z.M) + Z.Up, X.U), 0).reshape((-1, X.U.shape[1])) #U = np.hstack((X.U.dot(Z.M) + Z.Up, X.U)) S = tensor.eye(2*self._k) V = tensor.stack((X.V, Z.Vp), 1).reshape((X.V.shape[0], -1)) #V = np.vstack((X.V, Z.Vp)) return ManifoldElementShared.from_vars((U, S, V), shape=(self._m, self._n), r=self._k)
def generate_samples(self,name, X_new, n_samples = 500): with self.model as model: Kuu = pm.gp.util.stabilize(self.cov(self.Xu)) Kuf = self.cov(self.Xu, self.X) Luu = tt.slinalg.cholesky(Kuu) A = pm.gp.util.solve_lower(Luu, Kuf) Qff = tt.dot(tt.transpose(A),A) Kffd = self.cov(self.X, diag=True) Lamd_inv = tt.diag(1./tt.clip(Kffd - tt.diag(Qff) + self.sigma**2, 0, np.inf)) Sigma = pm.gp.util.stabilize(Kuu + tt.dot(Kuf.dot(Lamd_inv),tt.transpose(Kuf))) L_Sigma = tt.slinalg.cholesky(Sigma) Kus = self.cov(self.Xu,X_new) m1 = pm.gp.util.solve_lower(L_Sigma, Kus) m2 = pm.gp.util.solve_lower(L_Sigma, Kuf) mu_pred = tt.dot(tt.dot(tt.transpose(m1),m2),tt.dot(Lamd_inv,model.fp)) Kss = self.cov(X_new) + 1e-6 * tt.eye(X_new.shape[0]) As = pm.gp.util.solve_lower(Luu, Kus) Qss = tt.dot(tt.transpose(As),As) cov_pred = Kss - Qss + tt.dot(tt.transpose(m1),m1) f_pred = pm.MvNormal(name, mu=mu_pred, cov=cov_pred, shape=pm.gp.util.infer_shape(X_new)) with self.model: pred_samples = pm.sample_ppc(self.trace, vars=[f_pred], samples=n_samples) return pred_samples
def get_output_for(self, input, init=False, **kwargs): if input.ndim > 2: # if the input has more than two dimensions, flatten it into a # batch of feature vectors. input = input.flatten(2) activation = T.tensordot(input, self.W, [[1], [0]]) abs_dif = (T.sum(abs(activation.dimshuffle(0,1,2,'x') - activation.dimshuffle('x',1,2,0)),axis=2) + 1e6 * T.eye(input.shape[0]).dimshuffle(0,'x',1)) if init: mean_min_abs_dif = 0.5 * T.mean(T.min(abs_dif, axis=2),axis=0) abs_dif /= mean_min_abs_dif.dimshuffle('x',0,'x') self.init_updates = [(self.log_weight_scale, self.log_weight_scale-T.log(mean_min_abs_dif).dimshuffle(0,'x'))] f = T.sum(T.exp(-abs_dif),axis=2) if init: mf = T.mean(f,axis=0) f -= mf.dimshuffle('x',0) self.init_updates.append((self.b, -mf)) else: f += self.b.dimshuffle('x',0) return T.concatenate([input, f], axis=1)
def __init__(self, loss=None, inputs=None, C=None): symmetrize = False A, b = inputs if A.shape[0] <> A.shape[1]: symmetrize = True elif not T.allclose(A.T, A): print('not sym th') symmetrize = True if symmetrize: print('symetrize thean') self._A = T.dot(A.T, A) self._b = T.dot(A.T, b) else: self._A = A self._b = b # self._A = theano.shared(A) # self._b = theano.shared(b) if C is None: self._C = T.eye(self._A.shape[1]) else: self._C = C b = self._b.eval() A = self._A.eval() self._x0 = np.zeros(b.shape[0]) self._r0 = b - np.dot(A, self._x0) # self._z = T.dot(self._C,theano.shared(self._x0)) self._t_x = theano.shared(self._x0) # T.vector('x') self._output_tf = loss if loss is None: self._output_tf = self._tf_CG_loss()
def _initialize_posterior_distribution(self, RecognitionParams): ################## put together the total precision matrix ###################### # Diagonals must be PSD diagsquare = T.batched_dot(self.AAChol, self.AAChol.dimshuffle(0, 2, 1)) odsquare = T.batched_dot(self.BBChol, self.BBChol.dimshuffle(0, 2, 1)) self.AA = diagsquare + T.concatenate([ T.shape_padleft(T.zeros([self.xDim, self.xDim])), odsquare ]) + 1e-6 * T.eye(self.xDim) self.BB = T.batched_dot(self.AAChol[:-1], self.BBChol.dimshuffle(0, 2, 1)) # compute Cholesky decomposition self.the_chol = blk_tridag_chol(self.AA, self.BB) # symbolic recipe for computing the the diagonal (V) and # off-diagonal (VV) blocks of the posterior covariance self.V, self.VV, self.S = compute_sym_blk_tridiag(self.AA, self.BB) self.postX = self.Mu # The determinant of the covariance is the square of the determinant of the cholesky factor (twice the log). # Determinant of the Cholesky factor is the product of the diagonal elements of the block-diagonal. def comp_log_det(L): return T.log(T.diag(L)).sum() self.ln_determinant = -2 * theano.scan( fn=comp_log_det, sequences=self.the_chol[0])[0].sum()
def mk_training_fn(self): n = self.batch_size N = self.total_size q_size = self.q_size B = self.B gamma = self.gamma avg_I = self.avg_I t = self.t updates = self.updates epsilon = self.step_size / pow(2.0, t // self.step_size_decay) random = self.random inarray = self.inarray gt, dlog_prior = self.dlogp_elemwise, self.dlog_prior # 5. Calculate mean dlogp avg_gt = gt.mean(axis=0) # 6. Calculate approximate Fisher Score gt_diff = (gt - avg_gt) V = (1. / (n - 1)) * tt.dot(gt_diff.T, gt_diff) # 7. Update moving average I_t = (1. - 1. / t) * avg_I + (1. / t) * V if B is None: # if B is not specified # B \propto I_t as given in # http://www.ics.uci.edu/~welling/publications/papers/SGFS_v10_final.pdf # after iterating over the data few times to get a good approximation of I_N B = tt.switch(t <= int(N / n) * 50, tt.eye(q_size), gamma * I_t) # 8. Noise Term # The noise term is sampled from a normal distribution # of mean 0 and std_dev = sqrt(4B/step_size) # In order to generate the noise term, a standard # normal dist. is scaled with 2B_ch/sqrt(step_size) # where B_ch is cholesky decomposition of B # i.e. B = dot(B_ch, B_ch^T) B_ch = tt.slinalg.cholesky(B) noise_term = tt.dot((2.*B_ch)/tt.sqrt(epsilon), \ random.normal((q_size,), dtype=theano.config.floatX)) # 9. # Inv. Fisher Cov. Matrix cov_mat = (gamma * I_t * N) + ((4. / epsilon) * B) inv_cov_mat = tt.nlinalg.matrix_inverse(cov_mat) # Noise Coefficient noise_coeff = (dlog_prior + (N * avg_gt) + noise_term) dq = 2 * tt.dot(inv_cov_mat, noise_coeff) updates.update({avg_I: I_t, t: t + 1}) f = theano.function( outputs=dq, inputs=inarray, updates=updates, allow_input_downcast=True) return f
def __init__(self, z0=None, dim=None, v=None, jitter=.1): super(HouseholderFlow, self).__init__(dim=dim, z0=z0, jitter=jitter) if v is None: v = self.add_param(dim, 'v') self.shared_params = dict(v=v) v = v.dimshuffle(0, 'x') self.H = tt.eye(dim) - 2. * v.dot(v.T) / ((v**2).sum()+1e-10)
def _build_conditional(self, Xnew, pred_noise, diag, X, Xu, y, sigma, cov_total, mean_total): sigma2 = tt.square(sigma) Kuu = cov_total(Xu) Kuf = cov_total(Xu, X) Luu = cholesky(stabilize(Kuu)) A = solve_lower(Luu, Kuf) Qffd = tt.sum(A * A, 0) if self.approx == "FITC": Kffd = cov_total(X, diag=True) Lamd = tt.clip(Kffd - Qffd, 0.0, np.inf) + sigma2 else: # VFE or DTC Lamd = tt.ones_like(Qffd) * sigma2 A_l = A / Lamd L_B = cholesky(tt.eye(Xu.shape[0]) + tt.dot(A_l, tt.transpose(A))) r = y - mean_total(X) r_l = r / Lamd c = solve_lower(L_B, tt.dot(A, r_l)) Kus = self.cov_func(Xu, Xnew) As = solve_lower(Luu, Kus) mu = self.mean_func(Xnew) + tt.dot(tt.transpose(As), solve_upper(tt.transpose(L_B), c)) C = solve_lower(L_B, As) if diag: Kss = self.cov_func(Xnew, diag=True) var = Kss - tt.sum(tt.square(As), 0) + tt.sum(tt.square(C), 0) if pred_noise: var += sigma2 return mu, var else: cov = (self.cov_func(Xnew) - tt.dot(tt.transpose(As), As) + tt.dot(tt.transpose(C), C)) if pred_noise: cov += sigma2 * tt.identity_like(cov) return mu, cov if pred_noise else stabilize(cov)
def _build_conditional(self, Xnew, pred_noise, diag, X, Xu, y, sigma, cov_total, mean_total): sigma2 = tt.square(sigma) Kuu = cov_total(Xu) Kuf = cov_total(Xu, X) Luu = cholesky(stabilize(Kuu)) A = solve_lower(Luu, Kuf) Qffd = tt.sum(A * A, 0) if self.approx == "FITC": Kffd = cov_total(X, diag=True) Lamd = tt.clip(Kffd - Qffd, 0.0, np.inf) + sigma2 else: # VFE or DTC Lamd = tt.ones_like(Qffd) * sigma2 A_l = A / Lamd L_B = cholesky(tt.eye(Xu.shape[0]) + tt.dot(A_l, tt.transpose(A))) r = y - mean_total(X) r_l = r / Lamd c = solve_lower(L_B, tt.dot(A, r_l)) Kus = self.cov_func(Xu, Xnew) As = solve_lower(Luu, Kus) mu = self.mean_func(Xnew) + tt.dot(tt.transpose(As), solve_upper(tt.transpose(L_B), c)) C = solve_lower(L_B, As) if diag: Kss = self.cov_func(Xnew, diag=True) var = Kss - tt.sum(tt.square(As), 0) + tt.sum(tt.square(C), 0) if pred_noise: var += sigma2 return mu, var else: cov = (self.cov_func(Xnew) - tt.dot(tt.transpose(As), As) + tt.dot(tt.transpose(C), C)) if pred_noise: cov += sigma2 * tt.identity_like(cov) return mu, stabilize(cov)
def translate(x): """Returns a transform to represent a translation""" x = T.as_tensor_variable(x) m = T.eye(4, 4) m = T.set_subtensor(m[0, 3], x[0]) m = T.set_subtensor(m[1, 3], x[1]) m = T.set_subtensor(m[2, 3], x[2]) mInv = T.eye(4, 4) mInv = T.set_subtensor(mInv[0, 3], -x[0]) mInv = T.set_subtensor(mInv[1, 3], -x[1]) mInv = T.set_subtensor(mInv[2, 3], -x[2]) return Transform(m, mInv)
def cal_encoder_step(self, encoder_val): ''' Calculate the weight ratios in encoder. :type decoder_val: class :param decoder_val: the class which stores the intermediate variables in encoder :returns: R_h_x, R_h_h are theano variables, weight ratios in encoder ''' encoder_val.x = encoder_val.x.dimshuffle(0, 1, 'x') R_state_in_x = (encoder_val.x * self.input_emb + self.input_emb_offset ) / (self.ep * TT.sgn(encoder_val.state_in) + encoder_val.state_in).dimshuffle(0, 'x', 1) R_state_in_x = R_state_in_x.dimshuffle(0, 2, 1) R_reset_in_x = encoder_val.x * self.reset_emb / ( encoder_val.reset_in + self.ep * TT.sgn(encoder_val.reset_in)).dimshuffle(0, 'x', 1) R_reset_in_x = R_reset_in_x.dimshuffle(0, 2, 1) R_gate_in_x = encoder_val.x * self.gate_emb / ( encoder_val.gate_in + self.ep * TT.sgn(encoder_val.gate_in)).dimshuffle(0, 'x', 1) R_gate_in_x = R_gate_in_x.dimshuffle(0, 2, 1) h_before = encoder_val.h_before.dimshuffle(0, 1, 'x') R_gate_h = h_before * self.gate_hidden / ( encoder_val.gate + self.ep * TT.sgn(encoder_val.gate)).dimshuffle( 0, 'x', 1) R_gate_x = R_gate_in_x * (encoder_val.gate_in / ( encoder_val.gate + self.ep * TT.sgn(encoder_val.gate))).dimshuffle( 0, 1, 'x') R_reset_h = h_before * self.reset_hidden / ( encoder_val.reset + self.ep * TT.sgn(encoder_val.reset)).dimshuffle(0, 'x', 1) R_reset_x = R_reset_in_x * ( encoder_val.reset_in / (encoder_val.reset + self.ep * TT.sgn(encoder_val.reset))).dimshuffle(0, 1, 'x') R_reseted_h = R_reset_h * self.weight + TT.eye(self.dim, self.dim) * self.weight R_reseted_x = R_reset_x * self.weight encoder_val.reseted = encoder_val.reseted.dimshuffle(0, 1, 'x') R_state_reseted = encoder_val.reseted * self.input_hidden / ( encoder_val.state + self.ep * TT.sgn(encoder_val.state)).dimshuffle(0, 'x', 1) R_state_reseted = R_state_reseted.dimshuffle(0, 2, 1) R_state_h = TT.batched_dot(R_state_reseted, R_reseted_h) R_state_x = TT.batched_dot(R_state_reseted, R_reseted_x) R_state_x += R_state_in_x * ( (encoder_val.state_in / (encoder_val.state + self.ep * TT.sgn(encoder_val.state))).dimshuffle(0, 1, 'x')) R_h = (encoder_val.gate * encoder_val.state / (encoder_val.h + self.ep * TT.sgn(encoder_val.h))).dimshuffle( 0, 1, 'x') * self.weight R_h_h = R_state_h * R_h + R_gate_h * R_h R_h2 = ((1 - encoder_val.gate) * encoder_val.h_before / (encoder_val.h + self.ep * TT.sgn(encoder_val.h))).dimshuffle( 0, 1, 'x') R_h_h = TT.identity_like(R_h_h[0]) * R_h2 R_h_x = R_gate_x * R_h + R_state_x * R_h return R_h_x, R_h_h
def path_probability(self, queryseq_padded, scorematrix, queryseq_mask_padded=None, scorematrix_mask=None, blank_symbol=None, align='pre'): """ Compute p(l|x) using only the forward variable and log scale :param queryseq_padded: (2L+1, B) :param scorematrix: (T, C+1, B) :param queryseq_mask_padded: (2L+1, B) :param scorematrix_mask: (T, B) :param blank_symbol: = C by default :return: """ if blank_symbol is None: # blank_symbol = scorematrix.shape[1] - 1.0 blank_symbol = tensor.cast(scorematrix.shape[1], floatX) - 1.0 if queryseq_mask_padded is None: queryseq_mask_padded = tensor.ones_like(queryseq_padded, dtype=floatX) if scorematrix_mask is None: scorematrix_mask = tensor.ones( [scorematrix.shape[0], scorematrix.shape[2]]) pred_y = self._class_batch_to_labeling_batch( queryseq_padded, scorematrix, scorematrix_mask) # (T, 2L+1, B), reshaped scorematrix r2, r3 = self._recurrence_relation( queryseq_padded, queryseq_mask_padded, blank_symbol) # r2 (2L+1, 2L+1), r3 (2L+1, 2L+1, B) def step(p_curr, p_prev): p1 = p_prev p2 = self._log_dot_matrix(p1, r2) p3 = self._log_dot_tensor(p1, r3) p123 = self._log_add(p3, self._log_add(p1, p2)) return p_curr.T + p123 + self._epslog(queryseq_mask_padded.T) alphas, _ = theano.scan( step, sequences=[self._epslog(pred_y)], outputs_info=[ self._epslog( tensor.eye(queryseq_padded.shape[0])[0] * tensor.ones(queryseq_padded.T.shape)) ]) B = alphas.shape[1] LL = tensor.sum(queryseq_mask_padded, axis=0, dtype='int32') if align == 'pre': TL = tensor.sum(scorematrix_mask, axis=0, dtype='int32') NLL = -self._log_add( alphas[TL - 1, tensor.arange(B), LL - 1], alphas[TL - 1, tensor.arange(B), LL - 2]) else: # align == 'post' NLL = -self._log_add(alphas[-1, tensor.arange(B), LL - 1], alphas[-1, tensor.arange(B), LL - 2]) return NLL, alphas
def get_output_for(self, input, init=False, **kwargs): if input.ndim > 2: # if the input has more than two dimensions, flatten it into a # batch of feature vectors. input = input.flatten(2) activation = T.tensordot(input, self.W, [[1], [0]]) abs_dif = (T.sum(abs( activation.dimshuffle(0, 1, 2, 'x') - activation.dimshuffle('x', 1, 2, 0)), axis=2) + 1e6 * T.eye(input.shape[0]).dimshuffle(0, 'x', 1)) if init: mean_min_abs_dif = 0.5 * T.mean(T.min(abs_dif, axis=2), axis=0) abs_dif /= mean_min_abs_dif.dimshuffle('x', 0, 'x') self.init_updates = [ (self.log_weight_scale, self.log_weight_scale - T.log(mean_min_abs_dif).dimshuffle(0, 'x')) ] f = T.sum(T.exp(-abs_dif), axis=2) if init: mf = T.mean(f, axis=0) f -= mf.dimshuffle('x', 0) self.init_updates.append((self.b, -mf)) else: f += self.b.dimshuffle('x', 0) return T.concatenate([input, f], axis=1)
def dlogp(inputs, gradients): g_logp, = gradients cov, delta = inputs g_logp.tag.test_value = floatX(np.array(1.)) n, k = delta.shape chol_cov = cholesky(cov) diag = tt.nlinalg.diag(chol_cov) ok = tt.all(diag > 0) chol_cov = tt.switch(ok, chol_cov, tt.fill(chol_cov, 1)) delta_trans = solve_lower(chol_cov, delta.T).T inner = n * tt.eye(k) - tt.dot(delta_trans.T, delta_trans) g_cov = solve_upper(chol_cov.T, inner) g_cov = solve_upper(chol_cov.T, g_cov.T) tau_delta = solve_upper(chol_cov.T, delta_trans.T) g_delta = tau_delta.T g_cov = tt.switch(ok, g_cov, -np.nan) g_delta = tt.switch(ok, g_delta, -np.nan) return [-0.5 * g_cov * g_logp, -g_delta * g_logp]
def _path_probabs(y, y_hat, y_mask, y_hat_mask, blank_symbol): """Compute the probabilities of the paths that are compatible with the sequence y. This function uses scan to get the forward probabilities (often denoted with the symbol alpha in the literature). See _log_path_probabs for a version that works in log domain. """ pred_y = _class_batch_to_labeling_batch(y, y_hat, y_hat_mask) pred_y = pred_y.dimshuffle(0, 2, 1) n_labels = y.shape[0] r2, r3 = _recurrence_relation(y, y_mask, blank_symbol) def step(p_curr, p_prev): # instead of dot product, we * first # and then sum oven one dimension. # objective: T.dot((p_prev)BxL, LxLxB) # solusion: Lx1xB * LxLxB --> LxLxB --> (sumover)xLxB dotproduct = p_prev + tensor.dot(p_prev, r2) + (p_prev.dimshuffle(1, "x", 0) * r3).sum(axis=0).T return p_curr.T * dotproduct * y_mask.T # B x L probabilities, _ = theano.scan( step, sequences=[pred_y], outputs_info=[tensor.eye(n_labels)[0] * tensor.ones(y.T.shape)] ) return probabilities
def __init__(self, incoming, dimension, params_init=(GlorotUniform(), GlorotUniform(), Uniform([0, 0.1])), addition_parameters=[False], **kwargs): ''' init parameters :param incoming: input to the LISTA layer :param dimension: 2 numbers list. dimension[0] is dict_size, length of dictionary vector in LISTA. dimension[1] is T a.k.a depth :param params_init: init value or init method for LISTA :transposed: = True if the input dictionary D is the transpose matrix of a theano.compile.SharedVariable V. In that case self.W = D^T = V^T^T = V :param kwargs: parameters of super class :return: ''' super(LISTA, self).__init__(incoming, **kwargs) self.transposed = addition_parameters[0] num_inputs = incoming.output_shape[-1] self.dict_size = dimension[0] self.T = dimension[1] self.W = self.add_param(params_init[0], [num_inputs, self.dict_size], name='W', lista=True, lista_weight_W=True, sparse_dictionary=True, regularizable=True) # self.S = self.add_param(params_init[1], [self.dict_size, self.dict_size], name='S', # lista=True, lista_weight_W=True, regularizable=True) if T > 0: self.S = T.eye(self.dict_size) - T.dot(self.get_dictionary(), self.get_dictionary().T) self.S = self.add_param(theano.shared(floatX(self.S.eval())), [self.dict_size, self.dict_size], name='S', lista=True, lista_weight_S=True, regularizable=True) self.theta = self.add_param(theano.shared(floatX(0.01 * np.ones([self.dict_size, ]))), [self.dict_size, ], name='theta', lista=True, lista_fun_param=True, regularizable=False) self.eps = 1e-6 self.clipped_theta = T.clip(self.theta, self.eps, 10)
def _log_ctc(self, ): _1000 = tt.eye(self.n, dtype=th.config.floatX)[0] prev_mask = 1 - _1000 prev_mask = safe_log(prev_mask) prevprev_mask = tt.neq(self.labels[:-2], self.labels[2:]) * \ tt.eq(self.labels[1:-1], self.blank) prevprev_mask = tt.concatenate( ([0, 0], prevprev_mask)).astype(th.config.floatX) prevprev_mask = safe_log(prevprev_mask) prev = tt.arange(-1, self.n - 1) prevprev = tt.arange(-2, self.n - 2) log_pred_y = tt.log(self.inpt[:, self.labels]) def step(curr, accum): return logmul( curr, logadd(accum, logmul(prev_mask, accum[prev]), logmul(prevprev_mask, accum[prevprev]))) log_fwd_pbblts, _ = th.scan(step, sequences=[log_pred_y], outputs_info=[safe_log(_1000)]) # TODO: Add probabilites[-1, -2] only if last label is blank. # If length = 1, skip the scan process. # log_liklihood = ifelse(tt.eq(self.n, 1), tt.sum(log_pred_y), # ifelse(tt.eq(self.labels[-1], self.blank), # logadd(log_fwd_pbblts[-1, -1], log_fwd_pbblts[-1, -2]), # log_fwd_pbblts[-1, -1])) log_liklihood = log_fwd_pbblts[-1, -1] self.cost = -log_liklihood self.debug = log_fwd_pbblts.T
def quadratic_saturating_loss(mx, Sx, target, Q, *args, **kwargs): ''' Squashing loss penalty function c(x) = ( 1 - e^(-0.5*quadratic_loss(x, target)) ) ''' if Sx is None: if mx.ndim == 1: mx = mx[None, :] delta = mx - target[None, :] deltaQ = delta.dot(Q) cost = 1.0 - tt.exp(-0.5 * tt.batched_dot(deltaQ, delta)) return cost else: # stochastic case (moment matching) delta = mx - target SxQ = Sx.dot(Q) EyeM = tt.eye(mx.shape[0]) IpSxQ = EyeM + SxQ Ip2SxQ = EyeM + 2 * SxQ S1 = tt.dot(Q, matrix_inverse(IpSxQ)) S2 = tt.dot(Q, matrix_inverse(Ip2SxQ)) # S1 = solve(IpSxQ.T, Q.T).T # S2 = solve(Ip2SxQ.T, Q.T).T # mean m_cost = -tt.exp(-0.5 * delta.dot(S1).dot(delta)) / tt.sqrt(det(IpSxQ)) # var s_cost = tt.exp(-delta.dot(S2).dot(delta)) / tt.sqrt( det(Ip2SxQ)) - m_cost**2 return 1.0 + m_cost, s_cost
def one_hot(x, m=None): """One-hot representation of integer vector. Given a vector of integers from 0 to m-1, returns a matrix with a one-hot representation, where each row corresponds to an element of x. Parameters ---------- x : integer vector The integer vector to convert to a one-hot representation. m : int, optional The number of different columns for the one-hot representation. This needs to be strictly greater than the maximum value of `x`. Defaults to ``max(x) + 1``. Returns ------- Theano tensor variable A Theano tensor variable of shape (``n``, `m`), where ``n`` is the length of `x`, with the one-hot representation of `x`. Notes ----- If your integer vector represents target class memberships, and you wish to compute the cross-entropy between predictions and the target class memberships, then there is no need to use this function, since the function :func:`lasagne.objectives.categorical_crossentropy()` can compute the cross-entropy from the integer vector directly. """ if m is None: m = T.cast(T.max(x) + 1, 'int32') return T.eye(m)[T.cast(x, 'int32')]
def __init__(self, generative_model, recognition_model, z_dim, max_length, vocab_size, dist_z_gen, dist_x_gen, dist_z_rec, gen_nn_kwargs, rec_nn_kwargs, iwae=False): self.vocab_size = vocab_size self.max_length = max_length self.generative_model = generative_model(z_dim, max_length, vocab_size, dist_z_gen, dist_x_gen, gen_nn_kwargs) self.recognition_model = recognition_model(z_dim, max_length, vocab_size, dist_z_rec, rec_nn_kwargs) self.iwae = iwae self.one_hot_encoder = T.concatenate( [T.zeros((1, self.vocab_size)), T.eye(self.vocab_size)], axis=0)
def logp(self, Y, X=None): if X is None: X = self.X mu = self.M(X).squeeze() Sigma = self.K(X) + tt.eye(X.shape[0]) * self.sigma**2 return MvNormal.dist(mu, Sigma).logp(Y)
def __call__(self, f): """ Compute the following function: E(f) = ||f_l - y_l||^2 + mu f^T L f + mu eps ||f||^2, :param f: Theano tensor Vector of N continuous elements. :return: Theano tensor Energy (cost) of the vector f. """ # Compute the un-normalized graph Laplacian: L = D - W D = T.diag(self.W.sum(axis=0)) L = D - self.W # Compute the label consistency S = T.diag(self.L) El = (f - self.y).T.dot(S.dot(f - self.y)) # Compute the smoothness along the similarity graph I = T.eye(self.L.shape[0]) Es = f.T.dot(L.dot(f)) + self.eps * f.T.dot(I.dot(f)) # Compute the whole cost function E = El + self.mu * Es return E
def one_hot(x, m=None): """One-hot representation of integer vector. Given a vector of integers from 0 to m-1, returns a matrix with a one-hot representation, where each row corresponds to an element of x. Parameters ---------- x : integer vector The integer vector to convert to a one-hot representation. m : int, optional The number of different columns for the one-hot representation. This needs to be strictly greater than the maximum value of `x`. Defaults to ``max(x) + 1``. Returns ------- Theano tensor variable A Theano tensor variable of shape (``n``, `m`), where ``n`` is the length of `x`, with the one-hot representation of `x`. Notes ----- If your integer vector represents target class memberships, and you wish to compute the cross-entropy between predictions and the target class memberships, then there is no need to use this function, since the function :func:`lasagne.objectives.categorical_crossentropy()` can compute the cross-entropy from the integer vector directly. """ if m is None: m = T.cast(T.max(x) + 1, "int32") return T.eye(m)[T.cast(x, "int32")]
def log_path_probs(blanked_label,y,blank_symbol): ''' table = feature probability table: T x 2L+1 ''' table = feature_table(blanked_label,y) r2, r3 = recurrence(blanked_label,blank_symbol) ''' log_p_curr: 2L+1 x 1 log_p_prev: 1 x 2L+1 ''' def step(log_p_curr, log_p_prev): p1 = log_p_prev p2 = _log_dot_matrix(p1, r2) p3 = _log_dot_matrix(p1, r3) p123 = _log_add(p3, _log_add(p1, p2)) return (log_p_curr + p123 ) ''' T.eye(blanked_label.shape[1])[0] = [ 1. 0. 0. 0. 0. 0. 0. 0. 0.] T.eye(blanked_label.shape[1])[0]*T.ones(blanked_label.T.shape): 1 x 2L+1 ''' log_probabilities, _ = theano.scan( step, sequences=[_epslog(table)], outputs_info=[_epslog(T.eye(blanked_label.shape[1])[0])]) return log_probabilities
def TzscorrCols(Xn): """ Theano expression which returns Fisher transformed correlation values between columns of a normalized input, `X_n`. Diagonal is set to zero. """ C_X = T.dot(Xn.T, Xn)-T.eye(Xn.shape[1]) return 0.5*T.log((1+C_X)/(1-C_X))
def get_blender_proj(self, camera): deg2rad = lambda angle: (angle / 180.) * np.pi sa = tensor.sin(deg2rad(-camera[0])) ca = tensor.cos(deg2rad(-camera[0])) se = tensor.sin(deg2rad(-camera[1])) ce = tensor.cos(deg2rad(-camera[1])) R_world2obj = tensor.eye(3) R_world2obj = tensor.set_subtensor(R_world2obj[0, 0], ca * ce) R_world2obj = tensor.set_subtensor(R_world2obj[0, 1], sa * ce) R_world2obj = tensor.set_subtensor(R_world2obj[0, 2], -se) R_world2obj = tensor.set_subtensor(R_world2obj[1, 0], -sa) R_world2obj = tensor.set_subtensor(R_world2obj[1, 1], ca) R_world2obj = tensor.set_subtensor(R_world2obj[2, 0], ca * se) R_world2obj = tensor.set_subtensor(R_world2obj[2, 1], sa * se) R_world2obj = tensor.set_subtensor(R_world2obj[2, 2], ce) R_obj2cam = np.array( ((1.910685676922942e-15, 4.371138828673793e-08, 1.0), (1.0, -4.371138828673793e-08, -0.0), (4.371138828673793e-08, 1.0, -4.371138828673793e-08))).T R_world2cam = tensor.dot(R_obj2cam, R_world2obj) cam_location = tensor.zeros((3, 1)) cam_location = tensor.set_subtensor(cam_location[0, 0], camera[2] * 1.75) T_world2cam = -1 * tensor.dot(R_obj2cam, cam_location) R_camfix = np.array(((1, 0, 0), (0, -1, 0), (0, 0, -1))) R_world2cam = tensor.dot(R_camfix, R_world2cam) T_world2cam = tensor.dot(R_camfix, T_world2cam) RT = tensor.concatenate([R_world2cam, T_world2cam], axis=1) return RT
def _compute_local_cn_acts(self, input, W): # Without Scan (Faster than scan, but still way too slow) shuffledIn = input.dimshuffle(0,1,'x') shuffledMasks = self.localmask.dimshuffle('x',0,1) # cubeIn = T.repeat(shuffledIn,self.localmask.shape[1],2) # cubeMasks = T.repeat(shuffledMasks,input.shape[0],0) maskedIn = shuffledIn * shuffledMasks maskedInMean = T.sum(maskedIn,axis=1,keepdims=True) / T.sum(shuffledMasks,axis=1,keepdims=True) maskedInVar = T.sum(T.sqr((maskedIn-maskedInMean)*shuffledMasks),axis=1,keepdims=True)/T.sum(shuffledMasks,axis=1,keepdims=True) maskedInSTD = T.sqrt(maskedInVar) maskedInSubMean = maskedIn - maskedInMean maskedCN = maskedInSubMean / maskedInSTD # maskedCN = maskedInSubMean shuffledInCN = maskedCN.dimshuffle(2,0,1) allOuts = T.dot(shuffledInCN, W) diagMask = T.eye(self.localmask.shape[1],self.localmask.shape[1]).dimshuffle(0,'x',1) diagMaskAll = allOuts * diagMask activation = T.sum(diagMaskAll,axis=0) return activation
def tangent2ambient(self, X, Z): U = tensor.stack((X.U.dot(Z.M) + Z.Up, X.U), 0).reshape((-1, X.U.shape[1])) #U = np.hstack((X.U.dot(Z.M) + Z.Up, X.U)) S = tensor.eye(2*self._k) V = tensor.stack((X.V, Z.Vp), 1).reshape((X.V.shape[0], -1)) #V = np.vstack((X.V, Z.Vp)) return (U, S, V)