def __init__(self, incoming, uvec=lasagne.init.Normal(1), b=lasagne.init.Constant(0), **kwargs): super(OrthogonalFlow, self).__init__(incoming, **kwargs) num_inputs = self.input_shape[1] n = num_inputs n_triu_entries = (n * (n + 1)) // 2 r = T.arange(n) tmp_mat = r[np.newaxis, :] + (n_triu_entries - n - (r * (r + 1)) // 2)[::-1, np.newaxis] triu_index_matrix = T.tril(tmp_mat.T) - r[np.newaxis, :] tmp_mat1 = T.tril(tmp_mat.T) - r[np.newaxis, :] skew_index_mat = T.tril(tmp_mat1 - T.diag(T.diag(tmp_mat1))) self.uvec = self.add_param(uvec, ((num_inputs - 1) * (num_inputs) / 2, ), name='uvec') vec0 = T.concatenate([T.zeros(1), self.uvec]) skw_matrix = vec0[skew_index_mat] - vec0[skew_index_mat].T self.U = expm(skw_matrix) self.b = self.add_param(b, (num_inputs, ), name='b') # scalar
def __init__(self, rng, input1, input2, n_in1, n_in2, n_hidden_layers, d_hidden, W1=None, W2=None): self.input1 = input1 self.input2 = input2 CouplingFunc = WarpNetwork(rng, input1, n_hidden_layers, d_hidden, n_in1, n_in2) if W1 is None: bin = numpy.sqrt(6. / (n_in1 + n_in1)) W1_values = numpy.identity(n_in1, dtype=theano.config.floatX) W1 = theano.shared(value=W1_values, name='W1') if W2 is None: bin = numpy.sqrt(6. / (n_in2 + n_in2)) W2_values = numpy.identity(n_in2, dtype=theano.config.floatX) W2 = theano.shared(value=W2_values, name='W2') V1u = T.triu(W1) V1l = T.tril(W1) V1l = T.extra_ops.fill_diagonal(V1l, 1.) V1 = T.dot(V1u, V1l) V2u = T.triu(W2) V2l = T.tril(W2) V2l = T.extra_ops.fill_diagonal(V2l, 1.) V2 = T.dot(V2u, V2l) self.output1 = T.dot(input1, V1) self.output2 = T.dot(input2, V2) + CouplingFunc.output self.log_jacobian = T.log(T.abs_(T.nlinalg.ExtractDiag()(V1u))).sum() \ + T.log(T.abs_(T.nlinalg.ExtractDiag()(V2u))).sum() self.params = CouplingFunc.params
def lower_lower(self): '''Evaluates the intractable term in the lower bound which itself must be lower bounded''' a = self.get_aux_mult() reversed_cum_probs = T.extra_ops.cumsum(a[:,::-1],1) dot_prod_m = T.dot(reversed_cum_probs, self.digams_1p2) dot_prod_mp1 = T.dot(T.concatenate((reversed_cum_probs[:,1:],T.zeros((self.K,1))),1), self.digams[:,0]) # final entropy term triu_ones = T.triu(T.ones_like(a)) - T.eye(self.K) aloga = T.sum(T.tril(a)*T.log(T.tril(a)+triu_ones),1) return T.dot(a, self.digams[:,1]) + dot_prod_m + dot_prod_mp1 - aloga
def log_prob(self, X, Y): """ Evaluate the log-probability for the given samples. Parameters ---------- Y: T.tensor samples from the upper layer X: T.tensor samples from the lower layer Returns ------- log_p: T.tensor log-probabilities for the samples in X and Y """ n_X, n_Y = self.get_hyper_params(['n_X', 'n_Y']) b, W, U = self.get_model_params(['b', 'W', 'U']) W = T.tril(W, k=-1) prob_X = self.sigmoid(T.dot(X, W) + T.dot(Y, U) + T.shape_padleft(b)) log_prob = X * T.log(prob_X) + (1 - X) * T.log(1 - prob_X) log_prob = T.sum(log_prob, axis=1) return log_prob
def logp(self, S): l = 0.0 #add prior pi = self.pi #Get time 0 states zeroIndices = np.roll(self.T.cumsum(),1) zeroIndices[0] = 0 zeroIndices = zeroIndices.astype('int32') l += TT.sum(TT.log(pi[S[zeroIndices]])) #l += TT.sum(TT.log(pi[S[:,0]])) #add likelihood Q = self.Q step_sizes = self.step_sizes #import pdb; pdb.set_trace() C = self.computeC(S) n_step_sizes = len(self.step_sizes) for i in range(0, n_step_sizes): tau = step_sizes[i] P = TT.slinalg.expm(tau*Q) stabilizer = TT.tril(TT.alloc(0.0, *P.shape)+0.1, k=-1) logP = TT.log(P + stabilizer) #compute likelihood in terms of P(tau) l += TT.sum(C[i,:,:]*logP) return l
def grad(self, inputs, output_gradients): """ Reverse-mode gradient updates for matrix solve operation c = A \ b. Symbolic expression for updates taken from [1]_. References ---------- ..[1] M. B. Giles, "An extended collection of matrix derivative results for forward and reverse mode automatic differentiation", http://eprints.maths.ox.ac.uk/1079/ """ A, b = inputs c = self(A, b) c_bar = output_gradients[0] trans_map = { 'lower_triangular': 'upper_triangular', 'upper_triangular': 'lower_triangular' } trans_solve_op = Solve( # update A_structure and lower to account for a transpose operation A_structure=trans_map.get(self.A_structure, self.A_structure), lower=not self.lower ) b_bar = trans_solve_op(A.T, c_bar) # force outer product if vector second input A_bar = -tensor.outer(b_bar, c) if c.ndim == 1 else -b_bar.dot(c.T) if self.A_structure == 'lower_triangular': A_bar = tensor.tril(A_bar) elif self.A_structure == 'upper_triangular': A_bar = tensor.triu(A_bar) return [A_bar, b_bar]
def logp(self, S): l = 0.0 #add prior pi = self.pi #Get time 0 states zeroIndices = np.roll(self.T.cumsum(), 1) zeroIndices[0] = 0 zeroIndices = zeroIndices.astype('int32') l += TT.sum(TT.log(pi[S[zeroIndices]])) #l += TT.sum(TT.log(pi[S[:,0]])) #add likelihood Q = self.Q step_sizes = self.step_sizes #import pdb; pdb.set_trace() C = self.computeC(S) n_step_sizes = len(self.step_sizes) for i in range(0, n_step_sizes): tau = step_sizes[i] P = TT.slinalg.expm(tau * Q) stabilizer = TT.tril(TT.alloc(0.0, *P.shape) + 0.1, k=-1) logP = TT.log(P + stabilizer) #compute likelihood in terms of P(tau) l += TT.sum(C[i, :, :] * logP) return l
def L_op(self, inputs, outputs, gradients): # Modified from theano/tensor/slinalg.py # No handling for on_error = 'nan' dz = gradients[0] chol_x = outputs[0] # this is for nan mode # # ok = ~tensor.any(tensor.isnan(chol_x)) # chol_x = tensor.switch(ok, chol_x, 1) # dz = tensor.switch(ok, dz, 1) # deal with upper triangular by converting to lower triangular if not self.lower: chol_x = chol_x.T dz = dz.T def tril_and_halve_diagonal(mtx): """Extracts lower triangle of square matrix and halves diagonal.""" return tensor.tril(mtx) - tensor.diag(tensor.diagonal(mtx) / 2.) def conjugate_solve_triangular(outer, inner): """Computes L^{-T} P L^{-1} for lower-triangular L.""" return gpu_solve_upper_triangular( outer.T, gpu_solve_upper_triangular(outer.T, inner.T).T) s = conjugate_solve_triangular( chol_x, tril_and_halve_diagonal(chol_x.T.dot(dz))) if self.lower: grad = tensor.tril(s + s.T) - tensor.diag(tensor.diagonal(s)) else: grad = tensor.triu(s + s.T) - tensor.diag(tensor.diagonal(s)) return [grad]
def L_op(self, inputs, outputs, output_gradients): r""" Reverse-mode gradient updates for matrix solve operation c = A \\\ b. Symbolic expression for updates taken from [#]_. References ---------- .. [#] M. B. Giles, "An extended collection of matrix derivative results for forward and reverse mode automatic differentiation", http://eprints.maths.ox.ac.uk/1079/ """ A, b = inputs c = outputs[0] c_bar = output_gradients[0] trans_map = { "lower_triangular": "upper_triangular", "upper_triangular": "lower_triangular", } trans_solve_op = Solve( # update A_structure and lower to account for a transpose operation A_structure=trans_map.get(self.A_structure, self.A_structure), lower=not self.lower, ) b_bar = trans_solve_op(A.T, c_bar) # force outer product if vector second input A_bar = -tensor.outer(b_bar, c) if c.ndim == 1 else -b_bar.dot(c.T) if self.A_structure == "lower_triangular": A_bar = tensor.tril(A_bar) elif self.A_structure == "upper_triangular": A_bar = tensor.triu(A_bar) return [A_bar, b_bar]
def grad(self, inputs, gradients): """ Cholesky decomposition reverse-mode gradient update. Symbolic expression for reverse-mode Cholesky gradient taken from [0]_ References ---------- .. [0] I. Murray, "Differentiation of the Cholesky decomposition", http://arxiv.org/abs/1602.07527 """ x = inputs[0] dz = gradients[0] chol_x = self(x) def tril_and_halve_diagonal(mtx): """Extracts lower triangle of square matrix and halves diagonal.""" return tensor.tril(mtx) - tensor.diag(tensor.diagonal(mtx) / 2.) def conjugate_solve_triangular(outer, inner): """Computes L^{-T} P L^{-1} for lower-triangular L.""" return solve_upper_triangular( outer.T, solve_upper_triangular(outer.T, inner.T).T) s = conjugate_solve_triangular( chol_x, tril_and_halve_diagonal(chol_x.T.dot(dz))) return [tensor.tril(s + s.T) - tensor.diag(tensor.diagonal(s))]
def log_prob(self, X, Y): """ Evaluate the log-probability for the given samples. Parameters ---------- Y: T.tensor samples from the upper layer X: T.tensor samples from the lower layer Returns ------- log_p: T.tensor log-probabilities for the samples in X and Y """ n_X, n_Y = self.get_hyper_params(['n_X', 'n_Y']) b, W, U = self.get_model_params(['b', 'W', 'U']) W = T.tril(W, k=-1) prob_X = self.sigmoid(T.dot(X, W) + T.dot(Y, U) + T.shape_padleft(b)) log_prob = X*T.log(prob_X) + (1-X)*T.log(1-prob_X) log_prob = T.sum(log_prob, axis=1) return log_prob
def rank_loss(scores): # Images diag = T.diag(scores) diff_img = scores - diag.dimshuffle(0, 'x') + 1 max_img = T.maximum(0, diff_img) triu_img = T.triu(max_img, 1) til_img = T.tril(max_img, -1) res_img = T.sum(triu_img) + T.sum(til_img) # Sentences diff_sent = scores.T - diag.dimshuffle(0, 'x') + 1 max_sent = T.maximum(0, diff_sent) triu_sent = T.triu(max_sent, 1) til_sent = T.tril(max_sent, -1) res_sent = T.sum(triu_sent) + T.sum(til_sent) return T.log(T.sum(scores) + 0.01)
def __init__(self, input, n_in, n_out): batchSize, seqLen, _ = input.shape import collections if isinstance(n_out, collections.Sequence): LRembedLayer = EmbeddingLayer(input, n_in, n_out[2]) MRembedLayer = EmbeddingLayer(input, n_in, n_out[1]) SRembedLayer = EmbeddingLayer(input, n_in, n_out[0]) n_out_max = max(n_out) else: LRembedLayer = EmbeddingLayer(input, n_in, n_out) MRembedLayer = EmbeddingLayer(input, n_in, n_out) SRembedLayer = EmbeddingLayer(input, n_in, n_out) n_out_max = n_out self.layers = [LRembedLayer, MRembedLayer, SRembedLayer] M1s = T.ones((seqLen, seqLen)) Sep24Mat = T.triu(M1s, 24) + T.tril(M1s, -24) Sep12Mat = T.triu(M1s, 12) + T.tril(M1s, -12) Sep6Mat = T.triu(M1s, 6) + T.tril(M1s, -6) LRsel = Sep24Mat.dimshuffle('x', 0, 1, 'x') MRsel = (Sep12Mat - Sep24Mat).dimshuffle('x', 0, 1, 'x') SRsel = (Sep6Mat - Sep12Mat).dimshuffle('x', 0, 1, 'x') selections = [LRsel, MRsel, SRsel] self.output = T.zeros((batchSize, seqLen, seqLen, n_out_max), dtype=theano.config.floatX) for emLayer, sel in zip(self.layers, selections): l_n_out = emLayer.n_out self.output = T.inc_subtensor(self.output[:, :, :, :l_n_out], T.mul(emLayer.output, sel)) self.pcenters = 0 self.params = [] self.paramL1 = 0 self.paramL2 = 0 for layer in [LRembedLayer, MRembedLayer, SRembedLayer]: self.params += layer.params self.paramL1 += layer.paramL1 self.paramL2 += layer.paramL2 self.pcenters += layer.pcenters self.n_out = n_out_max
def get_aux_mult(self): a_first_row_unnorm = (self.digams_1_cumsum - self.digams_1p2_cumsum + self.digams[:,1]).reshape((1,self.K)) a_first_row_unnorm_rep = t_repeat(a_first_row_unnorm, self.K, axis=0).reshape((self.K,self.K)) a = T.exp(a_first_row_unnorm_rep) * T.tril(T.ones((self.K, self.K))) return a / T.sum(a, 1).reshape((self.K,1))
def check_l(m, k=0): m_symb = T.matrix(dtype=m.dtype) k_symb = T.iscalar() f = theano.function([m_symb, k_symb], T.tril(m_symb, k_symb), mode=mode_with_gpu) result = f(m, k) assert np.allclose(result, np.tril(m, k)) assert result.dtype == np.dtype(dtype) assert any([isinstance(node.op, GpuTri) for node in f.maker.fgraph.toposort()])
def check_l(m, k=0): m_symb = T.matrix(dtype=m.dtype) k_symb = T.iscalar() f = theano.function([m_symb, k_symb], T.tril(m_symb, k_symb), mode=mode_with_gpu) result = f(m, k) assert np.allclose(result, np.tril(m, k)) assert result.dtype == np.dtype(dtype) assert any([ isinstance(node.op, GpuTri) for node in f.maker.fgraph.toposort() ])
def get_model(self, X, Y, X_test): #initial_params = {'m':m,'S_b':S_b,'mu':mu,'Sigma_b':Sigma_b,'Z':Z,'lhyp':lhyp,'ls':ls} (M, D), N, Q = self.Z.shape, X.shape[0], X.shape[1] #変数の正の値への制約条件 beta, sf2, l = T.exp(self.ls), T.exp(self.lhyp[0]), T.exp( self.lhyp[1:]) S = T.exp(self.S_b) #Sigma=T.exp(self.Sigma_b) #xについてはルートを取らなくても対角行列なので問題なし #uについては対角でないのでコレスキー分解するとかして三角行列を作る必要がある Sigma = T.tril(self.Sigma_b - T.diag(T.diag(self.Sigma_b)) + T.diag(T.exp(T.diag(self.Sigma_b)))) #スケール変換 mu_scaled, Sigma_scaled = sf2**0.5 * self.mu, sf2**0.5 * Sigma #reparametarizationのための乱数 srng = T.shared_randomstreams.RandomStreams(234) eps_NQ = srng.normal(self.m.shape) eps_M = srng.normal(self.mu.shape) #サンプルの生成。バッチでやるので一回だけのMC Xtilda = self.m + S * eps_NQ U = mu_scaled + Sigma_scaled * eps_M Kmm = self.ker.RBF(sf2, l, self.Z) KmmInv = sT.matrix_inverse(Kmm) #KmmDet=theano.sandbox.linalg.det(Kmm) Kmn = self.ker.RBF(sf2, l, self.Z, Xtilda) Knn = self.ker.RBF(sf2, l, Xtilda, Xtilda) Ktilda = Knn - T.dot(Kmn.T, T.dot(KmmInv, Kmn)) Kinterval = T.dot(KmmInv, Kmn) mean_U = T.dot(Kinterval.T, U) Covariance = beta LL = self.log_mvn(X, mean_U, Covariance) - 0.5 * beta * T.sum( (T.eye(N) * Ktilda)) #KL_X = -0.5 * (-T.sum(T.log(T.sum(Sigma,0))) + T.dot(m.T,T.dot(KmmInv,m)).squeeze() + T.sum((Sigma*KmmInv)) - M)-0.5*T.log(KmmDet) KL_X = self.KLD_X(self.m, S) KL_U = self.KLD_U(mu_scaled, Sigma_scaled, Kmm) return KL_X, KL_U, LL
def grad(self, inputs, gradients): """ Cholesky decomposition reverse-mode gradient update. Symbolic expression for reverse-mode Cholesky gradient taken from [0]_ References ---------- .. [0] I. Murray, "Differentiation of the Cholesky decomposition", http://arxiv.org/abs/1602.07527 """ x = inputs[0] dz = gradients[0] chol_x = self(x) # Replace the cholesky decomposition with 1 if there are nans # or solve_upper_triangular will throw a ValueError. if self.on_error == 'nan': ok = ~tensor.any(tensor.isnan(chol_x)) chol_x = tensor.switch(ok, chol_x, 1) dz = tensor.switch(ok, dz, 1) # deal with upper triangular by converting to lower triangular if not self.lower: chol_x = chol_x.T dz = dz.T def tril_and_halve_diagonal(mtx): """Extracts lower triangle of square matrix and halves diagonal.""" return tensor.tril(mtx) - tensor.diag(tensor.diagonal(mtx) / 2.) def conjugate_solve_triangular(outer, inner): """Computes L^{-T} P L^{-1} for lower-triangular L.""" return solve_upper_triangular( outer.T, solve_upper_triangular(outer.T, inner.T).T) s = conjugate_solve_triangular( chol_x, tril_and_halve_diagonal(chol_x.T.dot(dz))) if self.lower: grad = tensor.tril(s + s.T) - tensor.diag(tensor.diagonal(s)) else: grad = tensor.triu(s + s.T) - tensor.diag(tensor.diagonal(s)) if self.on_error == 'nan': return [tensor.switch(ok, grad, np.nan)] else: return [grad]
def L_op(self, inputs, outputs, output_gradients): # Modified from theano/tensor/slinalg.py A, b = inputs c = outputs[0] c_bar = output_gradients[0] trans_solve_op = GpuCublasTriangularSolve(not self.lower) b_bar = trans_solve_op(A.T, c_bar) A_bar = -tensor.outer(b_bar, c) if c.ndim == 1 else -b_bar.dot(c.T) if self.lower: A_bar = tensor.tril(A_bar) else: A_bar = tensor.triu(A_bar) return [A_bar, b_bar]
def triangularize_network(layers, force_diag=False): n_layers, rem = divmod(len(layers) + 1, 4) assert(rem == 0) assert(n_layers > 0) assert((n_layers - 1, aL_PARAM) not in layers) layers_LU = layers.copy() for nn in xrange(n_layers): LL, UL = layers[(nn, LL_PARAM)], layers[(nn, UL_PARAM)] LL_diag = T.nlinalg.alloc_diag(T.nlinalg.extract_diag(LL)) layers_LU[(nn, LL_PARAM)] = \ ifelse(force_diag, LL_diag, T.tril(LL)) layers_LU[(nn, UL_PARAM)] = \ ifelse(force_diag, T.eye(UL.shape[0]), T.triu(UL)) return layers_LU, n_layers
def grad(self, inputs, g_outputs): r"""The gradient function should return .. math:: \sum_n\left(W_n\frac{\partial\,w_n} {\partial a_{ij}} + \sum_k V_{nk}\frac{\partial\,v_{nk}} {\partial a_{ij}}\right), where [:math:`W`, :math:`V`] corresponds to ``g_outputs``, :math:`a` to ``inputs``, and :math:`(w, v)=\mbox{eig}(a)`. Analytic formulae for eigensystem gradients are well-known in perturbation theory: .. math:: \frac{\partial\,w_n} {\partial a_{ij}} = v_{in}\,v_{jn} .. math:: \frac{\partial\,v_{kn}} {\partial a_{ij}} = \sum_{m\ne n}\frac{v_{km}v_{jn}}{w_n-w_m} Code derived from theano.nlinalg.Eigh and doi=10.1.1.192.9105 """ x, = inputs w, v = self(x) # Replace gradients wrt disconnected variables with # zeros. This is a work-around for issue #1063. W, V = _zero_disconnected([w, v], g_outputs) N = x.shape[0] # W part gW = T.tensordot(v, v * W[numpy.newaxis, :], (1, 1)) # V part vv = v[:, :, numpy.newaxis, numpy.newaxis] * v[numpy.newaxis, numpy.newaxis, :, :] minusww = -w[:, numpy.newaxis] + w[numpy.newaxis, :] minuswwinv = 1 / (minusww + T.eye(N)) minuswwinv = T.triu(minuswwinv, 1) + T.tril(minuswwinv, -1) # remove diagonal c = (vv * minuswwinv[numpy.newaxis, :, numpy.newaxis, :]).dimshuffle( (1, 3, 0, 2)) vc = T.tensordot(v, c, (1, 0)) gV = T.tensordot(V, vc, ((0, 1), (0, 1))) g = gW + gV res = (g.T + g) / 2 return [res]
def grad(self, inputs, gradients): """ Cholesky decomposition reverse-mode gradient update. Symbolic expression for reverse-mode Cholesky gradient taken from [0]_ References ---------- .. [0] I. Murray, "Differentiation of the Cholesky decomposition", http://arxiv.org/abs/1602.07527 """ x = inputs[0] dz = gradients[0] chol_x = self(x) ok = tt.all(tt.nlinalg.diag(chol_x) > 0) chol_x = tt.switch(ok, chol_x, tt.fill_diagonal(chol_x, 1)) dz = tt.switch(ok, dz, floatX(1)) # deal with upper triangular by converting to lower triangular if not self.lower: chol_x = chol_x.T dz = dz.T def tril_and_halve_diagonal(mtx): """Extracts lower triangle of square matrix and halves diagonal.""" return tt.tril(mtx) - tt.diag(tt.diagonal(mtx) / 2.) def conjugate_solve_triangular(outer, inner): """Computes L^{-T} P L^{-1} for lower-triangular L.""" solve = tt.slinalg.Solve(A_structure="upper_triangular") return solve(outer.T, solve(outer.T, inner.T).T) s = conjugate_solve_triangular( chol_x, tril_and_halve_diagonal(chol_x.T.dot(dz))) if self.lower: grad = tt.tril(s + s.T) - tt.diag(tt.diagonal(s)) else: grad = tt.triu(s + s.T) - tt.diag(tt.diagonal(s)) return [tt.switch(ok, grad, floatX(np.nan))]
def test_autoregressor(dim=3, n_samples=5): ar = AutoRegressor(dim) ar.params['b'] += 0.1 tparams = ar.set_tparams() X = T.matrix('X', dtype=floatX) nlp = ar.neg_log_prob(X) p = ar.get_prob(X, *ar.get_params()) W = T.tril(ar.W, k=-1) z = T.dot(X, W) + ar.b x = np.random.randint(0, 2, size=(n_samples, dim)).astype(floatX) f = theano.function([X], [nlp, p, z, W]) nlp_t, p_t, z_t, W_t = f(x) print x.shape, nlp_t.shape z_np = np.zeros((n_samples, dim)).astype(floatX) + ar.params['b'][None, :] for i in xrange(dim): print i for j in xrange(i + 1, dim): print i, j z_np[:, i] += ar.params['W'][j, i] * x[:, j] assert np.allclose(z_t, z_np), (z_t, z_np) p_np = sigmoid(z_np) assert np.allclose(p_t, p_np), (p_t, p_np) p_np = np.clip(p_np, 1e-7, 1 - 1e-7) nlp_np = (- x * np.log(p_np) - (1 - x) * np.log(1 - p_np)).sum(axis=1) assert np.allclose(nlp_t, nlp_np), (nlp_t, nlp_np) samples, updates = ar.sample(n_samples=n_samples) f = theano.function([], samples, updates=updates) print f() assert False
def grad(self, inputs, gradients): """ Cholesky decomposition reverse-mode gradient update. Symbolic expression for reverse-mode Cholesky gradient taken from [0]_ References ---------- .. [0] I. Murray, "Differentiation of the Cholesky decomposition", http://arxiv.org/abs/1602.07527 """ x = inputs[0] dz = gradients[0] chol_x = self(x) # deal with upper triangular by converting to lower triangular if not self.lower: chol_x = chol_x.T dz = dz.T def tril_and_halve_diagonal(mtx): """Extracts lower triangle of square matrix and halves diagonal.""" return tensor.tril(mtx) - tensor.diag(tensor.diagonal(mtx) / 2.) def conjugate_solve_triangular(outer, inner): """Computes L^{-T} P L^{-1} for lower-triangular L.""" return solve_upper_triangular( outer.T, solve_upper_triangular(outer.T, inner.T).T) s = conjugate_solve_triangular( chol_x, tril_and_halve_diagonal(chol_x.T.dot(dz))) if self.lower: return [tensor.tril(s + s.T) - tensor.diag(tensor.diagonal(s))] else: return [tensor.triu(s + s.T) - tensor.diag(tensor.diagonal(s))]
def log_prob(self, X): """ Evaluate the log-probability for the given samples. Parameters ---------- X: T.tensor samples from X Returns ------- log_p: T.tensor log-probabilities for the samples in X """ n_X, = self.get_hyper_params(['n_X']) b, W = self.get_model_params(['b', 'W']) W = T.tril(W, k=-1) prob_X = self.sigmoid(T.dot(X, W) + b) log_prob = X * T.log(prob_X) + (1 - X) * T.log(1 - prob_X) log_prob = T.sum(log_prob, axis=1) return log_prob
def __init__(self, weights_init, biases_init, lower=False, weights_prec=0., biases_prec=0., weights_mean=None, biases_mean=None): assert weights_init.ndim == 2, 'weights_init must be 2D array.' assert biases_init.ndim == 1, 'biases_init must be 1D array.' assert weights_init.shape[0] == biases_init.shape[0], \ 'Dimensions of weights_init and biases_init must be consistent.' self.lower = lower self.weights = th.shared(weights_init, name='W') self.weights_tri = (tt.tril(self.weights) if lower else tt.triu(self.weights)) self.biases = th.shared(biases_init, name='b') self.weights_prec = weights_prec self.biases_prec = biases_prec if weights_mean is None: weights_mean = np.eye(weights_init.shape[0]) if biases_mean is None: biases_mean = np.zeros_like(biases_init) self.weights_mean = (np.tril(weights_mean) if lower else np.triu(weights_mean)) self.biases_mean = biases_mean super(TriangularAffineLayer, self).__init__( [self.weights, self.biases])
def log_prob(self, X): """ Evaluate the log-probability for the given samples. Parameters ---------- X: T.tensor samples from X Returns ------- log_p: T.tensor log-probabilities for the samples in X """ n_X, = self.get_hyper_params(['n_X']) b, W = self.get_model_params(['b', 'W']) W = T.tril(W, k=-1) prob_X = self.sigmoid(T.dot(X, W) + b) log_prob = X*T.log(prob_X) + (1-X)*T.log(1-prob_X) log_prob = T.sum(log_prob, axis=1) return log_prob
def L_op(self, inputs, outputs, gradients): # Modified from theano/tensor/slinalg.py # No handling for on_error = 'nan' dz = gradients[0] chol_x = outputs[0] # this is for nan mode # # ok = ~tensor.any(tensor.isnan(chol_x)) # chol_x = tensor.switch(ok, chol_x, 1) # dz = tensor.switch(ok, dz, 1) # deal with upper triangular by converting to lower triangular if not self.lower: chol_x = chol_x.T dz = dz.T def tril_and_halve_diagonal(mtx): """Extracts lower triangle of square matrix and halves diagonal.""" return tensor.tril(mtx) - tensor.diag(tensor.diagonal(mtx) / 2.0) def conjugate_solve_triangular(outer, inner): """Computes L^{-T} P L^{-1} for lower-triangular L.""" return gpu_solve_upper_triangular( outer.T, gpu_solve_upper_triangular(outer.T, inner.T).T ) s = conjugate_solve_triangular( chol_x, tril_and_halve_diagonal(chol_x.T.dot(dz)) ) if self.lower: grad = tensor.tril(s + s.T) - tensor.diag(tensor.diagonal(s)) else: grad = tensor.triu(s + s.T) - tensor.diag(tensor.diagonal(s)) return [grad]
def test_autoregressor(dim=3, n_samples=5): ar = AutoRegressor(dim) ar.params['b'] += 0.1 tparams = ar.set_tparams() X = T.matrix('X', dtype=floatX) nlp = ar.neg_log_prob(X) p = ar.get_prob(X, *ar.get_params()) W = T.tril(ar.W, k=-1) z = T.dot(X, W) + ar.b x = np.random.randint(0, 2, size=(n_samples, dim)).astype(floatX) f = theano.function([X], [nlp, p, z, W]) nlp_t, p_t, z_t, W_t = f(x) print x.shape, nlp_t.shape z_np = np.zeros((n_samples, dim)).astype(floatX) + ar.params['b'][None, :] for i in xrange(dim): print i for j in xrange(i + 1, dim): print i, j z_np[:, i] += ar.params['W'][j, i] * x[:, j] assert np.allclose(z_t, z_np), (z_t, z_np) p_np = sigmoid(z_np) assert np.allclose(p_t, p_np, atol=1e-4), (p_t - p_np) p_np = np.clip(p_np, 1e-7, 1 - 1e-7) nlp_np = (- x * np.log(p_np) - (1 - x) * np.log(1 - p_np)).sum(axis=1) assert np.allclose(nlp_t, nlp_np, atol=1e-3), (nlp_t - nlp_np) samples, updates = ar.sample(n_samples=n_samples) f = theano.function([], samples, updates=updates) print f()
def __init__(self, rng, input, n_in, n_batch, d_bucket, activation, activation_deriv, w=None, index_permute=None, index_permute_reverse=None): srng = RandomStreams(seed=234) n_bucket = n_in / d_bucket + 1 self.input = input # randomly permute input space if index_permute is None: index_permute = srng.permutation(n=n_in)#numpy.random.permutation(n_in) index_permute_reverse = T.argsort(index_permute) self.index_permute = index_permute self.index_permute_reverse = index_permute_reverse permuted_input = input[:, index_permute] self.permuted_input = permuted_input # initialize reflection parameters if w is None: bound = numpy.sqrt(3. / d_bucket) w_values = numpy.asarray(rng.uniform(low=-bound, high=bound, size=(n_bucket, d_bucket, d_bucket)), dtype=theano.config.floatX) w = theano.shared(value=w_values, name='w') self.w = w # compute outputs and Jacobians log_jacobian = T.alloc(0, n_batch) for b in xrange(n_bucket): bucket_size = d_bucket if b == n_bucket - 1: bucket_size = n_in - b * d_bucket x_b = self.permuted_input[:, b*d_bucket:b*d_bucket + bucket_size] w_b = self.w[b, :bucket_size, :bucket_size] # W = T.slinalg.Expm()(w_b) # log_jacobian = log_jacobian + T.alloc(T.nlinalg.trace(w_b), n_batch) Upper = T.triu(w_b) # Upper = T.extra_ops.fill_diagonal(Upper, 1.) Lower = T.tril(w_b) Lower = T.extra_ops.fill_diagonal(Lower, 1.) log_det_Upper = T.log(T.abs_(T.nlinalg.ExtractDiag()(Upper))).sum() # log_det_Lower = T.log(T.abs_(T.nlinalg.ExtractDiag()(Lower))).sum() W = T.dot(Upper, Lower) log_jacobian = log_jacobian + T.alloc(log_det_Upper, n_batch) # W = T.dot(T.transpose(w_b), w_b) + 0.001*T.eye(bucket_size) # log_jacobian = log_jacobian + T.alloc(T.log(T.abs_(T.nlinalg.Det()(W))), n_batch) # diag = T.nlinalg.diag(W) # div = T.tile(T.reshape(T.sqrt(diag), [1, bucket_size]), (bucket_size, 1)) # W = W / div / T.transpose(div) #import pdb; pdb.set_trace() lin_output_b = T.dot(x_b, W) if b>0: lin_output = T.concatenate([lin_output, lin_output_b], axis=1) else: lin_output = lin_output_b if activation is not None: derivs = activation_deriv(lin_output_b) #import pdb; pdb.set_trace() log_jacobian = log_jacobian + T.log(T.abs_(derivs)).sum(axis=1) # for n in xrange(n_batch): # mat = T.tile(T.reshape(derivs[n], [1, bucket_size]), (bucket_size, 1)) # mat = mat * W # T.inc_subtensor(log_jacobian[n], T.log(T.abs_(T.nlinalg.Det()(mat)))) self.log_jacobian = log_jacobian self.output = ( lin_output if activation is None else activation(lin_output) ) self.params = [w]
def log_single_component(c, mu, P, al, tr): L = T.tril(P[c, :, :], k=-1) + T.diag(T.exp(T.diagonal( P[c, :, :]))) z = T.exp(-0.5 * T.sum(T.dot(T.transpose(L), (tr - mu[c, :]))**2) + T.log(al[c]) + T.log(T.nlinalg.det(L)) - D * log2pi / 2.) return z
def __init__(self,D, M,Q,Domain_number): self.Xlabel=T.matrix('Xlabel') self.X=T.matrix('X') N=self.X.shape[0] self.Weight=T.matrix('Weight') ker=kernel(Q) mmd=MMD(M,Domain_number) mu_value = np.random.randn(M,D) Sigma_b_value = np.zeros((M,M)) + np.log(0.01) Z_value = np.random.randn(M,Q) self.test=Z_value ls_value=np.zeros(Domain_number)+np.log(0.1) self.mu = theano.shared(value=mu_value, name='mu', borrow=True) self.Sigma_b = theano.shared(value=Sigma_b_value, name='Sigma_b', borrow=True) self.Z = theano.shared(value=Z_value, name='Z', borrow=True) self.ls = theano.shared(value=ls_value, name='ls', borrow=True) self.params = [self.mu,self.Sigma_b,self.Z,self.ls] self.hiddenLayer_x = HiddenLayer(rng=rng,input=self.X,n_in=D,n_out=20,activation=T.nnet.relu,number='_x') self.hiddenLayer_m = HiddenLayer(rng=rng,input=self.hiddenLayer_x.output,n_in=20,n_out=Q,activation=T.nnet.relu,number='_m') self.hiddenLayer_S = HiddenLayer(rng=rng,input=self.hiddenLayer_x.output,n_in=20,n_out=Q,activation=T.nnet.relu,number='_S') self.loc_params= [] self.loc_params.extend(self.hiddenLayer_x.params) self.loc_params.extend(self.hiddenLayer_m.params) self.loc_params.extend(self.hiddenLayer_S.params) self.local_params={} for i in self.loc_params: self.local_params[str(i)]=i self.params.extend(ker.params) self.params.extend(mmd.params) self.global_params={} for i in self.params: self.global_params[str(i)]=i self.params.extend(self.hiddenLayer_x.params) self.params.extend(self.hiddenLayer_m.params) self.params.extend(self.hiddenLayer_S.params) self.wrt={} for i in self.params: self.wrt[str(i)]=i m=self.hiddenLayer_m.output S_0=self.hiddenLayer_S.output S_1=T.exp(S_0) S=T.sqrt(S_1) from theano.tensor.shared_randomstreams import RandomStreams srng = RandomStreams(seed=234) eps_NQ = srng.normal((N,Q)) eps_M= srng.normal((M,D))#平均と分散で違う乱数を使う必要があるので別々に銘銘 beta = T.exp(self.ls) #uについては対角でないのでコレスキー分解するとかして三角行列を作る必要がある Sigma = T.tril(self.Sigma_b - T.diag(T.diag(self.Sigma_b)) + T.diag(T.exp(T.diag(self.Sigma_b)))) #スケール変換 mu_scaled, Sigma_scaled = ker.sf2**0.5 * self.mu, ker.sf2**0.5 * Sigma Xtilda = m + S * eps_NQ self.U = mu_scaled+Sigma_scaled.dot(eps_M) Kmm = ker.RBF(self.Z) Kmm=mmd.MMD_kenel_Xonly(mmd.Zlabel_T,Kmm,self.Weight) KmmInv = sT.matrix_inverse(Kmm) Kmn = ker.RBF(self.Z,Xtilda) Kmn=mmd.MMD_kenel_ZX(self.Xlabel,Kmn,self.Weight) Knn = ker.RBF(Xtilda) Knn=mmd.MMD_kenel_Xonly(self.Xlabel,Knn,self.Weight) Ktilda=Knn-T.dot(Kmn.T,T.dot(KmmInv,Kmn)) Kinterval=T.dot(KmmInv,Kmn) mean_U=T.dot(Kinterval.T,self.U) betaI=T.diag(T.dot(self.Xlabel,beta)) Covariance = betaI self.LL = (self.log_mvn(self.X, mean_U, Covariance) - 0.5*T.sum(T.dot(betaI,Ktilda))) self.KL_X = -self.KLD_X(m,S) self.KL_U = -self.KLD_U(mu_scaled , Sigma_scaled , Kmm,KmmInv)
def __init__(self, D, M, Q, Domain_number, D_Y, M_Y): self.Xlabel = T.matrix('Xlabel') self.X = T.matrix('X') self.Y = T.matrix('Y') N = self.X.shape[0] self.Weight = T.matrix('Weight') ker = kernel(Q) mmd = MMD(M, Domain_number) mu_value = np.random.randn(M, D) Sigma_b_value = np.zeros((M, M)) + np.log(0.01) Z_value = np.random.randn(M, Q) ls_value = np.zeros(Domain_number) + np.log(0.1) self.mu = theano.shared(value=mu_value, name='mu', borrow=True) self.Sigma_b = theano.shared(value=Sigma_b_value, name='Sigma_b', borrow=True) self.Z = theano.shared(value=Z_value, name='Z', borrow=True) self.ls = theano.shared(value=ls_value, name='ls', borrow=True) self.hiddenLayer_x = HiddenLayer(rng=rng, input=self.X, n_in=D, n_out=20, activation=T.nnet.relu, number='_x') self.hiddenLayer_m = HiddenLayer(rng=rng, input=self.hiddenLayer_x.output, n_in=20, n_out=Q, activation=T.nnet.relu, number='_m') self.hiddenLayer_S = HiddenLayer(rng=rng, input=self.hiddenLayer_x.output, n_in=20, n_out=Q, activation=T.nnet.relu, number='_S') ################################################################################# ###モデルの計算X側 m = self.hiddenLayer_m.output S_0 = self.hiddenLayer_S.output S_1 = T.exp(S_0) S = T.sqrt(S_1) from theano.tensor.shared_randomstreams import RandomStreams srng = RandomStreams(seed=234) eps_NQ = srng.normal((N, Q)) eps_M = srng.normal((M, D)) #平均と分散で違う乱数を使う必要があるので別々に銘銘 beta = T.exp(self.ls) #uについては対角でないのでコレスキー分解するとかして三角行列を作る必要がある Sigma = T.tril(self.Sigma_b - T.diag(T.diag(self.Sigma_b)) + T.diag(T.exp(T.diag(self.Sigma_b)))) #スケール変換 mu_scaled, Sigma_scaled = ker.sf2**0.5 * self.mu, ker.sf2**0.5 * Sigma Xtilda = m + S * eps_NQ self.U = mu_scaled + Sigma_scaled.dot(eps_M) Kmm = ker.RBF(self.Z) Kmm = mmd.MMD_kenel_Xonly(mmd.Zlabel_T, Kmm, self.Weight) KmmInv = sT.matrix_inverse(Kmm) Kmn = ker.RBF(self.Z, Xtilda) Kmn = mmd.MMD_kenel_ZX(self.Xlabel, Kmn, self.Weight) Knn = ker.RBF(Xtilda) Knn = mmd.MMD_kenel_Xonly(self.Xlabel, Knn, self.Weight) Ktilda = Knn - T.dot(Kmn.T, T.dot(KmmInv, Kmn)) Kinterval = T.dot(KmmInv, Kmn) mean_U = T.dot(Kinterval.T, self.U) betaI = T.diag(T.dot(self.Xlabel, beta)) Covariance = betaI ############################################################################################## ###Y側の計算 ker_Y = kernel(Q, number='_Y') muY_value = np.random.randn(M_Y, D_Y) SigmaY_b_value = np.zeros((M_Y, M_Y)) + np.log(0.01) ZY_value = np.random.randn(M_Y, Q) lsY_value = np.zeros(1) + np.log(0.1) self.muY = theano.shared(value=muY_value, name='muY', borrow=True) self.SigmaY_b = theano.shared(value=SigmaY_b_value, name='SigmaY_b', borrow=True) self.ZY = theano.shared(value=ZY_value, name='ZY', borrow=True) self.lsY = theano.shared(value=lsY_value, name='lsY', borrow=True) epsY_NQ = srng.normal((N, Q)) epsY_M = srng.normal((M_Y, D_Y)) betaY0 = T.exp(self.lsY) betaY = T.tile(betaY0, N) #uについては対角でないのでコレスキー分解するとかして三角行列を作る必要がある SigmaY = T.tril(self.SigmaY_b - T.diag(T.diag(self.SigmaY_b)) + T.diag(T.exp(T.diag(self.SigmaY_b)))) #スケール変換 muY_scaled, SigmaY_scaled = ker_Y.sf2**0.5 * self.muY, ker_Y.sf2**0.5 * SigmaY XtildaY = m + S * epsY_NQ self.UY = muY_scaled + SigmaY_scaled.dot(epsY_M) KmmY = ker_Y.RBF(self.ZY) KmmInvY = sT.matrix_inverse(KmmY) KmnY = ker_Y.RBF(self.ZY, XtildaY) KnnY = ker_Y.RBF(XtildaY) KtildaY = KnnY - T.dot(KmnY.T, T.dot(KmmInvY, KmnY)) KintervalY = T.dot(KmmInvY, KmnY) mean_UY = T.dot(KintervalY.T, self.UY) betaIY = T.diag(betaY) CovarianceY = betaIY ############################################################################################## ###パラメータの格納 self.params = [] self.params_X = [self.mu, self.Sigma_b, self.Z, self.ls] self.params_Y = [self.muY, self.SigmaY_b, self.ZY, self.lsY] self.loc_params = [] self.loc_params.extend(self.hiddenLayer_x.params) self.loc_params.extend(self.hiddenLayer_m.params) self.loc_params.extend(self.hiddenLayer_S.params) self.local_params = {} for i in self.loc_params: self.local_params[str(i)] = i self.params_X.extend(ker.params) self.params_X.extend(mmd.params) self.params_Y.extend(ker_Y.params) self.global_params_X = {} for i in self.params_X: self.global_params_X[str(i)] = i self.global_params_Y = {} for i in self.params_Y: self.global_params_Y[str(i)] = i self.params.extend(self.params_X) self.params.extend(self.params_Y) self.params.extend(self.loc_params) self.wrt = {} for i in self.params: self.wrt[str(i)] = i ############################################################################################### ###最終的な尤度 self.LL = (self.log_mvn(self.X, mean_U, Covariance) - 0.5 * T.sum(T.dot(betaI, Ktilda))) self.KL_U = -self.KLD_U(mu_scaled, Sigma_scaled, Kmm, KmmInv) self.LLY = (self.log_mvn(self.Y, mean_UY, CovarianceY) - 0.5 * T.sum(T.dot(betaIY, KtildaY))) self.KL_UY = -self.KLD_U(muY_scaled, SigmaY_scaled, KmmY, KmmInvY) self.KL_X = -self.KLD_X(m, S)
def __init__(self, params, correct, samples=20, batch_size=None): ker = kernel() self.samples = samples self.params = params self.batch_size = batch_size #データの保存ファイル model_file_name = 'model2' + '.save' #もしこれまでに作ったのがあるならロードする try: print('Trying to load model...') with open(model_file_name, 'rb') as file_handle: obj = pickle.load(file_handle) self.f, self.g = obj print('Loaded!') return except: print('Failed. Creating a new model...') X,Y,X_test,m,S_b,mu,Sigma_b,Z,eps_NQ,eps_M =\ T.dmatrices('X','Y','X_test','m','S_b','mu','Sigma_b','Z','eps_NQ','eps_M') lhyp = T.dvector('lhyp') ls = T.dvector('ls') (M, D), N, Q = Z.shape, X.shape[0], X.shape[1] #変数の正の値への制約条件 beta = T.exp(ls[0]) #beta=T.exp(lhyp[0]) sf2, l = T.exp(lhyp[0]), T.exp(lhyp[1:1 + Q]) S = T.exp(S_b) #Sigma=T.exp(self.Sigma_b) #xについてはルートを取らなくても対角行列なので問題なし #uについては対角でないのでコレスキー分解するとかして三角行列を作る必要がある Sigma = T.tril(Sigma_b - T.diag(T.diag(Sigma_b)) + T.diag(T.exp(T.diag(Sigma_b)))) #スケール変換 mu_scaled, Sigma_scaled = sf2**0.5 * mu, sf2**0.5 * Sigma Xtilda = m + S * eps_NQ U = mu_scaled + Sigma_scaled.dot(eps_M) print('Setting up cache...') Kmm = ker.RBF(sf2, l, Z) KmmInv = sT.matrix_inverse(Kmm) #KmmDet=theano.sandbox.linalg.det(Kmm) #KmmInv_cache = sT.matrix_inverse(Kmm) #self.fKmm = theano.function([Z, lhyp], Kmm, name='Kmm') #self.f_KmmInv = theano.function([Z, lhyp], KmmInv_cache, name='KmmInv_cache') #復習:これは員数をZ,lhypとした関数kmmInv_cacheをコンパイルしている。つまり逆行列はzとハイパーパラメタの関数になった #self.update_KmmInv_cache()#実際に数値を入れてkinnvを計算させている #逆行列の微分関数を作っている #self.dKmm_d = {'Z': theano.function([Z, lhyp], T.jacobian(Kmm.flatten(), Z), name='dKmm_dZ'), # 'lhyp': theano.function([Z, lhyp], T.jacobian(Kmm.flatten(), lhyp), name='dKmm_dlhyp')} print('Modeling...') Kmn = ker.RBF(sf2, l, Z, Xtilda) Knn = ker.RBF(sf2, l, Xtilda, Xtilda) Ktilda = Knn - T.dot(Kmn.T, T.dot(KmmInv, Kmn)) Kinterval = T.dot(KmmInv, Kmn) mean_U = T.dot(Kinterval.T, U) Covariance = beta LL = (self.log_mvn(X, mean_U, Covariance) - 0.5 * beta * T.sum( (T.eye(N) * Ktilda))) * correct KL_X = -self.KLD_X(m, S) * correct KL_U = -self.KLD_U(mu_scaled, Sigma_scaled, Kmm, KmmInv) print('Compiling model ...') inputs = { 'X': X, 'Z': Z, 'm': m, 'S_b': S_b, 'mu': mu, 'Sigma_b': Sigma_b, 'lhyp': lhyp, 'ls': ls, 'eps_M': eps_M, 'eps_NQ': eps_NQ } z = 0.0 * sum([ T.sum(v) for v in inputs.values() ]) # solve a bug with derivative wrt inputs not in the graph self.f = {n: theano.function(list(inputs.values()), f+z, name=n, on_unused_input='ignore')\ for n,f in zip(['X', 'U', 'LL', 'KL_U', 'KL_X'], [X, U, LL, KL_U, KL_X])} wrt = { 'Z': Z, 'm': m, 'S_b': S_b, 'mu': mu, 'Sigma_b': Sigma_b, 'lhyp': lhyp, 'ls': ls } self.g = { vn: { gn: theano.function(list(inputs.values()), T.grad(gv + z, vv), name='d' + gn + '_d' + vn, on_unused_input='ignore') for gn, gv in zip(['LL', 'KL_U', 'KL_X'], [LL, KL_U, KL_X]) } for vn, vv in wrt.items() } with open(model_file_name, 'wb') as file_handle: print('Saving model...') sys.setrecursionlimit(2000) pickle.dump([self.f, self.g], file_handle, protocol=pickle.HIGHEST_PROTOCOL)
def tril_and_halve_diagonal(mtx): """Extracts lower triangle of square matrix and halves diagonal.""" return tensor.tril(mtx) - tensor.diag(tensor.diagonal(mtx) / 2.)
def __init__(self,input, D_in, D_out,num_MC,inducing_number,fixed_z,Domain_number=None,Domain_consideration=True,number="1",kernel_name='X'): Xtilda=input self.N=Xtilda.shape[1] D=D_out Q=D_in M=inducing_number ################################################################################ #set_initial_value ker=kernel(Q,kernel_name) self.kern=ker mu_value = np.random.randn(M,D)* 1e-2 Sigma_b_value = np.zeros((M,M)) if Domain_consideration: ls_value=np.zeros(Domain_number)+np.log(0.1) else: ls_value=np.zeros(1)+np.log(0.1) self.mu = theano.shared(value=mu_value, name='mu'+number, borrow=True) self.Sigma_b = theano.shared(value=Sigma_b_value, name='Sigma_b'+number, borrow=True) self.Z = theano.shared(value=fixed_z, name='Z'+number, borrow=True) self.ls = theano.shared(value=ls_value, name='ls'+number, borrow=True) ############################################################################## #param list self.params = [self.mu,self.Sigma_b,self.ls] self.params.extend(ker.params) self.hyp_params_list=[self.mu,self.Sigma_b,self.ls,ker.params] self.Z_params_list=[self.Z] self.global_params_list=self.params ############################################################################# #set random seed from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams srng = RandomStreams(seed=234) eps_M = srng.normal((num_MC,M,D))#平均と分散で違う乱数を使う必要があるので別々に銘銘 eps_ND = srng.normal((num_MC,self.N,D)) ################################################################# #set constraints self.beta = T.exp(self.ls) #uについては対角でないのでコレスキー分解するとかして三角行列を作る必要がある Sigma = T.tril(self.Sigma_b - T.diag(T.diag(self.Sigma_b)) + T.diag(T.exp(T.diag(self.Sigma_b)))) ################################################################## #スケール変換 mu_scaled, Sigma_scaled = ker.sf2**0.5 * self.mu, ker.sf2**0.5 * Sigma ################################################################## #if the model is latetnt variable model, we make MC samples of latent X #Xtilda = eps_NQ * S[None,:,:] + m[None,:,:] #Xtilda, updates = theano.scan(fn=lambda a: m+S*a, # sequences=[eps_NQ]) ############################### #U is the posterior samples self.U, updates = theano.scan(fn=lambda a: mu_scaled+Sigma_scaled.dot(a), sequences=[eps_M]) ################################ #inducing point prior Kmm = ker.RBF(self.Z) KmmInv = sT.matrix_inverse(Kmm) ############################### #For the MC calculation, we copy the input X Knn, updates = theano.scan(fn=lambda a: self.kern.RBF(a), sequences=[Xtilda]) Kmn, updates = theano.scan(fn=lambda a: self.kern.RBF(self.Z,a), sequences=[Xtilda]) ######################################## #make posterior (p(F|U)) , its variace Ktilda, updates = theano.scan(fn=lambda a,b: a-T.dot(b.T,T.dot(KmmInv,b)), sequences=[Knn,Kmn]) ################################################## #get the posterior samples form (p(F|U)) #MC*N*D_out #F, updates = theano.scan(fn=lambda a,b,c,d: T.dot(a.T,T.dot(KmmInv,b)) + T.dot(T.maximum(c, 1e-16)**0.5,d), # sequences=[Kmn,self.U,Ktilda,eps_ND]) F, updates = theano.scan(fn=lambda a,c,d: T.dot(a.T,T.dot(KmmInv,mu_scaled)) + T.dot(T.maximum(c, 1e-16)**0.5,d), sequences=[Kmn,Ktilda,eps_ND]) ################################################## #Kinterval=T.dot(KmmInv,Kmn) self.mean_U=F #mean_U=T.dot(Kinterval.T,self.U) #A=Kinterval.T #Sigma_tilda=Ktilda+T.dot(A,T.dot(Sigma_scaled,A.T)) #mean_tilda=T.dot(A,mu_scaled) #self.mean_U=mean_tilda + T.dot(T.maximum(Sigma_tilda, 1e-16)**0.5,eps_ND) ################################################################### eps_ND_F = srng.normal((num_MC,self.N,D)) added_noise=T.tile(self.beta,(num_MC,self.N,D)) self.output=added_noise*eps_ND_F+self.mean_U #self.KL_X = -self.KLD_X(m,S) self.KL_U = self.KLD_U(mu_scaled , Sigma_scaled , Kmm,KmmInv)
def __init__(self, rng, target,input_m,input_S, n_in, n_out,inducing_number,Domain_number,Xlabel, liklihood="Gaussian",Domain_consideration=True,number="1"): m=input_m S_0=input_S N=m.shape[0] D=n_out Q=n_in M=inducing_number #set_initial_value ker=kernel(Q) mu_value = np.random.randn(M,D)* 1e-2 Sigma_b_value = np.zeros((M,M)) Z_value = np.random.randn(M,Q) if Domain_consideration: ls_value=np.zeros(Domain_number)+np.log(0.1) else: ls_value=np.zeros(1)+np.log(0.1) self.mu = theano.shared(value=mu_value, name='mu'+number, borrow=True) self.Sigma_b = theano.shared(value=Sigma_b_value, name='Sigma_b'+number, borrow=True) self.Z = theano.shared(value=Z_value, name='Z'+number, borrow=True) self.ls = theano.shared(value=ls_value, name='ls'+number, borrow=True) self.params = [self.mu,self.Sigma_b,self.Z,self.ls] self.params.extend(ker.params) self.hyp_params_list=[self.mu,self.Sigma_b,self.ls] self.Z_params_list=[self.Z] self.global_params_list=self.params S_1=T.exp(S_0) S=T.sqrt(S_1) from theano.tensor.shared_randomstreams import RandomStreams srng = RandomStreams(seed=234) eps_NQ = srng.normal((N,Q)) eps_M = srng.normal((M,D))#平均と分散で違う乱数を使う必要があるので別々に銘銘 eps_ND = srng.normal((N,D)) beta = T.exp(self.ls) #uについては対角でないのでコレスキー分解するとかして三角行列を作る必要がある Sigma = T.tril(self.Sigma_b - T.diag(T.diag(self.Sigma_b)) + T.diag(T.exp(T.diag(self.Sigma_b)))) #スケール変換 mu_scaled, Sigma_scaled = ker.sf2**0.5 * self.mu, ker.sf2**0.5 * Sigma Xtilda = m + S * eps_NQ self.U = mu_scaled+Sigma_scaled.dot(eps_M) Kmm = ker.RBF(self.Z) KmmInv = sT.matrix_inverse(Kmm) Kmn = ker.RBF(self.Z,Xtilda) Knn = ker.RBF(Xtilda) Ktilda=Knn-T.dot(Kmn.T,T.dot(KmmInv,Kmn)) #F = T.dot(Kmn.T,T.dot(KmmInv,self.U)) + T.dot(T.maximum(Ktilda, 1e-16)**0.5,eps_ND) Kinterval=T.dot(KmmInv,Kmn) A=Kinterval.T Sigma_tilda=Ktilda+T.dot(A,T.dot(Sigma_scaled,A.T)) mean_tilda=T.dot(A,mu_scaled) #mean_U=F #mean_U=T.dot(Kinterval.T,self.U) mean_U=mean_tilda + T.dot(T.maximum(Sigma_tilda, 1e-16)**0.5,eps_ND) betaI=T.diag(T.dot(Xlabel,beta)) Covariance = betaI self.output=mean_U self.LL = self.log_mvn(target, mean_U, Covariance)/N# - 0.5*T.sum(T.dot(betaI,Ktilda)) self.KL_X = -self.KLD_X(m,S) self.KL_U = -self.KLD_U(mu_scaled , Sigma_scaled , Kmm,KmmInv)
def __init__(self, params,correct,Xinfo, samples = 500,batch_size=None): ker = kernel() mmd = MMD() self.samples = samples self.params = params self.batch_size=batch_size self.Xlabel_value=Xinfo["Xlabel_value"] self.Weight_value=Xinfo["Weight_value"] #データの保存ファイル model_file_name = 'model_MMD_kernel' + '.save' #もしこれまでに作ったのがあるならロードする try: print ('Trying to load model...') with open(model_file_name, 'rb') as file_handle: obj = pickle.load(file_handle) self.f, self.g= obj print ('Loaded!') return except: print ('Failed. Creating a new model...') X,Y,X_test,m,S_b,mu,Sigma_b,Z,eps_NQ,eps_M =\ T.dmatrices('X','Y','X_test','m','S_b','mu','Sigma_b','Z','eps_NQ','eps_M') Xlabel=T.dmatrix('Xlabel') Zlabel=T.dmatrix('Zlabel') Zlabel_T=T.exp(Zlabel)/T.sum(T.exp(Zlabel),1)[:,None]#ラベルは確率なので正の値でかつ、企画化されている Weight=T.dmatrix('Weight') lhyp = T.dvector('lhyp') ls=T.dvector('ls') ga=T.dvector('ga') (M, D), N, Q = Z.shape, X.shape[0], X.shape[1] #変数の正の値への制約条件 beta = T.exp(ls) gamma=T.exp(ga[0]) #beta=T.exp(lhyp[0]) sf2, l = T.exp(lhyp[0]), T.exp(lhyp[1:1+Q]) S=T.exp(S_b) #Sigma=T.exp(self.Sigma_b) #xについてはルートを取らなくても対角行列なので問題なし #uについては対角でないのでコレスキー分解するとかして三角行列を作る必要がある Sigma = T.tril(Sigma_b - T.diag(T.diag(Sigma_b)) + T.diag(T.exp(T.diag(Sigma_b)))) #スケール変換 mu_scaled, Sigma_scaled = sf2**0.5 * mu, sf2**0.5 * Sigma Xtilda = m + S * eps_NQ U = mu_scaled+Sigma_scaled.dot(eps_M) print ('Setting up cache...') Kmm = ker.RBF(sf2, l, Z) Kmm=mmd.MMD_kenel_Xonly(gamma,Zlabel_T,Kmm,Weight) KmmInv = sT.matrix_inverse(Kmm) #KmmDet=theano.sandbox.linalg.det(Kmm) #KmmInv_cache = sT.matrix_inverse(Kmm) #self.fKmm = theano.function([Z, lhyp], Kmm, name='Kmm') #self.f_KmmInv = theano.function([Z, lhyp], KmmInv_cache, name='KmmInv_cache') #復習:これは員数をZ,lhypとした関数kmmInv_cacheをコンパイルしている。つまり逆行列はzとハイパーパラメタの関数になった #self.update_KmmInv_cache()#実際に数値を入れてkinnvを計算させている #逆行列の微分関数を作っている #self.dKmm_d = {'Z': theano.function([Z, lhyp], T.jacobian(Kmm.flatten(), Z), name='dKmm_dZ'), # 'lhyp': theano.function([Z, lhyp], T.jacobian(Kmm.flatten(), lhyp), name='dKmm_dlhyp')} print ('Modeling...') Kmn = ker.RBF(sf2,l,Z,Xtilda) Kmn=mmd.MMD_kenel_ZX(gamma,Zlabel_T,Xlabel,Kmn,Weight) Knn = ker.RBF(sf2,l,Xtilda,Xtilda) Knn=mmd.MMD_kenel_Xonly(gamma,Xlabel,Knn,Weight) Ktilda=Knn-T.dot(Kmn.T,T.dot(KmmInv,Kmn)) Kinterval=T.dot(KmmInv,Kmn) mean_U=T.dot(Kinterval.T,U) betaI=T.diag(T.dot(Xlabel,beta)) Covariance = betaI LL = (self.log_mvn(X, mean_U, Covariance) - 0.5*T.sum(T.dot(betaI,Ktilda)))*correct KL_X = -self.KLD_X(m,S)*correct KL_U = -self.KLD_U(mu_scaled , Sigma_scaled , Kmm,KmmInv) print ('Compiling model ...') inputs = {'X': X, 'Z': Z, 'm': m, 'S_b': S_b, 'mu': mu, 'Sigma_b': Sigma_b, 'lhyp': lhyp, 'ls': ls, 'eps_M': eps_M, 'eps_NQ': eps_NQ,'ga':ga,'Zlabel':Zlabel,'Weight':Weight,'Xlabel':Xlabel} z = 0.0*sum([T.sum(v) for v in inputs.values()]) # solve a bug with derivative wrt inputs not in the graph self.f = {n: theano.function(list(inputs.values()), f+z, name=n, on_unused_input='ignore')\ for n,f in zip(['X', 'U', 'LL', 'KL_U', 'KL_X'], [X, U, LL, KL_U, KL_X])} wrt = {'Z': Z, 'm': m, 'S_b': S_b, 'mu': mu, 'Sigma_b': Sigma_b, 'lhyp': lhyp, 'ls': ls,'ga':ga,'Zlabel':Zlabel} self.g = {vn: {gn: theano.function(list(inputs.values()), T.grad(gv+z, vv), name='d'+gn+'_d'+vn, on_unused_input='ignore') for gn,gv in zip(['LL', 'KL_U', 'KL_X'], [LL, KL_U, KL_X])} for vn, vv in wrt.items()} with open(model_file_name, 'wb') as file_handle: print ('Saving model...') sys.setrecursionlimit(10000) pickle.dump([self.f, self.g], file_handle, protocol=pickle.HIGHEST_PROTOCOL)
def get_prob(self, x, W, b): W = T.tril(W, k=-1) p = T.nnet.sigmoid(T.dot(x, W) + b) * 0.9999 + 0.000005 return p
def constructL(ltri): tmp = T.transpose(T.tril(T.ones((d,d)),-1)) lower_tril_indices = tmp.nonzero() L = T.transpose(T.set_subtensor(tmp[lower_tril_indices], ltri)) return L
def __init__(self, input, D_in, D_out, num_MC, inducing_number, Domain_number=None, Domain_consideration=True, number="1", kernel_name='X'): Xtilda = input self.N = Xtilda.shape[1] D = D_out Q = D_in M = inducing_number ################################################################################ #set_initial_value ker = kernel(Q, kernel_name) self.kern = ker mu_value = np.random.randn(M, D) * 1e-2 Sigma_b_value = np.zeros((M, M)) Z_value = np.random.randn(M, Q) if Domain_consideration: ls_value = np.zeros(Domain_number) + np.log(0.1) else: ls_value = np.zeros(1) + np.log(0.1) self.mu = theano.shared(value=mu_value, name='mu' + number, borrow=True) self.Sigma_b = theano.shared(value=Sigma_b_value, name='Sigma_b' + number, borrow=True) self.Z = theano.shared(value=Z_value, name='Z' + number, borrow=True) self.ls = theano.shared(value=ls_value, name='ls' + number, borrow=True) ############################################################################## #param list self.params = [self.mu, self.Sigma_b, self.ls, self.Z] self.params.extend(ker.params) self.hyp_params_list = [self.mu, self.Sigma_b, self.ls, ker.params] self.Z_params_list = [self.Z] self.global_params_list = self.params ############################################################################# #set random seed from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams srng = RandomStreams(seed=234) eps_M = srng.normal((num_MC, M, D)) #平均と分散で違う乱数を使う必要があるので別々に銘銘 eps_ND = srng.normal((num_MC, self.N, D)) ################################################################# #set constraints self.beta = T.exp(self.ls) #uについては対角でないのでコレスキー分解するとかして三角行列を作る必要がある Sigma = T.tril(self.Sigma_b - T.diag(T.diag(self.Sigma_b)) + T.diag(T.exp(T.diag(self.Sigma_b)))) ################################################################## #スケール変換 mu_scaled, Sigma_scaled = ker.sf2**0.5 * self.mu, ker.sf2**0.5 * Sigma ################################################################## #if the model is latetnt variable model, we make MC samples of latent X #Xtilda = eps_NQ * S[None,:,:] + m[None,:,:] #Xtilda, updates = theano.scan(fn=lambda a: m+S*a, # sequences=[eps_NQ]) ############################### #U is the posterior samples self.U, updates = theano.scan( fn=lambda a: mu_scaled + Sigma_scaled.dot(a), sequences=[eps_M]) ################################ #inducing point prior Kmm = ker.RBF(self.Z) KmmInv = sT.matrix_inverse(Kmm) ############################### #For the MC calculation, we copy the input X Knn, updates = theano.scan(fn=lambda a: self.kern.RBF(a), sequences=[Xtilda]) Kmn, updates = theano.scan(fn=lambda a: self.kern.RBF(self.Z, a), sequences=[Xtilda]) ######################################## #make posterior (p(F|U)) , its variace Ktilda, updates = theano.scan( fn=lambda a, b: a - T.dot(b.T, T.dot(KmmInv, b)), sequences=[Knn, Kmn]) ################################################## #get the posterior samples form (p(F|U)) #MC*N*D_out F, updates = theano.scan(fn=lambda a, b, c, d: T.dot( a.T, T.dot(KmmInv, b)) + T.dot(T.maximum(c, 1e-16)**0.5, d), sequences=[Kmn, self.U, Ktilda, eps_ND]) ################################################## #Kinterval=T.dot(KmmInv,Kmn) self.mean_U = F #mean_U=T.dot(Kinterval.T,self.U) #A=Kinterval.T #Sigma_tilda=Ktilda+T.dot(A,T.dot(Sigma_scaled,A.T)) #mean_tilda=T.dot(A,mu_scaled) #self.mean_U=mean_tilda + T.dot(T.maximum(Sigma_tilda, 1e-16)**0.5,eps_ND) ################################################################### self.output = self.mean_U #self.KL_X = -self.KLD_X(m,S) self.KL_U = self.KLD_U(mu_scaled, Sigma_scaled, Kmm, KmmInv)
def __init__(self, rng, target, input_m, input_S, n_in, n_out, inducing_number, Domain_number, Xlabel, liklihood="Gaussian", Domain_consideration=True, number="1"): m = input_m S_0 = input_S N = m.shape[0] D = n_out Q = n_in M = inducing_number #set_initial_value ker = kernel(Q) mu_value = np.random.randn(M, D) * 1e-2 Sigma_b_value = np.zeros((M, M)) Z_value = np.random.randn(M, Q) if Domain_consideration: ls_value = np.zeros(Domain_number) + np.log(0.1) else: ls_value = np.zeros(1) + np.log(0.1) self.mu = theano.shared(value=mu_value, name='mu' + number, borrow=True) self.Sigma_b = theano.shared(value=Sigma_b_value, name='Sigma_b' + number, borrow=True) self.Z = theano.shared(value=Z_value, name='Z' + number, borrow=True) self.ls = theano.shared(value=ls_value, name='ls' + number, borrow=True) self.params = [self.mu, self.Sigma_b, self.Z, self.ls] self.params.extend(ker.params) self.hyp_params_list = [self.mu, self.Sigma_b, self.ls] self.Z_params_list = [self.Z] self.global_params_list = self.params S_1 = T.exp(S_0) S = T.sqrt(S_1) from theano.tensor.shared_randomstreams import RandomStreams srng = RandomStreams(seed=234) eps_NQ = srng.normal((N, Q)) eps_M = srng.normal((M, D)) #平均と分散で違う乱数を使う必要があるので別々に銘銘 eps_ND = srng.normal((N, D)) beta = T.exp(self.ls) #uについては対角でないのでコレスキー分解するとかして三角行列を作る必要がある Sigma = T.tril(self.Sigma_b - T.diag(T.diag(self.Sigma_b)) + T.diag(T.exp(T.diag(self.Sigma_b)))) #スケール変換 mu_scaled, Sigma_scaled = ker.sf2**0.5 * self.mu, ker.sf2**0.5 * Sigma Xtilda = m + S * eps_NQ self.U = mu_scaled + Sigma_scaled.dot(eps_M) Kmm = ker.RBF(self.Z) KmmInv = sT.matrix_inverse(Kmm) Kmn = ker.RBF(self.Z, Xtilda) Knn = ker.RBF(Xtilda) Ktilda = Knn - T.dot(Kmn.T, T.dot(KmmInv, Kmn)) #F = T.dot(Kmn.T,T.dot(KmmInv,self.U)) + T.dot(T.maximum(Ktilda, 1e-16)**0.5,eps_ND) Kinterval = T.dot(KmmInv, Kmn) A = Kinterval.T Sigma_tilda = Ktilda + T.dot(A, T.dot(Sigma_scaled, A.T)) mean_tilda = T.dot(A, mu_scaled) #mean_U=F #mean_U=T.dot(Kinterval.T,self.U) mean_U = mean_tilda + T.dot(T.maximum(Sigma_tilda, 1e-16)**0.5, eps_ND) betaI = T.diag(T.dot(Xlabel, beta)) Covariance = betaI self.output = mean_U self.LL = self.log_mvn( target, mean_U, Covariance) / N # - 0.5*T.sum(T.dot(betaI,Ktilda)) self.KL_X = -self.KLD_X(m, S) self.KL_U = -self.KLD_U(mu_scaled, Sigma_scaled, Kmm, KmmInv)
def __init__(self, params, sx2 = 1, linear_model = False, samples = 20, use_hat = False): ker, self.samples, self.params, self.KmmInv = kernel(), samples, params, {} self.use_hat = use_hat model_file_name = 'model' + ('_hat' if use_hat else '') + ('_linear' if linear_model else '') + '.save' try: print 'Trying to load model...' with open(model_file_name, 'rb') as file_handle: obj = cPickle.load(file_handle) self.f, self.g, self.f_Kmm, self.f_KmmInv, self.dKmm_d = obj self.update_KmmInv_cache() print 'Loaded!' return except: print 'Failed. Creating a new model...' Y, Z, m, ls, mu, lL, eps_MK, eps_NQ, eps_NK, KmmInv = T.dmatrices('Y', 'Z', 'm', 'ls', 'mu', 'lL', 'eps_MK', 'eps_NQ', 'eps_NK', 'KmmInv') lhyp = T.dvector('lhyp') (M, K), N, Q = mu.shape, m.shape[0], Z.shape[1] s, sl2, sf2, l = T.exp(ls), T.exp(lhyp[0]), T.exp(lhyp[1]), T.exp(lhyp[2:2+Q]) L = T.tril(lL - T.diag(T.diag(lL)) + T.diag(T.exp(T.diag(lL)))) print 'Setting up cache...' Kmm = ker.RBF(sf2, l, Z) if not linear_model else ker.LIN(sl2, Z) KmmInv_cache = sT.matrix_inverse(Kmm) self.f_Kmm = theano.function([Z, lhyp], Kmm, name='Kmm') self.f_KmmInv = theano.function([Z, lhyp], KmmInv_cache, name='KmmInv_cache') self.update_KmmInv_cache() self.dKmm_d = {'Z': theano.function([Z, lhyp], T.jacobian(Kmm.flatten(), Z), name='dKmm_dZ'), 'lhyp': theano.function([Z, lhyp], T.jacobian(Kmm.flatten(), lhyp), name='dKmm_dlhyp')} print 'Setting up model...' if not self.use_hat: mu_scaled, L_scaled = sf2**0.5 * mu, sf2**0.5 * L X = m + s * eps_NQ U = mu_scaled + L_scaled.dot(eps_MK) Kmn = ker.RBF(sf2, l, Z, X) if not linear_model else ker.LIN(sl2, Z, X) Knn = ker.RBFnn(sf2, l, X) if not linear_model else ker.LINnn(sl2, X) A = KmmInv.dot(Kmn) B = Knn - T.sum(Kmn * KmmInv.dot(Kmn), 0) F = A.T.dot(U) + T.maximum(B, 1e-16)[:,None]**0.5 * eps_NK F = T.concatenate((T.zeros((N,1)), F), axis=1) S = T.nnet.softmax(F) LS = T.sum(T.log(T.maximum(T.sum(Y * S, 1), 1e-16))) if not linear_model: KL_U = -0.5 * (T.sum(KmmInv.T * T.sum(mu_scaled[:,None,:]*mu_scaled[None,:,:], 2)) + K * (T.sum(KmmInv.T * L_scaled.dot(L_scaled.T)) - M - 2.0*T.sum(T.log(T.diag(L_scaled))) + 2.0*T.sum(T.log(T.diag(sT.cholesky(Kmm)))))) else: KL_U = 0 #KL_U = -0.5 * T.sum(T.sum(mu_scaled * KmmInv.dot(mu_scaled), 0) + T.sum(KmmInv * L_scaled.dot(L_scaled.T)) - M # - 2.0*T.sum(T.log(T.diag(L_scaled))) + 2.0*T.sum(T.log(T.diag(sT.cholesky(Kmm))))) if not linear_model else 0 else: # mu_scaled, L_scaled = mu / sf2**0.5, L / sf2**0.5 mu_scaled, L_scaled = mu / sf2, L / sf2 X = m + s * eps_NQ U = mu_scaled + L_scaled.dot(eps_MK) Kmn = ker.RBF(sf2, l, Z, X) if not linear_model else ker.LIN(sl2, Z, X) Knn = ker.RBFnn(sf2, l, X) if not linear_model else ker.LINnn(sl2, X) B = Knn - T.sum(Kmn * KmmInv.dot(Kmn), 0) F = Kmn.T.dot(U) + T.maximum(B, 1e-16)[:,None]**0.5 * eps_NK F = T.concatenate((T.zeros((N,1)), F), axis=1) S = T.nnet.softmax(F) LS = T.sum(T.log(T.maximum(T.sum(Y * S, 1), 1e-16))) if not linear_model: KL_U = -0.5 * (T.sum(Kmm.T * T.sum(mu_scaled[:,None,:]*mu_scaled[None,:,:], 2)) + K * (T.sum(Kmm.T * L_scaled.dot(L_scaled.T)) - M - 2.0*T.sum(T.log(T.diag(L_scaled))) - 2.0*T.sum(T.log(T.diag(sT.cholesky(Kmm)))))) else: KL_U = 0 KL_X_all = -0.5 * T.sum((m**2.0 + s**2.0)/sx2 - 1.0 - 2.0*ls + T.log(sx2), 1) KL_X = T.sum(KL_X_all) print 'Compiling...' inputs = {'Y': Y, 'Z': Z, 'm': m, 'ls': ls, 'mu': mu, 'lL': lL, 'lhyp': lhyp, 'KmmInv': KmmInv, 'eps_MK': eps_MK, 'eps_NQ': eps_NQ, 'eps_NK': eps_NK} z = 0.0*sum([T.sum(v) for v in inputs.values()]) # solve a bug with derivative wrt inputs not in the graph f = zip(['X', 'U', 'S', 'LS', 'KL_U', 'KL_X', 'KL_X_all'], [X, U, S, LS, KL_U, KL_X, KL_X_all]) self.f = {n: theano.function(inputs.values(), f+z, name=n, on_unused_input='ignore') for n,f in f} g = zip(['LS', 'KL_U', 'KL_X'], [LS, KL_U, KL_X]) wrt = {'Z': Z, 'm': m, 'ls': ls, 'mu': mu, 'lL': lL, 'lhyp': lhyp, 'KmmInv': KmmInv} self.g = {vn: {gn: theano.function(inputs.values(), T.grad(gv+z, vv), name='d'+gn+'_d'+vn, on_unused_input='ignore') for gn,gv in g} for vn, vv in wrt.iteritems()} with open(model_file_name, 'wb') as file_handle: print 'Saving model...' sys.setrecursionlimit(2000) cPickle.dump([self.f, self.g, self.f_Kmm, self.f_KmmInv, self.dKmm_d], file_handle, protocol=cPickle.HIGHEST_PROTOCOL)
def constructL(ltri): tmp = T.transpose(T.tril(T.ones((d, d)), -1)) lower_tril_indices = tmp.nonzero() L = T.transpose(T.set_subtensor(tmp[lower_tril_indices], ltri)) return L
def createGradientFunctions(self): #Create the Theano variables W1, W2, W3, W4, W5, W7, x, eps = T.dmatrices("W1", "W2", "W3", "W4", "W5", "W7", "x", "eps") #Create biases as cols so they can be broadcasted for minibatches b1, b2, b3, b4, b5, b7 = T.dcols("b1", "b2", "b3", "b4", "b5", "b7") if self.continuous_data: h_encoder = T.nnet.softplus(T.dot(W1, x) + b1) else: h_encoder = T.tanh(T.dot(W1, x) + b1) mu_encoder = T.dot(W2, h_encoder) + b2 log_sigma_encoder = 0.5 * (T.dot(W3, h_encoder) + b3) L_u = T.tril(log_L_u - T.diag(T.diag(log_L_u)) + T.diag(T.exp(T.diag(log_L_u)))) # To do: Better ways of paramterising the covariance (see: http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.31.494&rep=rep1&type=pdf) #Compute GP objects K_ff = self.ker.RBF(sf2, ell, X) K_uu = self.ker.RBF(sf2, ell, X_u) K_uu_inv = nlinalg.matrix_inverse(K_uu) L_f = slinalg.cholesky(K_ff - T.dot(K_fu, T.dot(K_uu_inv, K_fu.T))) # f_i make up the columns of f, simiarly for m_u_i u = m_u + T.dot(L_u, eps_u) #n_induce iid pseudo inducing sets f = T.dot(K_fu, T.dot(K_uu_inv, u)) + T.dot(L_f, X) #Find the hidden variable z # log_sigma_lhood = 0.5*(T.dot(W9,f) + b9) # the var GP maps to both mean *and* covariance sigma_var_lhood = sigma_z**2 * T.eye(self.dimZ) L_z = slinalg.cholesky(sigma_var_lhood) z = f + T.dot(L_z, eps_z) # z = mu_encoder + T.exp(log_sigma_encoder)*eps prior = 0.5 * T.sum(1 + 2 * log_sigma_encoder - mu_encoder**2 - T.exp(2 * log_sigma_encoder)) #Set up decoding layer if self.continuous_data: h_decoder = T.nnet.softplus(T.dot(W4, z) + b4) mu_decoder = T.nnet.sigmoid(T.dot(W5, h_decoder) + b5) log_sigma_decoder = 0.5 * (T.dot(W7, h_decoder) + b7) logpxz = T.sum(-(0.5 * np.log(2 * np.pi) + log_sigma_decoder) - 0.5 * ((x - mu_decoder) / T.exp(log_sigma_decoder))**2) gradvariables = [ W1, W2, W3, W4, W5, W7, b1, b2, b3, b4, b5, b7, sf2, ell, X_u, m_u, L_u ] else: h_decoder = T.tanh(T.dot(W4, z) + b4) y = T.nnet.sigmoid(T.dot(W5, h_decoder) + b5) logpxz = -T.nnet.binary_crossentropy(y, x).sum() gradvariables = [ W1, W2, W3, W4, W5, b1, b2, b3, b4, b5, sf2, ell, X_u, m_u, L_u ] #Set up auxiliary layer if self.continuous_data: h_auxiliary = T.nnet.softplus(T.dot(W6, [x, z]) + b6) mu_auxiliary = T.nnet.sigmoid(T.dot(W7, h_auxiliary) + b7) log_sigma_auxiliary = 0.5 * (T.dot(W8, h_auxiliary) + b8) else: pass #to do logp = logpxz + prior #Compute KL terms # KL_qp = -0.5*T.sum(1.0 + 2*log_sigma_lhood - f**2 - T.exp(2*log_sigma_lhood)) KL_qp = 0.5 * (T.dot(f.T, f) + T.trace(sigma_var_lhood + T.log(T.eye(self.dimZ)) - T.log(sigma_var_lhood)) - self.dimZ) KL_qr = 0.5 * (T.dot( (mu_auxiliary - mu_encoder).T, T.dot(T.diag(1.0 / T.exp(log_sigma_auxiliary)), mu_auxiliary - mu_decoder)) + T.trace( T.dot(T.diag(1.0 / T.exp(log_sigma_auxiliary)), T.dot(L_u, L_u.T)) + log_sigma_auxiliary - log_sigma_encoder) - self.dimXf - self.dimf) #Compute bound and all the gradients stoch_bound = logpxz - KL_qp - KL_qr derivatives = T.grad(stoch_bound, gradvariables) #Add the lowerbound so we can keep track of results derivatives.append(stoch_bound) self.gradientfunction = th.function(gradvariables + [x, eps_u, eps_z, X], derivatives, on_unused_input='ignore') self.lowerboundfunction = th.function(gradvariables + [x, eps_u, eps_z, X], stoch_bound, on_unused_input='ignore') self.zfunction = th.function(gradvariables + [x, eps_u, eps_z, X], z, on_unused_input='ignore')
def neg_log_prob(self, x, c): W = T.tril(self.War, k=-1) p = T.nnet.sigmoid(T.dot(x, W) + self.bar + c) return self.f_neg_log_prob(x, p)
def __init__(self, params,correct, samples = 500,batch_size=None): ker = kernel() self.samples = samples self.params = params self.batch_size=batch_size #データの保存ファイル model_file_name = 'model2' + '.save' #もしこれまでに作ったのがあるならロードする try: print ('Trying to load model...') with open(model_file_name, 'rb') as file_handle: obj = pickle.load(file_handle) self.f, self.g= obj print ('Loaded!') return except: print ('Failed. Creating a new model...') X,Y,X_test,mu,Sigma_b,Z,eps_NQ,eps_M =\ T.dmatrices('X','Y','X_test','mu','Sigma_b','Z','eps_NQ','eps_M') Wx, Ws, Wu=\ T.dmatrices('Wx', 'Ws', 'Wu') bx, bs, bu=\ T.dvectors('bx', 'bs', 'bu') gamma_x,beta_x,gamma_u,beta_u,gamma_s,beta_s=\ T.dvectors("gamma_x","beta_x","gamma_u","beta_u","gamma_s","beta_s") lhyp = T.dvector('lhyp') ls=T.dvector('ls') (M, D), N, Q = Z.shape, X.shape[0], X.shape[1] #変数の正の値への制約条件 beta = T.exp(ls[0]) #beta=T.exp(lhyp[0]) sf2, l = T.exp(lhyp[0]), T.exp(lhyp[1:1+Q]) #Sigma=T.exp(self.Sigma_b) #xについてはルートを取らなくても対角行列なので問題なし #uについては対角でないのでコレスキー分解するとかして三角行列を作る必要がある Sigma = T.tril(Sigma_b - T.diag(T.diag(Sigma_b)) + T.diag(T.exp(T.diag(Sigma_b)))) #スケール変換 mu_scaled, Sigma_scaled = sf2**0.5 * mu, sf2**0.5 * Sigma #隠れ層の生成 out1=self.neural_net_predict(Wx,bx,gamma_x,beta_x,X) m=self.neural_net_predict(Wu,bu,gamma_u,beta_u,out1) S=self.neural_net_predict(Ws,bs,gamma_s,beta_s,out1) #outputs1 = T.dot(X,Wx) + bx #m = T.dot(out1,Wu) + bu #S=T.dot(out1,Ws) + bs S=T.exp(S) S=T.sqrt(S) Xtilda = m+S*eps_NQ U = mu_scaled+Sigma_scaled.dot(eps_M) print ('Setting up cache...') Kmm = ker.RBF(sf2, l, Z) KmmInv = sT.matrix_inverse(Kmm) #KmmDet=theano.sandbox.linalg.det(Kmm) #KmmInv_cache = sT.matrix_inverse(Kmm) #self.fKmm = theano.function([Z, lhyp], Kmm, name='Kmm') #self.f_KmmInv = theano.function([Z, lhyp], KmmInv_cache, name='KmmInv_cache') #復習:これは員数をZ,lhypとした関数kmmInv_cacheをコンパイルしている。つまり逆行列はzとハイパーパラメタの関数になった #self.update_KmmInv_cache()#実際に数値を入れてkinnvを計算させている #逆行列の微分関数を作っている #self.dKmm_d = {'Z': theano.function([Z, lhyp], T.jacobian(Kmm.flatten(), Z), name='dKmm_dZ'), # 'lhyp': theano.function([Z, lhyp], T.jacobian(Kmm.flatten(), lhyp), name='dKmm_dlhyp')} print ('Modeling...') Kmn = ker.RBF(sf2,l,Z,Xtilda) Knn = ker.RBF(sf2,l,Xtilda,Xtilda) Ktilda=Knn-T.dot(Kmn.T,T.dot(KmmInv,Kmn)) Kinterval=T.dot(KmmInv,Kmn) mean_U=T.dot(Kinterval.T,U) Covariance = beta LL = (self.log_mvn(X, mean_U, Covariance) - 0.5*beta*T.sum((T.eye(N)*Ktilda)))*correct KL_X = -self.KLD_X(m,S)*correct KL_U = -self.KLD_U(mu_scaled , Sigma_scaled , Kmm,KmmInv) print ('Compiling model ...') inputs = {'X': X, 'Z': Z,'mu': mu, 'Sigma_b': Sigma_b, 'lhyp': lhyp, 'ls': ls, 'eps_M': eps_M, 'eps_NQ': eps_NQ,\ "Wx":Wx, "bx":bx, "Wu":Wu,"bu":bu, "Ws":Ws, "bs":bs,\ "gamma_x":gamma_x,"beta_x":beta_x,"gamma_u":gamma_u,"beta_u":beta_u,"gamma_s":gamma_s,"beta_s":beta_s} z = 0.0*sum([T.sum(v) for v in inputs.values()]) # solve a bug with derivative wrt inputs not in the graph self.f = {n: theano.function(list(inputs.values()), f+z, name=n, on_unused_input='ignore')\ for n,f in zip(['Xtilda','U', 'LL', 'KL_U', 'KL_X'], [Xtilda,U, LL, KL_U, KL_X])} wrt = {'Z': Z,'mu': mu, 'Sigma_b': Sigma_b, 'lhyp': lhyp, 'ls': ls, "Wx":Wx, "bx":bx, "Wu":Wu,"bu":bu, "Ws":Ws, "bs":bs,\ "gamma_x":gamma_x,"beta_x":beta_x,"gamma_u":gamma_u,"beta_u":beta_u,"gamma_s":gamma_s,"beta_s":beta_s} self.g = {vn: {gn: theano.function(list(inputs.values()), T.grad(gv+z, vv), name='d'+gn+'_d'+vn, on_unused_input='ignore') for gn,gv in zip(['LL', 'KL_U', 'KL_X'], [LL, KL_U, KL_X])} for vn, vv in wrt.items()} with open(model_file_name, 'wb') as file_handle: print ('Saving model...') sys.setrecursionlimit(2000) pickle.dump([self.f, self.g], file_handle, protocol=pickle.HIGHEST_PROTOCOL)
def step_neg_log_prob(self, x, c, War, bar): W = T.tril(War, k=-1) p = T.nnet.sigmoid(T.dot(x, W) + bar + c) return self.f_neg_log_prob(x, p)
def get_output_for(self, input_, **kwargs): W = T.tril(self.W, -1) interactions = T.batched_dot(T.dot(input_, W), input_) interactions = T.sqrt(T.max(interactions, 1e-6)) return self.nonlinearity(input_ + interactions)
def tril_and_halve_diagonal(mtx): """Extracts lower triangle of square matrix and halves diagonal.""" return tensor.tril(mtx) - tensor.diag(tensor.diagonal(mtx) / 2.0)
def __init__(self, rng, input, n_in, n_batch, d_bucket, activation, activation_deriv, w=None, index_permute=None, index_permute_reverse=None): srng = RandomStreams(seed=234) n_bucket = n_in / d_bucket + 1 self.input = input # randomly permute input space if index_permute is None: index_permute = srng.permutation(n=n_in)#numpy.random.permutation(n_in) index_permute_reverse = T.argsort(index_permute) self.index_permute = index_permute self.index_permute_reverse = index_permute_reverse permuted_input = input[:, index_permute] self.permuted_input = permuted_input # initialize matrix parameters if w is None: bound = numpy.sqrt(3. / d_bucket) w_values = numpy.asarray(rng.uniform(low=-bound, high=bound, size=(n_bucket, d_bucket, d_bucket)), dtype=theano.config.floatX) w = theano.shared(value=w_values, name='w') self.w = w # compute outputs and Jacobians log_jacobian = T.alloc(0, n_batch) for b in xrange(n_bucket): bucket_size = d_bucket if b == n_bucket - 1: bucket_size = n_in - b * d_bucket if b>0: prev_input = x_b """here we warp the previous bucket of inputs and add to the new input""" x_b = self.permuted_input[:, b*d_bucket:b*d_bucket + bucket_size] w_b = self.w[b, :bucket_size, :bucket_size] if b>0: x_b_plus = x_b + m_b else: x_b_plus = x_b Upper = T.triu(w_b) Lower = T.tril(w_b) Lower = T.extra_ops.fill_diagonal(Lower, 1.) log_det_Upper = T.log(T.abs_(T.nlinalg.ExtractDiag()(Upper))).sum() W = T.dot(Upper, Lower) log_jacobian = log_jacobian + T.alloc(log_det_Upper, n_batch) lin_output_b = T.dot(x_b_plus, W) if b>0: lin_output = T.concatenate([lin_output, lin_output_b], axis=1) else: lin_output = lin_output_b if activation is not None: derivs = activation_deriv(lin_output_b) #import pdb; pdb.set_trace() log_jacobian = log_jacobian + T.log(T.abs_(derivs)).sum(axis=1) self.log_jacobian = log_jacobian self.output = ( lin_output[:, index_permute_reverse] if activation is None else activation(lin_output[:, index_permute_reverse]) ) self.params = [w]
def __init__(self, rng,input_m,input_S, n_in, n_out,inducing_number,Domain_number=None, liklihood="Gaussian",Domain_consideration=True,number="1",kernel_name='X'): m=input_m self.cal=input_m S_0=input_S self.N=m.shape[0] D=n_out Q=n_in M=inducing_number #set_initial_value ker=kernel(Q,kernel_name) self.kern=ker mu_value = np.random.randn(M,D)* 1e-2 Sigma_b_value = np.zeros((M,M)) Z_value = np.random.randn(M,Q) if Domain_consideration: ls_value=np.zeros(Domain_number)+np.log(0.1) else: ls_value=np.zeros(1)+np.log(0.1) self.mu = theano.shared(value=mu_value, name='mu'+number, borrow=True) self.Sigma_b = theano.shared(value=Sigma_b_value, name='Sigma_b'+number, borrow=True) self.Z = theano.shared(value=Z_value, name='Z'+number, borrow=True) self.ls = theano.shared(value=ls_value, name='ls'+number, borrow=True) self.params = [self.mu,self.Sigma_b,self.Z,self.ls] self.params.extend(ker.params) self.hyp_params_list=[self.mu,self.Sigma_b,self.ls] self.Z_params_list=[self.Z] self.global_params_list=self.params S_1=T.exp(S_0) S=T.sqrt(S_1) from theano.tensor.shared_randomstreams import RandomStreams srng = RandomStreams(seed=234) eps_NQ = srng.normal((100,self.N,Q)) eps_M = srng.normal((100,M,D))#平均と分散で違う乱数を使う必要があるので別々に銘銘 eps_ND = srng.normal((100,self.N,D)) self.beta = T.exp(self.ls) #uについては対角でないのでコレスキー分解するとかして三角行列を作る必要がある Sigma = T.tril(self.Sigma_b - T.diag(T.diag(self.Sigma_b)) + T.diag(T.exp(T.diag(self.Sigma_b)))) #スケール変換 mu_scaled, Sigma_scaled = ker.sf2**0.5 * self.mu, ker.sf2**0.5 * Sigma #Xtilda = m[None,:,:] + S[None,:,:] * eps_NQ Xtilda, updates = theano.scan(fn=lambda a: m+S*a, sequences=[eps_NQ]) #self.U = mu_scaled[None,:,:]+Sigma_scaled[None,:,:].dot(eps_M) self.U, updates = theano.scan(fn=lambda a: mu_scaled+Sigma_scaled.dot(a), sequences=[eps_M]) Kmm = ker.RBF(self.Z) KmmInv = sT.matrix_inverse(Kmm) Knn, updates = theano.scan(fn=lambda a: self.kern.RBF(a), sequences=[Xtilda]) Kmn, updates = theano.scan(fn=lambda a: self.kern.RBF(self.Z,a), sequences=[Xtilda]) #Kmn = ker.RBF(self.Z,Xtilda) #Knn = ker.RBF(Xtilda) Ktilda, updates = theano.scan(fn=lambda a,b: a-T.dot(b.T,T.dot(KmmInv,b)), sequences=[Knn,Kmn]) #Ktilda=Knn-T.dot(Kmn.T,T.dot(KmmInv,Kmn)) F, updates = theano.scan(fn=lambda a,b,c,d: T.dot(a.T,T.dot(KmmInv,b)) + T.dot(T.maximum(c, 1e-16)**0.5,d), sequences=[Kmn,self.U,Ktilda,eps_ND]) #F = T.dot(Kmn.T,T.dot(KmmInv,self.U)) + T.dot(T.maximum(Ktilda, 1e-16)**0.5,eps_ND) #Kinterval=T.dot(KmmInv,Kmn) self.mean_U=F #mean_U=T.dot(Kinterval.T,self.U) #A=Kinterval.T #Sigma_tilda=Ktilda+T.dot(A,T.dot(Sigma_scaled,A.T)) #mean_tilda=T.dot(A,mu_scaled) #self.mean_U=mean_tilda + T.dot(T.maximum(Sigma_tilda, 1e-16)**0.5,eps_ND) self.output=self.mean_U self.KL_X = -self.KLD_X(m,S) self.KL_U = -self.KLD_U(mu_scaled , Sigma_scaled , Kmm,KmmInv)
def __init__(self, D, M, Q, Domain_number, m, pre_params, Pre_U, Hiddenlayerdim1, Hiddenlayerdim2): self.Xlabel = T.matrix('Xlabel') self.X = T.matrix('X') N = self.X.shape[0] self.Weight = T.matrix('Weight') ker = kernel(Q) #mmd=MMD(M,Domain_number) mu_value = np.random.randn(M, D) Sigma_b_value = np.zeros((M, M)) + np.log(0.01) Z_value = m[:M] self.test = Z_value ls_value = np.zeros(Domain_number) + np.log(0.1) self.mu = theano.shared(value=mu_value, name='mu', borrow=True) self.Sigma_b = theano.shared(value=Sigma_b_value, name='Sigma_b', borrow=True) self.Z = theano.shared(value=Z_value, name='Z', borrow=True) self.ls = theano.shared(value=ls_value, name='ls', borrow=True) self.params = [self.mu, self.Sigma_b, self.Z, self.ls] self.hiddenLayer_x = HiddenLayer(rng=rng, input=self.X, n_in=D, n_out=Hiddenlayerdim1, activation=T.nnet.relu, number='_x') self.hiddenLayer_hidden = HiddenLayer(rng=rng, input=self.hiddenLayer_x.output, n_in=Hiddenlayerdim1, n_out=Hiddenlayerdim2, activation=T.nnet.relu, number='_h') self.hiddenLayer_m = HiddenLayer(rng=rng, input=self.hiddenLayer_hidden.output, n_in=Hiddenlayerdim2, n_out=Q, activation=T.nnet.relu, number='_m') self.hiddenLayer_S = HiddenLayer(rng=rng, input=self.hiddenLayer_hidden.output, n_in=Hiddenlayerdim2, n_out=Q, activation=T.nnet.relu, number='_S') self.loc_params = [] self.loc_params.extend(self.hiddenLayer_x.params) self.loc_params.extend(self.hiddenLayer_hidden.params) self.loc_params.extend(self.hiddenLayer_m.params) self.loc_params.extend(self.hiddenLayer_S.params) self.local_params = {} for i in self.loc_params: self.local_params[str(i)] = i self.params.extend(ker.params) #self.params.extend(mmd.params) self.hyp_params = {} for i in [self.mu, self.Sigma_b, self.ls]: self.hyp_params[str(i)] = i self.Z_params = {} for i in [self.Z]: self.Z_params[str(i)] = i self.global_params = {} for i in self.params: self.global_params[str(i)] = i self.params.extend(self.hiddenLayer_x.params) self.params.extend(self.hiddenLayer_hidden.params) self.params.extend(self.hiddenLayer_m.params) self.params.extend(self.hiddenLayer_S.params) self.wrt = {} for i in self.params: self.wrt[str(i)] = i for i, j in pre_params.items(): self.wrt[i].set_value(j) for i, j in Pre_U.items(): self.wrt[i].set_value(j) m = self.hiddenLayer_m.output S_0 = self.hiddenLayer_S.output S_1 = T.exp(S_0) S = T.sqrt(S_1) from theano.tensor.shared_randomstreams import RandomStreams srng = RandomStreams(seed=234) eps_NQ = srng.normal((N, Q)) eps_M = srng.normal((M, D)) #平均と分散で違う乱数を使う必要があるので別々に銘銘 beta = T.exp(self.ls) #uについては対角でないのでコレスキー分解するとかして三角行列を作る必要がある Sigma = T.tril(self.Sigma_b - T.diag(T.diag(self.Sigma_b)) + T.diag(T.exp(T.diag(self.Sigma_b)))) #スケール変換 mu_scaled, Sigma_scaled = ker.sf2**0.5 * self.mu, ker.sf2**0.5 * Sigma Xtilda = m + S * eps_NQ self.U = mu_scaled + Sigma_scaled.dot(eps_M) Kmm = ker.RBF(self.Z) #Kmm=mmd.MMD_kenel_Xonly(mmd.Zlabel_T,Kmm,self.Weight) KmmInv = sT.matrix_inverse(Kmm) Kmn = ker.RBF(self.Z, Xtilda) #Kmn=mmd.MMD_kenel_ZX(self.Xlabel,Kmn,self.Weight) Knn = ker.RBF(Xtilda) #Knn=mmd.MMD_kenel_Xonly(self.Xlabel,Knn,self.Weight) Ktilda = Knn - T.dot(Kmn.T, T.dot(KmmInv, Kmn)) Kinterval = T.dot(KmmInv, Kmn) mean_U = T.dot(Kinterval.T, self.U) betaI = T.diag(T.dot(self.Xlabel, beta)) Covariance = betaI self.LL = (self.log_mvn(self.X, mean_U, Covariance) - 0.5 * T.sum(T.dot(betaI, Ktilda))) self.KL_X = -self.KLD_X(m, S) self.KL_U = -self.KLD_U(mu_scaled, Sigma_scaled, Kmm, KmmInv)
def contaminate_mixture(data, fit_for='z', fit_data=None): #stickbreaking problems steps = [] # shapes and sizes n_epochs = data['epoch_i'].max() + 1 # each epoch indexed by epoch_i n_raters = data['rater_i'].max() + 1 n_obs = data.shape[0] # each spindle marker indexed by t # static priors vars trust_purcell = 0.1 # crank up to give more weight to purcell et al, 2017 purcell = np.array([0.3587, 0.6387, 0.0026, 0., 0., 0.]) + (1 - trust_purcell) s_number_prior = purcell / purcell.sum() max_s = len(s_number_prior) - 1 gss_spindle_testvals = [1., 5., 10., 15., 20.] with pm.Model() as model: # True s gss = pm.Uniform('gss', lower=0., upper=25., shape=(n_epochs, max_s), testval=np.tile(np.array(gss_spindle_testvals).T, reps=(n_epochs, 1),)) # Real spindles gss_per_obs = gss[data['epoch_i'], :] # The number of spindles per epoch: if fit_for == 'z': gss_prior = pm.Dirichlet('gss_prior', a=s_number_prior) if n_epochs > 1: z = pm.Categorical('z', p=gss_prior, shape=n_epochs) else: z = pm.Categorical('z', p=gss_prior) else: z = fit_data['z'] z_rs = z.reshape((n_epochs, 1)) if fit_for in ['w', 'z']: # when we are finding z or w w_prior_possibilities = tt.tril(tt.ones((max_s + 1, max_s + 1))) w = pm.Categorical('w', p=w_prior_possibilities[z_rs[data['epoch_i'], 0], :], shape=n_obs) else: # fit for gss w = fit_data['w'] # --- Raters ability to detect markers --- # r_E = pm.Bound(pm.Normal, lower=0.)('r_E', mu=0.5, sd=0.5, shape=n_raters) r_E_per_obs = r_E[data['rater_i']] #r_E = pm.Bound(pm.Normal, lower=0.)('r_E', mu=0.5, sd=0.5) # --- Behaviour --- # contaminate_dist_s = pm.Uniform.dist(lower=0., upper=25., shape=n_obs) contaminate_dist_s.mean = 12.5 possible_dists = [contaminate_dist_s] for i in range(0, 5): dist = pm.Normal.dist(mu=gss_per_obs[:, i], sd=r_E_per_obs) dist.mean = gss_spindle_testvals[i] possible_dists.append(dist) w_array = tt.extra_ops.to_one_hot(w, nb_class=max_s + 1) s = pm.Mixture('s', w=w_array, comp_dists=possible_dists, observed=data['s']) #STEP methods for vars: if fit_for == 'z': steps = [pm.CategoricalGibbsMetropolis([z, w]), pm.NUTS([gss_prior, gss, r_E], target_accept=0.9)] if fit_for == 'w': steps = [pm.CategoricalGibbsMetropolis([w]), pm.NUTS([gss, r_E], target_accept=0.9)] #else, everything NUTS return model, steps