def step(self, i_t, x_t, z_t, att_p, y_p, c_p, *other_args): # See Unit.scan() for seqs. # args: seqs (x_t = unit.xc, z_t, i_t), outputs (# unit.n_act, y_p, c_p, ...), non_seqs (none) other_outputs = [] #att_p = theano.printing.Print('att in lstms', attrs=['__str__'])(att_p) if self.recurrent_transform: state_vars = other_args[:len(self.recurrent_transform.state_vars)] self.recurrent_transform.set_sorted_state_vars(state_vars) z_r, r_updates = self.recurrent_transform.step(y_p) z_t += z_r for v in self.recurrent_transform.get_sorted_state_vars(): other_outputs += [r_updates[v]] maxatt = att_p.repeat(z_t.shape[1]).reshape((z_t.shape[0],z_t.shape[1]))#.dimshuffle(1,0) #maxatt = theano.printing.Print('maxatt',attrs=['__str__','shape'])(maxatt) z_t = T.switch(maxatt>0,z_t,z_t + T.dot(y_p, self.W_re)) #z_t += T.dot(y_p, self.W_re) #z_t = theano.printing.Print('z_t lstms',attrs=['shape'])(z_t) partition = z_t.shape[1] // 4 ingate = T.nnet.sigmoid(z_t[:,:partition]) forgetgate = ((T.nnet.sigmoid(z_t[:,partition:2*partition])).T * (1.-att_p)).T outgate = T.nnet.sigmoid(z_t[:,2*partition:3*partition]) input = T.tanh(z_t[:,3*partition:4*partition]) #c_t = ((forgetgate * c_p + ingate * input).T * (1.-T.max(att_p,axis=-1))).T c_t = forgetgate * c_p + ingate * input y_t = outgate * T.tanh(c_t) i_output = T.outer(i_t, self.o_output) i_h = T.outer(i_t, self.o_h) # return: next outputs (# unit.n_act, y_t, c_t, ...) return (y_t * i_output, c_t * i_h + c_p * (1 - i_h)) + tuple(other_outputs)
def step(self, i_t, x_t, z_t, y_p, c_p, *other_args): # See Unit.scan() for seqs. # args: seqs (x_t = unit.xc, z_t, i_t), outputs (# unit.n_act, y_p, c_p, ...), non_seqs (none) other_outputs = [] if self.recurrent_transform: state_vars = other_args[:len(self.recurrent_transform.state_vars)] self.recurrent_transform.set_sorted_state_vars(state_vars) z_r, r_updates = self.recurrent_transform.step(y_p) z_t += z_r for v in self.recurrent_transform.get_sorted_state_vars(): other_outputs += [r_updates[v]] z_t += T.dot(y_p, self.W_re) partition = z_t.shape[1] // 4 #number of units forgetgate = T.nnet.sigmoid(z_t[:,:partition]) propgate = T.nnet.sigmoid(z_t[:,partition:2*partition]) diffgate = T.nnet.sigmoid(z_t[:,2*partition:3*partition]) input = T.tanh(z_t[:,3*partition:4*partition]) # c(t) = (1 - FG(t)) * IN(t) + FG(t) * c(t-1) c_t = (1-forgetgate) * input + forgetgate * c_p # y(t) = tanh( PG(t) * c(t) + DG(t) * ( c(t) - c(t-1)) ) HINT: The additional nonlinearity maybe has not a significant effect y_t = T.tanh(propgate * c_t + diffgate * ( c_t - c_p)) i_output = T.outer(i_t, self.o_output) i_h = T.outer(i_t, self.o_h) # return: next outputs (# unit.n_act, y_t, c_t, ...) return (y_t * i_output, c_t * i_h + c_p * (1 - i_h)) + tuple(other_outputs)
def full(self, X, Xs=None): X, Xs = self._slice(X, Xs) scf_x = self.scaling_func(X, self.args) if Xs is None: return tt.outer(scf_x, scf_x) * self.cov_func(X) else: scf_xs = self.scaling_func(Xs, self.args) return tt.outer(scf_x, scf_xs) * self.cov_func(X, Xs)
def contrastive_divergence_1(self, v1): '''Determine the weight updates according to CD-1''' h1 = self.sample_h_given_v(v1) v2 = self.sample_v_given_h(h1) h2p = self.propup(v2) return (T.outer(v1, h1) - T.outer(v2, h2p), v1 - v2, h1 - h2p)
def image_step_val(Imat, htm1mat, ctm1mat, Wcnn, Wxi, Whi, bi, Wxf, Whf, bf, Wxc, Whc, bc, Wxo, Who, bo, Why, by, forbatch): xtmat = theano.dot(Imat, Wcnn) itmat = sigma(theano.dot(xtmat,Wxi) + theano.dot(htm1mat,Whi) + T.outer(forbatch,bi) ) ftmat = sigma(theano.dot(xtmat,Wxf) + theano.dot(htm1mat,Whf) + T.outer(forbatch,bf) ) ctmat = ftmat * ctm1mat + itmat*act(theano.dot(xtmat,Wxc)+theano.dot(htm1mat,Whc)+T.outer(forbatch,bc) ) otmat = sigma(theano.dot(xtmat,Wxo) + theano.dot(htm1mat,Who) + T.outer(forbatch,bo) ) htmat = otmat * act(ctmat) # yt = T.concatenate([addzero,tempyt],axis=0) return htmat, ctmat
def psb(inverse_hessian, weight_delta, gradient_delta, **options): gradient_delta_t = gradient_delta.T param = weight_delta - inverse_hessian.dot(gradient_delta) devider = (1. / T.dot(gradient_delta, gradient_delta)) param1 = T.outer(param, gradient_delta) + T.outer(gradient_delta, param) param2 = ( T.dot(gradient_delta, param) * T.outer(gradient_delta, gradient_delta_t) ) return inverse_hessian + param1 * devider - param2 * devider ** 2
def train(): train_set, valid_set, test_set = loadData() x,y = train_set m,n_input = x.shape width = 28 height = 28 n_hidden = 49 learning_rate = .1 #set up shared variables W = theano.shared(numpy.random.uniform(-4 * numpy.sqrt(6. / (n_hidden + n_input)),4 * numpy.sqrt(6. / (n_hidden + n_input)),(n_hidden,width*height)),name="W") b_v = theano.shared(numpy.zeros((width*height,)),name="b_v") b_h = theano.shared(numpy.zeros((n_hidden,)),name="b_h") theano_rng = T.shared_randomstreams.RandomStreams(numpy.random.randint(2 ** 30)) v_input = T.fvector("v_input") #1. sample hidden units h_prob = T.nnet.sigmoid(T.dot(v_input,W.T)+b_h) h_sample = theano_rng.binomial(size=(n_hidden,), n=1, p=h_prob) #2. calculate positive gradient g_p = T.outer(v_input,h_sample) #3. make reconstruction v_prob_reconstruction = T.nnet.sigmoid(T.dot(h_sample,W)+b_v) v_reconstruction = theano_rng.binomial(size=(n_input,), n=1, p=v_prob_reconstruction) h_prob_reconstruction = T.nnet.sigmoid(T.dot(v_reconstruction,W.T)+b_h) h_reconstruction = theano_rng.binomial(size=(n_hidden,), n=1, p=h_prob_reconstruction) #4. calculate negative gradient g_n = T.outer(v_reconstruction,h_reconstruction) #FUNCTIONS FOR TESTING #f_h_prob = theano.function(inputs=[v_input,],outputs=[h_prob,]) #f_h_sample = theano.function(inputs=[v_input,],outputs=[h_sample,]) #f_g_p = theano.function(inputs=[v_input,],outputs=[g_p,]) #f_v_prob_reconstruction = theano.function(inputs=[v_input,],outputs=[v_prob_reconstruction,]) #f_v_reconstruction = theano.function(inputs=[v_input,],outputs=[v_reconstruction,]) #f_h_prob_reconstruction = theano.function(inputs=[v_input,],outputs=[h_prob_reconstruction,]) #f_h_reconstruction = theano.function(inputs=[v_input,],outputs=[h_reconstruction,]) #f_g_n = theano.function(inputs=[v_input,],outputs=[g_n,]) learn = theano.function(inputs=[v_input,],updates=[(W,W+learning_rate*(g_p-g_n).T)]) for i in range(300001): if i > 0: if i%10000 == 0: print "Epcoh: ",i display_weights(W,width,height,i) learn(x[i%m,:]) with open('weights.pkl', 'wb') as output: pickle.dump(W.get_value(), output, pickle.HIGHEST_PROTOCOL)
def times_reflection(input, n_hidden, reflection): input_re = input[:, :n_hidden] input_im = input[:, n_hidden:] reflect_re = reflection[n_hidden:] reflect_im = reflection[:n_hidden] vstarv = (reflect_re**2 + reflect_im**2).sum() input_re_reflect = input_re - 2 / vstarv * (T.outer(T.dot(input_re, reflect_re), reflect_re) + T.outer(T.dot(input_im, reflect_im), reflect_im)) input_im_reflect = input_im - 2 / vstarv * (-T.outer(T.dot(input_re, reflect_im), reflect_im) + T.outer(T.dot(input_im, reflect_re), reflect_re)) return T.concatenate([input_re_reflect, input_im_reflect], axis=1)
def _step(x_t, i_t, c_tm1, y_tm1): #z_t = T.dot(x_t, W) + T.dot(y_tm1, V_h) + b z_t = x_t + T.dot(y_tm1, V_h) partition = z_t.shape[1] / 4 ingate = T.nnet.sigmoid(z_t[:,:partition]) forgetgate = T.nnet.sigmoid(z_t[:,partition:2*partition]) outgate = T.nnet.sigmoid(z_t[:,2*partition:3*partition]) input = T.tanh(z_t[:,3*partition:4*partition]) c_t = forgetgate * c_tm1 + ingate * input y_t = outgate * T.tanh(c_t) i_output = T.outer(i_t, o_output) i_h = T.outer(i_t, o_h) return c_t * i_h + c_tm1 * (1 - i_h), y_t * i_output
def learningstep_m1(self, Y, L, M, W, epsilon): """Perform a single learning step. This is a faster learning step for the case of mini-batch-size = 1. Keyword arguments: the keyword arguments must be the same as given in self.input_parameters(mode) for mode='train'. """ # Input integration: I = T.dot(T.log(W),Y) # recurrent term: vM = theano.ifelse.ifelse( T.eq(L,-1), # if no label is provided T.sum(M, axis=0), M[L,:] ) # numeric trick to prevent overflow in the exp-function: max_exponent = 88. - T.log(I.shape[0]).astype('float32') scale = theano.ifelse.ifelse(T.gt(I[T.argmax(I)], max_exponent), I[T.argmax(I)] - max_exponent, 0.) # activation: recurrent softmax with overflow protection s = vM*T.exp(I-scale)/T.sum(vM*T.exp(I-scale)) s.name = 's_%d.%d[t]'%(self._nmultilayer,self._nlayer) # weight update W_new = W + epsilon*(T.outer(s,Y) - s[:,np.newaxis]*W) W_new.name = 'W_%d.%d[t]'%(self._nmultilayer,self._nlayer) return s, W_new
def one_iter(W_i, V_i, b_i, a, v_lt_i, p_lt_i, log_likelihood): h_i = self.sigmoid(a) p_i = self.sigmoid(T.dot(h_i, V_i) + b_i) v_i = 1. * (theano_rng.uniform([num_samples]) <= p_i) log_likelihood += v_i * T.log(p_i) + (1 - v_i) * T.log(1 - p_i) a += T.outer(v_i, W_i) return a, v_i, p_i, log_likelihood
def free_energy(self, visible): """Computes the free energy of the model. :type visible: theano.tensor.TensorType :param visible: The state of the visible units (either 1/0, or mean - not important). :rtype: theano.tensor.var.TensorVariable :returns: The free energy of the model, given the visible activation. Computed as .. math:: :label: free_energy \mathcal{F}(x) = - \log \sum_h e^{-E(x,h)} """ print 'Running free energy.' D = TT.sum(visible, axis=1) exponent_term = TT.dot(visible, self.W) + TT.outer(D, self.b_hidden) # TT.outer(D, self.b_hidden) # D is a coefficient, b_hidden should hidden_term = TT.sum(TT.log(1 + TT.exp(exponent_term)), axis=1) # This is the other and more crucial difference between an RBM and a # RSM: multiplying hidedn bias by "document length". b_visible_term = TT.dot(visible, self.b_visible) free_energy = - hidden_term - b_visible_term return free_energy
def compute_probabilistic_matrix(self,X, y, num_cases, k=5): z = T.dot(X, self.A) #Transform x into z space dists = T.sqr(dist2hy(z,z)) dists = T.extra_ops.fill_diagonal(dists, T.max(dists)+1) nv = T.min(dists,axis=1) # value of nearest neighbour dists = (dists.T - nv).T d = T.extra_ops.fill_diagonal(dists, 0) #Take only k nearest num = T.zeros((num_cases, self.num_classes)) denom = T.zeros((num_cases,)) for c_i in xrange(self.num_classes): #Mask for class i mask_i = T.eq(T.outer(T.ones_like(y),y),c_i) #K nearest neighbour within a class i dim_ci = T.sum(mask_i[0]) d_c_i = T.reshape(d[mask_i.nonzero()],(num_cases,dim_ci)) k_indice = T.argsort(d_c_i, axis=1)[:,0:k] kd = T.zeros((num_cases,k)) for it in xrange(k): kd = T.set_subtensor(kd[:,it], d_c_i[T.arange(num_cases),k_indice[:,it]]) #Numerator value = T.exp(-T.mean(kd,axis=1)) num = T.set_subtensor(num[:,c_i], value) denom += value p = num / denom.dimshuffle(0,'x') #prob that point i will be correctly classified return p
def kron(a, b): """ Kronecker product Same as scipy.linalg.kron(a, b). :note: numpy.kron(a, b) != scipy.linalg.kron(a, b)! They don't have the same shape and order when a.ndim != b.ndim != 2. :param a: array_like :param b: array_like :return: array_like with a.ndim + b.ndim - 2 dimensions. """ a = tensor.as_tensor_variable(a) b = tensor.as_tensor_variable(b) if (a.ndim + b.ndim <= 2): raise TypeError('kron: inputs dimensions must sum to 3 or more. ' 'You passed %d and %d.' % (a.ndim, b.ndim)) o = tensor.outer(a, b) o = o.reshape(tensor.concatenate((a.shape, b.shape)), a.ndim + b.ndim) shf = o.dimshuffle(0, 2, 1, * range(3, o.ndim)) if shf.ndim == 3: shf = o.dimshuffle(1, 0, 2) o = shf.flatten() else: o = shf.reshape((o.shape[0] * o.shape[2], o.shape[1] * o.shape[3]) + tuple([o.shape[i] for i in range(4, o.ndim)])) return o
def __init__(self, C, D, use_unlabeled): self.W = theano.shared(np.ones((C,D), dtype='float32')) t_eps = T.scalar('epsilon', dtype='float32') t_Y = T.vector('Y', dtype='float32') t_s = T.vector('s', dtype='float32') self.activation_unlabeled = theano.function( [t_Y], T.sum(t_Y*self.W/T.sum(self.W, axis=0), axis=1), allow_input_downcast=True ) self.activation_normalization = theano.function( [t_s], t_s/T.sum(t_s), allow_input_downcast=True ) self.weight_update = theano.function( [t_Y,t_s,t_eps], self.W, updates={ self.W: self.W + t_eps*(T.outer(t_s,t_Y) - t_s[:,np.newaxis]*self.W) }, allow_input_downcast=True ) self.epsilon = None self._Y = None self._s = None self._delta = np.eye(C, dtype='float32') self._C = C self._use_unlabeled = use_unlabeled self._skipupdate = False
def compute_psi1(lls, lsf, xmean, xvar, z): if xmean.ndim == 1: xmean = xmean[ None, : ] ls = T.exp(lls) sf = T.exp(lsf) lspxvar = ls + xvar constterm1 = ls / lspxvar constterm2 = T.prod(T.sqrt(constterm1), 1) r2_psi1 = T.outer(T.sum(xmean * xmean / lspxvar, 1), T.ones_like(z[ : , 0 : 1 ])) \ - np.float32(2) * T.dot(xmean / lspxvar, T.transpose(z)) + \ T.dot(np.float32(1.0) / lspxvar, T.transpose(z)**2) psi1 = sf * T.outer(constterm2, T.ones_like(z[ : , 0 : 1 ])) * T.exp(-np.float32(0.5) * r2_psi1) return psi1
def forward(self): z = self.z0 # sxd u = self.u_ # d w = self.w_ # d b = self.b # . h = self.h # f # h(sxd \dot d + .) = s if not self.batched: hwz = h(z.dot(w) + b) # s # sxd + (s \outer d) = sxd z1 = z + tt.outer(hwz, u) # sxd return z1 else: z = z.swapaxes(0, 1) # z bxsxd # u bxd # w bxd b = b.dimshuffle(0, 'x') # b bx- hwz = h(tt.batched_dot(z, w) + b) # bxs # bxsxd + (bxsx- * bx-xd) = bxsxd hwz = hwz.dimshuffle(0, 1, 'x') # bxsx- u = u.dimshuffle(0, 'x', 1) # bx-xd z1 = z + hwz * u # bxsxd return z1.swapaxes(0, 1) # sxbxd
def __init__(self, C, D): self.W = theano.shared(np.ones((C,D), dtype='float32')) t_M = T.matrix('M', dtype='float32') t_vM = T.vector('M', dtype='float32') t_Y = T.vector('Y', dtype='float32') t_I = T.vector('I', dtype='float32') t_s = T.vector('s', dtype='float32') t_eps = T.scalar('epsilon', dtype='float32') self.input_integration = theano.function( [t_Y], T.dot(T.log(self.W),t_Y), allow_input_downcast=True ) self.M_summation = theano.function( [t_M], T.sum(t_M, axis=0), allow_input_downcast=True ) self.recurrent_softmax = theano.function( [t_I,t_vM], t_vM*T.exp(t_I)/T.sum(t_vM*T.exp(t_I)), allow_input_downcast=True ) self.weight_update = theano.function( [t_Y,t_s,t_eps], self.W, updates={ self.W: self.W + t_eps*(T.outer(t_s,t_Y) - t_s[:,np.newaxis]*self.W) }, allow_input_downcast=True ) self.epsilon = None self._Y = None self._s = None
def one_iter(Wi, Vi, bi, rand_i, a, vis_i, post): hid = self.sigmoid(a) pi = self.sigmoid(T.dot(hid, Vi) + bi) vis_i = T.cast(rand_i <= pi, floatX) post = post + T.log(pi*vis_i + (1-pi)*(1-vis_i)) a = a + T.outer(vis_i, Wi) return a, vis_i, post
def get_square_norm_gradients_scan(D_by_layer, cost, accum = 0): # This returns a theano variable that will be of shape (minibatch_size, ). # It will contain, for each training example, the associated square-norm of the total gradient. # If you take the element-wise square-root afterwards, you will get # the associated 2-norms, which is what you want for importance sampling. for (layer_name, D) in D_by_layer.items(): backprop_output = tensor.grad(cost, D['output']) if D.has_key('weight'): A = D['input'] B = backprop_output S, _ = theano.scan(fn=lambda A, B: tensor.sqr(tensor.outer(A,B)).sum(), sequences=[A,B]) accum = accum + S if D.has_key('bias'): B = backprop_output S, _ = theano.scan(fn=lambda B: tensor.sqr(B).sum(), sequences=[B]) accum = accum + S return accum
def grad(self, inputs, output_gradients): """ Reverse-mode gradient updates for matrix solve operation c = A \ b. Symbolic expression for updates taken from [1]_. References ---------- ..[1] M. B. Giles, "An extended collection of matrix derivative results for forward and reverse mode automatic differentiation", http://eprints.maths.ox.ac.uk/1079/ """ A, b = inputs c = self(A, b) c_bar = output_gradients[0] trans_map = { 'lower_triangular': 'upper_triangular', 'upper_triangular': 'lower_triangular' } trans_solve_op = Solve( # update A_structure and lower to account for a transpose operation A_structure=trans_map.get(self.A_structure, self.A_structure), lower=not self.lower ) b_bar = trans_solve_op(A.T, c_bar) # force outer product if vector second input A_bar = -tensor.outer(b_bar, c) if c.ndim == 1 else -b_bar.dot(c.T) if self.A_structure == 'lower_triangular': A_bar = tensor.tril(A_bar) elif self.A_structure == 'upper_triangular': A_bar = tensor.triu(A_bar) return [A_bar, b_bar]
def bfgs(inverse_hessian, weight_delta, gradient_delta, maxrho=1e4): ident_matrix = T.eye(inverse_hessian.shape[0]) maxrho = asfloat(maxrho) rho = asfloat(1.) / gradient_delta.dot(weight_delta) rho = ifelse( T.isinf(rho), maxrho * T.sgn(rho), rho, ) param1 = ident_matrix - T.outer(weight_delta, gradient_delta) * rho param2 = ident_matrix - T.outer(gradient_delta, weight_delta) * rho param3 = rho * T.outer(weight_delta, weight_delta) return param1.dot(inverse_hessian).dot(param2) + param3
def dfp(inverse_hessian, weight_delta, gradient_delta, maxnum=1e5): maxnum = asfloat(maxnum) quasi_dot_gradient = inverse_hessian.dot(gradient_delta) param1 = ( T.outer(weight_delta, weight_delta) ) / ( T.dot(gradient_delta, weight_delta) ) param2_numerator = T.clip( T.outer(quasi_dot_gradient, gradient_delta) * inverse_hessian, -maxnum, maxnum ) param2_denominator = gradient_delta.dot(quasi_dot_gradient) param2 = param2_numerator / param2_denominator return inverse_hessian + param1 - param2
def added_part_f(sen_part = T.matrix("sen_part")): inter_sen_part0 = T.zeros_like(T.outer(sen_part[0], sen_part[1])) inter_sen_part, updates = theano.scan(fn = inter_accu, \ sequences = dict(input = sen_part, taps = [-1, 0]), \ outputs_info = dict(initial = inter_sen_part0, taps = [-1])) added_part = T.dot(inter_sen_part[-1], 1.0/(sen_part.shape[0]-1)) added_part = added_part[0:pca_dim] new_sen_part = T.concatenate([sen_part, added_part], axis=0) return new_sen_part
def GradientForOneObject(sample, dream, h_lids, vBias, hBias): #T.sum(T.sqr(hBias - self.bm.computeProbabilityHByV(sample, self.W, hBias))) + \ #T.sum(T.sqr(vBias - sample)) + # self.bm.cleverAddingToFreeEnergy(sample, self.W, vBias, hBias) +\ # self.bm.cleverAddingToFreeEnergy(sample, self.W, vBias, hBias) + # self.bm.cleverAddingToFreeEnergy(sample, self.W, vBias, hBias) + \ energy = regularization * self.bm.cleverAddingToFreeEnergy(sample, self.W, vBias, hBias) + \ self.bm.freeEnergy(sample, self.W, vBias, hBias) - self.bm.freeEnergy(dream, self.W, vBias, hBias) #energy = self.bm. #energy = T.sum(energy) grad = theano.grad(energy, [self.W, vBias, hBias], consider_constant=[sample, dream]) gradUByW1 = T.outer(grad[1], h_lids); gradUByW2 = T.outer(grad[2], h_lids); gradUByhBias = (grad[2]); gradUByvBias = (grad[1]); gradUByW = grad[0]; gradHLid0 = (h_lids); return [energy, gradHLid0, gradUByW1, gradUByW2, gradUByhBias, gradUByvBias, gradUByW]
def compute_kernel(lls, lsf, x, z): ls = T.exp(lls) sf = T.exp(lsf) if x.ndim == 1: x = x[ None, : ] if z.ndim == 1: z = z[ None, : ] lsre = T.outer(T.ones_like(x[ :, 0 ]), ls) r2 = T.outer(T.sum(x * x / lsre, 1), T.ones_like(z[ : , 0 : 1 ])) - np.float32(2) * \ T.dot(x / lsre, T.transpose(z)) + T.dot(np.float32(1.0) / lsre, T.transpose(z)**2) k = sf * T.exp(-np.float32(0.5) * r2) return k
def bp_mll(pred, target): # From : Multi-Label Neural Networks with Applications to # Functional Genomics and Text Categorization # https://cs.nju.edu.cn/zhouzh/zhouzh.files/publication/tkde06a.pdf y_i = pred * target not_y_i = pred * (1-target) matrices, updates = theano.scan(fn=lambda p, t: T.outer(p, t), sequences=[y_i, not_y_i]) cost = matrices.sum(axis=(1,2)) return cost, updates
def lstm(z, i_t, s_p, h_p): z += T.dot(h_p, self.N_re) i = T.outer(i_t, T.alloc(numpy.cast['int8'](1), n_out)) ingate = T.nnet.sigmoid(z[:,n_out: 2 * n_out]) forgetgate = T.nnet.sigmoid(z[:,2 * n_out:3 * n_out]) outgate = T.nnet.sigmoid(z[:,3 * n_out:]) input = T.tanh(z[:,:n_out]) s_t = input * ingate + s_p * forgetgate h_t = T.tanh(s_t) * outgate return theano.gradient.grad_clip(s_t * i, -50, 50), h_t * i
def get_weight(self, encOutputs1, encMask1, encOutput2): e = T.alloc(1, encOutputs1.shape[0]) tiledEncOutput2 = T.outer(e, encOutput2.flatten()).reshape([e.shape[0], encOutput2.shape[0], encOutput2.shape[1]]) attInput = T.concatenate([encOutputs1, tiledEncOutput2], axis=2) A = self.h2.get_output(self.h1.get_output(attInput))[:,:,0] maskedExpA = T.exp(A) * encMask1 weight = maskedExpA / T.sum(maskedExpA, axis=0) weight = self.sharpen(weight) / T.sum(self.sharpen(weight), axis=0) if not (self.sharpen is None) else weight return weight
def L_op(self, inputs, outputs, output_gradients): # Modified from theano/tensor/slinalg.py A, b = inputs c = outputs[0] c_bar = output_gradients[0] # FIXME: triangular structure would use GpuCublasTriangularsolve? # no need to handle A_structure like slinalg.py? trans_solve_op = GpuCusolverSolve('general') b_bar = trans_solve_op(A.T, c_bar) A_bar = -tensor.outer(b_bar, c) if c.ndim == 1 else -b_bar.dot(c.T) return [A_bar, b_bar]
def compute_output(self): # We compute the output mean self.Kzz = compute_kernel( self.lls, self.lsf, self.z, self.z) + T.eye(self.z.shape[0]) * self.jitter * T.exp(self.lsf) self.KzzInv = T.nlinalg.MatrixInversePSD()(self.Kzz) LLt = T.dot(self.LParamPost, T.transpose(self.LParamPost)) self.covCavityInv = self.KzzInv + LLt * casting( self.n_points - self.set_for_training) / casting(self.n_points) self.covCavity = T.nlinalg.MatrixInversePSD()(self.covCavityInv) self.meanCavity = T.dot( self.covCavity, casting(self.n_points - self.set_for_training) / casting(self.n_points) * self.mParamPost) self.KzzInvcovCavity = T.dot(self.KzzInv, self.covCavity) self.KzzInvmeanCavity = T.dot(self.KzzInv, self.meanCavity) self.covPosteriorInv = self.KzzInv + LLt self.covPosterior = T.nlinalg.MatrixInversePSD()(self.covPosteriorInv) self.meanPosterior = T.dot(self.covPosterior, self.mParamPost) self.Kxz = compute_kernel(self.lls, self.lsf, self.input_means, self.z) self.B = T.dot(self.KzzInvcovCavity, self.KzzInv) - self.KzzInv v_out = T.exp(self.lsf) + T.dot(self.Kxz * T.dot(self.Kxz, self.B), T.ones_like(self.z[:, 0:1])) if self.ignore_variances: self.output_means = T.dot(self.Kxz, self.KzzInvmeanCavity) self.output_vars = abs(v_out) + casting(0) * T.sum(self.input_vars) else: self.EKxz = compute_psi1(self.lls, self.lsf, self.input_means, self.input_vars, self.z) self.output_means = T.dot(self.EKxz, self.KzzInvmeanCavity) # In other layers we have to compute the expected variance self.B2 = T.outer(T.dot(self.KzzInv, self.meanCavity), T.dot(self.KzzInv, self.meanCavity)) exact_output_vars = True if exact_output_vars: # We compute the exact output variance self.psi2 = compute_psi2(self.lls, self.lsf, self.z, self.input_means, self.input_vars) ll = T.transpose(self.EKxz[:, None, :] * self.EKxz[:, :, None], [1, 2, 0]) kk = T.transpose(self.Kxz[:, None, :] * self.Kxz[:, :, None], [1, 2, 0]) v1 = T.transpose( T.sum(T.sum( T.shape_padaxis(self.B2, 2) * (self.psi2 - ll), 0), 0, keepdims=True)) v2 = T.transpose( T.sum(T.sum( T.shape_padaxis(self.B, 2) * (self.psi2 - kk), 0), 0, keepdims=True)) else: # We compute the approximate output variance using the unscented kalman filter v1 = 0 v2 = 0 n = self.input_d for j in range(1, n + 1): mask = T.zeros_like(self.input_vars) mask = T.set_subtensor(mask[:, j - 1], 1) inc = mask * T.sqrt(casting(n) * self.input_vars) self.kplus = T.sqrt( casting(1.0) / casting(2 * n)) * compute_kernel( self.lls, self.lsf, self.input_means + inc, self.z) self.kminus = T.sqrt( casting(1.0) / casting(2 * n)) * compute_kernel( self.lls, self.lsf, self.input_means - inc, self.z) v1 += T.dot(self.kplus * T.dot(self.kplus, self.B2), T.ones_like(self.z[:, 0:1])) v1 += T.dot(self.kminus * T.dot(self.kminus, self.B2), T.ones_like(self.z[:, 0:1])) v2 += T.dot(self.kplus * T.dot(self.kplus, self.B), T.ones_like(self.z[:, 0:1])) v2 += T.dot(self.kminus * T.dot(self.kminus, self.B), T.ones_like(self.z[:, 0:1])) v1 -= T.dot(self.EKxz * T.dot(self.EKxz, self.B2), T.ones_like(self.z[:, 0:1])) v2 -= T.dot(self.Kxz * T.dot(self.Kxz, self.B), T.ones_like(self.z[:, 0:1])) self.output_vars = abs(v_out) + abs(v2) + abs(v1) self.output_vars = self.output_vars + T.exp(self.lvar_noise) return
def main(): #Load mastectomy dataset df = datasets.get_rdataset('mastectomy', 'HSAUR', cache=True).data #Change event to integer df.event = df.event.astype(np.int64) #Change metastized to integer (1 for yes, 0 for no) df.metastized = (df.metastized == 'yes').astype(np.int64) #Count the number of patients n_patients = df.shape[0] #Create array for each individual patient patients = np.arange(n_patients) #Censoring - we do not observe the death of every subject, and subjects may still be alive at time t=0 #1 - observation is not censored (death was observed) #0 - observation is censored (death was not observed) nonCensored = df.event.mean() #Create censoring plot fig, ax = plt.subplots(figsize=(8, 6)) blue, _, red = sns.color_palette()[:3] #Create horizontal lines for censored observations ax.hlines(patients[df.event.values == 0], 0, df[df.event.values == 0].time, color=blue, label='Censored') #Create horizontal red lines for uncensored observations ax.hlines(patients[df.event.values == 1], 0, df[df.event.values == 1].time, color=red, label='Uncensored') #Create scatter ppoints for metastized months ax.scatter(df[df.metastized.values == 1].time, patients[df.metastized.values == 1], color='k', zorder=10, label='Metastized') ax.set_xlim(left=0) ax.set_xlabel('Months since mastectomy') ax.set_yticks([]) ax.set_ylabel('Subject') ax.set_ylim(-0.25, n_patients + 0.25) ax.legend(loc='center right') #To understand the impact of metastization on survival time, we use a risk regression model #Cox proportional hazards model #Make intervals 3 months long interval_length = 3 interval_bounds = np.arange(0, df.time.max() + interval_length + 1, interval_length) n_intervals = interval_bounds.size - 1 intervals = np.arange(n_intervals) #Check how deaths and censored observations are distributed in intervals fig, ax = plt.subplots(figsize=(8, 6)) #Plot histogram of uncensored events ax.hist(df[df.event == 1].time.values, bins=interval_bounds, color=red, alpha=0.5, lw=0, label='Uncensored') #Plot histogram of censored events ax.hist(df[df.event == 0].time.values, bins=interval_bounds, color=blue, alpha=0.5, lw=0, label='Censored') ax.set_xlim(0, interval_bounds[-1]) ax.set_xlabel('Months since mastectomy') ax.set_yticks([0, 1, 2, 3]) ax.set_ylabel('Number of observations') ax.legend() #Calculates the last interval period when a subject was alive last_period = np.floor((df.time - 0.01) / interval_length).astype(int) #Creates an empty matrix to store deaths death = np.zeros((n_patients, n_intervals)) #For each patient (row), create an event where the last interval period was observed (column) death[patients, last_period] = df.event #Create matrix of the amount of time a subject (row) was at risk in an interval (column) exposure = np.greater_equal.outer(df.time, interval_bounds[:-1]) * interval_length exposure[patients, last_period] = df.time - interval_bounds[last_period] #Define parameters for PyMC SEED = 5078864 n_samples = 1000 n_tune = 1000 #Create PyMC model -> lambda(t) = lambda0(t) * e ^ (X*beta) with pm.Model() as model: #Define prior distribution of hazards as vague Gamma distribution lambda0 = pm.Gamma('lambda0', 0.01, 0.01, shape=n_intervals) #Define hazard regression coefficients (beta) for covariates X as a normal distribution beta = pm.Normal('beta', 0, sd=1000) #Create equation for lambda(t) as a deterministic node - record sampled values as part of output #T.outer = symbolic matrix, vector-vector outer product lambda_ = pm.Deterministic( 'lambda_', T.outer(T.exp(beta * df.metastized), lambda0)) #Mu is created from our lambda values (hazard) times patient exposure per interval mu = pm.Deterministic('mu', exposure * lambda_) #We model the posterior distribution as a Poisson distribution with mean Mu obs = pm.Poisson('obs', mu, observed=death) with model: trace = pm.sample(n_samples, tune=n_tune, random_seed=SEED) pm.traceplot(trace) #Calculate hazard rate for subjects with metastized cancer (based on regression coefficients) hazardRate = np.exp(trace['beta'].mean()) pm.plot_posterior(trace, varnames=['beta'], color='#87ceeb') pm.autocorrplot(trace, varnames=['beta']) #Store base hazard as well as metastized hazard for each sample per interval #(sample x number of intervals) base_hazard = trace['lambda0'] met_hazard = trace['lambda0'] * np.exp(np.atleast_2d(trace['beta']).T) #Calculate cumulative hazard def cum_hazard(hazard): return (interval_length * hazard).cumsum(axis=-1) #Calculative survival as = e^(-cumulative hazard) def survival(hazard): return np.exp(-cum_hazard(hazard)) #Plot highest posterior density def plot_with_hpd(x, hazard, f, ax, color=None, label=None, alpha=0.05): #Use function f on hazard mean mean = f(hazard.mean(axis=0)) #Create confidence percentiles percentiles = 100 * np.array([alpha / 2., 1. - alpha / 2.]) hpd = np.percentile(f(hazard), percentiles, axis=0) ax.fill_between(x, hpd[0], hpd[1], color=color, alpha=0.25) ax.step(x, mean, color=color, label=label) #Create figure fig, (hazard_ax, surv_ax) = plt.subplots(ncols=2, sharex=True, sharey=False, figsize=(16, 6)) #Plot Hazard with HPD up until the last interval for non-metasized cancer plot_with_hpd(interval_bounds[:-1], base_hazard, cum_hazard, hazard_ax, color=blue, label='Had not metastized') #Plot Hazard with HPD up until the last interval for metasized cancer plot_with_hpd(interval_bounds[:-1], met_hazard, cum_hazard, hazard_ax, color=red, label='Metastized') hazard_ax.set_xlim(0, df.time.max()) hazard_ax.set_xlabel('Months since mastectomy') hazard_ax.set_ylabel(r'Cumulative hazard $\Lambda(t)$') hazard_ax.legend(loc=2) #Plot Survival with HPD up until the last interval for non-metasized cancer plot_with_hpd(interval_bounds[:-1], base_hazard, survival, surv_ax, color=blue) #Plot Survival with HPD up until the last interval for metasized cancer plot_with_hpd(interval_bounds[:-1], met_hazard, survival, surv_ax, color=red) surv_ax.set_xlim(0, df.time.max()) surv_ax.set_xlabel('Months since mastectomy') surv_ax.set_ylabel('Survival function $S(t)$') fig.suptitle('Bayesian survival model') #Consider time varying effects with pm.Model() as time_varying_model: lambda0 = pm.Gamma('lambda0', 0.01, 0.01, shape=n_intervals) #Beta is now modeled as a normal random walk instead of a normal distribution #This is due to the fact that the regression coefficients can vary over time beta = GaussianRandomWalk('beta', tau=1., shape=n_intervals) lambda_ = pm.Deterministic( 'h', lambda0 * T.exp(T.outer(T.constant(df.metastized), beta))) mu = pm.Deterministic('mu', exposure * lambda_) obs = pm.Poisson('obs', mu, observed=death) with time_varying_model: time_varying_trace = pm.sample(n_samples, tune=n_tune, random_seed=SEED) pm.traceplot(time_varying_trace) pm.plot_posterior(time_varying_trace, varnames=['beta'], color='#87ceeb') pm.forestplot(time_varying_trace, varnames=['beta']) #Create plot to show the mean trace of beta fig, ax = plt.subplots(figsize=(8, 6)) #Create percentiles of the new trace beta_hpd = np.percentile(time_varying_trace['beta'], [2.5, 97.5], axis=0) beta_low = beta_hpd[0] beta_high = beta_hpd[1] #Fill percentile interval ax.fill_between(interval_bounds[:-1], beta_low, beta_high, color=blue, alpha=0.25) #Create the mean estimate for beta from trace samples beta_hat = time_varying_trace['beta'].mean(axis=0) #Plot a stepwise line for beta_hat per interval ax.step(interval_bounds[:-1], beta_hat, color=blue) #Plot points where cancer was metastized, differentiation between death and censorship ax.scatter(interval_bounds[last_period[(df.event.values == 1) & (df.metastized == 1)]], beta_hat[last_period[(df.event.values == 1) & (df.metastized == 1)]], c=red, zorder=10, label='Died, cancer metastized') ax.scatter(interval_bounds[last_period[(df.event.values == 0) & (df.metastized == 1)]], beta_hat[last_period[(df.event.values == 0) & (df.metastized == 1)]], c=blue, zorder=10, label='Censored, cancer metastized') ax.set_xlim(0, df.time.max()) ax.set_xlabel('Months since mastectomy') ax.set_ylabel(r'$\beta_j$') ax.legend() #Store time-varying model tv_base_hazard = time_varying_trace['lambda0'] tv_met_hazard = time_varying_trace['lambda0'] * np.exp( np.atleast_2d(time_varying_trace['beta'])) #Plot cumulative hazard functions with and without time-varying effect fig, ax = plt.subplots(figsize=(8, 6)) ax.step(interval_bounds[:-1], cum_hazard(base_hazard.mean(axis=0)), color=blue, label='Had not metastized') ax.step(interval_bounds[:-1], cum_hazard(met_hazard.mean(axis=0)), color=red, label='Metastized') ax.step(interval_bounds[:-1], cum_hazard(tv_base_hazard.mean(axis=0)), color=blue, linestyle='--', label='Had not metastized (time varying effect)') ax.step(interval_bounds[:-1], cum_hazard(tv_met_hazard.mean(axis=0)), color=red, linestyle='--', label='Metastized (time varying effect)') ax.set_xlim(0, df.time.max() - 4) ax.set_xlabel('Months since mastectomy') ax.set_ylim(0, 2) ax.set_ylabel(r'Cumulative hazard $\Lambda(t)$') ax.legend(loc=2) #Plot cumulative hazard and survival models with HPD fig, (hazard_ax, surv_ax) = plt.subplots(ncols=2, sharex=True, sharey=False, figsize=(16, 6)) plot_with_hpd(interval_bounds[:-1], tv_base_hazard, cum_hazard, hazard_ax, color=blue, label='Had not metastized') plot_with_hpd(interval_bounds[:-1], tv_met_hazard, cum_hazard, hazard_ax, color=red, label='Metastized') hazard_ax.set_xlim(0, df.time.max()) hazard_ax.set_xlabel('Months since mastectomy') hazard_ax.set_ylim(0, 2) hazard_ax.set_ylabel(r'Cumulative hazard $\Lambda(t)$') hazard_ax.legend(loc=2) plot_with_hpd(interval_bounds[:-1], tv_base_hazard, survival, surv_ax, color=blue) plot_with_hpd(interval_bounds[:-1], tv_met_hazard, survival, surv_ax, color=red) surv_ax.set_xlim(0, df.time.max()) surv_ax.set_xlabel('Months since mastectomy') surv_ax.set_ylabel('Survival function $S(t)$') fig.suptitle('Bayesian survival model with time varying effects') plt.show() print('x')
def test_A_plus_scaled_outer(self): f = self.function( [self.A, self.x, self.y], self.A + 0.1 * tensor.outer(self.x, self.y) ) self.assertFunctionContains(f, ScipyGer(destructive=False)) self.run_f(f) # DebugMode tests correctness
def attributes_update(self, attributes, depth, graph, original_graph, bonds): '''Given the current attributes, the current depth, and the graph that the attributes are based on, this function will update the 2D attributes tensor''' ############# GET NEW ATTRIBUTE MATRIX ######################### # New pre-activated attribute matrix v = M_i,j,: x ones((N_atom, 1)) -> (N_atom, N_features) # as long as dimensions are appropriately shuffled shuffled_graph = graph.copy().dimshuffle( (2, 0, 1)) # (N_feature x N_atom x N_atom) shuffled_graph.name = 'shuffled_graph' ones_vec = K.ones_like(attributes[:, 0]) # (N_atom x 1) ones_vec.name = 'ones_vec' (new_preactivated_attributes, updates) = theano.scan( lambda x: K.dot(x, ones_vec), sequences=shuffled_graph) # (N_features x N_atom) # Need to pass through an activation function still # Final attribute = bond flag = is not part of W_inner or b_inner (new_attributes, updates) = theano.scan(lambda x: self.activation_inner( K.dot(x, self.W_inner[depth, :, :]) + self.b_inner[depth, 0, :]), sequences=new_preactivated_attributes[:-1, :].T ) # (N_atom x N_features -1) # Append last feature (bond flag) after the loop new_attributes = K.concatenate((new_attributes, attributes[:, -1:]), axis=1) new_attributes.name = 'new_attributes' ############ UPDATE GRAPH TENSOR WITH NEW ATOM ATTRIBUTES ################### ### Node attribute contribution is located in every entry of graph[i,j,:] where ### there is a bond @ ij or when i = j (self) # Get atoms matrix (identity) atoms = T.identity_like(bonds) # (N_atom x N_atom) atoms.name = 'atoms_identity' # Combine bonds_or_atoms = bonds + atoms # (N_atom x N_atom) bonds_or_atoms.name = 'bonds_or_atoms' atom_indeces = T.arange( ones_vec.shape[0]) # 0 to N_atoms - 1 (indeces) atom_indeces.name = 'atom_indeces vector' ### Subtract previous node attribute contribution # Multiply each entry in bonds_or_atoms by the previous atom features for that column (old_features_to_sub, updates) = theano.scan( lambda i: T.outer(bonds_or_atoms[:, i], attributes[i, :]), sequences=T.arange(ones_vec.shape[0])) old_features_to_sub.name = 'old_features_to_sub' ### Add new node attribute contribution # Multiply each entry in bonds_or_atoms by the previous atom features for that column (new_features_to_add, updates) = theano.scan( lambda i: T.outer(bonds_or_atoms[:, i], new_attributes[i, :]), sequences=T.arange(ones_vec.shape[0])) new_features_to_add.name = 'new_features_to_add' # Update new graph new_graph = graph - old_features_to_sub + new_features_to_add new_graph.name = 'new_graph' return (new_attributes, new_graph)
def test_int_fails(self): self.manual_setup_method("int32") f = self.function([self.x, self.y], tensor.outer(self.x, self.y)) self.assertFunctionContains0(f, CGer(destructive=True)) self.assertFunctionContains0(f, CGer(destructive=False))
def flat_outer(a, b): return tt.outer(a, b).ravel()
def test_scaled_A_plus_scaled_outer(self): f = self.function( [self.A, self.x, self.y], 0.2 * self.A + 0.1 * tensor.outer(self.x, self.y) ) self.assertFunctionContains(f, gemm_no_inplace) self.run_f(f) # DebugMode tests correctness
def test_int_fails(self): self.setUp('int32') f = self.function([self.x, self.y], tensor.outer(self.x, self.y)) self.assertFunctionContains0(f, CGer(destructive=True)) self.assertFunctionContains0(f, CGer(destructive=False))
def force_outer(l, r): return tensor.outer(l, r) if r.ndim == 1 else l.dot(r.T)
def test_optimization_pipeline_float(self): skip_if_blas_ldflags_empty() self.setUp('float32') f = self.function([self.x, self.y], tensor.outer(self.x, self.y)) self.assertFunctionContains(f, CGer(destructive=True)) f(self.xval, self.yval) # DebugMode tests correctness
def test_A_plus_scaled_outer(self): skip_if_blas_ldflags_empty() f = self.function([self.A, self.x, self.y], self.A + 0.1 * tensor.outer(self.x, self.y)) self.assertFunctionContains(f, CGer(destructive=False)) self.run_f(f) # DebugMode tests correctness
def test_profiling(self): config1 = theano.config.profile config2 = theano.config.profile_memory config3 = theano.config.profiling.min_peak_memory try: theano.config.profile = True theano.config.profile_memory = True theano.config.profiling.min_peak_memory = True x = [T.fvector("val%i" % i) for i in range(3)] z = [] z += [ T.outer(x[i], x[i + 1]).sum(axis=1) for i in range(len(x) - 1) ] z += [x[i] + x[i + 1] for i in range(len(x) - 1)] p = theano.ProfileStats(False, gpu_checks=False) if theano.config.mode in [ "DebugMode", "DEBUG_MODE", "FAST_COMPILE" ]: m = "FAST_RUN" else: m = None f = theano.function(x, z, profile=p, name="test_profiling", mode=m) inp = [np.arange(1024, dtype="float32") + 1 for i in range(len(x))] f(*inp) buf = StringIO() f.profile.summary(buf) # regression testing for future algo speed up the_string = buf.getvalue() lines1 = [ l for l in the_string.split("\n") if "Max if linker" in l ] lines2 = [l for l in the_string.split("\n") if "Minimum peak" in l] if theano.config.device == "cpu": assert "CPU: 4112KB (4104KB)" in the_string, (lines1, lines2) assert "CPU: 8204KB (8196KB)" in the_string, (lines1, lines2) assert "CPU: 8208KB" in the_string, (lines1, lines2) assert ( "Minimum peak from all valid apply node order is 4104KB" in the_string), (lines1, lines2) else: assert "CPU: 16KB (16KB)" in the_string, (lines1, lines2) assert "GPU: 8204KB (8204KB)" in the_string, (lines1, lines2) assert "GPU: 12300KB (12300KB)" in the_string, (lines1, lines2) assert "GPU: 8212KB" in the_string, (lines1, lines2) assert ( "Minimum peak from all valid apply node order is 4116KB" in the_string), (lines1, lines2) finally: theano.config.profile = config1 theano.config.profile_memory = config2 theano.config.profiling.min_peak_memory = config3
def cosine_similarity(x, y, eps=1e-6): z = T.dot(x, y.T) z /= T.sqrt(T.outer(T.sum(x * x, axis=1), T.sum(y * y, axis=1)) + eps) return z
def OptimalGaussian(x_train, y_train, Regression=True, Classification=False, bias=False, n_iter=5, alpha=0.01, minibatch=False): ''' inputs x_train: training features y_train: response variable n_iter: # of iterations for SGD alpha: strength of L2 penalty (default penalty for now) outputs Gaussian Node: dictionary with Node parameters an predict method ''' rng = numpy.random feats = len(x_train[0, :]) N = len(x_train[:, 0]) D = [x_train, y_train] training_steps = n_iter #print "training steps: ", training_steps #print "penalty strength: ", alpha #print "Uses bias: ", bias # Declare Theano symbolic variables x = T.matrix("x") y = T.vector("y") w = theano.shared(rng.uniform(low=-0.25, high=0.25, size=feats), name="w") b = theano.shared(abs(rng.randn(1)[0]), name="b") a = theano.shared(abs(rng.randn(1)[0]), name="a") rep = theano.shared(numpy.asarray([1] * N), name="rep") #print "Initialize node as:" #print w.get_value(), b.get_value(), a.get_value() # Construct Theano expression graph W = T.outer(rep, w) if bias: p_1 = a * T.exp(-0.5 / (b**2) * T.dot((x - w).T, (x - w))) else: p_1 = a * T.exp(-0.5 / (1**2) * T.diagonal(T.dot((x - W), (x - W).T))) prediction = p_1 > 0.5 if Regression: xent = 0.5 * (y - p_1)**2 if alpha == 0: cost = xent.mean() # The cost to minimize else: cost = xent.mean() + alpha * ((w**2).sum()) if bias: gw, gb, ga = T.grad(cost, [w, b, a]) else: gw, ga = T.grad(cost, [w, a]) # Compute the gradient of the cost # Compile Node = {} Node['Path'] = {} NodePath = Node['Path'] if bias: train = theano.function(inputs=[x, y], outputs=[prediction, xent], updates=((w, w - 0.1 * gw), (b, b - 0.1 * gb), (a, a - 0.1 * ga))) else: train = theano.function(inputs=[x, y], outputs=[prediction, xent], updates=((w, w - 0.1 * gw), (a, a - 0.1 * ga))) predict = theano.function(inputs=[x], outputs=p_1) # Train for i in range(training_steps): if minibatch: batch_split = train_test_split(x_train, y_train, test_size=0.2) _, D[0], _, D[1] = batch_split #IPython.embed() pred, err = train(D[0], D[1]) elif not minibatch: pred, err = train(D[0], D[1]) NodePath[str(i)] = {} NodePath[str(i)]['w'] = w.get_value() NodePath[str(i)]['b'] = b.get_value() NodePath[str(i)]['a'] = a.get_value() Node['w'] = w.get_value() Node['b'] = b.get_value() Node['a'] = a.get_value() Node['predict'] = predict return Node
def build_model(tparams, options): # for training # encoder input x_node = tensor.tensor4('x_node', dtype=config.floatX) x = tensor.tensor4('x', dtype='int64') x_mask_word = tensor.tensor4('x_mask_word', dtype=config.floatX) x_mask_sent = tensor.tensor3('x_mask_sent', dtype=config.floatX) x_mask_doc = tensor.matrix('x_mask_doc', dtype=config.floatX) # decoder input dec_inp = tensor.matrix('dec_inp', dtype='int64') dec_inp_mask = tensor.matrix('dec_inp_mask', dtype=config.floatX) # decoder output dec_out = tensor.matrix('dec_out', dtype='int64') dec_out_mask = tensor.matrix('dec_out_mask', dtype=config.floatX) #TODO # for generation hidi = tensor.matrix('hidi', dtype=config.floatX) celi = tensor.matrix('celi', dtype=config.floatX) hids = tensor.tensor4('hids', dtype=config.floatX) xi = tensor.vector('xi', dtype='int64') xi_mask = tensor.vector('xi_mask', dtype=config.floatX) preds, f_encode, f_decode, f_probi = ptr_network(tparams, x_node,x, x_mask_word, x_mask_sent, x_mask_doc, dec_inp, dec_inp_mask, xi, xi_mask, hidi, celi, hids, options) #cost = None #return x, x_mask_word, x_mask_sent, x_mask_doc, dec_inp, dec_inp_mask, dec_out, dec_out_mask, preds, cost, f_encode, f_decode, f_probi n_steps = preds.shape[0] n_sents = preds.shape[1] n_docs = preds.shape[2] n_clusters = preds.shape[3] #preds = preds.reshape([n_steps, n_sents * n_docs, n_clusters]) preds_contiguous = preds.dimshuffle(0,2,1,3).reshape([n_steps, n_docs * n_sents, n_clusters]) # pull out the probs of the correct ones n_steps = dec_inp.shape[0] n_samples = dec_inp.shape[1] idx_steps = tensor.outer(tensor.arange(n_steps, dtype='int64'), tensor.ones((n_samples,), dtype='int64')) idx_samples = tensor.outer(tensor.ones((n_steps,), dtype='int64'), tensor.arange(n_samples, dtype='int64')) # idx_steps, dec_out, idx_samples are all n_steps x n_samples, then probs is also n_steps x n_samples #probs = preds[idx_steps, dec_out, idx_samples] # n_steps x n_samples probs = preds_contiguous[idx_steps, dec_out, idx_samples] # n_steps x n_samples # probs *= y_mask off = 1e-8 if probs.dtype == 'float16': off = 1e-6 # probs += (1 - y_mask) # change unmasked position to 1, since log(1) = 0 probs += off probs_printed = theano.printing.Print('this is probs')(probs) cost = -tensor.log(probs) cost *= dec_out_mask #TODO: might cause NaN here ! # This should be okay since in dec_out_mask, we always have at least one 1. for the terminate signal. cost = cost.sum(axis=0) / tensor.maximum(1.0, dec_out_mask.sum(axis=0)) cost = cost.mean() return x_node,x, x_mask_word, x_mask_sent, x_mask_doc, dec_inp, dec_inp_mask, dec_out, dec_out_mask, preds, cost, f_encode, f_decode, f_probi
def free_energy(self, v_sample): D = T.sum(v_sample, axis=1) wx_b = T.dot(v_sample, self.W) + T.outer(D, self.hbias) vbias_term = T.dot(v_sample, self.vbias) hidden_term = T.sum(T.log(1 + T.exp(wx_b)), axis=1) return -hidden_term - vbias_term
def prepare_model(x_train, y_train, batchsize, params=None): input_var = T.matrix('inputs') target_var = T.ivector('targets') same_cluster_indices_matrix = T.matrix('same_clusters') diff_cluster_indices_matrix = T.matrix('diff_clusters') # prepare network print '\nPreparing the model with primary hidden layer size %d...' % HOURGLASS_LAYER_SIZE print 'X-shape = %d, Num_classes = %d, num_samples = %d' % ( x_train[0].shape[0], max(y_train), len(x_train)) representation_layer, network = build_args_nn(x_train, y_train, batchsize, input_var) # loss stuff prediction = lasagne.layers.get_output(network) get_representations = lasagne.layers.get_output(representation_layer, inputs=input_var, deterministic=True) loss = lasagne.objectives.categorical_crossentropy(prediction, target_var) if LAMBDA1 == LAMBDA2 == 0.0: loss = loss.mean() else: representations = get_representations dot_prods = T.dot(representations, representations.T) # X times X.T diag = T.sqrt(T.diagonal(dot_prods)) # sqrt(||ri||^2) = ||ri|| norms = T.outer(diag, diag.T) distances = 0.5 * (1 - (dot_prods * (1. / norms)) ) # d(a,b) = 1/2 (1 - dot(a,b) / (||a||*||b||)) # we want the first sum to be as close to zero as possible, so we add it to the loss. # we want the second sum to be as close to 1 as possible, so we want LAMBDA2 * (1 - sum2) # to be as close to zero as possible, thus adding that difference to the overall loss. loss = loss.mean() \ + (LAMBDA1 * T.sum(same_cluster_indices_matrix * distances)) \ + (LAMBDA2 * (1.0 - T.sum(diff_cluster_indices_matrix * distances))) # for loading/building the parameters if not params: params = lasagne.layers.get_all_params(network, trainable=True) else: lasagne.layers.set_all_param_values(network, params) params = lasagne.layers.get_all_params(network, trainable=True) updates = lasagne.updates.adam(loss, params, learning_rate=LEARNING_RATE) # the final keys train_function = theano.function([ input_var, target_var, same_cluster_indices_matrix, diff_cluster_indices_matrix ], loss, updates=updates, allow_input_downcast=True, on_unused_input='ignore') convert_to_numpy_function = theano.function([input_var], get_representations, allow_input_downcast=True) # theano.printing.debugprint(train_function.maker.fgraph.outputs[0]) return network, train_function, convert_to_numpy_function
def sample_h_given_v(self, v, beta, D): pre_sigmoid_activation = beta * (T.dot(v, self.model.W) + T.outer(D, self.model.hbias)) h_mean = T.nnet.sigmoid(pre_sigmoid_activation) h_sample = self.theano_rng.binomial(size=h_mean.shape, n=1, p=h_mean, dtype=theano.config.floatX) return h_sample
def __init__(self, X_train, y_train, X_test, y_test, num_class, batch_size=100, max_iter=1000, M=20, n_hidden=50, a0=1, b0=1, master_stepsize=5e-4, auto_corr=0.99): self.n_hidden = n_hidden self.d = X_train.shape[1] # number of data, dimension self.M = M self.num_class = num_class self.batch_size = batch_size self.stepsize = master_stepsize self.epoch = int(max_iter * batch_size / 60000) # For Mnist num_vars = self.d * n_hidden + n_hidden + n_hidden * num_class + num_class + 2 + n_hidden * ( n_hidden + 1 ) + 4 * self.n_hidden + self.num_class + self.d # w1: d*n_hidden; b1: n_hidden; w3 = n_hidden; b3 = 1; 2 variances self.theta = np.zeros([self.M, num_vars ]) # particles, will be initialized later ''' The data sets are normalized so that the input features and the targets have zero mean and unit variance ''' self.std_X_train = np.std(X_train, 0) self.std_X_train[self.std_X_train == 0] = 1 self.mean_X_train = np.mean(X_train, 0) self.mean_y_train = np.mean(y_train) self.std_y_train = np.std(y_train) self.history = np.zeros([self.epoch, 2]) self.learningRateBlock = int(self.epoch * 0.2 * 60000 / self.batch_size) self.learningRateBlockDecay = 0.5 ''' Theano symbolic variables Define the neural network here ''' X = T.matrix('X') # Feature matrix y = T.matrix('y') # labels w_1 = T.matrix('w_1') # weights between input layer and hidden layer v_11 = T.vector('v_11') v_12 = T.vector( 'v_12') # Transform Vector between input layer and hidden layer b_1 = T.vector('b_1') # bias vector of hidden layer w_2 = T.matrix('w_2') # weights between hidden layer and hidden layer v_21 = T.vector('v_21') v_22 = T.vector('v_22') b_2 = T.vector('b_2') # bias of output w_3 = T.matrix('w_3') # weights between hidden layer and output layer v_31 = T.vector('v_31') v_32 = T.vector( 'v_32') # Transform Vector between output layer and hidden layer b_3 = T.vector('b_3') # bias of output N = T.scalar('N') # number of observations p_1 = T.eye(self.d) - 2 * T.outer(v_11, v_11) / T.sum(v_11**2) q_1 = T.eye(self.n_hidden) - 2 * T.outer(v_12, v_12) / T.sum(v_12**2) p_2 = T.eye(self.n_hidden) - 2 * T.outer(v_21, v_21) / T.sum(v_21**2) q_2 = T.eye(self.n_hidden) - 2 * T.outer(v_22, v_22) / T.sum(v_22**2) p_3 = T.eye(self.n_hidden) - 2 * T.outer(v_31, v_31) / T.sum(v_31**2) q_3 = T.eye(self.num_class) - 2 * T.outer(v_32, v_32) / T.sum(v_32**2) wf_1 = T.dot(T.dot(p_1, w_1), q_1) wf_2 = T.dot(T.dot(p_2, w_2), q_2) wf_3 = T.dot(T.dot(p_3, w_3), q_3) log_gamma = T.scalar('log_gamma') # variances related parameters log_lambda = T.scalar('log_lambda') ### #prediction = (T.nnet.nnet.softmax(T.dot( T.nnet.relu(T.dot(T.nnet.relu(T.dot(X, wf_1)+b_1), wf_2) + b_2) , wf_3) + b_3)) prediction = (T.nnet.nnet.softmax( T.dot( T.nnet.relu( batchnorm( T.dot(T.nnet.relu(batchnorm(T.dot(X, wf_1) + b_1)), wf_2) + b_2)), wf_3) + b_3)) ''' define the log posterior distribution ''' priorprec = T.log(b0 / a0) log_lik_data = T.sum(T.sum(y * T.log(prediction))) log_prior_w = -0.5 * (num_vars - 2) * ( T.log(2 * np.pi) - priorprec) - (T.exp(priorprec) / 2) * ( (w_1**2).sum() + (w_2**2).sum() + (w_3**2).sum() + (b_1**2).sum() + (b_2**2).sum() + (b_3**2).sum()) + 1e-9 * log_gamma + 1e-9 * log_lambda # sub-sampling mini-batches of data, where (X, y) is the batch data, and N is the number of whole observations log_posterior = (log_lik_data * N / X.shape[0] + log_prior_w) dw_1, db_1, dw_2, db_2, dw_3, db_3, dv_11, dv_12, dv_21, dv_22, dv_31, dv_32, d_log_gamma, d_log_lambda = T.grad( log_posterior, [ w_1, b_1, w_2, b_2, w_3, b_3, v_11, v_12, v_21, v_22, v_31, v_32, log_gamma, log_lambda ]) # automatic gradient logp_gradient = theano.function(inputs=[ X, y, w_1, b_1, w_2, b_2, w_3, b_3, v_11, v_12, v_21, v_22, v_31, v_32, log_gamma, log_lambda, N ], outputs=[ dw_1, db_1, dw_2, db_2, dw_3, db_3, dv_11, dv_12, dv_21, dv_22, dv_31, dv_32, d_log_gamma, d_log_lambda ]) # prediction function self.nn_predict = theano.function(inputs=[ X, w_1, b_1, w_2, b_2, w_3, b_3, v_11, v_12, v_21, v_22, v_31, v_32 ], outputs=prediction) ''' Training with SVGD ''' # normalization X_train = self.normalization(X_train) N0 = X_train.shape[0] # number of observations ''' initializing all particles ''' for i in range(self.M): w1, b1, w2, b2, w3, b3, v11, v12, v21, v22, v31, v32, loggamma, loglambda = self.init_weights( a0, b0) # use better initialization for gamma ridx = np.random.choice(range(X_train.shape[0]), \ np.min([X_train.shape[0], 1000]), replace = False) y_hat = self.nn_predict(X_train[ridx, :], w1, b1, w2, b2, w3, b3, v11, v12, v21, v22, v31, v32) loggamma = -np.log(np.mean(np.power(y_hat - y_train[ridx], 2))) self.theta[i, :] = self.pack_weights(w1, b1, w2, b2, w3, b3, v11, v12, v21, v22, v31, v32, loggamma, loglambda) #w1_, b1_, w2_, b2_, w3_, b3_, v11_, v12_, v21_, v22_, v31_, v32_, loggamma_, loglambda_ = self.unpack_weights(self.theta[i,:]) #print(np.sum((v31_-v31)**2)) #pdb.set_trace() grad_theta = np.zeros([self.M, num_vars]) # gradient # adagrad with momentum fudge_factor = 1e-5 historical_grad = 0 for iter in range(max_iter): # sub-sampling batch = [ i % N0 for i in range(iter * batch_size, (iter + 1) * batch_size) ] for i in range(self.M): w1, b1, w2, b2, w3, b3, v11, v12, v21, v22, v31, v32, loggamma, loglambda = self.unpack_weights( self.theta[i, :]) dw1, db1, dw2, db2, dw3, db3, dv11, dv12, dv21, dv22, dv31, dv32, dloggamma, dloglambda = logp_gradient( X_train[batch, :], y_train[batch], w1, b1, w2, b2, w3, b3, v11, v12, v21, v22, v31, v32, loggamma, loglambda, N0) grad_theta[i, :] = self.pack_weights(dw1, db1, dw2, db2, dw3, db3, dv11, dv12, dv21, dv22, dv31, dv32, dloggamma, dloglambda) # calculating the kernel matrix if (self.M > 1): kxy, dxkxy = self.svgd_kernel(h=-1) grad_theta = (np.matmul(kxy, grad_theta) + dxkxy) / self.M # \Phi(x) # adagrad if iter == 0: historical_grad = historical_grad + np.multiply( grad_theta, grad_theta) else: historical_grad = auto_corr * historical_grad + ( 1 - auto_corr) * np.multiply(grad_theta, grad_theta) adj_grad = np.divide(grad_theta, fudge_factor + np.sqrt(historical_grad)) if ((iter + 1) % self.learningRateBlock == 0): master_stepsize = master_stepsize * self.learningRateBlockDecay print(master_stepsize) self.theta = self.theta + master_stepsize * adj_grad if (iter * self.batch_size % (X_train.shape[0]) == 0): epoch_index = int(iter * self.batch_size / X_train.shape[0]) pred = self.predict(X_test) self.history[epoch_index, 0] = self.evluation(X_train, y_train, iter) self.history[epoch_index, 1] = sum(pred == y_test) * 1.0 / X_test.shape[0] print('Epoch ', iter * self.batch_size / X_train.shape[0], ' Iter:', iter, ' Cost: ', self.history[epoch_index, 0]) print('Precision: ', self.history[epoch_index, 1]) if (epoch_index % 10 == 0): np.savez('structure' + np.str(epoch_index) + '.npz', v11=v11, v12=v12, v21=v21, v22=v22, v31=v31, v32=v32) self.savemodel()
def test_optimization_pipeline(self): f = self.function([self.x, self.y], tensor.outer(self.x, self.y)) self.assertFunctionContains(f, CGer(destructive=True)) f(self.xval, self.yval) # DebugMode tests correctness
def __init__(self, glimpse_shape, glimpse_times, dim_hidden, dim_fc, dim_out, reward_base, rng_std=1.0, activation=T.tanh, bptt_truncate=-1, lmbd=0.1, # gdupdate + lmbd*rlupdate DEBUG=False, ): # super(AttentionUnit, self).__init__() if reward_base == None: reward_base = np.zeros((glimpse_times)).astype('float32') reward_base[-1] = 1.0 x = T.ftensor3('x') # N * W * H y = T.ivector('y') # label lr = T.fscalar('lr') reward_base = theano.shared(name='reward_base', value=np.array(reward_base).astype(theano.config.floatX), borrow=True) # Time (vector) reward_bias = T.fvector('reward_bias') # rng = T.shared_randomstreams.RandomStreams(123) rng = MRG_RandomStreams(np.random.randint(9999999)) i = InputLayer(x) au = AttentionUnit(x, glimpse_shape, glimpse_times, dim_hidden, rng, rng_std, activation, bptt_truncate) # All hidden states are put into decoder # layers = [i, au, InputLayer(au.output[:,:,:].flatten(2))] # dim_fc = [glimpse_times*dim_hidden] + dim_fc + [dim_out] # Only the last hidden states layers = [i, au, InputLayer(au.output[:,-1,:])] dim_fc = [dim_hidden] + dim_fc + [dim_out] for Idim, Odim in zip(dim_fc[:-1], dim_fc[1:]): fc = FullConnectLayer(layers[-1].output, Idim, Odim, activation, 'FC') layers.append(fc) sm = SoftmaxLayer(layers[-1].output) layers.append(sm) output = sm.output # N * classes hidoutput = au.output # N * dim_output location = au.location # N * T * dim_hidden prediction = output.argmax(1) # N # calc equalvec = T.eq(prediction, y) # [0, 1, 0, 0, 1 ...] correct = T.cast(T.sum(equalvec), 'float32') # noequalvec = T.neq(prediction, y) # nocorrect = T.cast(T.sum(noequalvec), 'float32') logLoss = T.log(output)[T.arange(y.shape[0]), y] # # reward_biased = T.outer(equalvec, reward_base - reward_bias.dimshuffle('x', 0)) reward_biased = T.outer(equalvec, reward_base) - reward_bias.dimshuffle('x', 0) # N * Time # (R_t - b_t), where b = E[R] # gradient descent gdobjective = logLoss.sum()/x.shape[0] # correct * dim_output (only has value on the correctly predicted sample) gdparams = reduce(lambda x, y: x+y.params, layers, []) gdupdates = map(lambda x: (x, x+lr*T.grad(gdobjective, x)), gdparams) # reinforce learning # without maximum, then -log(p) will decrease the p rlobjective = (T.maximum(reward_biased.dimshuffle(0, 1, 'x'), 0) * T.log(au.location_p)).sum() / correct # location_p: N * Time * 2 # location_logp: N * Time # reward_biased: N * 2 rlparams = au.reinforceParams rlupdates = map(lambda x: (x, x+lr*lmbd*T.grad(rlobjective, x)), rlparams) # Hidden state keeps unchange in time deltas = T.stack(*[((au.output[:,i,:].mean(0)-au.output[:,i+1,:].mean(0))**2).sum() for i in xrange(glimpse_times-1)]) # N * Time * dim_hidden print 'compile step()' self.step = theano.function([x, y, lr, reward_bias], [gdobjective, rlobjective, correct, T.outer(equalvec, reward_base)], updates=gdupdates+rlupdates) # print 'compile gdstep()' # self.gdstep = theano.function([x, y, lr], [gdobjective, correct, location], updates=gdupdates) # print 'compile rlstep()' # self.rlstep = theano.function([x, y, lr], [rlobjective], updates=rlupdates) print 'compile predict()' self.predict = theano.function([x], prediction) if DEBUG: print 'compile glimpse()' self.glimpse = theano.function([x], au.glimpse) #[layers[-3].output, fc.output]) print 'compile innerstate()' self.getinnerstate = theano.function([x], au.innerstate) print 'compile forward()' self.forward = theano.function([x], map(lambda x: x.output, layers)) #[layers[-3].output, fc.output]) print 'compile error()' self.error = theano.function([x, y, reward_bias], [gdobjective, rlobjective]) print 'compile locate()' self.locate = theano.function([x], [au.location_mean, location]) #[layers[-3].output, fc.output]) print 'compile debug()' self.debug = theano.function([x, y, lr, reward_bias], [deltas, au.location_p], on_unused_input='warn') # self.xxx self.layers = layers self.params = gdparams + rlparams self.glimpse_times = glimpse_times
def predict(self, mx, Sx, *args, **kwargs): if self.N < self.n_inducing: # stick with the full GP return GP_UI.predict(self, mx, Sx) idims = self.D odims = self.E # centralize inputs zeta = self.X_sp - mx # initialize some variables sf2 = self.hyp[:, idims]**2 eyeE = tt.tile(tt.eye(idims), (odims, 1, 1)) lscales = self.hyp[:, :idims] iL = eyeE/lscales.dimshuffle(0, 1, 'x') # predictive mean inp = iL.dot(zeta.T).transpose(0, 2, 1) iLdotSx = iL.dot(Sx) B = (iLdotSx[:, :, None, :]*iL[:, None, :, :]).sum(-1) + tt.eye(idims) t = tt.stack([solve(B[i].T, inp[i].T).T for i in range(odims)]) c = sf2/tt.sqrt(tt.stack([det(B[i]) for i in range(odims)])) l_ = tt.exp(-0.5*tt.sum(inp*t, 2)) lb = l_*self.beta_sp M = tt.sum(lb, 1)*c # input output covariance tiL = tt.stack([t[i].dot(iL[i]) for i in range(odims)]) V = tt.stack([tiL[i].T.dot(lb[i]) for i in range(odims)]).T*c # predictive covariance logk = (tt.log(sf2))[:, None] - 0.5*tt.sum(inp*inp, 2) logk_r = logk.dimshuffle(0, 'x', 1) logk_c = logk.dimshuffle(0, 1, 'x') Lambda = tt.square(iL) LL = (Lambda.dimshuffle(0, 'x', 1, 2) + Lambda).transpose(0, 1, 3, 2) R = tt.dot(LL, Sx.T).transpose(0, 1, 3, 2) + tt.eye(idims) z_ = Lambda.dot(zeta.T).transpose(0, 2, 1) M2 = tt.zeros((odims, odims)) # initialize indices triu_indices = np.triu_indices(odims) indices = [tt.as_index_variable(idx) for idx in triu_indices] def second_moments(i, j, M2, beta, iK, sf2, R, logk_c, logk_r, z_, Sx): # This comes from Deisenroth's thesis ( Eqs 2.51- 2.54 ) Rij = R[i, j] n2 = logk_c[i] + logk_r[j] n2 += utils.maha(z_[i], -z_[j], 0.5*solve(Rij, Sx)) Q = tt.exp(n2)/tt.sqrt(det(Rij)) # Eq 2.55 m2 = matrix_dot(beta[i], Q, beta[j]) m2 = theano.ifelse.ifelse( tt.eq(i, j), m2 - tt.sum(iK[i]*Q) + sf2[i], m2) M2 = tt.set_subtensor(M2[i, j], m2) M2 = theano.ifelse.ifelse( tt.eq(i, j), M2 + 1e-6, tt.set_subtensor(M2[j, i], m2)) return M2 nseq = [self.beta_sp, (self.iKmm - self.iBmm), sf2, R, logk_c, logk_r, z_, Sx] M2_, updts = theano.scan( fn=second_moments, sequences=indices, outputs_info=[M2], non_sequences=nseq, allow_gc=False) M2 = M2_[-1] S = M2 - tt.outer(M, M) return M, S, V
def prop_up(self, vis, D=None): if D == None: D = self.D pre_sigmoid_activation = T.dot(vis, self.W) + T.outer(D, self.hbias) return [pre_sigmoid_activation, T.nnet.sigmoid(pre_sigmoid_activation)]
# [14.12499977, 11.7499998 , 29.37499945, 30.74999942, 32.74999931], # [ 7.87499985, 12.12499977, 27.56249941, 32.74999931, 42.9374991 ]])) # # print T.nlinalg.det(m).eval() # # exit(0) gama = theano.shared( np.float32([[0.2, 0.4, 0.4], [0.7, 0.2, 0.1], [0.2, 0.1, 0.7], [0.5, 0.3, 0.2]])) z_var = theano.shared( np.float32([[1, 2, 3, 4, 9], [1, 2, 5, 6, 8], [1, 3, 6, 8, 6], [8, 2, 7, 4, 1]])) mu_N_k_z, updates_dete = theano.scan(lambda z_s, gama_s: T.outer(z_s, gama_s), sequences=[z_var, gama]) phi_k = T.mean(gama, axis=0) mu_z_k = T.sum(mu_N_k_z, axis=0) / T.sum(gama, axis=0) mu_k_z = T.transpose(mu_z_k, (1, 0)) sigma_N_k_z_z, _ = theano.scan(lambda z_v, gama_v: theano.scan(lambda mu_v, gama_v_v:gama_v_v*T.outer(z_v-mu_v,z_v-mu_v) ,\ sequences= [mu_k_z, gama_v]), sequences=[z_var, gama]) sigma_k_z_z = T.sum(sigma_N_k_z_z, axis=0) sigma_k_z_z = T.transpose( T.transpose(sigma_k_z_z, (1, 2, 0)) / T.sum(gama, axis=0), (2, 0, 1))
def free_energy(self, v, beta, D): return -beta * T.dot(v, self.model.vbias.T) - T.sum(T.log(1+T.exp( beta * (T.dot(v,self.model.W) + T.outer(D,self.model.hbias)) )),axis=1)
def attributes_update(self, attributes, depth, graph, original_graph, bonds): '''Given the current attributes, the current depth, and the graph that the attributes are based on, this function will update the 2D attributes tensor''' ############# GET NEW ATTRIBUTE MATRIX ######################### # New pre-activated attribute matrix v = M_i,j,: x ones((N_atom, 1)) -> (N_atom, N_features) # as long as dimensions are appropriately shuffled shuffled_graph = graph.copy().dimshuffle( (2, 0, 1)) # (N_feature x N_atom x N_atom) shuffled_graph.name = 'shuffled_graph' ones_vec = K.ones_like(attributes[:, 0]) # (N_atom x 1) ones_vec.name = 'ones_vec' # Embed individually # (scan sequences iterates over the FIRST dimension) # (flatten(ndim) keeps the first ndim-1 dimensions the same, then expands the rest to fill) flattened_graph = shuffled_graph.flatten( ndim=2).T # (N_atom^2 x N_feature) # Embed each possible atom-atom interaction (new_presummed_attributes_flat, updates) = theano.scan( lambda x: self.activation_inner( K.dot(x[:-1], self.W_inner[depth, :, :]) + self.b_inner[depth, 0, :]), sequences=flattened_graph) # still (N_atom^2 x N_feature) # Reshape into #(N_feature-1 x N_atom x N_atom) new_presummed_attributes = new_presummed_attributes_flat.T.reshape( shuffled_graph[:-1, :, :].shape) # Now sum activated self+neighbors (new_attributes, updates) = theano.scan( lambda x: K.dot(x, ones_vec), sequences=new_presummed_attributes) # (N_features x N_atom) # Append last feature (bond flag) after the loop new_attributes = K.concatenate((new_attributes.T, attributes[:, -1:]), axis=1) new_attributes.name = 'new_attributes' ############ UPDATE GRAPH TENSOR WITH NEW ATOM ATTRIBUTES ################### ### Node attribute contribution is located in every entry of graph[i,j,:] where ### there is a bond @ ij or when i = j (self) # Get atoms matrix (identity) atoms = T.identity_like(bonds) # (N_atom x N_atom) atoms.name = 'atoms_identity' # Combine bonds_or_atoms = bonds + atoms # (N_atom x N_atom) bonds_or_atoms.name = 'bonds_or_atoms' atom_indeces = T.arange( ones_vec.shape[0]) # 0 to N_atoms - 1 (indeces) atom_indeces.name = 'atom_indeces vector' ### Subtract previous node attribute contribution # Multiply each entry in bonds_or_atoms by the previous atom features for that column (old_features_to_sub, updates) = theano.scan( lambda i: T.outer(bonds_or_atoms[:, i], attributes[i, :]), sequences=T.arange(ones_vec.shape[0])) old_features_to_sub.name = 'old_features_to_sub' ### Add new node attribute contribution # Multiply each entry in bonds_or_atoms by the previous atom features for that column (new_features_to_add, updates) = theano.scan( lambda i: T.outer(bonds_or_atoms[:, i], new_attributes[i, :]), sequences=T.arange(ones_vec.shape[0])) new_features_to_add.name = 'new_features_to_add' # Update new graph new_graph = graph - old_features_to_sub + new_features_to_add new_graph.name = 'new_graph' return (new_attributes, new_graph)
def __init__(self, rng, size, N_word, max_length, Wf_values=None, Wp_values=None, L_values=None, activation=T.tanh): self.size = size self.max_length = max_length #initial Wf, bf if Wf_values is None: Wf_values = np.asarray(rng.uniform( low=-np.sqrt(6. / (size + size * 2)), high=np.sqrt(6. / (size + size * 2)), size=(size, size * 2 + 1)), dtype=theano.config.floatX) if activation == T.nnet.sigmoid: Wf_values *= 4 Wf = theano.shared(value=Wf_values, name='Wf', borrow=True) self.Wf = Wf #initial Wp, bp if Wp_values is None: Wp_values = np.asarray(rng.uniform(low=-np.sqrt(6. / (size * 2)), high=np.sqrt(6. / (size * 2)), size=(2 * size + 1, )), dtype=theano.config.floatX) Wp = theano.shared(value=Wp_values, name='Wp', borrow=True) self.Wp = Wp if L_values is None: L_values = np.asarray(rng.uniform(low=-np.sqrt(6. / (N_word)), high=np.sqrt(6. / (N_word)), size=(N_word, size)), dtype=theano.config.floatX) self.L = theano.shared(value=L_values, name='L', borrow=True) self.params = [self.Wf, self.Wp, self.L] else: self.L = theano.shared(value=L_values, name='L', borrow=True) self.params = [ self.Wf, self.Wp, #self.L ] self.L1 = (abs(self.Wf).sum() + abs(self.Wp).sum()) self.L2_sqr = ((self.Wf**2).sum() + (self.Wp**2).sum()) v1 = T.fvector('v1') v2 = T.fvector('v2') dv = T.fvector('dv') v = T.fvector('v') p = T.fscalar('p') p1 = T.fscalar('p1') p2 = T.fscalar('p2') dp = T.fscalar('dp') i = T.iscalar('i') f_function = self.f_function(v1, v2) p_function = self.g_function(v1, v2) * p1 * p2 self.f = theano.function(inputs=[v1, v2], outputs=f_function) self.p = theano.function(inputs=[v1, v2, p1, p2], outputs=p_function) self.L_i = theano.function(inputs=[ i, ], outputs=self.L[i]) da = (1 - v**2) * dv dWf = T.outer(da, T.concatenate([v1, v2, [np.float32(1.0)]])) g_f = [ dWf, T.dot(da, self.Wf[:, 0:self.size]), T.dot(da, self.Wf[:, self.size:self.size * 2]) ] #g_p = [ # T.grad(p_function, element) * dp # for element in [self.Wp, v1, v2, p1, p2] # ] b = p / p1 / p2 db = b * (1 - b) temp = dp * p1 * p2 * db g_p = [ temp * T.concatenate([v1, v2, [np.float32(1.0)]]), temp * self.Wp[0:self.size], temp * self.Wp[self.size:self.size * 2], dp * p / p1, dp * p / p2 ] self.g_p = theano.function(inputs=[v1, v2, p1, p2, p, dp], outputs=g_p) self.g_f = theano.function(inputs=[v1, v2, v, dv], outputs=g_f)
def test_outer(self): f = self.function([self.x, self.y], tensor.outer(self.x, self.y)) self.assertFunctionContains(f, ScipyGer(destructive=True))
import theano import numpy import theano.tensor as T bn = numpy.array([5,6,100,200]) bn = bn.reshape(2,2) print "Bn shape",bn.shape x = T.matrix('x') b = theano.shared(bn) y = T.outer(x,b) f = theano.function(inputs=[x],outputs=[y,]) xn = numpy.array([1,2,3,4]) xn = xn.reshape(2,2) print "Xn shape",xn.shape print f(xn)
def __init__(self, obsfeat_space, action_space, enable_inputnorm, favor_zero_expert_reward, include_time, time_scale, exobs_Bex_Do, exa_Bex_Da, ext_Bex, kernel_bandwidth_params, kernel_batchsize, kernel_reg_weight, use_median_heuristic, use_logscale_reward, save_reward, epsilon ): self.obsfeat_space, self.action_space = obsfeat_space, action_space self.favor_zero_expert_reward = favor_zero_expert_reward self.include_time = include_time self.time_scale = time_scale self.exobs_Bex_Do, self.exa_Bex_Da, self.ext_Bex = exobs_Bex_Do, exa_Bex_Da, ext_Bex self.use_logscale_reward = use_logscale_reward self.save_reward = save_reward self.epsilon = epsilon with nn.variable_scope('inputnorm'): # Standardize both observations and actions if actions are continuous # otherwise standardize observations only. self.inputnorm = (nn.Standardizer if enable_inputnorm else nn.NoOpStandardizer)( (obsfeat_space.dim + action_space.dim) if isinstance(action_space, ContinuousSpace) else obsfeat_space.dim) self.inputnorm_updated = False self.update_inputnorm(self.exobs_Bex_Do, self.exa_Bex_Da) # pre-standardize with expert data # Expert feature expectations #self.expert_feat_Df = self._compute_featexp(self.exobs_Bex_Do, self.exa_Bex_Da, self.ext_Bex) self.expert_feat_B_Df = self._featurize(self.exobs_Bex_Do, self.exa_Bex_Da, self.ext_Bex) # Arguments for MMD Reward self.kernel_bandwidth_params = kernel_bandwidth_params self.kernel_batchsize = kernel_batchsize self.kernel_reg_weight = kernel_reg_weight self.use_median_heuristic = use_median_heuristic self.mmd_square = 1. self.expert_sigmas = [] self.iteration = 0 self.YY = None self.min_param = 100.0 self.max_param = 300.0 # MMD reward function # - Use Radial Basis Function Kernel # : k(x,y) = \sum exp(- sigma(i) * ||x-y||^2 ) # - sigmas : Bandwidth parameters x = T.matrix('x') y = T.matrix('y') sigmas = T.vector('sigmas') feat_dim = self.expert_feat_B_Df.shape[1] # - dist[i]: ||x[i]-y[i]||^2 # We should normalize x, y w.r.t its dimension # since in large dimension, a small difference between x, y # makes large difference in total kernel function value. normalized_x = x / feat_dim normalized_y = y / feat_dim dist_B = ((normalized_x)**2).sum(1).reshape((normalized_x.shape[0], 1)) \ + ((normalized_y)**2).sum(1).reshape((1, normalized_y.shape[0])) \ - 2*(normalized_x).dot((normalized_y).T) rbf_kernel_sum, _ = theano.scan(fn=lambda sigma, distance: T.exp(-sigma*distance), outputs_info=None, sequences=sigmas, non_sequences=dist_B) rbf_kernel = rbf_kernel_sum.mean(axis=0) if self.kernel_reg_weight > 0.0: xynorm = T.outer(normalized_x.norm(2, axis=1), normalized_y.norm(2, axis=1)) rbf_kernel += self.kernel_reg_weight*((normalized_x).dot(normalized_y.T)) / xynorm self.kernel_function = theano.function([x, y, sigmas], [rbf_kernel], allow_input_downcast=True) # Evaluate k( expert, expert ) if not (self.use_median_heuristic > 0): self.kernel_exex_total = self.kernel_function(self.expert_feat_B_Df, self.expert_feat_B_Df, self.kernel_bandwidth_params) self.kernel_exex_total = np.mean(self.kernel_exex_total)