def shanon_Entropy_studentt(self, log_cov, freedom): Nrff, dout = log_cov.shape const = T.log( ((freedom - 2) * np.pi)**(dout / 2) ) + T.gammaln(freedom / 2) - T.gammaln((freedom + dout) / 2) + (T.psi( (freedom + dout) / 2) - T.psi(freedom / 2)) * (freedom + dout) / 2 return 0.5 * T.sum(log_cov) + Nrff * const
def entropy_pi(self): log_gamma_term = T.sum( T.gammaln(self.tau_IBP[:,0]) + T.gammaln(self.tau_IBP[:,1]) \ - T.gammaln(self.tau_IBP[:,0] + self.tau_IBP[:,1]) ) digamma_term = T.sum( (1.0-self.tau_IBP[:,0])*T.psi(self.tau_IBP[:,0]) + (1.0-self.tau_IBP[:,1])*T.psi(self.tau_IBP[:,1]) + (self.tau_IBP[:,0]+self.tau_IBP[:,1]-2.0)*T.psi(self.tau_IBP[:,0]+self.tau_IBP[:,1]) ) return log_gamma_term + digamma_term
def create_gradientfunctions(self, x_train): """This function takes as input the whole dataset and creates the entire model""" x = T.matrix("x") epoch = T.iscalar("epoch") batch_size = x.shape[0] alpha, beta = self.encoder(x) z = self.sampler(alpha, beta) reconstructed_x, logpxz = self.decoder(x,z) # Expectation of (logpz - logqz_x) over logqz_x is equal to KLD (see appendix B): # KLD = 0.5 * T.sum(1 + beta - alpha**2 - T.exp(beta), axis=1, keepdims=True) #KLD = 0.5 * T.sum(1 + beta - (alpha**2 + T.exp(beta)) / (2*(self.prior_noise_level**2)) , axis=1, keepdims=True) # KLD = cross-entroy of the sample distribution of sigmoid(z) from the beta distribution alpha_prior = 1.0/self.prior_noise_level beta_prior = 1.0/self.prior_noise_level # sigmoidZ = T.nnet.sigmoid(z) # KLD = 25*T.sum((alpha_prior-1)*sigmoidZ + (beta-1)*(1-sigmoidZ) - betaln(alpha_prior,beta), axis=1, keepdims=True) # KLD = 0 KLD = -(betaln(alpha, beta) - betaln(alpha_prior, beta_prior) \ + (alpha_prior - alpha)*T.psi(alpha_prior) + (beta_prior - beta)*T.psi(beta_prior) \ + (alpha - alpha_prior + beta - beta_prior)*T.psi(alpha_prior+beta_prior)) # Average over batch dimension logpx = T.mean(logpxz + KLD) rmse_val = rmse_score(x, reconstructed_x) # Compute all the gradients gradients = T.grad(logpx, self.params.values()) # Adam implemented as updates updates = self.get_adam_updates(gradients, epoch) batch = T.iscalar('batch') givens = { x: x_train[batch*self.batch_size:(batch+1)*self.batch_size, :] } # Define a bunch of functions for convenience update = theano.function([batch, epoch], logpx, updates=updates, givens=givens) likelihood = theano.function([x], logpx) eval_rmse = theano.function([x], rmse_val) encode = theano.function([x], z) decode = theano.function([z], reconstructed_x) encode_alpha = theano.function([x], alpha) encode_beta = theano.function([x], beta) return update, likelihood, encode, decode, encode_alpha, encode_beta, eval_rmse
def log_p_z_IBP(self): self.digams = T.psi(self.tau_IBP) self.digams_1p2 = T.psi(self.tau_IBP[:,0] + self.tau_IBP[:,1]) self.digams_1_cumsum = T.extra_ops.cumsum(T.concatenate((T.zeros(1), self.digams[:,0])))[0:-1] self.digams_2_cumsum = T.extra_ops.cumsum(self.digams[:,1]) self.digams_1p2_cumsum = T.extra_ops.cumsum(self.digams_1p2) tractable_part = T.sum(T.dot(self.z_IBP_samp.T, self.digams_2_cumsum-self.digams_1p2_cumsum)) intractable_part = T.sum(T.dot((1-self.z_IBP_samp) ,self.lower_lower())) return tractable_part + intractable_part
def psi_taylor_approx_at_zero(x): euler = -T.psi(1).eval() # 0.57721 polygamma_2_1 = -2.4041138063191 # T.polygamma(2, 1) polygamma_4_1 = -24.886266123440 # T.polygamma(4, 1) psi = -1. / x - euler + (np.pi**2 / 6.) * x + ( polygamma_2_1 / 2.) * x**2 + (np.pi**4 / 90.) * x**3 # + (polygamma_4_1/24.)*x**4 return psi
def kl_div_ng_ng_with_real_psi(p_alpha, p_beta, p_nu, p_mu, q_alpha, q_beta, q_nu, q_mu): kl_dist = 1.0 / 2.0 * p_alpha / p_beta * (q_mu - p_mu)**2.0 * q_nu kl_dist = kl_dist + 1.0 / 2.0 * q_nu / p_nu kl_dist = kl_dist - 1.0 / 2.0 * T.log(q_nu / p_nu) kl_dist = kl_dist - 1.0 / 2.0 + q_alpha * T.log(p_beta / q_beta) - T.log( T.gamma(p_alpha) / T.gamma(q_alpha)) kl_dist = kl_dist + (p_alpha - q_alpha) * T.psi(p_alpha) - ( p_beta - q_beta) * p_alpha / p_beta return kl_dist
def kl_recog_prior(self, stt): if stt.ndim == 3: stt_flat = wild_reshape(stt, (-1, stt.shape[2])) else: stt_flat = stt latent_a, latent_b = stt_flat[:, :stt_flat.shape[1] // 2], stt_flat[:, stt_flat.shape[1] // 2:] kl = T.log(latent_a*latent_b) - ((latent_a-1)/latent_a)*(self.euler_gamma +T.psi(latent_b) +1/latent_b) - ((latent_b-1)/latent_b) if stt.ndim == 3: kl = recover_time(kl, stt.shape[0]) return kl
def calc_kl_divergence(posterior_a, posterior_b, alpha, beta): # compute taylor expansion for E[log (1-v)] term # hard-code so we don't have to use Scan() # posterior_a.shape = (batch_size, sequence_length) # posterior_b.shape = (batch_size, sequence_length) kl = 1. / (1 + posterior_a * posterior_b) * Beta_fn( 1. / posterior_a, posterior_b) kl += 1. / (2 + posterior_a * posterior_b) * Beta_fn( 2. / posterior_a, posterior_b) kl += 1. / (3 + posterior_a * posterior_b) * Beta_fn( 3. / posterior_a, posterior_b) kl += 1. / (4 + posterior_a * posterior_b) * Beta_fn( 4. / posterior_a, posterior_b) kl += 1. / (5 + posterior_a * posterior_b) * Beta_fn( 5. / posterior_a, posterior_b) kl += 1. / (6 + posterior_a * posterior_b) * Beta_fn( 6. / posterior_a, posterior_b) kl += 1. / (7 + posterior_a * posterior_b) * Beta_fn( 7. / posterior_a, posterior_b) kl += 1. / (8 + posterior_a * posterior_b) * Beta_fn( 8. / posterior_a, posterior_b) kl += 1. / (9 + posterior_a * posterior_b) * Beta_fn( 9. / posterior_a, posterior_b) kl += 1. / (10 + posterior_a * posterior_b) * Beta_fn( 10. / posterior_a, posterior_b) kl *= (beta - 1) * posterior_b # use another taylor approx for Digamma function euler = -T.psi(1).eval() # 0.57721 # psi_b_taylor_approx = psi_taylor_approx_at_infinity(posterior_b) # psi_b_taylor_approx = psi_taylor_approx_at_zero(posterior_b) psi_b_taylor_approx = T.switch(posterior_b < 0.53, psi_taylor_approx_at_zero(posterior_b), psi_taylor_approx_at_infinity(posterior_b)) kl += (posterior_a - alpha) / posterior_a * (-euler - psi_b_taylor_approx - 1 / posterior_b) # kl += (posterior_a-alpha)/posterior_a * (-euler - T.psi(posterior_b) - 1/posterior_b) # add normalization constants kl += T.log(posterior_a * posterior_b) + T.log(Beta_fn(alpha, beta)) # final term kl += -(posterior_b - 1) / posterior_b return kl.sum(axis=1)
def main(): print "hi" ALPHA = 0.01 N = 100 K = 10 V = 1000 M = 3 EPS = 0.01 w = tt.ivector("w") beta = tt.fmatrix("beta") phi = theano.shared(init_phi(K, N), "phi") gamma = theano.shared(init_gamma(ALPHA, K, N), "gamma") beta_prime = tt.matrix("beta_prime") alpha = tt.vector("alpha") phi_update = (beta_prime * tt.exp(tt.psi(gamma)).dimshuffle(0, "x")).T # N x K updates = OrderedDict() new_phi = phi_update / tt.sum(phi_update, axis=1).dimshuffle(0, "x") updates[phi] = new_phi updates[gamma] = alpha + tt.sum(new_phi, axis=0) e_step = theano.function([beta_prime, alpha], [], updates=updates) #m_step = theano.function([w], [], updates=) # w_value = np.random.randint(0, N - 1, N * k).reshape(k, N) #w_value.dtype = "int32" alpha_value = ALPHA beta_prime_value = np.arange(0.0, N * K, dtype="float32").reshape(K, N) phi_prev = None gamma_prev = None i = 0 while (phi_prev is None and gamma_prev is None) or ((np.abs(phi.get_value() - phi_prev) < EPS).all() and (np.abs(gamma.get_value() - gamma_prev) < EPS).all()): i += 1 if i % 1000 == 0: print i e_step(beta_prime=beta_prime_value, alpha=np.repeat(ALPHA, K)) phi_prev = phi.get_value() gamma_prev = gamma.get_value() print "Result", res, "phi", phi.get_value()
def buildExtGradFn(self): tn, tt, pn, pt = self.__thetaNorm, self.__thetaTilde, self.__phiNorm, self.__phiTilde dg1 = T.psi(self.__thetaNorm + EPS) dg2 = T.psi(self.__thetaNorm + T.cast(self._nd, 'float32') + EPS) dgW1 = T.psi(self.__thetaTilde + T.cast(self._ndz, 'float32') + EPS) dgW2 = T.psi(self.__thetaTilde + EPS) gradTerm_theta = dg1 - dg2 + dgW1 - dgW2 dg1 = T.psi(self.__phiNorm + EPS) dg2 = T.psi(self.__phiNorm + T.cast(self._nz, 'float32') + EPS) dgW1 = T.psi(self.__phiTilde + T.cast(self._nzw, 'float32') + EPS) dgW2 = T.psi(self.__phiTilde + EPS) gradTerm_phi = dg1 - dg2 + dgW1 - dgW2 self.calcExternalGrad_phi = theano.function(inputs=[], outputs=[gradTerm_phi]) self.calcExternalGrad_theta = theano.function(inputs=[], outputs=[gradTerm_theta]) self.calcExternalGrad = theano.function( inputs=[], outputs=[gradTerm_phi, gradTerm_theta])
def shanon_Entropy_studentt(self,log_cov,freedom): Nrff,dout=log_cov.shape const=T.log(((freedom-2)*np.pi)**(dout/2))+T.gammaln(freedom/2)-T.gammaln((freedom+dout)/2) + (T.psi((freedom+dout)/2 ) - T.psi(freedom/2))*(freedom+dout)/2 return 0.5*T.sum(log_cov) + Nrff*const
def __init__(self, layer_def, inputs, inputs_shape, rs, clone_from=None): """ Create a Dirichlet layer, according to the following paper: Malmir M, Sikka K, Forster D, Fasel I, Movellan JR, Cottrell GW. Deep Active Object Recognition by Joint Label and Action Prediction. arXiv preprint arXiv:1512.05484. 2015 Dec 17. Each unit in this layer encodes a Dicihlet distribution over its input. The input is assumed to be a belief vector, i.e. \sum_i input[i] = 1, 0 <= input_i <= 1 for all i :type layer_def: Element, xml containing configu for Conv layer :type inputs: a list of [belief_in, actions, objects, previous_belief] :param inputs[0], belief_in, is a theano.matrix which contains belief vectors in its columns :param inputs[1], actions, theano.ivector, list of actions for each column of belief_in :param inputs[2], objects, theano.ivector, list of objects for each column of belief_in :param inputs[3], previous_belief, theano.matrix, used to accumulate beliefs over time :type input_shapes: list of sizes of inputs :type rs: a random number generator used to initialize weights """ assert ( len(inputs) == 4 ) #belief dim x bacth_sz, actions: 1 x batch_size, objects 1 x batch_sz, accbelief (numActs*numObjs) x batch_sz beliefs, actions, objects, accbeliefs = inputs self.inputs = inputs # beliefs, actions, objects dim = inputs_shape[0][0] assert (inputs_shape[0][1] == inputs_shape[1][1]) #batch_size assert (inputs_shape[0][1] == inputs_shape[2][1]) #batch_size assert (inputs_shape[0][1] == inputs_shape[3][1]) #batch_size assert (inputs_shape[1][0] == 1) #action is a single integer assert (inputs_shape[2][0] == 1) #object label is a single integer batch_size = inputs_shape[0][1] self.numActions = int(layer_def.find("numActions").text) self.numObjects = int(layer_def.find("numObjects").text) assert (self.numObjects * self.numActions == inputs_shape[3][0]) assert (self.numObjects == dim) #total number of dirichlet units = numActions x numObjects num_dirichlets = self.numObjects * self.numActions if clone_from == None: self.alphas = theano.shared(np.random.randint( 5, 30, [dim, num_dirichlets]).astype(theano.config.floatX) / 25., borrow=True) # dim x num_dirichlets else: self.alphas = clone_from.alphas #self.alphas = theano.shared(0.7* np.ones([dim,num_dirichlets]).astype(theano.config.floatX),borrow=True)# dim x num_dirichlets #remove 0 from the input belief normalized_beliefs = beliefs + 1.e-6 normalized_beliefs = normalized_beliefs / T.reshape( T.sum(normalized_beliefs, axis=0), [1, batch_size]) log_normed_beliefs = T.log(normalized_beliefs) # dim x batch_size self.log_normed = log_normed_beliefs #calculate Dirichlet probs for the current normalize beliefs self.term1 = T.reshape(T.gammaln(T.sum(self.alphas, axis=0)), [num_dirichlets, 1]) self.term2 = T.reshape(T.sum(T.gammaln(self.alphas), axis=0), [num_dirichlets, 1]) self.term3 = T.dot(T.transpose(self.alphas - 1.), log_normed_beliefs) # num_dirichlets x batch_size #find a mask based on the actions dirichlet_actions = np.tile( np.arange(self.numActions).reshape([-1, 1]), [self.numObjects, 1]) dirichlet_actions = np.tile(dirichlet_actions, [1, batch_size]) dirichlet_actions = theano.shared(dirichlet_actions.astype( theano.config.floatX), borrow=True) in_actions = T.tile(T.reshape(actions, [1, batch_size]), [num_dirichlets, 1]) self.eq_actions = T.eq(dirichlet_actions, in_actions) #self.current_belief = T.exp(self.term1 - self.term2 + self.eq_actions * self.term3) #this should be normalized for each column log_cur_belief = self.term1 - self.term2 + self.eq_actions * self.term3 #this should be normalized for each column #log_cur_belief = self.term1 - self.term2 + self.term3 #this should be normalized for each column log_cur_belief_normd = log_cur_belief - T.reshape( T.max(log_cur_belief, axis=0), [1, batch_size]) cur_blf = self.eq_actions * T.exp(log_cur_belief_normd) self.current_belief = cur_blf / T.sum(cur_blf, axis=0) acc_is_zero = T.eq(accbeliefs, 0.) accbeliefs_no_0 = acc_is_zero + (1. - acc_is_zero) * accbeliefs updated_belief = self.eq_actions * self.current_belief * accbeliefs_no_0 + ( 1. - self.eq_actions) * accbeliefs # num_dirichlet x batch_size sum_up_blf = T.reshape(T.sum(updated_belief, axis=0), [1, batch_size]) #sum_up_blf_normed = T.switch( T.eq(sum_up_blf, 0.) , np.ones([1,batch_size]).astype(theano.config.floatX),sum_up_blf) #self.updated_belief = updated_belief / sum_up_blf_normed self.updated_belief = updated_belief / sum_up_blf self.output = self.updated_belief #self.updated_belief = self.current_belief #construct the outputs # for each class, assign 1s to the components that indicate P(a,o|x) #weights_marginalize = np.zeros([self.numObjects,num_dirichlets],dtype=theano.config.floatX) #for i in range(self.numObjects): # weights_marginalize[i,i*self.numActions:(i+1)*self.numActions] = 1. #weights_margin = theano.shared( weights_marginalize , borrow=True) #self.output = T.dot( weights_margin, self.updated_belief) #calculating weight updates objects_idx = np.tile( np.arange(self.numObjects).reshape([-1, 1]), [1, self.numActions]).reshape([1, -1]) objects_idx = np.tile(objects_idx.reshape([-1, 1]), [1, batch_size]) # num_dirichlets x batch_size objects_idx = theano.shared(objects_idx.astype(theano.config.floatX), borrow=True) in_objects = T.tile(T.reshape(objects, [1, batch_size]), [num_dirichlets, 1]) # num_dirichlets x batch_size self.idx = self.eq_actions * T.eq( objects_idx, in_objects) # num_dirichlets x batch_size self.idx = self.idx.astype(theano.config.floatX) self.N = T.reshape(T.sum(self.idx, axis=1), [1, num_dirichlets]) #take care of 0 in the input to avoid nan in log term5 = T.dot(log_normed_beliefs, T.transpose(self.idx)) #dim x num_dirichlets self.update = self.N * T.reshape(T.psi(T.sum(self.alphas, axis=0)), [1, num_dirichlets]) - self.N * T.psi( self.alphas) + term5 #self.update = T.psi(self.alphas) + term5 #calculate log-prob of data ndirichlets ndirichlets dir_l_p = self.N * T.gammaln(T.sum( self.alphas, axis=0)) - self.N * T.sum( T.gammaln(self.alphas), axis=0) + T.sum( term5 * (self.alphas - 1.), axis=0) self.log_p_ao = T.mean(dir_l_p) self.params = [self.alphas] self.inputs_shape = inputs_shape #self.output_shape = [dim,batch_size] self.output_shape = [num_dirichlets, batch_size]
def log_p_v(self): term = T.sum(T.log(self.alpha_IBP) \ + (self.alpha_IBP-1)*(T.psi(self.tau[:,0]) - T.psi(self.tau_IBP[:,0] + self.tau_IBP[:,1]))) return term