Пример #1
0
    def shanon_Entropy_studentt(self, log_cov, freedom):
        Nrff, dout = log_cov.shape
        const = T.log(
            ((freedom - 2) * np.pi)**(dout / 2)
        ) + T.gammaln(freedom / 2) - T.gammaln((freedom + dout) / 2) + (T.psi(
            (freedom + dout) / 2) - T.psi(freedom / 2)) * (freedom + dout) / 2

        return 0.5 * T.sum(log_cov) + Nrff * const
Пример #2
0
    def entropy_pi(self):
        log_gamma_term = T.sum( T.gammaln(self.tau_IBP[:,0]) + T.gammaln(self.tau_IBP[:,1]) \
                       - T.gammaln(self.tau_IBP[:,0] + self.tau_IBP[:,1]) )
        digamma_term = T.sum( (1.0-self.tau_IBP[:,0])*T.psi(self.tau_IBP[:,0])
                     + (1.0-self.tau_IBP[:,1])*T.psi(self.tau_IBP[:,1])
                     + (self.tau_IBP[:,0]+self.tau_IBP[:,1]-2.0)*T.psi(self.tau_IBP[:,0]+self.tau_IBP[:,1]) )

        return log_gamma_term + digamma_term
    def create_gradientfunctions(self, x_train):
        """This function takes as input the whole dataset and creates the entire model"""
        x = T.matrix("x")

        epoch = T.iscalar("epoch")

        batch_size = x.shape[0]

        alpha, beta = self.encoder(x)
        z = self.sampler(alpha, beta)
        reconstructed_x, logpxz = self.decoder(x,z)

        # Expectation of (logpz - logqz_x) over logqz_x is equal to KLD (see appendix B):
        # KLD = 0.5 * T.sum(1 + beta - alpha**2 - T.exp(beta), axis=1, keepdims=True)

        #KLD = 0.5 * T.sum(1 + beta - (alpha**2 + T.exp(beta)) / (2*(self.prior_noise_level**2)) , axis=1, keepdims=True)

        # KLD = cross-entroy of the sample distribution of sigmoid(z) from the beta distribution
        alpha_prior = 1.0/self.prior_noise_level
        beta_prior = 1.0/self.prior_noise_level
        # sigmoidZ = T.nnet.sigmoid(z)
        # KLD = 25*T.sum((alpha_prior-1)*sigmoidZ + (beta-1)*(1-sigmoidZ) - betaln(alpha_prior,beta), axis=1, keepdims=True)
        # KLD = 0

        KLD = -(betaln(alpha, beta) - betaln(alpha_prior, beta_prior) \
         + (alpha_prior - alpha)*T.psi(alpha_prior) + (beta_prior - beta)*T.psi(beta_prior) \
         + (alpha - alpha_prior + beta - beta_prior)*T.psi(alpha_prior+beta_prior))

        # Average over batch dimension
        logpx = T.mean(logpxz + KLD)
 
        rmse_val = rmse_score(x, reconstructed_x)

        # Compute all the gradients
        gradients = T.grad(logpx, self.params.values())

        # Adam implemented as updates
        updates = self.get_adam_updates(gradients, epoch)

        batch = T.iscalar('batch')

        givens = {
            x: x_train[batch*self.batch_size:(batch+1)*self.batch_size, :]
        }

        # Define a bunch of functions for convenience
        update = theano.function([batch, epoch], logpx, updates=updates, givens=givens)
        likelihood = theano.function([x], logpx)
        eval_rmse = theano.function([x], rmse_val)
        encode = theano.function([x], z)
        decode = theano.function([z], reconstructed_x)
        encode_alpha = theano.function([x], alpha)
        encode_beta = theano.function([x], beta)

        return update, likelihood, encode, decode, encode_alpha, encode_beta, eval_rmse
Пример #4
0
    def log_p_z_IBP(self):
        self.digams = T.psi(self.tau_IBP)
        self.digams_1p2 = T.psi(self.tau_IBP[:,0] + self.tau_IBP[:,1])

        self.digams_1_cumsum   = T.extra_ops.cumsum(T.concatenate((T.zeros(1), self.digams[:,0])))[0:-1]
        self.digams_2_cumsum   = T.extra_ops.cumsum(self.digams[:,1])
        self.digams_1p2_cumsum = T.extra_ops.cumsum(self.digams_1p2)

        tractable_part = T.sum(T.dot(self.z_IBP_samp.T, self.digams_2_cumsum-self.digams_1p2_cumsum))
        intractable_part = T.sum(T.dot((1-self.z_IBP_samp) ,self.lower_lower()))

        return tractable_part + intractable_part
Пример #5
0
def psi_taylor_approx_at_zero(x):
    euler = -T.psi(1).eval()  # 0.57721
    polygamma_2_1 = -2.4041138063191  # T.polygamma(2, 1)
    polygamma_4_1 = -24.886266123440  # T.polygamma(4, 1)
    psi = -1. / x - euler + (np.pi**2 / 6.) * x + (
        polygamma_2_1 / 2.) * x**2 + (np.pi**4 /
                                      90.) * x**3  # + (polygamma_4_1/24.)*x**4
    return psi
Пример #6
0
def kl_div_ng_ng_with_real_psi(p_alpha, p_beta, p_nu, p_mu, q_alpha, q_beta,
                               q_nu, q_mu):
    kl_dist = 1.0 / 2.0 * p_alpha / p_beta * (q_mu - p_mu)**2.0 * q_nu
    kl_dist = kl_dist + 1.0 / 2.0 * q_nu / p_nu
    kl_dist = kl_dist - 1.0 / 2.0 * T.log(q_nu / p_nu)
    kl_dist = kl_dist - 1.0 / 2.0 + q_alpha * T.log(p_beta / q_beta) - T.log(
        T.gamma(p_alpha) / T.gamma(q_alpha))
    kl_dist = kl_dist + (p_alpha - q_alpha) * T.psi(p_alpha) - (
        p_beta - q_beta) * p_alpha / p_beta
    return kl_dist
Пример #7
0
    def kl_recog_prior(self, stt):
        if stt.ndim == 3:
            stt_flat = wild_reshape(stt, (-1, stt.shape[2]))
        else:
            stt_flat = stt

        latent_a, latent_b = stt_flat[:, :stt_flat.shape[1] // 2], stt_flat[:, stt_flat.shape[1] // 2:]

        kl = T.log(latent_a*latent_b) - ((latent_a-1)/latent_a)*(self.euler_gamma +T.psi(latent_b) +1/latent_b) - ((latent_b-1)/latent_b)

        if stt.ndim == 3:
            kl = recover_time(kl, stt.shape[0])

        return kl
Пример #8
0
def calc_kl_divergence(posterior_a, posterior_b, alpha, beta):
    # compute taylor expansion for E[log (1-v)] term
    # hard-code so we don't have to use Scan()
    # posterior_a.shape = (batch_size, sequence_length)
    # posterior_b.shape = (batch_size, sequence_length)
    kl = 1. / (1 + posterior_a * posterior_b) * Beta_fn(
        1. / posterior_a, posterior_b)
    kl += 1. / (2 + posterior_a * posterior_b) * Beta_fn(
        2. / posterior_a, posterior_b)
    kl += 1. / (3 + posterior_a * posterior_b) * Beta_fn(
        3. / posterior_a, posterior_b)
    kl += 1. / (4 + posterior_a * posterior_b) * Beta_fn(
        4. / posterior_a, posterior_b)
    kl += 1. / (5 + posterior_a * posterior_b) * Beta_fn(
        5. / posterior_a, posterior_b)
    kl += 1. / (6 + posterior_a * posterior_b) * Beta_fn(
        6. / posterior_a, posterior_b)
    kl += 1. / (7 + posterior_a * posterior_b) * Beta_fn(
        7. / posterior_a, posterior_b)
    kl += 1. / (8 + posterior_a * posterior_b) * Beta_fn(
        8. / posterior_a, posterior_b)
    kl += 1. / (9 + posterior_a * posterior_b) * Beta_fn(
        9. / posterior_a, posterior_b)
    kl += 1. / (10 + posterior_a * posterior_b) * Beta_fn(
        10. / posterior_a, posterior_b)
    kl *= (beta - 1) * posterior_b

    # use another taylor approx for Digamma function
    euler = -T.psi(1).eval()  # 0.57721
    # psi_b_taylor_approx = psi_taylor_approx_at_infinity(posterior_b)
    # psi_b_taylor_approx = psi_taylor_approx_at_zero(posterior_b)
    psi_b_taylor_approx = T.switch(posterior_b < 0.53,
                                   psi_taylor_approx_at_zero(posterior_b),
                                   psi_taylor_approx_at_infinity(posterior_b))
    kl += (posterior_a - alpha) / posterior_a * (-euler - psi_b_taylor_approx -
                                                 1 / posterior_b)
    # kl += (posterior_a-alpha)/posterior_a * (-euler - T.psi(posterior_b) - 1/posterior_b)

    # add normalization constants
    kl += T.log(posterior_a * posterior_b) + T.log(Beta_fn(alpha, beta))

    # final term
    kl += -(posterior_b - 1) / posterior_b
    return kl.sum(axis=1)
Пример #9
0
def main():
    print "hi"
    ALPHA = 0.01
    N = 100
    K = 10
    V = 1000
    M = 3
    EPS = 0.01
    w = tt.ivector("w")
    beta = tt.fmatrix("beta")

    phi = theano.shared(init_phi(K, N), "phi")
    gamma = theano.shared(init_gamma(ALPHA, K, N), "gamma")
    beta_prime = tt.matrix("beta_prime")
    alpha = tt.vector("alpha")

    phi_update = (beta_prime * tt.exp(tt.psi(gamma)).dimshuffle(0, "x")).T # N x K
    updates = OrderedDict()
    new_phi = phi_update / tt.sum(phi_update, axis=1).dimshuffle(0, "x")
    updates[phi] = new_phi

    updates[gamma] = alpha + tt.sum(new_phi, axis=0)

    e_step = theano.function([beta_prime, alpha], [], updates=updates)

    #m_step = theano.function([w], [], updates=)
    # w_value = np.random.randint(0, N - 1, N * k).reshape(k, N)
    #w_value.dtype = "int32"
    alpha_value = ALPHA
    beta_prime_value = np.arange(0.0, N * K, dtype="float32").reshape(K, N)

    phi_prev = None
    gamma_prev = None
    i = 0
    while (phi_prev is None and gamma_prev is None) or ((np.abs(phi.get_value() - phi_prev) < EPS).all()
                                                        and (np.abs(gamma.get_value() - gamma_prev) < EPS).all()):
        i += 1
        if i % 1000 == 0:
            print i
        e_step(beta_prime=beta_prime_value, alpha=np.repeat(ALPHA, K))
        phi_prev = phi.get_value()
        gamma_prev = gamma.get_value()

    print "Result", res, "phi", phi.get_value()
Пример #10
0
    def buildExtGradFn(self):
        tn, tt, pn, pt = self.__thetaNorm, self.__thetaTilde, self.__phiNorm, self.__phiTilde

        dg1 = T.psi(self.__thetaNorm + EPS)
        dg2 = T.psi(self.__thetaNorm + T.cast(self._nd, 'float32') + EPS)
        dgW1 = T.psi(self.__thetaTilde + T.cast(self._ndz, 'float32') + EPS)
        dgW2 = T.psi(self.__thetaTilde + EPS)
        gradTerm_theta = dg1 - dg2 + dgW1 - dgW2

        dg1 = T.psi(self.__phiNorm + EPS)
        dg2 = T.psi(self.__phiNorm + T.cast(self._nz, 'float32') + EPS)
        dgW1 = T.psi(self.__phiTilde + T.cast(self._nzw, 'float32') + EPS)
        dgW2 = T.psi(self.__phiTilde + EPS)
        gradTerm_phi = dg1 - dg2 + dgW1 - dgW2

        self.calcExternalGrad_phi = theano.function(inputs=[],
                                                    outputs=[gradTerm_phi])
        self.calcExternalGrad_theta = theano.function(inputs=[],
                                                      outputs=[gradTerm_theta])
        self.calcExternalGrad = theano.function(
            inputs=[], outputs=[gradTerm_phi, gradTerm_theta])
Пример #11
0
 def shanon_Entropy_studentt(self,log_cov,freedom):
     Nrff,dout=log_cov.shape
     const=T.log(((freedom-2)*np.pi)**(dout/2))+T.gammaln(freedom/2)-T.gammaln((freedom+dout)/2)  +   (T.psi((freedom+dout)/2 ) - T.psi(freedom/2))*(freedom+dout)/2 
     
     return 0.5*T.sum(log_cov) + Nrff*const
Пример #12
0
    def __init__(self, layer_def, inputs, inputs_shape, rs, clone_from=None):
        """
            Create a Dirichlet layer, according to the following paper:
                Malmir M, Sikka K, Forster D, Fasel I, Movellan JR, Cottrell GW. 
                Deep Active Object Recognition by Joint Label and Action Prediction. 
                arXiv preprint arXiv:1512.05484. 2015 Dec 17. 
            Each unit in this layer encodes a Dicihlet distribution over its input.
            The input is assumed to be a belief vector, i.e. \sum_i input[i] = 1, 0 <= input_i <= 1 for all i
                 
            :type layer_def: Element, xml containing configu for Conv layer

            :type inputs: a list of [belief_in, actions, objects, previous_belief] 
            :param inputs[0], belief_in, is a theano.matrix which contains belief vectors in its columns
            :param inputs[1], actions, theano.ivector, list of actions for each column of belief_in 
            :param inputs[2], objects, theano.ivector, list of objects for each column of belief_in 
            :param inputs[3], previous_belief, theano.matrix, used to accumulate beliefs over time 
            
            
            :type input_shapes: list of sizes of inputs 

            :type rs: a random number generator used to initialize weights
        """
        assert (
            len(inputs) == 4
        )  #belief dim x bacth_sz, actions: 1 x batch_size, objects 1 x batch_sz, accbelief (numActs*numObjs) x batch_sz
        beliefs, actions, objects, accbeliefs = inputs
        self.inputs = inputs  # beliefs, actions, objects
        dim = inputs_shape[0][0]
        assert (inputs_shape[0][1] == inputs_shape[1][1])  #batch_size
        assert (inputs_shape[0][1] == inputs_shape[2][1])  #batch_size
        assert (inputs_shape[0][1] == inputs_shape[3][1])  #batch_size
        assert (inputs_shape[1][0] == 1)  #action is a single integer
        assert (inputs_shape[2][0] == 1)  #object label is a single integer
        batch_size = inputs_shape[0][1]
        self.numActions = int(layer_def.find("numActions").text)
        self.numObjects = int(layer_def.find("numObjects").text)
        assert (self.numObjects * self.numActions == inputs_shape[3][0])
        assert (self.numObjects == dim)
        #total number of dirichlet units = numActions x numObjects
        num_dirichlets = self.numObjects * self.numActions
        if clone_from == None:
            self.alphas = theano.shared(np.random.randint(
                5, 30, [dim, num_dirichlets]).astype(theano.config.floatX) /
                                        25.,
                                        borrow=True)  # dim x num_dirichlets
        else:
            self.alphas = clone_from.alphas
        #self.alphas         = theano.shared(0.7* np.ones([dim,num_dirichlets]).astype(theano.config.floatX),borrow=True)# dim x num_dirichlets
        #remove 0 from the input belief
        normalized_beliefs = beliefs + 1.e-6
        normalized_beliefs = normalized_beliefs / T.reshape(
            T.sum(normalized_beliefs, axis=0), [1, batch_size])
        log_normed_beliefs = T.log(normalized_beliefs)  # dim x batch_size
        self.log_normed = log_normed_beliefs

        #calculate Dirichlet probs for the current normalize beliefs
        self.term1 = T.reshape(T.gammaln(T.sum(self.alphas, axis=0)),
                               [num_dirichlets, 1])
        self.term2 = T.reshape(T.sum(T.gammaln(self.alphas), axis=0),
                               [num_dirichlets, 1])
        self.term3 = T.dot(T.transpose(self.alphas - 1.),
                           log_normed_beliefs)  # num_dirichlets x batch_size
        #find a mask based on the actions
        dirichlet_actions = np.tile(
            np.arange(self.numActions).reshape([-1, 1]), [self.numObjects, 1])
        dirichlet_actions = np.tile(dirichlet_actions, [1, batch_size])
        dirichlet_actions = theano.shared(dirichlet_actions.astype(
            theano.config.floatX),
                                          borrow=True)
        in_actions = T.tile(T.reshape(actions, [1, batch_size]),
                            [num_dirichlets, 1])
        self.eq_actions = T.eq(dirichlet_actions, in_actions)
        #self.current_belief = T.exp(self.term1 - self.term2 + self.eq_actions * self.term3) #this should be normalized for each column
        log_cur_belief = self.term1 - self.term2 + self.eq_actions * self.term3  #this should be normalized for each column
        #log_cur_belief      = self.term1 - self.term2 + self.term3 #this should be normalized for each column
        log_cur_belief_normd = log_cur_belief - T.reshape(
            T.max(log_cur_belief, axis=0), [1, batch_size])
        cur_blf = self.eq_actions * T.exp(log_cur_belief_normd)
        self.current_belief = cur_blf / T.sum(cur_blf, axis=0)

        acc_is_zero = T.eq(accbeliefs, 0.)
        accbeliefs_no_0 = acc_is_zero + (1. - acc_is_zero) * accbeliefs
        updated_belief = self.eq_actions * self.current_belief * accbeliefs_no_0 + (
            1. - self.eq_actions) * accbeliefs  # num_dirichlet x batch_size
        sum_up_blf = T.reshape(T.sum(updated_belief, axis=0), [1, batch_size])
        #sum_up_blf_normed   = T.switch( T.eq(sum_up_blf, 0.) , np.ones([1,batch_size]).astype(theano.config.floatX),sum_up_blf)
        #self.updated_belief = updated_belief / sum_up_blf_normed
        self.updated_belief = updated_belief / sum_up_blf
        self.output = self.updated_belief
        #self.updated_belief = self.current_belief

        #construct the outputs
        # for each class, assign 1s to the components that indicate P(a,o|x)
        #weights_marginalize = np.zeros([self.numObjects,num_dirichlets],dtype=theano.config.floatX)
        #for i in range(self.numObjects):
        #    weights_marginalize[i,i*self.numActions:(i+1)*self.numActions] = 1.
        #weights_margin      = theano.shared( weights_marginalize , borrow=True)
        #self.output         = T.dot( weights_margin, self.updated_belief)

        #calculating weight updates
        objects_idx = np.tile(
            np.arange(self.numObjects).reshape([-1, 1]),
            [1, self.numActions]).reshape([1, -1])
        objects_idx = np.tile(objects_idx.reshape([-1, 1]),
                              [1, batch_size])  # num_dirichlets x batch_size
        objects_idx = theano.shared(objects_idx.astype(theano.config.floatX),
                                    borrow=True)
        in_objects = T.tile(T.reshape(objects, [1, batch_size]),
                            [num_dirichlets, 1])  # num_dirichlets x batch_size
        self.idx = self.eq_actions * T.eq(
            objects_idx, in_objects)  # num_dirichlets x batch_size
        self.idx = self.idx.astype(theano.config.floatX)
        self.N = T.reshape(T.sum(self.idx, axis=1), [1, num_dirichlets])
        #take care of 0 in the input to avoid nan in log
        term5 = T.dot(log_normed_beliefs,
                      T.transpose(self.idx))  #dim x num_dirichlets
        self.update = self.N * T.reshape(T.psi(T.sum(self.alphas, axis=0)),
                                         [1, num_dirichlets]) - self.N * T.psi(
                                             self.alphas) + term5
        #self.update         = T.psi(self.alphas) + term5

        #calculate log-prob of data                ndirichlets                    ndirichlets
        dir_l_p = self.N * T.gammaln(T.sum(
            self.alphas, axis=0)) - self.N * T.sum(
                T.gammaln(self.alphas), axis=0) + T.sum(
                    term5 * (self.alphas - 1.), axis=0)
        self.log_p_ao = T.mean(dir_l_p)

        self.params = [self.alphas]
        self.inputs_shape = inputs_shape
        #self.output_shape   = [dim,batch_size]
        self.output_shape = [num_dirichlets, batch_size]
Пример #13
0
    def log_p_v(self):
        term = T.sum(T.log(self.alpha_IBP) \
            + (self.alpha_IBP-1)*(T.psi(self.tau[:,0]) - T.psi(self.tau_IBP[:,0] + self.tau_IBP[:,1])))

        return term