Exemplo n.º 1
0
 def estep():
     if any(self.param['alpha'] < 1e-20):
         self.param['alpha'][where(self.param['alpha'] < 1e-20)] = 1e-20
         self.param['alpha'] /= sum(self.param['alpha'])
     for k in xrange(K):
         LP[k,:] = self.param['P'][k].loglik(dat)  + log(self.param['alpha'][k])
     for k in xrange(K):
         T[k,:] = exp(LP[k,:]-logsumexp(LP,axis=0))
     # return sum((T*LP).flatten())
     return sum(logsumexp(LP,axis=0))
Exemplo n.º 2
0
def check_loglik_importance(dic):
    """
    Checks a log-likelihood function via importance sampling. Nsamples
    are drawn from the proposal_high distribution (given in the
    dictionary). Therefore, it is assumed that the log-likelihood
    function of this distribution is correct. If also the likelihood
    function of the distribution under test (dic['dist']) is correct,
    the importance sampling estimate of the partition function, should
    yield a value close to 1. Here, an absolute error of
    dic['tolerance'] is accepted to pass the test. 

    Argument:
    :param dic: dictionary filled with (at least) proposal_high, nsamples, tolerance
    :type dic : dictionary

    """
    dic = fill_dict_with_defaults(dic)
    data = dic['proposal_high'].sample(dic['nsamples'])
    logQ = dic['proposal_high'].loglik(data)
    logP = dic['dist'].loglik(data)
    logZ = logsumexp(logP - logQ) - np.log(dic['nsamples'])
    diff = np.abs(np.exp(logZ) - 1)
    assert np.abs(np.exp(logZ) - 1) < dic[
        'tolerance'], "Testing loglik failed for %s, difference is %g, which is bigger than %g, number of samples are: %d " % (
            dic['dist'].name, diff, dic['tolerance'], dic['nsamples'])
Exemplo n.º 3
0
def check_sample(dic):
    """
    Checks a sample function via importance sampling. Nsamples are
    drawn from the distribution under test. Then the partition
    function of the proposal_low distribution is estimated using these
    samples and the log-likelihood function of the distribution under
    test. If sampling and both log-likelihood functions are correct,
    the importance sampling estimate should yield a partition function
    estimate of 1. Here, an absolute error of dic['tolerance'] is
    accepted to pass the test.

    Argument:
    :param dic: dictionary filled with (at least) proposal_low, nsamples, tolerance
    :type dic : dictionary

    """
    dic = fill_dict_with_defaults(dic)
    data = dic['dist'].sample(dic['nsamples'])
    logP = dic['proposal_low'].loglik(data)
    logQ = dic['dist'].loglik(data)
    logZ = logsumexp(logP - logQ) - np.log(dic['nsamples'])
    diff = np.abs(np.exp(logZ) - 1)
    assert np.abs(np.exp(logZ) - 1) < dic[
        'tolerance'], "Testing sampling failed for %s, difference is %g, which is bigger than %g, number of smaples are: %d" % (
            dic['dist'].name, diff, dic['tolerance'], dic['nsamples'])
 def test_loglik(self):
     p1 = Distributions.TruncatedExponentialPower({'a':-1.0,'b':2.0,'p':1.0,'s':2.0})
     p2 = Distributions.TruncatedExponentialPower({'a':-1.0,'b':2.0,'p':1.5,'s':2.0})
     nsamples = 1000000
     data = p2.sample(nsamples)
     logZ = logsumexp(p1.loglik(data) -p2.loglik(data) - np.log(nsamples))
     print "Estimated partition function: ", np.exp(logZ)
     self.assertTrue(np.abs(np.exp(logZ)-1.0) < 0.1*self.TolParam,'Difference in estimated partition function (1.0) greater than' + str(0.1*self.TolParam))
 def test_loglik(self):
     nsamples = 1000000
     Gauss = Gaussian(n=1,mu=array([0]),sigma=array([[4]]))
     dat = Gauss.sample(nsamples)
     logWeights = self.mog.loglik(dat) - Gauss.loglik(dat)
     Z = logsumexp(logWeights)-log(nsamples)
     print "test_loglik: z: " ,exp(Z)
     self.assertTrue(abs(exp(Z)-1)<1e-01)
Exemplo n.º 6
0
 def test_loglik(self):
     nsamples = 1000000
     Gauss = Gaussian(n=1, mu=array([0]), sigma=array([[4]]))
     dat = Gauss.sample(nsamples)
     logWeights = self.mog.loglik(dat) - Gauss.loglik(dat)
     Z = logsumexp(logWeights) - log(nsamples)
     print "test_loglik: z: ", exp(Z)
     self.assertTrue(abs(exp(Z) - 1) < 1e-01)
Exemplo n.º 7
0
 def test_loglik(self):
     p1 = Distributions.Kumaraswamy({'a': 2.0, 'b': 3.0})
     p2 = Distributions.Kumaraswamy({'a': 1.0, 'b': 1.0})
     nsamples = 1000000
     data = p2.sample(nsamples)
     logZ = logsumexp(p1.loglik(data) - p2.loglik(data) - np.log(nsamples))
     print "Estimated partition function: ", np.exp(logZ)
     self.assertTrue(
         np.abs(np.exp(logZ) - 1.0) < 0.1 * self.TolParam,
         'Difference in estimated partition function (1.0) greater than' +
         str(0.1 * self.TolParam))
Exemplo n.º 8
0
 def test_loglik(self):
     nsamples=100000
     q = self.q.copy()
     q['s'] = 2.0*self.q['s']
     dataImportance = q.sample(nsamples)
     # from matplotlib.pyplot import show
     # self.P.histogram(dataImportance,bins=200)
     # show()
     # raw_input()
     
     logweights = self.P.loglik(dataImportance)-q.loglik(dataImportance)
     Z = logsumexp(logweights)-log(nsamples)
     err = abs(exp(Z)-1)
     self.assertTrue(err<1e-01,'Estimated partition function deviates from 1.0 by %.4g' % (err,))
Exemplo n.º 9
0
    def test_loglik(self):
        p1 = self.p
        p2 = self.p.copy()
        p2['mu'] *= 1.1

        nsamples = 1000000
        data = p2.sample(nsamples)
        logZ = logsumexp(p1.loglik(data) - p2.loglik(data) - np.log(nsamples))
        print np.exp(logZ)
        print "Estimated partition function: ", np.exp(logZ)
        self.assertTrue(
            np.abs(np.exp(logZ) - 1.0) < 0.1 * self.TolParam,
            'Difference in estimated partition function (1.0) greater than' +
            str(0.1 * self.TolParam))
Exemplo n.º 10
0
    def test_loglik(self):
        p1 = Distributions.Gamma({'u': 2.0, 's': 3.0})
        p2 = Distributions.Gamma({'u': 1.0, 's': 1.0})
        nsamples = 1000000
        data = p2.sample(nsamples)
        logZ = logsumexp(p1.loglik(data) - p2.loglik(data) - np.log(nsamples))
        print "Estimated partition function: ", np.exp(logZ)

        print "Testing log-likelihood of Gamma distribution ... "
        sys.stdout.flush()
        p = Distributions.Gamma({'u': 2.0, 's': 3.0})
        l = p.loglik(self.X)
        for k in range(len(self.LL)):
            self.assertFalse(np.abs(l[k] - self.LL[k]) > self.Tol,\
               'Difference in log-likelihood for Gamma greater than ' + str(self.Tol))
Exemplo n.º 11
0
    def loglik(self,dat):
        '''

        Computes the loglikelihood of the data points in dat. 

        :param dat: Data points for which the loglikelihood will be computed.
        :type dat: natter.DataModule.Data
        :returns:  An array containing the loglikelihoods.
        :rtype:    numpy.array
         
           
        '''
        ret = zeros((self.param['K'],dat.size(1)))
        for k in range(self.param['K']):
            ret[k,:]  = log(self.param['pi'][k]) + squeeze(self.__kloglik(dat,k))
        return squeeze(logsumexp(ret,0))
Exemplo n.º 12
0
 def test_loglik(self):
     p1 = Distributions.Gamma({'u':2.0,'s':3.0})
     p2 = Distributions.Gamma({'u':1.0,'s':1.0})
     nsamples = 1000000
     data = p2.sample(nsamples)
     logZ = logsumexp(p1.loglik(data) -p2.loglik(data) - np.log(nsamples))
     print "Estimated partition function: ", np.exp(logZ)
     
     
     print "Testing log-likelihood of Gamma distribution ... "
     sys.stdout.flush()
     p = Distributions.Gamma({'u':2.0,'s':3.0})
     l = p.loglik(self.X)
     for k in range(len(self.LL)):
         self.assertFalse(np.abs(l[k] - self.LL[k]) > self.Tol,\
            'Difference in log-likelihood for Gamma greater than ' + str(self.Tol))
Exemplo n.º 13
0
    def test_loglik(self):
        nsamples = 100000
        q = self.q.copy()
        q['s'] = 2.0 * self.q['s']
        dataImportance = q.sample(nsamples)
        # from matplotlib.pyplot import show
        # self.P.histogram(dataImportance,bins=200)
        # show()
        # raw_input()

        logweights = self.P.loglik(dataImportance) - q.loglik(dataImportance)
        Z = logsumexp(logweights) - log(nsamples)
        err = abs(exp(Z) - 1)
        self.assertTrue(
            err < 1e-01,
            'Estimated partition function deviates from 1.0 by %.4g' % (err, ))
Exemplo n.º 14
0
    def loglik(self, dat):
        '''

        Computes the loglikelihood of the data points in dat. 

        :param dat: Data points for which the loglikelihood will be computed.
        :type dat: natter.DataModule.Data
        :returns:  An array containing the loglikelihoods.
        :rtype:    numpy.array
         
           
        '''
        ret = zeros((self.param['K'], dat.size(1)))
        for k in xrange(self.param['K']):
            ret[k, :] = log(self.param['pi'][k]) + squeeze(
                self.__kloglik(dat, k))
        return squeeze(logsumexp(ret, 0))
Exemplo n.º 15
0
    def loglik(self,dat):
        '''

        Computes the loglikelihood of the data points in dat. 

        :param dat: Data points for which the loglikelihood will be computed.
        :type dat: natter.DataModule.Data
        :returns:  An array containing the loglikelihoods.
        :rtype:    numpy.array
         
           
        '''
        self._checkAlpha()
        n,m = dat.size()
        X = zeros((m,len(self.param['P'])))
        for k,p in enumerate(self.param['P']):
            X[:,k] = p.loglik(dat) + log(self.param['alpha'][k])
        return logsumexp(X,axis=1)
Exemplo n.º 16
0
    def mixturePosterior(self,dat):
        """
        Returns the posterior p(k|x) over the inidicator variable for
        the mixture components given the data points in dat.

        :param dat: data points at which the posterior is computed
        :type dat: natter.numpy.ndarray
        :returns: posterior over the mixture components
        :rtype: numpy.ndarray
        """
        n,m = dat.size()
        K = len(self.param['P'])

        T = zeros((K,m)) # alpha(i)*p_i(x|theta)/(sum_j alpha(j) p_j(x|theta))
        LP = zeros((K,m)) # log likelihoods of the single mixture components

        for k,p in enumerate(self.param['P']):
            LP[k,:] = p.loglik(dat)  + log(self.param['alpha'][k])
        for k in xrange(K):
            T[k,:] = exp(LP[k,:]-logsumexp(LP,axis=0))

        return T
Exemplo n.º 17
0
def check_sample(dic):
    """
    Checks a sample function via importance sampling. Nsamples are
    drawn from the distribution under test. Then the partition
    function of the proposal_low distribution is estimated using these
    samples and the log-likelihood function of the distribution under
    test. If sampling and both log-likelihood functions are correct,
    the importance sampling estimate should yield a partition function
    estimate of 1. Here, an absolute error of dic['tolerance'] is
    accepted to pass the test.

    Argument:
    :param dic: dictionary filled with (at least) proposal_low, nsamples, tolerance
    :type dic : dictionary

    """
    dic = fill_dict_with_defaults(dic)
    data = dic['dist'].sample(dic['nsamples'])
    logP = dic['proposal_low'].loglik(data)
    logQ = dic['dist'].loglik(data)
    logZ = logsumexp(logP -logQ) - np.log(dic['nsamples'])
    diff = np.abs(np.exp(logZ)-1)
    assert np.abs(np.exp(logZ)-1)<dic['tolerance'], "Testing sampling failed for %s, difference is %g, which is bigger than %g, number of smaples are: %d"%(dic['dist'].name,diff,dic['tolerance'],dic['nsamples'])
Exemplo n.º 18
0
def check_loglik_importance(dic):
    """
    Checks a log-likelihood function via importance sampling. Nsamples
    are drawn from the proposal_high distribution (given in the
    dictionary). Therefore, it is assumed that the log-likelihood
    function of this distribution is correct. If also the likelihood
    function of the distribution under test (dic['dist']) is correct,
    the importance sampling estimate of the partition function, should
    yield a value close to 1. Here, an absolute error of
    dic['tolerance'] is accepted to pass the test. 

    Argument:
    :param dic: dictionary filled with (at least) proposal_high, nsamples, tolerance
    :type dic : dictionary

    """
    dic = fill_dict_with_defaults(dic)
    data = dic['proposal_high'].sample(dic['nsamples'])
    logQ = dic['proposal_high'].loglik(data)
    logP = dic['dist'].loglik(data)
    logZ = logsumexp(logP -logQ) - np.log(dic['nsamples'])
    diff = np.abs(np.exp(logZ)-1)
    assert np.abs(np.exp(logZ)-1)<dic['tolerance'], "Testing loglik failed for %s, difference is %g, which is bigger than %g, number of samples are: %d "%(dic['dist'].name,diff,dic['tolerance'],dic['nsamples'])
Exemplo n.º 19
0
    def estimate(self, dat, errTol=1e-4, maxiter=1000):
        '''

        Estimates the parameters from the data in dat. It is possible to only selectively fit parameters of the distribution by setting the primary array accordingly (see :doc:`Tutorial on the Distributions module <tutorial_Distributions>`).

        The estimation method uses EM to fit the mixture distribution.
        
        :param dat: Data points on which the Mixture of Dirichlet distributions will be estimated.
        :type dat: natter.DataModule.Data
        :param errTol: Stopping criterion for the iteration
        :type errTol: float
        :param maxiter: maximal number of EM iterations
        :param maxiter: int
        
        '''
        if len(dat.X.shape) == 1:
            print "\tReshaping data to right shape"
            dat.X = reshape(dat.X, (1, dat.X.shape[0]))
        print "\tEstimating Mixture of Gaussians with EM ..."
        errTol = 1e-5

        K = self.param['K']
        mu = self.param['mu'].copy()
        s = self.param['s'].copy()
        m = dat.size(1)
        p = self.param['pi'].copy()
        X = dat.X

        H = zeros((K, m))
        ALLold = ALL = Inf

        nr = floor(m / K)
        for k in range(K):
            mu[k] = mean(X[0, k * nr:(k + 1) * nr + 1])

        for i in range(maxiter):
            ALLold = ALL
            sumH = zeros((1, m))
            for j in range(K):
                if p[j] < 1e-3:
                    p[j] = 1e-3
                if s[j] < 1e-3:
                    s[j] = 1e-3
            # E-Step
            # the next few lines have been transferred to the log-domain for numerical stability
            for k in range(K):
                H[k, :] = log(p[k]) + squeeze(-.5 * log(pi * 2.0) - log(s[k]) -
                                              (dat.X - mu[k])**2 /
                                              (2.0 * s[k]**2.0))

            sumH = logsumexp(H, 0)
            for k in range(K):
                H[k, :] = H[k, :] - sumH

            H = exp(H)  # leave log-domain here
            sumHk = sum(H, 1)

            if 'mu' in self.primary:
                mu = squeeze(dot(H, X.T)) / sumHk
                self.param['mu'] = mu
            if 'pi' in self.primary:
                p = squeeze(mean(H, 1))
                self.param['pi'] = p
            if 's' in self.primary:
                for k in range(K):
                    s[k] = sqrt(sum(H[k, :] * (X - mu[k])**2) / sumHk[k])
                self.param['s'] = s
            if i >= 2:
                ALL = self.all(dat)
                print "\r\t Mixture Of Gaussians ALL: %.8f [Bits]" % ALL,
                sys.stdout.flush()
                if abs(ALLold - ALL) < errTol:
                    break
        print "\t[EM finished]"
Exemplo n.º 20
0
    def estimate(self,dat, errTol=1e-4,maxiter=1000):
        '''

        Estimates the parameters from the data in dat. It is possible to only selectively fit parameters of the distribution by setting the primary array accordingly (see :doc:`Tutorial on the Distributions module <tutorial_Distributions>`).

        The estimation method uses EM to fit the mixture distribution.
        
        :param dat: Data points on which the Mixture of Dirichlet distributions will be estimated.
        :type dat: natter.DataModule.Data
        :param errTol: Stopping criterion for the iteration
        :type errTol: float
        :param maxiter: maximal number of EM iterations
        :param maxiter: int
        
        '''
        if len(dat.X.shape) == 1:
            print "\tReshaping data to right shape"
            dat.X = reshape(dat.X,(1,dat.X.shape[0]))
        print "\tEstimating Mixture of Gaussians with EM ..."
        errTol=1e-5




        K=self.param['K']
        mu = self.param['mu'].copy()
        s = self.param['s'].copy()
        m = dat.size(1)
        p = self.param['pi'].copy()
        X = dat.X
        
        H = zeros((K,m))
        ALLold = ALL = Inf

        nr = floor(m/K)
        for k in range(K):
            mu[k] = mean(X[0,k*nr:(k+1)*nr+1])

        for i in range(maxiter):
            ALLold = ALL
            sumH = zeros((1,m))
            for j in range(K):
                if p[j] < 1e-3:
                    p[j] = 1e-3
                if s[j] < 1e-3:
                    s[j] = 1e-3
            # E-Step
            # the next few lines have been transferred to the log-domain for numerical stability
            for k in range(K):
                H[k,:] = log(p[k]) + squeeze(-.5*log(pi*2.0) - log(s[k]) - (dat.X-mu[k])**2 / (2.0*s[k]**2.0)) 

            sumH = logsumexp(H,0)
            for k in range(K):
                H[k,:] = H[k,:] - sumH

            H = exp(H) # leave log-domain here
            sumHk = sum(H,1)


            if 'mu' in self.primary:
                mu = squeeze(dot(H,X.T))/sumHk
                self.param['mu'] = mu
            if 'pi' in self.primary:
                p = squeeze(mean(H,1))
                self.param['pi'] = p
            if 's' in self.primary:
                for k in range(K):
                    s[k] = sqrt(sum(H[k,:]*(X-mu[k])**2)/sumHk[k])
                self.param['s'] = s
            if i >= 2:
                ALL = self.all(dat)
                print "\r\t Mixture Of Gaussians ALL: %.8f [Bits]" % ALL,
                sys.stdout.flush()
                if abs(ALLold-ALL)<errTol:
                    break
        print "\t[EM finished]"
Exemplo n.º 21
0
 def test_sample(self):
     nsamples = 10000
     data = self.ECG.sample(nsamples)
     logWeights = self.Gaussian.loglik(data) -self.ECG.loglik(data)
     Z = logsumexp(logWeights)-log(nsamples)
     self.assertTrue(abs(exp(Z)-1)<1e-01)
Exemplo n.º 22
0
 def test_loglik(self):
     nsamples=100000
     dataImportance = self.Gaussian.sample(nsamples)
     logweights = self.ECG.loglik(dataImportance)-self.Gaussian.loglik(dataImportance)
     Z = logsumexp(logweights)-log(nsamples)
     self.assertTrue(abs(exp(Z)-1)<1e-01)