Пример #1
0
  def get_global_suff_stats(self, Data, LP, doPrecompEntropy=None, **kwargs):
    ''' Calculate the sufficient statistics for global parameter updates
        Only adds stats relevant for this allocModel. 
        Other stats are added by the obsModel.
        
        Args
        -------
        Data : bnpy data object
        LP : local param dict with fields
              resp : Data.nObs x K array,
                       where resp[n,k] = posterior resp of comp k
        doPrecompEntropy : boolean flag
                      indicates whether to precompute ELBO terms in advance
                      used for memoized learning algorithms (moVB)

        Returns
        -------
        SS : SuffStats for K components, with field
              N : vector of length-K,
                   effective number of observations assigned to each comp
    '''
    Nvec = np.sum( LP['resp'], axis=0 )
    SS = SuffStatBag(K=Nvec.size, D=Data.dim)
    SS.setField('N', Nvec, dims=('K'))
    if doPrecompEntropy is not None:
      ElogqZ_vec = self.E_logqZ(LP)
      SS.setELBOTerm('ElogqZ', ElogqZ_vec, dims=('K'))
    return SS
Пример #2
0
    def get_global_suff_stats(self, Data, LP, doPrecompEntropy=None, **kwargs):
        """ Calculate the sufficient statistics for global parameter updates
        Only adds stats relevant for this allocModel. 
        Other stats are added by the obsModel.
        
        Args
        -------
        Data : bnpy data object
        LP : local param dict with fields
              resp : Data.nObs x K array,
                       where resp[n,k] = posterior resp of comp k
        doPrecompEntropy : boolean flag
                      indicates whether to precompute ELBO terms in advance
                      used for memoized learning algorithms (moVB)

        Returns
        -------
        SS : SuffStats for K components, with field
              N : vector of length-K,
                   effective number of observations assigned to each comp
    """
        Nvec = np.sum(LP["resp"], axis=0)
        SS = SuffStatBag(K=Nvec.size, D=Data.dim)
        SS.setField("N", Nvec, dims=("K"))
        if doPrecompEntropy is not None:
            ElogqZ_vec = self.E_logqZ(LP)
            SS.setELBOTerm("ElogqZ", ElogqZ_vec, dims=("K"))
        return SS
Пример #3
0
def calcSummaryStats(Data, LP,
                     doPrecompEntropy=0,
                     doPrecompMergeEntropy=0,
                     mPairIDs=None,
                     trackDocUsage=0,
                     **kwargs):
    ''' Calculate summary statistics for given data slice and local params.

    Returns
    -------
    SS : SuffStatBag
    '''
    if mPairIDs is None:
        M = 0
    else:
        M = len(mPairIDs)

    resp = LP['resp']
    K = resp.shape[1]
    startLocIDs = Data.doc_range[:-1]
    StartStateCount = np.sum(resp[startLocIDs], axis=0)
    N = np.sum(resp, axis=0)

    if 'TransCount' in LP:
        TransStateCount = np.sum(LP['TransCount'], axis=0)
    else:
        respPair = LP['respPair']
        TransStateCount = np.sum(respPair, axis=0)

    SS = SuffStatBag(K=K, D=Data.dim, M=M)
    SS.setField('StartStateCount', StartStateCount, dims=('K'))
    SS.setField('TransStateCount', TransStateCount, dims=('K', 'K'))
    SS.setField('N', N, dims=('K'))
    SS.setField('nDoc', Data.nDoc, dims=None)

    if doPrecompEntropy or 'Htable' in LP:
        # Compute entropy terms!
        # 'Htable', 'Hstart' will both be in Mdict
        Mdict = calcELBO_NonlinearTerms(Data=Data,
                                        LP=LP, returnMemoizedDict=1)
        SS.setELBOTerm('Htable', Mdict['Htable'], dims=('K', 'K'))
        SS.setELBOTerm('Hstart', Mdict['Hstart'], dims=('K'))

    if doPrecompMergeEntropy:
        subHstart, subHtable = HMMUtil.PrecompMergeEntropy_SpecificPairs(
            LP, Data, mPairIDs)
        SS.setMergeTerm('Hstart', subHstart, dims=('M'))
        SS.setMergeTerm('Htable', subHtable, dims=('M', 2, 'K'))
        SS.mPairIDs = np.asarray(mPairIDs)

    if trackDocUsage:
        # Track how often topic appears in a seq. with mass > thresh.
        DocUsage = np.zeros(K)
        for n in range(Data.nDoc):
            start = Data.doc_range[n]
            stop = Data.doc_range[n + 1]
            DocUsage += np.sum(LP['resp'][start:stop], axis=0) > 0.01
        SS.setSelectionTerm('DocUsageCount', DocUsage, dims='K')
    return SS
Пример #4
0
    def get_global_suff_stats(self,
                              Data,
                              LP,
                              doPrecompEntropy=False,
                              doPrecompMergeEntropy=False,
                              mPairIDs=None):
        ''' Calculate the sufficient statistics for global parameter updates
        Only adds stats relevant for this allocModel. 
        Other stats are added by the obsModel.
        
        Args
        -------
        Data : bnpy data object
        LP : local param dict with fields
              resp : Data.nObs x K array,
                       where resp[n,k] = posterior resp of comp k
        doPrecompEntropy : boolean flag
                      indicates whether to precompute ELBO terms in advance
                      used for memoized learning algorithms (moVB)
        doPrecompMergeEntropy : boolean flag
                      indicates whether to precompute ELBO terms in advance
                      for all possible merges of pairs of components
                      used for optional merge moves

        Returns
        -------
        SS : SuffStats for K components, with field
              N : vector of length-K,
                   effective number of observations assigned to each comp
    '''
        Nvec = np.sum(LP['resp'], axis=0)
        SS = SuffStatBag(K=Nvec.size, D=Data.dim)
        SS.setField('N', Nvec, dims=('K'))
        if doPrecompEntropy:
            ElogqZ_vec = self.E_logqZ(LP)
            SS.setELBOTerm('ElogqZ', ElogqZ_vec, dims=('K'))
        if doPrecompMergeEntropy:
            # Hmerge : KxK matrix of entropies for all possible pair-wise merges
            # for example, if we had only 3 components {0,1,2}
            # Hmerge = [ 0 H(0,1) H(0,2)
            #            0   0    H(1,2)
            #            0   0      0 ]
            #  where H(i,j) is entropy if components i and j merged.
            Hmerge = np.zeros((self.K, self.K))
            for jj in range(self.K):
                compIDs = np.arange(jj + 1, self.K)
                Rcombo = LP['resp'][:, jj][:, np.
                                           newaxis] + LP['resp'][:, compIDs]
                Hmerge[jj, compIDs] = np.sum(
                    Rcombo * np.log(Rcombo + EPS), axis=0)
            SS.setMergeTerm('ElogqZ', Hmerge, dims=('K', 'K'))
        return SS
Пример #5
0
 def get_global_suff_stats(self, Data, LP, doPrecompEntropy=None, **kwargs):
     ''' Calculate sufficient statistics.
         Admixture models have no suff stats for allocation   
     '''
     wv = LP['word_variational']
     _, K = wv.shape
     SS = SuffStatBag(K=K, D=Data.vocab_size)
     SS.setField('nDoc', Data.nDoc, dims=None)
     if doPrecompEntropy:
         SS.setELBOTerm('ElogpZ', self.E_log_pZ(Data, LP), dims='K')
         SS.setELBOTerm('ElogqZ', self.E_log_qZ(Data, LP), dims='K')
         SS.setELBOTerm('ElogpPi', self.E_log_pPI(Data, LP), dims=None)
         SS.setELBOTerm('ElogqPi', self.E_log_qPI(Data, LP), dims=None)
     return SS
Пример #6
0
 def get_global_suff_stats(self, Data, LP, doPrecompEntropy=None, **kwargs):
     ''' Calculate sufficient statistics.
         Admixture models have no suff stats for allocation   
     '''
     wv = LP['word_variational']
     _, K = wv.shape
     SS = SuffStatBag(K=K, D=Data.vocab_size)
     SS.setField('nDoc', Data.nDoc, dims=None)
     if doPrecompEntropy:
         SS.setELBOTerm('ElogpZ', self.E_log_pZ(Data, LP), dims='K')
         SS.setELBOTerm('ElogqZ', self.E_log_qZ(Data, LP), dims='K')
         SS.setELBOTerm('ElogpPi', self.E_log_pPI(Data, LP), dims=None)
         SS.setELBOTerm('ElogqPi', self.E_log_qPI(Data, LP), dims=None)
     return SS
Пример #7
0
    def get_global_suff_stats(self, Data, LP, doPrecompEntropy=None, **kwargs):
        ''' Calculate sufficient statistics.
    '''
        resp = LP['resp']
        _, K = resp.shape
        SS = SuffStatBag(K=K, D=Data.get_dim())
        SS.setField('nDoc', Data.nDoc, dims=None)
        SS.setField('sumLogVd', np.sum(LP['ElogV'], axis=0), dims='K')
        SS.setField('sumLog1mVd', np.sum(LP['Elog1mV'], axis=0), dims='K')

        if doPrecompEntropy:
            ElogqZ = self.E_logqZ(Data, LP)
            VZlocal = self.E_logpVZ_logqV(Data, LP)
            SS.setELBOTerm('ElogqZ', ElogqZ, dims='K')
            SS.setELBOTerm('VZlocal', VZlocal, dims=None)
        return SS
Пример #8
0
  def get_global_suff_stats(self, Data, LP,
                             doPrecompEntropy=False, 
                             doPrecompMergeEntropy=False, mPairIDs=None):
    ''' Calculate the sufficient statistics for global parameter updates
        Only adds stats relevant for this allocModel. 
        Other stats are added by the obsModel.
        
        Args
        -------
        Data : bnpy data object
        LP : local param dict with fields
              resp : Data.nObs x K array,
                       where resp[n,k] = posterior resp of comp k
        doPrecompEntropy : boolean flag
                      indicates whether to precompute ELBO terms in advance
                      used for memoized learning algorithms (moVB)
        doPrecompMergeEntropy : boolean flag
                      indicates whether to precompute ELBO terms in advance
                      for all possible merges of pairs of components
                      used for optional merge moves

        Returns
        -------
        SS : SuffStats for K components, with field
              N : vector of length-K,
                   effective number of observations assigned to each comp
    '''
    Nvec = np.sum(LP['resp'], axis=0)
    SS = SuffStatBag(K=Nvec.size, D=Data.dim)
    SS.setField('N', Nvec, dims=('K'))
    if doPrecompEntropy:
      ElogqZ_vec = self.E_logqZ(LP)
      SS.setELBOTerm('ElogqZ', ElogqZ_vec, dims=('K'))
    if doPrecompMergeEntropy:
      # Hmerge : KxK matrix of entropies for all possible pair-wise merges
      # for example, if we had only 3 components {0,1,2}
      # Hmerge = [ 0 H(0,1) H(0,2)
      #            0   0    H(1,2)
      #            0   0      0 ]      
      #  where H(i,j) is entropy if components i and j merged.
      Hmerge = np.zeros((self.K, self.K))
      for jj in range(self.K):
        compIDs = np.arange(jj+1, self.K)
        Rcombo = LP['resp'][:,jj][:,np.newaxis] + LP['resp'][:,compIDs]
        Hmerge[jj,compIDs] = np.sum(Rcombo*np.log(Rcombo+EPS), axis=0)
      SS.setMergeTerm('ElogqZ', Hmerge, dims=('K','K'))
    return SS
Пример #9
0
    def get_global_suff_stats(self, Data, LP, doPrecompEntropy=0, **kwargs):
        ''' Compute sufficient stats for provided dataset and local params

        Returns
        -------
        SS : SuffStatBag
            Updated fields
            * NodeStateCount : 2D array, nNodes x K
            * N : 2D array, size K x K
        '''
        K = LP['resp'].shape[-1]

        V = Data.nNodes
        SS = SuffStatBag(K=K, D=Data.dim, V=V)

        # NodeStateCount_src[i,k]
        #   Num edges assigned to topic k associated with node i as source
        srcResp = LP['resp'].sum(axis=2)
        NodeStateCount_src = Data.getSparseSrcNodeMat() * srcResp
        # Equivalent but slower: for loop
        # NodeStateCount_src = np.zeros((Data.nNodes, K))
        # for i in xrange(Data.nNodes):
        #     mask_i = Data.edges[:,0] == i
        #     NodeStateCount_src[i,:] = srcResp[mask_i].sum(axis=0)

        # NodeStateCount_rcv[i,k]
        #   Num edges assigned to topic k associated with node i as receiver
        rcvResp = LP['resp'].sum(axis=1)
        NodeStateCount_rcv = Data.getSparseRcvNodeMat() * rcvResp

        # Summing src counts and rcv counts gives the total
        SS.setField('NodeStateCount',
                    NodeStateCount_src + NodeStateCount_rcv,
                    dims=('V', 'K'))
        # Compute total atoms assigned to each cluster pair
        Nresp = np.sum(LP['resp'], axis=0)
        SS.setField('N', Nresp, dims=('K', 'K'))

        if doPrecompEntropy:
            # Remember, resp has shape nEdges x K x K
            # So, need to sum so we track scalar entropy, not K x K
            Hresp = calcLentropyAsScalar(LP)
            SS.setELBOTerm('Hresp', Hresp, dims=None)
        return SS
Пример #10
0
    def get_global_suff_stats(self, Data, LP, doPrecompEntropy=None, **kwargs):
        ''' Create sufficient stats needed for global param updates

        Args
        -------
        Data : bnpy data object
        LP : Dictionary containing the local parameters. Expected to contain:
            resp : Data.nObs x K array
            respPair : Data.nObs x K x K array (from the def. of respPair, note
                       respPair[0,:,:] is undefined)

        Returns
        -------
        SS : SuffStatBag with fields
            StartStateCount : A vector of length K with entry i being
                             resp(z_{1k}) = resp[0,:]
            TransStateCount : A K x K matrix where TransStateCount[i,j] =
                           sum_{n=2}^K respPair(z_{n-1,j}, z_{nk})
            N : A vector of length K with entry k being
                sum_{n=1}^Data.nobs resp(z_{nk})

            The first two of these are used by FiniteHMM.update_global_params,
            and the third is used by ObsModel.update_global_params.

        (see the documentation for information about resp and respPair)
        '''
        resp = LP['resp']
        respPair = LP['respPair']
        K = resp.shape[1]
        startLocIDs = Data.doc_range[:-1]

        StartStateCount = np.sum(resp[startLocIDs], axis=0)
        N = np.sum(resp, axis=0)
        TransStateCount = np.sum(respPair, axis=0)

        SS = SuffStatBag(K=K, D=Data.dim)
        SS.setField('StartStateCount', StartStateCount, dims=('K'))
        SS.setField('TransStateCount', TransStateCount, dims=('K', 'K'))
        SS.setField('N', N, dims=('K'))

        if doPrecompEntropy is not None:
            entropy = self.elbo_entropy(Data, LP)
            SS.setELBOTerm('Elogqz', entropy, dims=None)
        return SS
Пример #11
0
    def get_global_suff_stats(self, Data, LP, doPrecompEntropy=False, doPrecompMergeEntropy=False, mPairIDs=None):
        """ Count expected number of times each topic is used across all docs    
    """
        wv = LP["word_variational"]
        _, K = wv.shape
        # Turn dim checking off, since some stats have dim K+1 instead of K
        SS = SuffStatBag(K=K, D=Data.vocab_size)
        SS.setField("nDoc", Data.nDoc, dims=None)
        sumLogPi = np.sum(LP["E_logPi"], axis=0)
        SS.setField("sumLogPiActive", sumLogPi[:K], dims="K")
        SS.setField("sumLogPiUnused", sumLogPi[-1], dims=None)

        if "DocTopicFrac" in LP:
            Nmajor = LP["DocTopicFrac"]
            Nmajor[Nmajor < 0.05] = 0
            SS.setField("Nmajor", np.sum(Nmajor, axis=0), dims="K")
        if doPrecompEntropy:
            # ---------------- Z terms
            SS.setELBOTerm("ElogpZ", self.E_logpZ(Data, LP), dims="K")
            # ---------------- Pi terms
            # Note: no terms needed for ElogpPI
            # SS already has field sumLogPi, which is sufficient for this term
            ElogqPiC, ElogqPiA, ElogqPiU = self.E_logqPi_Memoized_from_LP(LP)
            SS.setELBOTerm("ElogqPiConst", ElogqPiC, dims=None)
            SS.setELBOTerm("ElogqPiActive", ElogqPiA, dims="K")
            SS.setELBOTerm("ElogqPiUnused", ElogqPiU, dims=None)

        if doPrecompMergeEntropy:
            ElogpZMat, sLgPiMat, ElogqPiMat = self.memo_elbo_terms_for_merge(LP)
            SS.setMergeTerm("ElogpZ", ElogpZMat, dims=("K", "K"))
            SS.setMergeTerm("ElogqPiActive", ElogqPiMat, dims=("K", "K"))
            SS.setMergeTerm("sumLogPiActive", sLgPiMat, dims=("K", "K"))
        return SS
Пример #12
0
    def get_global_suff_stats(self, Data, LP, doPrecompEntropy=0, **kwargs):
        ''' Compute sufficient stats for provided dataset and local params

        Returns
        -------
        SS : SuffStatBag with K components and fields
            * sumSource : nNodes x K
            * sumReceiver : nNodes x K
        '''
        V = Data.nNodes
        K = LP['resp'].shape[-1]
        SS = SuffStatBag(K=K, D=Data.dim, V=V)
        if 'NodeStateCount' not in LP:
            assert 'resp' in LP
            LP = self.initLPFromResp(Data, LP)
        SS.setField('NodeStateCount', LP['NodeStateCount'], dims=('V', 'K'))
        if np.allclose(LP['resp'].sum(axis=1).min(), 1.0):
            # If the LP fully represents all present edges,
            # then the NodeStateCount should as well.
            assert np.allclose(SS.NodeStateCount, Data.nEdges * 2)
        SS.setField('N', LP['N_fg'], dims=('K', ))
        SS.setField('scaleFactor', Data.nEdges, dims=None)

        if 'Ldata_bg' in LP:
            SS.setELBOTerm('Ldata_bg', LP['Ldata_bg'], dims=None)

        if doPrecompEntropy:
            Hresp_fg = LP['Lentropy_fg']  # = -1 * calcRlogR(LP['resp'])
            Hresp_bg = LP['Lentropy_bg']

            SS.setELBOTerm('Hresp', Hresp_fg, dims='K')
            SS.setELBOTerm('Hresp_bg', Hresp_bg, dims=None)

        return SS
Пример #13
0
  def get_global_suff_stats(self, Data, LP, doPrecompEntropy=False, 
                                              doPrecompMergeEntropy=False,
                                              mPairIDs=None):
    ''' Count expected number of times each topic is used across all docs    
    '''
    wv = LP['word_variational']
    _, K = wv.shape
    # Turn dim checking off, since some stats have dim K+1 instead of K
    SS = SuffStatBag(K=K, D=Data.vocab_size)
    SS.setField('nDoc', Data.nDoc, dims=None)
    sumLogPi = np.sum(LP['E_logPi'], axis=0)
    SS.setField('sumLogPiActive', sumLogPi[:K], dims='K')
    SS.setField('sumLogPiUnused', sumLogPi[-1], dims=None)

    if 'DocTopicFrac' in LP:
      Nmajor = LP['DocTopicFrac']
      Nmajor[Nmajor < 0.05] = 0
      SS.setField('Nmajor', np.sum(Nmajor, axis=0), dims='K')
    if doPrecompEntropy:
      # ---------------- Z terms
      SS.setELBOTerm('ElogpZ', self.E_logpZ(Data, LP), dims='K')
      # ---------------- Pi terms
      # Note: no terms needed for ElogpPI
      # SS already has field sumLogPi, which is sufficient for this term
      ElogqPiC, ElogqPiA, ElogqPiU = self.E_logqPi_Memoized_from_LP(LP)
      SS.setELBOTerm('ElogqPiConst', ElogqPiC, dims=None)
      SS.setELBOTerm('ElogqPiActive', ElogqPiA, dims='K')
      SS.setELBOTerm('ElogqPiUnused', ElogqPiU, dims=None)

    if doPrecompMergeEntropy:
      ElogpZMat, sLgPiMat, ElogqPiMat = self.memo_elbo_terms_for_merge(LP)
      SS.setMergeTerm('ElogpZ', ElogpZMat, dims=('K','K'))
      SS.setMergeTerm('ElogqPiActive', ElogqPiMat, dims=('K','K'))
      SS.setMergeTerm('sumLogPiActive', sLgPiMat, dims=('K','K'))
    return SS
Пример #14
0
    def get_global_suff_stats(self, Data, LP, doPrecompEntropy=False, 
                                              doPrecompMergeEntropy=False,
                                              mPairIDs=None):
        ''' Count expected number of times each topic is used across all docs    
        '''
        wv = LP['word_variational']
        _, K = wv.shape
        # Turn dim checking off, since some stats have dim K+1 instead of K
        SS = SuffStatBag(K=K, D=Data.vocab_size)
        SS.setField('nDoc', Data.nDoc, dims=None)
        sumLogPi = np.sum(LP['E_logPi'], axis=0)
        SS.setField('sumLogPiActive', sumLogPi[:K], dims='K')
        SS.setField('sumLogPiUnused', sumLogPi[-1], dims=None)

        if 'DocTopicFrac' in LP:
          Nmajor = LP['DocTopicFrac']
          Nmajor[Nmajor < 0.05] = 0
          SS.setField('Nmajor', np.sum(Nmajor, axis=0), dims='K')
        if doPrecompEntropy:
            # Z terms
            SS.setELBOTerm('ElogpZ', self.E_logpZ(Data, LP), dims='K')
            # Pi terms
            # Note: no terms needed for ElogpPI
            # SS already has field sumLogPi, which is sufficient for this term
            ElogqPiC, ElogqPiA, ElogqPiU = self.E_logqPi_Memoized_from_LP(LP)
            SS.setELBOTerm('ElogqPiConst', ElogqPiC, dims=None)
            SS.setELBOTerm('ElogqPiActive', ElogqPiA, dims='K')
            SS.setELBOTerm('ElogqPiUnused', ElogqPiU, dims=None)

        if doPrecompMergeEntropy:
            ElogpZMat, sLgPiMat, ElogqPiMat = self.memo_elbo_terms_for_merge(LP)
            SS.setMergeTerm('ElogpZ', ElogpZMat, dims=('K','K'))
            SS.setMergeTerm('ElogqPiActive', ElogqPiMat, dims=('K','K'))
            SS.setMergeTerm('sumLogPiActive', sLgPiMat, dims=('K','K'))
        return SS
Пример #15
0
  def get_global_suff_stats(self, Data, LP, doPrecompEntropy=False, 
                                              doPrecompMergeEntropy=False,
                                              mPairIDs=None):
    ''' Count expected number of times each topic is used across all docs    
    '''
    K = LP['DocTopicCount'].shape[1]
    SS = SuffStatBag(K=K, D=Data.vocab_size)
    SS.setField('nDoc', Data.nDoc, dims=None)
    sumLogPi = np.sum(LP['E_logPi'], axis=0)
    SS.setField('sumLogPiActive', sumLogPi[:K], dims='K')
    SS.setField('sumLogPiUnused', sumLogPi[-1], dims=None)

    if doPrecompEntropy:
      # ---------------- Z terms
      SS.setELBOTerm('ElogpZ', self.E_logpZ(Data, LP), dims='K')
      logFactData, logFactZ = self.E_logfactorialZ(Data, LP)
      SS.setELBOTerm('logFactData', logFactData, dims=None)
      SS.setELBOTerm('logFactZ', logFactZ, dims='K')

      # ---------------- Pi terms
      # Note: no terms needed for ElogpPI
      # SS already has field sumLogPi, which is sufficient for this term
      ElogqPiC, ElogqPiA, ElogqPiU = self.E_logqPi_Memoized_from_LP(LP)
      SS.setELBOTerm('ElogqPiConst', ElogqPiC, dims=None)
      SS.setELBOTerm('ElogqPiActive', ElogqPiA, dims='K')
      SS.setELBOTerm('ElogqPiUnused', ElogqPiU, dims=None)

    if doPrecompMergeEntropy:
      ElogpZMat, sLgPiMat, ElogqPiMat = self.memo_elbo_terms_for_merge(LP)
      SS.setMergeTerm('ElogpZ', ElogpZMat, dims=('K','K'))
      SS.setMergeTerm('ElogqPiActive', ElogqPiMat, dims=('K','K'))
      SS.setMergeTerm('sumLogPiActive', sLgPiMat, dims=('K','K'))

      SS.setMergeTerm('logFactZ', 
                     self.memo_factorial_term_for_merge(LP, mPairIDs),
                     dims=('K', 'K'))
    return SS
Пример #16
0
def calcSummaryStats(Dslice,
                     LP=None,
                     alpha=None,
                     doPrecompEntropy=False,
                     cslice=(0, None),
                     **kwargs):
    """ Calculate summary from local parameters for given data slice.

    Parameters
    -------
    Data : bnpy data object
    LP : local param dict with fields
        resp : Data.nObs x K array,
            where resp[n,k] = posterior resp of comp k
        doPrecompEntropy : boolean flag
            indicates whether to precompute ELBO terms in advance
            used for memoized learning algorithms (moVB)

    Returns
    -------
    SS : SuffStatBag with K components
        * nDoc : scalar float
            Counts total documents available in provided data.

        Also has optional ELBO field when precompELBO is True
        * Hvec : 1D array, size K
            Vector of entropy contributions from each comp.
            Hvec[k] = \sum_{n=1}^N H[q(z_n)], a function of 'resp'
    """
    K = LP['DocTopicCount'].shape[1]
    SS = SuffStatBag(K=K, D=Dslice.dim)

    if cslice[1] is None:
        SS.setField('nDoc', Dslice.nDoc, dims=None)
    else:
        SS.setField('nDoc', cslice[1] - cslice[0], dims=None)

    SS.setField('nDoc', Dslice.nDoc, dims=None)
    if doPrecompEntropy:
        assert 'theta' in LP
        Lalloc = L_alloc(Dslice, LP, alpha=alpha)
        SS.setELBOTerm('L_alloc', Lalloc, dims=None)

        if 'nnzPerRow' in LP and LP['nnzPerRow'] == 1:
            SS.setELBOTerm('Hvec', 0.0, dims=None)
        else:
            Hvec = L_entropy(Dslice, LP, returnVector=1)
            SS.setELBOTerm('Hvec', Hvec, dims='K')
    return SS
Пример #17
0
def calcSummaryStats(Dslice,
                     LP=None,
                     alpha=None,
                     alphaEbeta=None,
                     doTrackTruncationGrowth=0,
                     doPrecompEntropy=0,
                     doPrecompMergeEntropy=0,
                     mergePairSelection=None,
                     mPairIDs=None,
                     trackDocUsage=0,
                     **kwargs):
    """ Calculate summary from local parameters for given data slice.

    Parameters
    -------
    Data : bnpy data object
    LP : local param dict with fields
        resp : Data.nObs x K array,
            where resp[n,k] = posterior resp of comp k
    doPrecompEntropy : boolean flag
        indicates whether to precompute ELBO terms in advance
        used for memoized learning algorithms (moVB)

    Returns
    -------
    SS : SuffStatBag with K components
        Relevant fields
        * nDoc : scalar float
            Counts total documents available in provided data.
        * sumLogPi : 1D array, size K
            Entry k equals \sum_{d in docs} E[ \log \pi_{dk} ]
        * sumLogPiRem : scalar float
            Equals sum over docs of probability of inactive topics.

        Also has optional ELBO field when precompELBO is True
        * Hvec : 1D array, size K
            Vector of entropy contributions from each comp.
            Hvec[k] = \sum_{n=1}^N H[q(z_n)], a function of 'resp'
    """
    if mPairIDs is None:
        M = 0
    else:
        M = len(mPairIDs)
    K = LP['DocTopicCount'].shape[1]
    if 'digammaSumTheta' not in LP:
        digammaSumTheta = digamma(LP['theta'].sum(axis=1) + LP['thetaRem'])
        LP['digammaSumTheta'] = digammaSumTheta  # Used for merges

    if 'ElogPi' not in LP:
        LP['ElogPiRem'] = digamma(LP['thetaRem']) - LP['digammaSumTheta']
        LP['ElogPi'] = digamma(LP['theta']) - \
            LP['digammaSumTheta'][:, np.newaxis]

    SS = SuffStatBag(K=K, D=Dslice.dim, M=M)
    SS.setField('nDoc', Dslice.nDoc, dims=None)
    SS.setField('sumLogPi', np.sum(LP['ElogPi'], axis=0), dims='K')
    if 'ElogPiEmptyComp' in LP:
        sumLogPiEmptyComp = np.sum(LP['ElogPiEmptyComp']) - \
            np.sum(LP['ElogPiOrigComp'])
        SS.setField('sumLogPiEmptyComp', sumLogPiEmptyComp, dims=None)
    if doTrackTruncationGrowth:
        remvec = np.zeros(K)
        remvec[K - 1] = np.sum(LP['ElogPiRem'])
        SS.setField('sumLogPiRemVec', remvec, dims='K')
    else:
        SS.setField('sumLogPiRem', np.sum(LP['ElogPiRem']), dims=None)

    if doPrecompEntropy:
        Mdict = calcELBO_NonlinearTerms(Data=Dslice,
                                        LP=LP,
                                        returnMemoizedDict=1)
        if type(Mdict['Hresp']) == float:
            # SPARSE HARD ASSIGNMENTS
            SS.setELBOTerm('Hresp', Mdict['Hresp'], dims=None)
        else:
            SS.setELBOTerm('Hresp', Mdict['Hresp'], dims=('K', ))
        SS.setELBOTerm('slackTheta', Mdict['slackTheta'], dims='K')
        SS.setELBOTerm('gammalnTheta', Mdict['gammalnTheta'], dims='K')
        if 'ElogPiEmptyComp' in LP:
            SS.setELBOTerm('slackThetaEmptyComp', Mdict['slackThetaEmptyComp'])
            SS.setELBOTerm('gammalnThetaEmptyComp',
                           Mdict['gammalnThetaEmptyComp'])
            SS.setELBOTerm('HrespEmptyComp', Mdict['HrespEmptyComp'])

        else:
            SS.setELBOTerm('gammalnSumTheta',
                           Mdict['gammalnSumTheta'],
                           dims=None)
            SS.setELBOTerm('slackThetaRem', Mdict['slackThetaRem'], dims=None)
            SS.setELBOTerm('gammalnThetaRem',
                           Mdict['gammalnThetaRem'].sum(),
                           dims=None)

    if doPrecompMergeEntropy:
        if mPairIDs is None:
            raise NotImplementedError("TODO: all pairs for merges")
        m_Hresp = calcHrespForSpecificMergePairs(LP, Dslice, mPairIDs)
        if m_Hresp is not None:
            SS.setMergeTerm('Hresp', m_Hresp, dims=('M'))

        m_sumLogPi = np.zeros(M)
        m_gammalnTheta = np.zeros(M)
        m_slackTheta = np.zeros(M)
        for m, (kA, kB) in enumerate(mPairIDs):
            theta_vec = LP['theta'][:, kA] + LP['theta'][:, kB]
            ElogPi_vec = digamma(theta_vec) - LP['digammaSumTheta']
            m_gammalnTheta[m] = np.sum(gammaln(theta_vec))
            m_sumLogPi[m] = np.sum(ElogPi_vec)
            # slack = (Ndm - theta_dm) * E[log pi_dm]
            slack_vec = ElogPi_vec
            slack_vec *= -1 * (alphaEbeta[kA] + alphaEbeta[kB])
            m_slackTheta[m] = np.sum(slack_vec)
        SS.setMergeTerm('gammalnTheta', m_gammalnTheta, dims=('M'))
        SS.setMergeTerm('sumLogPi', m_sumLogPi, dims=('M'))
        SS.setMergeTerm('slackTheta', m_slackTheta, dims=('M'))

        # Uncomment this for verification of merge calculations.
        # for (kA, kB) in mPairIDs:
        #      self.verifySSForMergePair(Data, SS, LP, kA, kB)
        # .... end merge computations

    # Selection terms (computes doc-topic correlation)
    if mergePairSelection is not None:
        if mergePairSelection.count('corr') > 0:
            Tmat = LP['DocTopicCount']
            SS.setSelectionTerm('DocTopicPairMat',
                                np.dot(Tmat.T, Tmat),
                                dims=('K', 'K'))
            SS.setSelectionTerm('DocTopicSum', np.sum(Tmat, axis=0), dims='K')

    if trackDocUsage:
        # Track num of times a topic appears nontrivially in a doc
        DocUsage = np.sum(LP['DocTopicCount'] > 0.01, axis=0)
        SS.setSelectionTerm('DocUsageCount', DocUsage, dims='K')
        Pi = LP['theta'] / LP['theta'].sum(axis=1)[:, np.newaxis]
        SumPi = np.sum(Pi, axis=0)
        SS.setSelectionTerm('SumPi', SumPi, dims='K')
    return SS
Пример #18
0
def calcSummaryStats(Data,
                     LP,
                     doPrecompEntropy=False,
                     doPrecompMergeEntropy=False,
                     mPairIDs=None,
                     mergePairSelection=None,
                     trackDocUsage=False,
                     **kwargs):
    """ Calculate sufficient statistics for global updates.

    Parameters
    -------
    Data : bnpy data object
    LP : local param dict with fields
        resp : Data.nObs x K array,
            where resp[n,k] = posterior resp of comp k
    doPrecompEntropy : boolean flag
        indicates whether to precompute ELBO terms in advance
        used for memoized learning algorithms (moVB)
    doPrecompMergeEntropy : boolean flag
        indicates whether to precompute ELBO terms in advance
        for certain merge candidates.

    Returns
    -------
    SS : SuffStatBag with K components
        Summarizes for this mixture model, with fields
        * N : 1D array, size K
            N[k] = expected number of items assigned to comp k

        Also has optional ELBO field when precompELBO is True
        * ElogqZ : 1D array, size K
            Vector of entropy contributions from each comp.
            ElogqZ[k] = \sum_{n=1}^N resp[n,k] log resp[n,k]

        Also has optional Merge field when precompMergeELBO is True
        * ElogqZ : 2D array, size K x K
            Each term is scalar entropy of merge candidate
    """
    if mPairIDs is not None and len(mPairIDs) > 0:
        M = len(mPairIDs)
    else:
        M = 0
    if 'resp' in LP:
        Nvec = np.sum(LP['resp'], axis=0)
        K = Nvec.size
    else:
        # Sparse assignment case
        Nvec = as1D(toCArray(LP['spR'].sum(axis=0)))
        K = LP['spR'].shape[1]

    if hasattr(Data, 'dim'):
        SS = SuffStatBag(K=K, D=Data.dim, M=M)
    else:
        SS = SuffStatBag(K=K, D=Data.vocab_size, M=M)
    SS.setField('N', Nvec, dims=('K'))
    if doPrecompEntropy:
        Mdict = calcELBO_NonlinearTerms(LP=LP, returnMemoizedDict=1)
        if type(Mdict['Hresp']) == float:
            # SPARSE HARD ASSIGNMENTS
            SS.setELBOTerm('Hresp', Mdict['Hresp'], dims=None)
        else:
            SS.setELBOTerm('Hresp', Mdict['Hresp'], dims=('K', ))

    if doPrecompMergeEntropy:
        m_Hresp = None
        if 'resp' in LP:
            m_Hresp = -1 * NumericUtil.calcRlogR_specificpairs(
                LP['resp'], mPairIDs)
        elif 'spR' in LP:
            if LP['nnzPerRow'] > 1:
                m_Hresp = calcSparseMergeRlogR(spR_csr=LP['spR'],
                                               nnzPerRow=LP['nnzPerRow'],
                                               mPairIDs=mPairIDs)
        else:
            raise ValueError("Need resp or spR in LP")
        if m_Hresp is not None:
            assert m_Hresp.size == len(mPairIDs)
            SS.setMergeTerm('Hresp', m_Hresp, dims=('M'))
    if trackDocUsage:
        Usage = np.sum(LP['resp'] > 0.01, axis=0)
        SS.setSelectionTerm('DocUsageCount', Usage, dims='K')

    return SS
Пример #19
0
    def get_global_suff_stats(self,
                              Data,
                              LP,
                              doPrecompEntropy=None,
                              doPrecompMergeEntropy=None,
                              mPairIDs=None,
                              trackDocUsage=0,
                              preselectroutine=None,
                              **kwargs):
        ''' Calculate sufficient statistics.
    '''
        resp = LP['resp']
        _, K = resp.shape
        SS = SuffStatBag(K=K, D=Data.get_dim())
        SS.setField('nDoc', Data.nDoc, dims=None)
        SS.setField('sumLogPi', np.sum(LP['ElogPi'], axis=0), dims='K')
        SS.setField('sumLogPiRem', np.sum(LP['ElogPiRem']), dims=None)

        if doPrecompEntropy:
            ElogqZ = self.E_logqZ(Data, LP)
            SS.setELBOTerm('ElogqZ', ElogqZ, dims='K')

            slack_NmT, slack_NmT_Rem = self.slack_NminusTheta(LP)
            SS.setELBOTerm('slackNminusTheta', slack_NmT, dims='K')
            SS.setELBOTerm('slackNminusTheta_Rem', slack_NmT_Rem, dims=None)

            glnSumTheta, glnTheta, glnThetaRem = self.c_Dir_theta__parts(LP)
            SS.setELBOTerm('gammalnSumTheta', glnSumTheta, dims=None)
            SS.setELBOTerm('gammalnTheta', glnTheta, dims='K')
            SS.setELBOTerm('gammalnTheta_Rem', glnThetaRem, dims=None)

        ## Merge Term caching
        if doPrecompMergeEntropy:
            if mPairIDs is None:
                raise NotImplementedError("TODO: all pairs for merges")

            ElogqZMat = self.calcElogqZForMergePairs(LP['resp'], Data,
                                                     mPairIDs)
            SS.setMergeTerm('ElogqZ', ElogqZMat, dims=('K', 'K'))

            alphaEbeta = self.alpha_E_beta()

            sumLogPi = np.zeros((SS.K, SS.K))
            gammalnTheta = np.zeros((SS.K, SS.K))
            slack_NmT = np.zeros((SS.K, SS.K))
            for (kA, kB) in mPairIDs:
                theta_vec = LP['theta'][:, kA] + LP['theta'][:, kB]
                ElogPi_vec = digamma(theta_vec) - LP['digammaSumTheta']
                gammalnTheta[kA, kB] = np.sum(gammaln(theta_vec))
                sumLogPi[kA, kB] = np.sum(ElogPi_vec)
                ElogPi_vec *= alphaEbeta[kA] + alphaEbeta[kB]
                slack_NmT[kA, kB] = -1 * np.sum(ElogPi_vec)
            SS.setMergeTerm('gammalnTheta', gammalnTheta, dims=('K', 'K'))
            SS.setMergeTerm('sumLogPi', sumLogPi, dims=('K', 'K'))
            SS.setMergeTerm('slackNminusTheta', slack_NmT, dims=('K', 'K'))

            #for (kA, kB) in mPairIDs:
            #  self.verifySSForMergePair(Data, SS, LP, kA, kB)

        ## Selection terms (computes doc-topic correlation)
        if preselectroutine is not None:
            if preselectroutine.count('corr') > 0:
                Tmat = LP['DocTopicCount']
                SS.setSelectionTerm('DocTopicPairMat',
                                    np.dot(Tmat.T, Tmat),
                                    dims=('K', 'K'))
                SS.setSelectionTerm('DocTopicSum',
                                    np.sum(Tmat, axis=0),
                                    dims='K')

        if trackDocUsage:
            ## Track the number of times a topic appears with "significant mass" in a document
            DocUsage = np.sum(LP['DocTopicCount'] > 0.01, axis=0)
            SS.setSelectionTerm('DocUsageCount', DocUsage, dims='K')
        return SS