示例#1
0
def calcHrespForSpecificMergePairs(LP, Data, mPairIDs):
    ''' Calculate resp entropy terms for all candidate merge pairs

    Returns
    ---------
    Hresp : 1D array, size M
        where each entry corresponds to one merge pair in mPairIDs
    '''
    assert mPairIDs is not None
    if 'resp' in LP:
        N = LP['resp'].shape[0]
        if hasattr(Data, 'word_count') and N == Data.word_count.size:
            m_Hresp = -1 * calcRlogRdotv_specificpairs(
                LP['resp'], Data.word_count, mPairIDs)
        else:
            m_Hresp = -1 * calcRlogR_specificpairs(LP['resp'], mPairIDs)
    else:
        if LP['nnzPerRow'] == 1:
            return None
        N = LP['spR'].shape[0]
        if hasattr(Data, 'word_count') and N == Data.word_count.size:
            m_Hresp = calcSparseMergeRlogRdotv(
                spR_csr=LP['spR'],
                nnzPerRow=LP['nnzPerRow'],
                v=Data.word_count,
                mPairIDs=mPairIDs)
        else:
            m_Hresp = calcSparseMergeRlogR(
                spR_csr=LP['spR'],
                nnzPerRow=LP['nnzPerRow'],
                mPairIDs=mPairIDs)
    assert m_Hresp.size == len(mPairIDs)
    return m_Hresp
示例#2
0
def calcSummaryStats(Data,
                     LP,
                     doPrecompEntropy=False,
                     doPrecompMergeEntropy=False,
                     mPairIDs=None,
                     mergePairSelection=None,
                     trackDocUsage=False,
                     **kwargs):
    """ Calculate sufficient statistics for global updates.

    Parameters
    -------
    Data : bnpy data object
    LP : local param dict with fields
        resp : Data.nObs x K array,
            where resp[n,k] = posterior resp of comp k
    doPrecompEntropy : boolean flag
        indicates whether to precompute ELBO terms in advance
        used for memoized learning algorithms (moVB)
    doPrecompMergeEntropy : boolean flag
        indicates whether to precompute ELBO terms in advance
        for certain merge candidates.

    Returns
    -------
    SS : SuffStatBag with K components
        Summarizes for this mixture model, with fields
        * N : 1D array, size K
            N[k] = expected number of items assigned to comp k

        Also has optional ELBO field when precompELBO is True
        * ElogqZ : 1D array, size K
            Vector of entropy contributions from each comp.
            ElogqZ[k] = \sum_{n=1}^N resp[n,k] log resp[n,k]

        Also has optional Merge field when precompMergeELBO is True
        * ElogqZ : 2D array, size K x K
            Each term is scalar entropy of merge candidate
    """
    if mPairIDs is not None and len(mPairIDs) > 0:
        M = len(mPairIDs)
    else:
        M = 0
    if 'resp' in LP:
        Nvec = np.sum(LP['resp'], axis=0)
        K = Nvec.size
    else:
        # Sparse assignment case
        Nvec = as1D(toCArray(LP['spR'].sum(axis=0)))
        K = LP['spR'].shape[1]

    if hasattr(Data, 'dim'):
        SS = SuffStatBag(K=K, D=Data.dim, M=M)
    else:
        SS = SuffStatBag(K=K, D=Data.vocab_size, M=M)
    SS.setField('N', Nvec, dims=('K'))
    if doPrecompEntropy:
        Mdict = calcELBO_NonlinearTerms(LP=LP, returnMemoizedDict=1)
        if type(Mdict['Hresp']) == float:
            # SPARSE HARD ASSIGNMENTS
            SS.setELBOTerm('Hresp', Mdict['Hresp'], dims=None)
        else:
            SS.setELBOTerm('Hresp', Mdict['Hresp'], dims=('K', ))

    if doPrecompMergeEntropy:
        m_Hresp = None
        if 'resp' in LP:
            m_Hresp = -1 * NumericUtil.calcRlogR_specificpairs(
                LP['resp'], mPairIDs)
        elif 'spR' in LP:
            if LP['nnzPerRow'] > 1:
                m_Hresp = calcSparseMergeRlogR(spR_csr=LP['spR'],
                                               nnzPerRow=LP['nnzPerRow'],
                                               mPairIDs=mPairIDs)
        else:
            raise ValueError("Need resp or spR in LP")
        if m_Hresp is not None:
            assert m_Hresp.size == len(mPairIDs)
            SS.setMergeTerm('Hresp', m_Hresp, dims=('M'))
    if trackDocUsage:
        Usage = np.sum(LP['resp'] > 0.01, axis=0)
        SS.setSelectionTerm('DocUsageCount', Usage, dims='K')

    return SS