def makeExpansionSSFromZ_DPMixtureModel(Dslice=None, curModel=None, curLPslice=None, **kwargs): ''' Create expanded sufficient stats from Z assignments on target subset. Returns ------- xSSslice : accounts for all data atoms in Dslice assigned to ktarget Info : dict ''' xLPslice = makeExpansionLPFromZ_DPMixtureModel(Dslice=Dslice, curModel=curModel, curLPslice=curLPslice, **kwargs) xSSslice = curModel.get_global_suff_stats(Dslice, xLPslice, doPrecompEntropy=1, trackDocUsage=1, doTrackTruncationGrowth=1) if 'resp' in curLPslice: HrespOrigComp = -1 * NumericUtil.calcRlogR( curLPslice['resp'][:, kwargs['ktarget']]) else: target_resp = curLPslice['spR'][:, kwargs['ktarget']].toarray() np.maximum(target_resp, 1e-100, out=target_resp) HrespOrigComp = -1 * NumericUtil.calcRlogR(target_resp)[0] xSSslice.setELBOTerm('HrespEmptyComp', -1 * HrespOrigComp, dims=None) xSSslice.setUIDs(kwargs['xInitSS'].uids.copy()) Info = dict() Info['xLPslice'] = xLPslice return xSSslice, Info
def calcLocalParams(Data, LP, Elogbeta=None, nnzPerRowLP=None, **kwargs): ''' Compute local parameters for each data item. Parameters ------- Data : bnpy.data.DataObj subclass LP : dict Local parameters as key-value string/array pairs * E_log_soft_ev : 2D array, N x K E_log_soft_ev[n,k] = log p(data obs n | comp k) Returns ------- LP : dict Local parameters, with updated fields * resp : 2D array, size N x K array Posterior responsibility each comp has for each item resp[n, k] = p(z[n] = k | x[n]) ''' lpr = LP['E_log_soft_ev'] lpr += Elogbeta K = LP['E_log_soft_ev'].shape[1] if nnzPerRowLP and (nnzPerRowLP > 0 and nnzPerRowLP < K): # SPARSE Assignments LP['spR'] = sparsifyLogResp(lpr, nnzPerRow=nnzPerRowLP) assert np.all(np.isfinite(LP['spR'].data)) LP['nnzPerRow'] = nnzPerRowLP else: # DENSE Assignments # Calculate exp in numerically stable manner (first subtract the max) # perform this in-place so no new allocations occur NumericUtil.inplaceExpAndNormalizeRows(lpr) LP['resp'] = lpr return LP
def calc_local_params(self, Data, LP, **kwargs): ''' Calculate local parameters for each data item and each component. This is part of the E-step. Args ------- Data : bnpy data object with Data.nObs observations LP : local param dict with fields E_log_soft_ev : Data.nObs x K array E_log_soft_ev[n,k] = log p(data obs n | comp k) Returns ------- LP : local param dict with fields resp : Data.nObs x K array whose rows sum to one resp[n,k] = posterior responsibility that comp. k has for data n ''' lpr = LP['E_log_soft_ev'] lpr += self.Elogw # Calculate exp in numerically stable manner (first subtract the max) # perform this in-place so no new allocations occur NumericUtil.inplaceExpAndNormalizeRows(lpr) LP['resp'] = lpr assert np.allclose(lpr.sum(axis=1), 1) return LP
def E_logqZ(self, Data, LP): ''' Calculate E[ log q(z)] for each active topic Returns ------- ElogqZ : 1D array, size K ''' if hasattr(Data, 'word_count'): return NumericUtil.calcRlogRdotv(LP['resp'], Data.word_count) else: return NumericUtil.calcRlogR(LP['resp'])
def calc_local_params(self, Data, LP, nnzPerRowLP=0, **kwargs): ''' Compute local parameters for each data item and component. Parameters ------- Data : bnpy.data.DataObj subclass LP : dict Local parameters as key-value string/array pairs * E_log_soft_ev : 2D array, N x K E_log_soft_ev[n,k] = log p(data obs n | comp k) Returns ------- LP : dict Local parameters, with updated fields * resp : 2D array, size N x K array Posterior responsibility each comp has for each item resp[n, k] = p(z[n] = k | x[n]) ''' lpr = LP['E_log_soft_ev'] K = lpr.shape[1] if self.inferType.count('EM') > 0: # Using point estimates, for EM algorithm lpr += np.log(self.w + 1e-100) if nnzPerRowLP and (nnzPerRowLP > 0 and nnzPerRowLP < K): # SPARSE Assignments LP['nnzPerRow'] = nnzPerRowLP LP['spR'] = sparsifyLogResp(lpr, nnzPerRow=nnzPerRowLP) assert np.all(np.isfinite(LP['spR'].data)) else: lprPerItem = logsumexp(lpr, axis=1) lpr -= lprPerItem[:, np.newaxis] np.exp(lpr, out=lpr) LP['resp'] = lpr LP['evidence'] = lprPerItem.sum() else: # Full Bayesian approach, for VB or GS algorithms lpr += self.Elogw if nnzPerRowLP and (nnzPerRowLP > 0 and nnzPerRowLP < K): # SPARSE Assignments LP['nnzPerRow'] = nnzPerRowLP LP['spR'] = sparsifyLogResp(lpr, nnzPerRow=nnzPerRowLP) assert np.all(np.isfinite(LP['spR'].data)) else: # DENSE Assignments # Calculate exp in numerically safe way, # in-place so no new allocations occur NumericUtil.inplaceExpAndNormalizeRows(lpr) LP['resp'] = lpr assert np.allclose(lpr.sum(axis=1), 1) return LP
def L_entropy(Data=None, LP=None, resp=None, returnVector=0): """ Calculate entropy of soft assignments term in ELBO objective. Returns ------- L_entropy : scalar float """ spR = None if LP is not None: if 'resp' in LP: resp = LP['resp'] elif 'spR' in LP: spR = LP['spR'] N, K = LP['spR'].shape else: raise ValueError("LP dict missing resp or spR") if resp is not None: N, K = LP['resp'].shape if hasattr(Data, 'word_count') and N == Data.word_count.size: if resp is not None: Hvec = -1 * NumericUtil.calcRlogRdotv(resp, Data.word_count) elif spR is not None: Hvec = calcSparseRlogRdotv(v=Data.word_count, **LP) else: raise ValueError("Missing resp assignments!") else: if resp is not None: Hvec = -1 * NumericUtil.calcRlogR(resp) elif 'spR' in LP: assert 'nnzPerRow' in LP Hvec = calcSparseRlogR(**LP) else: raise ValueError("Missing resp assignments!") assert Hvec.size == K assert Hvec.min() >= -1e-6 if returnVector: return Hvec return Hvec.sum()
def calcHrespFromLP(LP=None, resp=None): if LP is not None and 'spR' in LP: nnzPerRow = LP['nnzPerRow'] if nnzPerRow > 1: # Handles multiply by -1 already Hresp = calcSparseRlogR(**LP) assert np.all(np.isfinite(Hresp)) else: Hresp = 0.0 else: if LP is not None and 'resp' in LP: resp = LP['resp'] Hresp = -1 * NumericUtil.calcRlogR(resp) return Hresp
def calcMergeTermsFromSeparateLP( self, Data=None, LPa=None, SSa=None, LPb=None, SSb=None, mUIDPairs=None): ''' Compute merge terms for case of expansion LP proposals. Returns ------- Mdict : dict, with fields * Hresp ''' M = len(mUIDPairs) m_Hresp = np.zeros(M) for m, (uidA, uidB) in enumerate(mUIDPairs): kA = SSa.uid2k(uidA) kB = SSb.uid2k(uidB) respAB = LPa['resp'][:, kA] + LPb['resp'][:, kB] m_Hresp[m] = -1 * NumericUtil.calcRlogR(respAB) assert m_Hresp.min() > -1e-9 return dict(Hresp=m_Hresp)
def restrictedLocalStep_HDPTopicModel(Dslice=None, curLPslice=None, ktarget=0, xObsModel=None, xalphaPi=None, thetaEmptyComp=None, xInitLPslice=None, b_localStepSingleDoc='fast', **kwargs): ''' Returns ------- xLPslice : dict with updated fields Fields with learned values * resp : N x Kfresh * DocTopicCount : nDoc x Kfresh * theta : nDoc x Kfresh * ElogPi : nDoc x Kfresh Fields copied directly from curLPslice * digammaSumTheta : 1D array, size nDoc * thetaRem : scalar * ElogPiRem : scalar * thetaEmptyComp * ElogPiEmptyComp ''' Kfresh = xObsModel.K assert Kfresh == xalphaPi.size # Compute conditional likelihoods for every data atom xLPslice = xObsModel.calc_local_params(Dslice) assert 'E_log_soft_ev' in xLPslice # Initialize DocTopicCount and theta xLPslice['resp'] = xLPslice['E_log_soft_ev'] xLPslice['DocTopicCount'] = np.zeros((Dslice.nDoc, Kfresh)) xLPslice['theta'] = np.zeros((Dslice.nDoc, Kfresh)) xLPslice['_nIters'] = -1 * np.ones(Dslice.nDoc) xLPslice['_maxDiff'] = -1 * np.ones(Dslice.nDoc) if b_localStepSingleDoc == 'fast': restrictedLocalStepForSingleDoc_Func = \ restrictedLocalStepForSingleDoc_HDPTopicModel else: print('SLOW<<<!!') restrictedLocalStepForSingleDoc_Func = \ restrictedLocalStepForSingleDoc_HDPTopicModel_SlowerButStable # Fill in these fields, one doc at a time for d in range(Dslice.nDoc): xLPslice = restrictedLocalStepForSingleDoc_Func( d=d, Dslice=Dslice, curLPslice=curLPslice, xLPslice=xLPslice, xInitLPslice=xInitLPslice, ktarget=ktarget, Kfresh=Kfresh, xalphaPi=xalphaPi, obsModelName=xObsModel.__class__.__name__, **kwargs) # Compute other LP quantities related to log prob (topic | doc) # and fill these into the expanded LP dict digammaSumTheta = curLPslice['digammaSumTheta'].copy() xLPslice['digammaSumTheta'] = digammaSumTheta xLPslice['ElogPi'] = \ digamma(xLPslice['theta']) - digammaSumTheta[:, np.newaxis] xLPslice['thetaRem'] = curLPslice['thetaRem'].copy() xLPslice['ElogPiRem'] = curLPslice['ElogPiRem'].copy() # Compute quantities related to leaving ktarget almost empty, # as we expand and transfer mass to other comps if thetaEmptyComp > 0: ElogPiEmptyComp = digamma(thetaEmptyComp) - digammaSumTheta xLPslice['thetaEmptyComp'] = thetaEmptyComp xLPslice['ElogPiEmptyComp'] = ElogPiEmptyComp if isExpansion: # Compute quantities related to OrigComp, the original target cluster. # These need to be tracked and turned into relevant summaries # so that they can be used to created a valid proposal state "propSS" xLPslice['ElogPiOrigComp'] = curLPslice['ElogPi'][:, ktarget] xLPslice['gammalnThetaOrigComp'] = \ np.sum(gammaln(curLPslice['theta'][:, ktarget])) slack = curLPslice['DocTopicCount'][:, ktarget] - \ curLPslice['theta'][:, ktarget] xLPslice['slackThetaOrigComp'] = np.sum( slack * curLPslice['ElogPi'][:, ktarget]) if hasattr(Dslice, 'word_count') and \ xLPslice['resp'].shape[0] == Dslice.word_count.size: xLPslice['HrespOrigComp'] = -1 * NumericUtil.calcRlogRdotv( curLPslice['resp'][:, ktarget], Dslice.word_count) else: xLPslice['HrespOrigComp'] = -1 * NumericUtil.calcRlogR( curLPslice['resp'][:, ktarget]) return xLPslice
def calcSummaryStats(Data, LP, doPrecompEntropy=False, doPrecompMergeEntropy=False, mPairIDs=None, mergePairSelection=None, trackDocUsage=False, **kwargs): """ Calculate sufficient statistics for global updates. Parameters ------- Data : bnpy data object LP : local param dict with fields resp : Data.nObs x K array, where resp[n,k] = posterior resp of comp k doPrecompEntropy : boolean flag indicates whether to precompute ELBO terms in advance used for memoized learning algorithms (moVB) doPrecompMergeEntropy : boolean flag indicates whether to precompute ELBO terms in advance for certain merge candidates. Returns ------- SS : SuffStatBag with K components Summarizes for this mixture model, with fields * N : 1D array, size K N[k] = expected number of items assigned to comp k Also has optional ELBO field when precompELBO is True * ElogqZ : 1D array, size K Vector of entropy contributions from each comp. ElogqZ[k] = \sum_{n=1}^N resp[n,k] log resp[n,k] Also has optional Merge field when precompMergeELBO is True * ElogqZ : 2D array, size K x K Each term is scalar entropy of merge candidate """ if mPairIDs is not None and len(mPairIDs) > 0: M = len(mPairIDs) else: M = 0 if 'resp' in LP: Nvec = np.sum(LP['resp'], axis=0) K = Nvec.size else: # Sparse assignment case Nvec = as1D(toCArray(LP['spR'].sum(axis=0))) K = LP['spR'].shape[1] if hasattr(Data, 'dim'): SS = SuffStatBag(K=K, D=Data.dim, M=M) else: SS = SuffStatBag(K=K, D=Data.vocab_size, M=M) SS.setField('N', Nvec, dims=('K')) if doPrecompEntropy: Mdict = calcELBO_NonlinearTerms(LP=LP, returnMemoizedDict=1) if type(Mdict['Hresp']) == float: # SPARSE HARD ASSIGNMENTS SS.setELBOTerm('Hresp', Mdict['Hresp'], dims=None) else: SS.setELBOTerm('Hresp', Mdict['Hresp'], dims=('K', )) if doPrecompMergeEntropy: m_Hresp = None if 'resp' in LP: m_Hresp = -1 * NumericUtil.calcRlogR_specificpairs( LP['resp'], mPairIDs) elif 'spR' in LP: if LP['nnzPerRow'] > 1: m_Hresp = calcSparseMergeRlogR(spR_csr=LP['spR'], nnzPerRow=LP['nnzPerRow'], mPairIDs=mPairIDs) else: raise ValueError("Need resp or spR in LP") if m_Hresp is not None: assert m_Hresp.size == len(mPairIDs) SS.setMergeTerm('Hresp', m_Hresp, dims=('M')) if trackDocUsage: Usage = np.sum(LP['resp'] > 0.01, axis=0) SS.setSelectionTerm('DocUsageCount', Usage, dims='K') return SS
def restrictedLocalStep_HDPTopicModel(Dslice=None, curLPslice=None, ktarget=0, kabsorbList=None, xObsModel=None, xalphaPi=None, nUpdateSteps=3, doBuildOnInit=False, convThr=0.5, thetaEmptyComp=0.0, **kwargs): ''' Compute local parameters for HDPTopicModel via restricted local step. Returns ------- xLPslice : dict with updated fields Fields with learned values * resp : N x Kfresh * DocTopicCount : nDoc x Kfresh * theta : nDoc x Kfresh * ElogPi : nDoc x Kfresh Fields copied directly from curLPslice * digammaSumTheta : 1D array, size nDoc * thetaRem : scalar * ElogPiRem : scalar ''' if doBuildOnInit: xWholeSS = xInitSS.copy() Kfresh = xObsModel.K assert Kfresh == xalphaPi.size xLPslice = dict() # Default warm_start initialization for DocTopicCount # by copying the previous counts at all absorbing states if kabsorbList is None: xLPslice['DocTopicCount'] = np.zeros((Dslice.nDoc, Kfresh)) xLPslice['resp'] = np.zeros((curLPslice['resp'].shape[0], Kfresh)) else: # Initialize DocTopicCounts by copying those from absorbing states xLPslice['DocTopicCount'] = \ curLPslice['DocTopicCount'][:, kabsorbList].copy() # Initialize resp by copying existing resp for absorbing state # Note: this is NOT consistent with some docs in DocTopicCount # but that will get fixed by restricted step xLPslice['resp'] = \ curLPslice['resp'][:, kabsorbList].copy() xLPslice['theta'] = \ xLPslice['DocTopicCount'] + xalphaPi[np.newaxis,:] xLPslice['_nIters'] = -1 * np.ones(Dslice.nDoc) xLPslice['_maxDiff'] = -1 * np.ones(Dslice.nDoc) for step in range(nUpdateSteps): # Compute conditional likelihoods for every data atom xLPslice = xObsModel.calc_local_params(Dslice, xLPslice) assert 'E_log_soft_ev' in xLPslice assert 'obsModelName' in xLPslice # Fill in these fields, one doc at a time for d in xrange(Dslice.nDoc): xLPslice = restrictedLocalStepForSingleDoc_HDPTopicModel( d=d, Dslice=Dslice, curLPslice=curLPslice, xLPslice=xLPslice, ktarget=ktarget, kabsorbList=kabsorbList, xalphaPi=xalphaPi, thetaEmptyComp=thetaEmptyComp, **kwargs) isLastStep = step == nUpdateSteps - 1 if not isLastStep: xSS = xObsModel.calcSummaryStats(Dslice, None, xLPslice) # Increment if doBuildOnInit: xSS.setUIDs(xWholeSS.uids) xWholeSS += xSS else: xWholeSS = xSS # Global step xObsModel.update_global_params(xWholeSS) # Decrement stats if doBuildOnInit: xWholeSS -= xSS # Assess early stopping if step > 0: thr = np.sum(np.abs(prevCountVec - xSS.getCountVec())) if thr < convThr: break prevCountVec = xSS.getCountVec() # Compute other LP quantities related to log prob (topic | doc) # and fill these into the expanded LP dict digammaSumTheta = curLPslice['digammaSumTheta'].copy() xLPslice['digammaSumTheta'] = digammaSumTheta xLPslice['ElogPi'] = \ digamma(xLPslice['theta']) - digammaSumTheta[:, np.newaxis] xLPslice['thetaRem'] = curLPslice['thetaRem'].copy() xLPslice['ElogPiRem'] = curLPslice['ElogPiRem'].copy() # Compute quantities related to leaving ktarget almost empty, # as we expand and transfer mass to other comps if thetaEmptyComp > 0: ElogPiEmptyComp = digamma(thetaEmptyComp) - digammaSumTheta xLPslice['thetaEmptyComp'] = thetaEmptyComp xLPslice['ElogPiEmptyComp'] = ElogPiEmptyComp # Compute quantities related to OrigComp, the original target cluster. # These need to be tracked and turned into relevant summaries # so that they can be used to created a valid proposal state "propSS" xLPslice['ElogPiOrigComp'] = curLPslice['ElogPi'][:, ktarget] xLPslice['gammalnThetaOrigComp'] = \ np.sum(gammaln(curLPslice['theta'][:, ktarget])) slack = curLPslice['DocTopicCount'][:, ktarget] - \ curLPslice['theta'][:, ktarget] xLPslice['slackThetaOrigComp'] = np.sum( slack * curLPslice['ElogPi'][:, ktarget]) if hasattr(Dslice, 'word_count') and \ xLPslice['resp'].shape[0] == Dslice.word_count.size: xLPslice['HrespOrigComp'] = -1 * NumericUtil.calcRlogRdotv( curLPslice['resp'][:, ktarget], Dslice.word_count) else: xLPslice['HrespOrigComp'] = -1 * NumericUtil.calcRlogR( curLPslice['resp'][:, ktarget]) return xLPslice
def summarizeRestrictedLocalStep_DPMixtureModel(Dslice=None, curModel=None, curLPslice=None, curSSwhole=None, ktarget=None, targetUID=None, xUIDs=None, mUIDPairs=None, xObsModel=None, xInitSS=None, doBuildOnInit=False, **kwargs): ''' Perform one restricted local step and summarize it. Returns ------- xSSslice : SuffStatBag Info : dict with other information ''' # Determine which uid to target if ktarget is None: assert targetUID is not None ktarget = curSSwhole.uid2k(targetUID) elif targetUID is None: assert ktarget is not None targetUID = curSSwhole.uids[ktarget] assert targetUID == curSSwhole.uids[ktarget] # Determine how many new uids to make assert xUIDs is not None Kfresh = len(xUIDs) # Verify provided summary states used to initialize clusters, if any. if xInitSS is not None: assert xInitSS.K == Kfresh xInitSS.setUIDs(xUIDs) # Create temporary observation model for each of Kfresh new clusters # If it doesn't exist already if xObsModel is None: xObsModel = curModel.obsModel.copy() if xInitSS is not None: xObsModel.update_global_params(xInitSS) assert xObsModel.K == Kfresh # Create probabilities for each of the Kfresh new clusters # by subdividing the target comp's original probabilities xPiVec, emptyPi = make_xPiVec_and_emptyPi(curModel=curModel, xInitSS=xInitSS, ktarget=ktarget, Kfresh=Kfresh, **kwargs) sumRespVec = curLPslice['resp'][:, ktarget] isExpansion = True if np.intersect1d(xUIDs, curSSwhole.uids).size > 0: isExpansion = False for uid in xUIDs: kk = curSSwhole.uid2k(uid) sumRespVec += curLPslice['resp'][:, kk] # Perform restricted inference! # xLPslice contains local params for all Kfresh expansion clusters xLPslice = restrictedLocalStep_DPMixtureModel(Dslice=Dslice, sumRespVec=sumRespVec, xObsModel=xObsModel, xPiVec=xPiVec, doBuildOnInit=doBuildOnInit, xInitSS=xInitSS, **kwargs) # Summarize this expanded local parameter pack xSSslice = curModel.get_global_suff_stats(Dslice, xLPslice, doPrecompEntropy=1, doTrackTruncationGrowth=1) xSSslice.setUIDs(xUIDs) # Handle bookkeeping for original entropy term if isExpansion: if 'resp' in curLPslice: HrespOrigComp = -1 * NumericUtil.calcRlogR( curLPslice['resp'][:, ktarget]) else: target_resp = curLPslice['spR'][:, ktarget].toarray() np.maximum(target_resp, 1e-100, out=target_resp) HrespOrigComp = -1 * NumericUtil.calcRlogR(target_resp) xSSslice.setELBOTerm('HrespEmptyComp', -1 * HrespOrigComp, dims=None) # If desired, add merge terms into the expanded summaries, if mUIDPairs is not None and len(mUIDPairs) > 0: # Check just to be safe for uidA, uidB in mUIDPairs: assert uidA != targetUID assert uidB != targetUID Mdict = curModel.allocModel.calcMergeTermsFromSeparateLP( Data=Dslice, LPa=curLPslice, SSa=curSSwhole, LPb=xLPslice, SSb=xSSslice, mUIDPairs=mUIDPairs) xSSslice.setMergeUIDPairs(mUIDPairs) for key, arr in list(Mdict.items()): xSSslice.setMergeTerm(key, arr, dims='M') # Prepare dict of info for debugging/inspection Info = dict() Info['Kfresh'] = Kfresh Info['xPiVec'] = xPiVec Info['emptyPi'] = emptyPi Info['xInitSS'] = xInitSS Info['xLPslice'] = xLPslice return xSSslice, Info
def calcLocalParams(Data, LP, aModel, methodLP='scratch', routineLP='simple', **kwargs): ''' Calculate all local parameters for provided dataset under a topic model Returns ------- LP : dict of local params, with fields * DocTopicCount * resp * model-specific fields for doc-topic probabilities ''' kwargs['methodLP'] = methodLP ## Prepare the log soft ev matrix ## Make sure it is C-contiguous, so that matrix ops are very fast Lik = np.asarray(LP['E_log_soft_ev'], order='C') Lik -= Lik.max(axis=1)[:, np.newaxis] NumericUtil.inplaceExp(Lik) K = Lik.shape[1] hasDocTopicCount = 'DocTopicCount' in LP \ and LP['DocTopicCount'].shape == (Data.nDoc, K) if methodLP == 'memo' and hasDocTopicCount: initDocTopicCount = LP['DocTopicCount'] else: initDocTopicCount = None if routineLP == 'simple': DocTopicCount, Prior, sumR, AI = calcDocTopicCountForData_Simple( Data, aModel, Lik, initDocTopicCount=initDocTopicCount, **kwargs) elif routineLP == 'fast': DocTopicCount, Prior, sumR = calcDocTopicCountForData_Fast( Data, aModel, Lik, initDocTopicCount=initDocTopicCount, **kwargs) else: raise ValueError('Unrecognized routine ' + routineLP) LP['DocTopicCount'] = DocTopicCount LP = aModel.updateLPGivenDocTopicCount(LP, DocTopicCount) LP = updateLPWithResp(LP, Data, Lik, Prior, sumR) if kwargs['restartremovejunkLP'] == 1: LP, RInfo = removeJunkTopicsFromAllDocs(aModel, Data, LP, **kwargs) if 'lapFrac' in kwargs and 'batchID' in kwargs: if hasattr(Data, 'batchID') and Data.batchID == kwargs['batchID']: perc = [0, 5, 10, 50, 90, 95, 100] siter = ' '.join( ['%4d' % np.percentile(AI['iter'], p) for p in perc]) sdiff = ['%6.4f' % np.percentile(AI['maxDiff'], p) for p in perc] sdiff = ' '.join(sdiff) nFail = np.sum(AI['maxDiff'] > kwargs['convThrLP']) msg = '%4.2f %3d %4d %s %s' % (kwargs['lapFrac'], Data.batchID, nFail, siter, sdiff) if kwargs['restartremovejunkLP'] == 1: msg += " %4d/%4d %4d/%4d" % ( RInfo['nDocRestartsAccepted'], RInfo['nDocRestartsTried'], RInfo['nRestartsAccepted'], RInfo['nRestartsTried']) elif kwargs['restartremovejunkLP'] == 2: msg += " %4d/%4d" % (AI['nRestartsAccepted'], AI['nRestartsTried']) LP['Info'] = AI return LP
def calcSmoothedBregDiv(self, X, Mu, W=None, smoothFrac=0.0, includeOnlyFastTerms=False, DivDataVec=None, returnDivDataVec=False, return1D=False, **kwargs): ''' Compute Bregman divergence between data X and clusters Mu. Smooth the data via update with prior parameters. Keyword Args ------------ includeOnlyFastTerms : boolean if False, includes all terms in divergence calculation. Returns Div[n,:] guaranteed to be non-negative. if True, includes only terms that vary with cluster index k Returns Div[n,:] equal to divergence up to additive constant Returns ------- Div : 2D array, N x K Div[n,k] = smoothed distance between X[n] and Mu[k] ''' if X.ndim < 2: X = X[np.newaxis, :] assert X.ndim == 2 N = X.shape[0] if not isinstance(Mu, list): Mu = (Mu, ) K = len(Mu) # Compute Div array up to a per-row additive constant indep. of k Div = np.zeros((N, K)) for k in range(K): Div[:, k] = -1 * np.dot(X, np.log(Mu[k])) # Compute contribution of prior smoothing if smoothFrac > 0: smoothVec = smoothFrac * self.Prior.lam for k in range(K): Div[:, k] -= np.sum(smoothVec * np.log(Mu[k])) # Equivalent to -1 * np.dot(MuX, np.log(Mu[k])), # but without allocating a new matrix MuX if not includeOnlyFastTerms: if DivDataVec is None: # Compute DivDataVec : 1D array of size N # This is the per-row additive constant indep. of k. # We do lots of steps in-place, to save memory. if smoothFrac > 0: MuX = X + smoothVec else: # Add small pos constant so that we never compute np.log(0) MuX = X + 1e-100 NX = MuX.sum(axis=1) # First block equivalent to # DivDataVec = -1 * NX * np.log(NX) DivDataVec = np.log(NX) DivDataVec *= -1 * NX # This next block is equivalent to: # >>> DivDataVec += np.sum(MuX * np.log(MuX), axis=1) # but uses in-place operations with faster numexpr library. NumericUtil.inplaceLog(MuX) logMuX = MuX if smoothFrac > 0: DivDataVec += np.dot(logMuX, smoothVec) logMuX *= X XlogMuX = logMuX DivDataVec += np.sum(XlogMuX, axis=1) Div += DivDataVec[:, np.newaxis] # Apply per-atom weights to divergences. if W is not None: assert W.ndim == 1 assert W.size == N Div *= W[:, np.newaxis] # Verify divergences are strictly non-negative if not includeOnlyFastTerms: minDiv = Div.min() if minDiv < 0: if minDiv < -1e-6: raise AssertionError( "Expected Div.min() to be positive or" + \ " indistinguishable from zero. Instead " + \ " minDiv=% .3e" % (minDiv)) np.maximum(Div, 0, out=Div) minDiv = Div.min() assert minDiv >= 0 if return1D: Div = Div[:, 0] if returnDivDataVec: return Div, DivDataVec return Div
def restrictedLocalStep_DPMixtureModel(Dslice=None, sumRespVec=None, LPkwargs=dict(), xObsModel=None, xPiVec=None, xInitSS=None, doBuildOnInit=False, nUpdateSteps=50, convThr=0.1, xPiPrior=1.0, logFunc=None, **kwargs): ''' Perform restricted local step on provided dataset. Returns ------- xLPslice : dict with updated local parameters Obeys restriction that sum(resp, axis=1) equals sumRespVec ''' if xInitSS is None: xWholeSS = None else: xWholeSS = xInitSS.copy() for step in range(nUpdateSteps): # Compute conditional likelihoods for every data atom xLPslice = xObsModel.calc_local_params(Dslice, **LPkwargs) assert 'E_log_soft_ev' in xLPslice xresp = xLPslice['E_log_soft_ev'] xresp += np.log(xPiVec)[np.newaxis, :] # Calculate exp in numerically stable manner (first subtract the max) # perform this in-place so no new allocations occur NumericUtil.inplaceExpAndNormalizeRows(xresp) # Enforce sum restriction xresp *= sumRespVec[:, np.newaxis] np.maximum(xresp, 1e-100, out=xresp) isLastStep = step == nUpdateSteps - 1 if not isLastStep: xSS = xObsModel.calcSummaryStats(Dslice, None, dict(resp=xresp)) # Increment if doBuildOnInit: xSS.setUIDs(xWholeSS.uids) xWholeSS += xSS else: xWholeSS = xSS # Global step xObsModel.update_global_params(xWholeSS) Nvec = xWholeSS.getCountVec() xPiVec = Nvec + xPiPrior # Decrement stats if doBuildOnInit: xWholeSS -= xSS # Assess early stopping if step > 0: thr = np.sum(np.abs(prevCountVec - xSS.getCountVec())) if thr < convThr: break prevCountVec = xSS.getCountVec() if logFunc: msg = "restrictedLocalStep_DPMixtureModel" msg += " stopped after %3d of %d iters. thr=%.4f" % (step, nUpdateSteps, thr) logFunc(msg) xLPslice['resp'] = xresp del xLPslice['E_log_soft_ev'] # delete since we did inplace ops on it return xLPslice
def makeExpansionLPFromZ_HDPTopicModel(Dslice=None, curModel=None, curLPslice=None, ktarget=None, xInitSS=None, targetZ=None, atomType=None, chosenDataIDs=None, emptyPiFrac=None, **kwargs): ''' Create expanded local parameters from Z assignments on target subset. Returns ------- xLP : dict with fields resp : N x Kfresh DocTopicCount : D x Kfresh theta : D x Kfresh ElogPi : D x Kfresh ''' Kfresh = targetZ.max() + 1 N = curLPslice['resp'].shape[0] # Compute prior probability of each proposed comp xPiVec, emptyPi = make_xPiVec_and_emptyPi(curModel=curModel, ktarget=ktarget, Kfresh=Kfresh, xInitSS=xInitSS, **kwargs) xalphaPi = curModel.allocModel.alpha * xPiVec emptyalphaPi = curModel.allocModel.alpha * emptyPi # Compute likelihood under each proposed comp xObsModel = curModel.obsModel.copy() xObsModel.update_global_params(xInitSS) xLPslice = xObsModel.calc_local_params(Dslice) # Initialize xresp so each atom is normalized # This is the "default", for non-target atoms. xresp = xLPslice['E_log_soft_ev'] xresp += np.log(xalphaPi) # log prior probability xresp -= xresp.max(axis=1)[:, np.newaxis] assert np.allclose(xresp.max(axis=1), 0.0) np.exp(xresp, out=xresp) xresp /= xresp.sum(axis=1)[:, np.newaxis] # Now, replace all targeted atoms with an all-or-nothing assignment if atomType == 'doc' and curModel.getAllocModelName().count('HDP'): if curModel.getObsModelName().count('Mult'): for pos, d in enumerate(chosenDataIDs): start = Dslice.doc_range[d] stop = Dslice.doc_range[d + 1] xresp[start:stop, :] = 1e-100 xresp[start:stop, targetZ[pos]] = 1.0 elif curModel.getObsModelName().count('Bern'): # For all words in each targeted doc, # Assign them to the corresponding cluster in targetZ for pos, d in enumerate(chosenDataIDs): bstart = Dslice.vocab_size * d bstop = Dslice.vocab_size * (d + 1) xresp[bstart:bstop, :] = 1e-100 xresp[bstart:bstop, targetZ[pos]] = 1.0 #words_d = Dslice.word_id[ # Dslice.doc_range[d]:Dslice.doc_range[d+1]] #xresp[bstart + words_d, :] = 1e-100 #xresp[bstart + words_d, targetZ[pos]] = 1.0 else: for pos, n in enumerate(chosenDataIDs): xresp[n, :] = 1e-100 xresp[n, targetZ[pos]] = 1.0 assert np.allclose(1.0, xresp.sum(axis=1)) # Make resp consistent with ktarget comp xresp *= curLPslice['resp'][:, ktarget][:, np.newaxis] np.maximum(xresp, 1e-100, out=xresp) # Create xDocTopicCount xDocTopicCount = np.zeros((Dslice.nDoc, Kfresh)) for d in range(Dslice.nDoc): start = Dslice.doc_range[d] stop = Dslice.doc_range[d + 1] if hasattr(Dslice, 'word_id') and \ curModel.getObsModelName().count('Mult'): xDocTopicCount[d] = np.dot(Dslice.word_count[start:stop], xresp[start:stop]) elif hasattr(Dslice, 'word_id') and \ curModel.getObsModelName().count('Bern'): bstart = d * Dslice.vocab_size bstop = (d + 1) * Dslice.vocab_size xDocTopicCount[d] = np.sum(xresp[bstart:bstop], axis=0) else: xDocTopicCount[d] = np.sum(xresp[start:stop], axis=0) # Create xtheta xtheta = xDocTopicCount + xalphaPi[np.newaxis, :] # Package up into xLPslice xLPslice['resp'] = xresp xLPslice['DocTopicCount'] = xDocTopicCount xLPslice['theta'] = xtheta assert np.allclose(xDocTopicCount.sum(axis=1), curLPslice['DocTopicCount'][:, ktarget]) assert np.allclose( xtheta.sum(axis=1) + emptyalphaPi, curLPslice['theta'][:, ktarget]) # Compute other LP quantities related to log prob (topic | doc) # and fill these into the expanded LP dict digammaSumTheta = curLPslice['digammaSumTheta'].copy() xLPslice['digammaSumTheta'] = digammaSumTheta xLPslice['ElogPi'] = \ digamma(xLPslice['theta']) - digammaSumTheta[:, np.newaxis] xLPslice['thetaRem'] = curLPslice['thetaRem'].copy() xLPslice['ElogPiRem'] = curLPslice['ElogPiRem'].copy() # Compute quantities related to leaving ktarget almost empty, # as we expand and transfer mass to other comps if emptyalphaPi > 0: thetaEmptyComp = emptyalphaPi ElogPiEmptyComp = digamma(thetaEmptyComp) - digammaSumTheta xLPslice['thetaEmptyComp'] = thetaEmptyComp xLPslice['ElogPiEmptyComp'] = ElogPiEmptyComp # Compute quantities related to OrigComp, the original target cluster. # These need to be tracked and turned into relevant summaries # so that they can be used to created a valid proposal state "propSS" xLPslice['ElogPiOrigComp'] = curLPslice['ElogPi'][:, ktarget] xLPslice['gammalnThetaOrigComp'] = \ np.sum(gammaln(curLPslice['theta'][:, ktarget])) slack = curLPslice['DocTopicCount'][:, ktarget] - \ curLPslice['theta'][:, ktarget] xLPslice['slackThetaOrigComp'] = np.sum(slack * curLPslice['ElogPi'][:, ktarget]) if hasattr(Dslice, 'word_count') and \ xLPslice['resp'].shape[0] == Dslice.word_count.size: xLPslice['HrespOrigComp'] = -1 * NumericUtil.calcRlogRdotv( curLPslice['resp'][:, ktarget], Dslice.word_count) else: xLPslice['HrespOrigComp'] = -1 * NumericUtil.calcRlogR( curLPslice['resp'][:, ktarget]) return xLPslice
def calcELBO_NonlinearTerms(Data=None, SS=None, LP=None, todict=0, rho=None, Ebeta=None, alpha=None, resp=None, nDoc=None, DocTopicCount=None, theta=None, thetaRem=None, ElogPi=None, ElogPiRem=None, sumLogPi=None, sumLogPiRem=None, sumLogPiRemVec=None, Hresp=None, slackTheta=None, slackThetaRem=None, gammalnTheta=None, gammalnSumTheta=None, gammalnThetaRem=None, thetaEmptyComp=None, ElogPiEmptyComp=None, ElogPiOrigComp=None, gammalnThetaOrigComp=None, slackThetaOrigComp=None, returnMemoizedDict=0, **kwargs): """ Calculate ELBO objective terms non-linear in suff stats. """ if resp is not None: N, K = resp.shape elif LP is not None: if 'resp' in LP: N, K = LP['resp'].shape else: N, K = LP['spR'].shape if Ebeta is None: Ebeta = rho2beta(rho, returnSize='K+1') if LP is not None: DocTopicCount = LP['DocTopicCount'] nDoc = DocTopicCount.shape[0] theta = LP['theta'] thetaRem = LP['thetaRem'] ElogPi = LP['ElogPi'] ElogPiRem = LP['ElogPiRem'] sumLogPi = np.sum(ElogPi, axis=0) sumLogPiRem = np.sum(ElogPiRem) if 'thetaEmptyComp' in LP: thetaEmptyComp = LP['thetaEmptyComp'] ElogPiEmptyComp = LP['ElogPiEmptyComp'] ElogPiOrigComp = LP['ElogPiOrigComp'] gammalnThetaOrigComp = LP['gammalnThetaOrigComp'] slackThetaOrigComp = LP['slackThetaOrigComp'] HrespOrigComp = LP['HrespOrigComp'] elif SS is not None: sumLogPi = SS.sumLogPi nDoc = SS.nDoc if hasattr(SS, 'sumLogPiRemVec'): sumLogPiRemVec = SS.sumLogPiRemVec else: sumLogPiRem = SS.sumLogPiRem if DocTopicCount is not None and theta is None: theta = DocTopicCount + alpha * Ebeta[:-1] thetaRem = alpha * Ebeta[-1] if theta is not None and ElogPi is None: digammasumtheta = digamma(theta.sum(axis=1) + thetaRem) ElogPi = digamma(theta) - digammasumtheta[:, np.newaxis] ElogPiRem = digamma(thetaRem) - digammasumtheta[:, np.newaxis] if sumLogPi is None and ElogPi is not None: sumLogPi = np.sum(ElogPi, axis=0) sumLogPiRem = np.sum(ElogPiRem) if Hresp is None: if SS is not None and SS.hasELBOTerm('Hresp'): Hresp = SS.getELBOTerm('Hresp') else: if hasattr(Data, 'word_count') and N == Data.word_count.size: if resp is not None: Hresp = -1 * NumericUtil.calcRlogRdotv( resp, Data.word_count) elif 'resp' in LP: Hresp = -1 * NumericUtil.calcRlogRdotv( LP['resp'], Data.word_count) elif 'spR' in LP: Hresp = calcSparseRlogRdotv( v=Data.word_count, **LP) else: raise ValueError("Missing resp assignments!") else: if resp is not None: Hresp = -1 * NumericUtil.calcRlogR(resp) elif 'resp' in LP: Hresp = -1 * NumericUtil.calcRlogR(LP['resp']) elif 'spR' in LP: assert 'nnzPerRow' in LP Hresp = calcSparseRlogR(**LP) else: raise ValueError("Missing resp assignments!") if slackTheta is None: if SS is not None and SS.hasELBOTerm('slackTheta'): slackTheta = SS.getELBOTerm('slackTheta') slackThetaRem = SS.getELBOTerm('slackThetaRem') else: slackTheta = DocTopicCount - theta slackTheta *= ElogPi slackTheta = np.sum(slackTheta, axis=0) slackThetaRem = -1 * np.sum(thetaRem * ElogPiRem) if gammalnTheta is None: if SS is not None and SS.hasELBOTerm('gammalnTheta'): gammalnSumTheta = SS.getELBOTerm('gammalnSumTheta') gammalnTheta = SS.getELBOTerm('gammalnTheta') gammalnThetaRem = SS.getELBOTerm('gammalnThetaRem') else: sumTheta = np.sum(theta, axis=1) + thetaRem gammalnSumTheta = np.sum(gammaln(sumTheta)) gammalnTheta = np.sum(gammaln(theta), axis=0) gammalnThetaRem = theta.shape[0] * gammaln(thetaRem) if thetaEmptyComp is not None: gammalnThetaEmptyComp = nDoc * gammaln(thetaEmptyComp) - \ gammalnThetaOrigComp slackThetaEmptyComp = -np.sum(thetaEmptyComp * ElogPiEmptyComp) - \ slackThetaOrigComp if returnMemoizedDict: Mdict = dict(Hresp=Hresp, slackTheta=slackTheta, slackThetaRem=slackThetaRem, gammalnTheta=gammalnTheta, gammalnThetaRem=gammalnThetaRem, gammalnSumTheta=gammalnSumTheta) if thetaEmptyComp is not None: Mdict['HrespEmptyComp'] = -1 * HrespOrigComp Mdict['gammalnThetaEmptyComp'] = gammalnThetaEmptyComp Mdict['slackThetaEmptyComp'] = slackThetaEmptyComp return Mdict # First, compute all local-only terms Lentropy = np.sum(Hresp) Lslack = slackTheta.sum() + slackThetaRem LcDtheta = -1 * (gammalnSumTheta - gammalnTheta.sum() - gammalnThetaRem) # For stochastic (soVB), we need to scale up these terms # Only used when --doMemoELBO is set to 0 (not recommended) if SS is not None and SS.hasAmpFactor(): Lentropy *= SS.ampF Lslack *= SS.ampF LcDtheta *= SS.ampF # Next, compute the slack term alphaEbeta = alpha * Ebeta Lslack_alphaEbeta = np.sum(alphaEbeta[:-1] * sumLogPi) if sumLogPiRemVec is not None: Ebeta_gt = 1 - np.cumsum(Ebeta[:-1]) Lslack_alphaEbeta += alpha * np.inner(Ebeta_gt, sumLogPiRemVec) else: Lslack_alphaEbeta += alphaEbeta[-1] * sumLogPiRem Lslack += Lslack_alphaEbeta if todict: return dict( Lslack=Lslack, Lentropy=Lentropy, LcDtheta=LcDtheta, Lslack_alphaEbeta=Lslack_alphaEbeta) return LcDtheta + Lslack + Lentropy
def calcLocalParams(Data, LP, alphaEbeta=None, alphaEbetaRem=None, alpha=None, initDocTopicCountLP='scratch', cslice=(0, None), nnzPerRowLP=0, doSparseOnlyAtFinalLP=0, **kwargs): ''' Calculate all local parameters for provided dataset under a topic model Returns ------- LP : dict Local parameter fields resp : 2D array, N x K DocTopicCount : 2D array, nDoc x K model-specific fields for doc-topic probabilities ''' assert isinstance(cslice, tuple) if len(cslice) != 2: cslice = (0, None) elif cslice[0] is None: cslice = (0, None) nDoc = calcNumDocFromSlice(Data, cslice) if 'obsModelName' in LP: obsModelName = LP['obsModelName'] elif hasattr(Data, 'word_count'): obsModelName = 'Mult' else: obsModelName = 'Gauss' # Unpack the problem size N, K = LP['E_log_soft_ev'].shape # Prepare the initial DocTopicCount matrix, # Useful for warm starts of the local step. initDocTopicCount = None if 'DocTopicCount' in LP: if LP['DocTopicCount'].shape == (nDoc, K): initDocTopicCount = LP['DocTopicCount'].copy() sumRespTilde = np.zeros(N) DocTopicCount = np.zeros((nDoc, K)) DocTopicProb = np.zeros((nDoc, K)) # Prepare the extra terms if alphaEbeta is None: assert alpha is not None alphaEbeta = alpha * np.ones(K) else: alphaEbeta = alphaEbeta[:K] # Prepare the likelihood matrix # Make sure it is C-contiguous, so that matrix ops are very fast Lik = np.asarray(LP['E_log_soft_ev'], order='C') if (nnzPerRowLP <= 0 or nnzPerRowLP >= K) or doSparseOnlyAtFinalLP: DO_DENSE = True # Dense Representation Lik -= Lik.max(axis=1)[:, np.newaxis] NumericUtil.inplaceExp(Lik) else: DO_DENSE = False nnzPerRowLP = np.minimum(nnzPerRowLP, K) spR_data = np.zeros(N * nnzPerRowLP, dtype=np.float64) spR_colids = np.zeros(N * nnzPerRowLP, dtype=np.int32) slice_start = Data.doc_range[cslice[0]] if not DO_DENSE and obsModelName.count('Mult'): if initDocTopicCountLP.count('fastfirstiter'): #tstart = time.time() init_spR = calcInitSparseResp(LP, alphaEbeta, nnzPerRowLP=nnzPerRowLP, **kwargs) #tstop = time.time() #telapsed = tstop - tstart AggInfo = dict() AggInfo['maxDiff'] = np.zeros(Data.nDoc) AggInfo['iter'] = np.zeros(Data.nDoc, dtype=np.int32) if 'restartLP' in kwargs and kwargs['restartLP']: AggInfo['nRestartsAccepted'] = np.zeros(1, dtype=np.int32) AggInfo['nRestartsTried'] = np.zeros(1, dtype=np.int32) else: AggInfo['nRestartsAccepted'] = None AggInfo['nRestartsTried'] = None for d in xrange(nDoc): start = Data.doc_range[cslice[0] + d] stop = Data.doc_range[cslice[0] + d + 1] if hasattr(Data, 'word_count') and obsModelName.count('Bern'): lstart = d * Data.vocab_size lstop = (d + 1) * Data.vocab_size else: lstart = start - slice_start lstop = stop - slice_start if hasattr(Data, 'word_count') and not obsModelName.count('Bern'): wc_d = Data.word_count[start:stop].copy() else: wc_d = 1.0 initDTC_d = None if initDocTopicCountLP == 'memo': if initDocTopicCount is not None: if DO_DENSE: initDTC_d = initDocTopicCount[d] else: DocTopicCount[d] = initDocTopicCount[d] else: initDocTopicCountLP = 'setDocProbsToEGlobalProbs' if not DO_DENSE and initDocTopicCountLP.count('fastfirstiter'): if obsModelName.count('Mult'): #tstart = time.time() DocTopicCount[d, :] = wc_d * init_spR[Data.word_id[start:stop]] #telapsed += time.time() - tstart if not DO_DENSE: m_start = nnzPerRowLP * start m_stop = nnzPerRowLP * stop # SPARSE RESP calcSparseLocalParams_SingleDoc( wc_d, Lik[lstart:lstop], alphaEbeta, topicCount_d_OUT=DocTopicCount[d], spResp_data_OUT=spR_data[m_start:m_stop], spResp_colids_OUT=spR_colids[m_start:m_stop], nnzPerRowLP=nnzPerRowLP, initDocTopicCountLP=initDocTopicCountLP, d=d, maxDiffVec=AggInfo['maxDiff'], numIterVec=AggInfo['iter'], nRAcceptVec=AggInfo['nRestartsAccepted'], nRTrialVec=AggInfo['nRestartsTried'], **kwargs) else: Lik_d = Lik[lstart:lstop].copy() # Local copy (DocTopicCount[d], DocTopicProb[d], sumRespTilde[lstart:lstop], Info_d) \ = calcLocalParams_SingleDoc( wc_d, Lik_d, alphaEbeta, alphaEbetaRem, DocTopicCount_d=initDTC_d, initDocTopicCountLP=initDocTopicCountLP, **kwargs) AggInfo = updateConvergenceInfoForDoc_d(d, Info_d, AggInfo, Data) #if initDocTopicCountLP.startswith('fast'): # AggInfo['time_extra'] = telapsed LP['DocTopicCount'] = DocTopicCount if hasattr(Data, 'word_count'): if cslice is None or (cslice[0] == 0 and cslice[1] is None): assert np.allclose(np.sum(DocTopicCount), np.sum(Data.word_count)) LP = updateLPGivenDocTopicCount(LP, DocTopicCount, alphaEbeta, alphaEbetaRem) if DO_DENSE: LP = updateLPWithResp(LP, Data, Lik, DocTopicProb, sumRespTilde, cslice, nnzPerRowLP=nnzPerRowLP, doSparseOnlyAtFinalLP=doSparseOnlyAtFinalLP) else: indptr = np.arange(0, (N + 1) * nnzPerRowLP, nnzPerRowLP, dtype=np.int32) LP['spR'] = scipy.sparse.csr_matrix((spR_data, spR_colids, indptr), shape=(N, K)) LP['nnzPerRow'] = nnzPerRowLP LP['Info'] = AggInfo writeLogMessageForManyDocs(Data, AggInfo, LP, **kwargs) return LP