def calcLocalParams(Data, LP, aModel, methodLP='scratch', routineLP='simple', **kwargs): ''' Calculate all local parameters for provided dataset under a topic model Returns ------- LP : dict of local params, with fields * DocTopicCount * resp * model-specific fields for doc-topic probabilities ''' kwargs['methodLP'] = methodLP ## Prepare the log soft ev matrix ## Make sure it is C-contiguous, so that matrix ops are very fast Lik = np.asarray(LP['E_log_soft_ev'], order='C') Lik -= Lik.max(axis=1)[:, np.newaxis] NumericUtil.inplaceExp(Lik) K = Lik.shape[1] hasDocTopicCount = 'DocTopicCount' in LP \ and LP['DocTopicCount'].shape == (Data.nDoc, K) if methodLP == 'memo' and hasDocTopicCount: initDocTopicCount = LP['DocTopicCount'] else: initDocTopicCount = None if routineLP == 'simple': DocTopicCount, Prior, sumR, AI = calcDocTopicCountForData_Simple( Data, aModel, Lik, initDocTopicCount=initDocTopicCount, **kwargs) elif routineLP == 'fast': DocTopicCount, Prior, sumR = calcDocTopicCountForData_Fast( Data, aModel, Lik, initDocTopicCount=initDocTopicCount, **kwargs) else: raise ValueError('Unrecognized routine ' + routineLP) LP['DocTopicCount'] = DocTopicCount LP = aModel.updateLPGivenDocTopicCount(LP, DocTopicCount) LP = updateLPWithResp(LP, Data, Lik, Prior, sumR) if kwargs['restartremovejunkLP'] == 1: LP, RInfo = removeJunkTopicsFromAllDocs(aModel, Data, LP, **kwargs) if 'lapFrac' in kwargs and 'batchID' in kwargs: if hasattr(Data, 'batchID') and Data.batchID == kwargs['batchID']: perc = [0, 5, 10, 50, 90, 95, 100] siter = ' '.join( ['%4d' % np.percentile(AI['iter'], p) for p in perc]) sdiff = ['%6.4f' % np.percentile(AI['maxDiff'], p) for p in perc] sdiff = ' '.join(sdiff) nFail = np.sum(AI['maxDiff'] > kwargs['convThrLP']) msg = '%4.2f %3d %4d %s %s' % (kwargs['lapFrac'], Data.batchID, nFail, siter, sdiff) if kwargs['restartremovejunkLP'] == 1: msg += " %4d/%4d %4d/%4d" % ( RInfo['nDocRestartsAccepted'], RInfo['nDocRestartsTried'], RInfo['nRestartsAccepted'], RInfo['nRestartsTried']) elif kwargs['restartremovejunkLP'] == 2: msg += " %4d/%4d" % (AI['nRestartsAccepted'], AI['nRestartsTried']) LP['Info'] = AI return LP
def calcLocalParams(Data, LP, alphaEbeta=None, alphaEbetaRem=None, alpha=None, initDocTopicCountLP='scratch', cslice=(0, None), nnzPerRowLP=0, doSparseOnlyAtFinalLP=0, **kwargs): ''' Calculate all local parameters for provided dataset under a topic model Returns ------- LP : dict Local parameter fields resp : 2D array, N x K DocTopicCount : 2D array, nDoc x K model-specific fields for doc-topic probabilities ''' assert isinstance(cslice, tuple) if len(cslice) != 2: cslice = (0, None) elif cslice[0] is None: cslice = (0, None) nDoc = calcNumDocFromSlice(Data, cslice) if 'obsModelName' in LP: obsModelName = LP['obsModelName'] elif hasattr(Data, 'word_count'): obsModelName = 'Mult' else: obsModelName = 'Gauss' # Unpack the problem size N, K = LP['E_log_soft_ev'].shape # Prepare the initial DocTopicCount matrix, # Useful for warm starts of the local step. initDocTopicCount = None if 'DocTopicCount' in LP: if LP['DocTopicCount'].shape == (nDoc, K): initDocTopicCount = LP['DocTopicCount'].copy() sumRespTilde = np.zeros(N) DocTopicCount = np.zeros((nDoc, K)) DocTopicProb = np.zeros((nDoc, K)) # Prepare the extra terms if alphaEbeta is None: assert alpha is not None alphaEbeta = alpha * np.ones(K) else: alphaEbeta = alphaEbeta[:K] # Prepare the likelihood matrix # Make sure it is C-contiguous, so that matrix ops are very fast Lik = np.asarray(LP['E_log_soft_ev'], order='C') if (nnzPerRowLP <= 0 or nnzPerRowLP >= K) or doSparseOnlyAtFinalLP: DO_DENSE = True # Dense Representation Lik -= Lik.max(axis=1)[:, np.newaxis] NumericUtil.inplaceExp(Lik) else: DO_DENSE = False nnzPerRowLP = np.minimum(nnzPerRowLP, K) spR_data = np.zeros(N * nnzPerRowLP, dtype=np.float64) spR_colids = np.zeros(N * nnzPerRowLP, dtype=np.int32) slice_start = Data.doc_range[cslice[0]] if not DO_DENSE and obsModelName.count('Mult'): if initDocTopicCountLP.count('fastfirstiter'): #tstart = time.time() init_spR = calcInitSparseResp(LP, alphaEbeta, nnzPerRowLP=nnzPerRowLP, **kwargs) #tstop = time.time() #telapsed = tstop - tstart AggInfo = dict() AggInfo['maxDiff'] = np.zeros(Data.nDoc) AggInfo['iter'] = np.zeros(Data.nDoc, dtype=np.int32) if 'restartLP' in kwargs and kwargs['restartLP']: AggInfo['nRestartsAccepted'] = np.zeros(1, dtype=np.int32) AggInfo['nRestartsTried'] = np.zeros(1, dtype=np.int32) else: AggInfo['nRestartsAccepted'] = None AggInfo['nRestartsTried'] = None for d in xrange(nDoc): start = Data.doc_range[cslice[0] + d] stop = Data.doc_range[cslice[0] + d + 1] if hasattr(Data, 'word_count') and obsModelName.count('Bern'): lstart = d * Data.vocab_size lstop = (d + 1) * Data.vocab_size else: lstart = start - slice_start lstop = stop - slice_start if hasattr(Data, 'word_count') and not obsModelName.count('Bern'): wc_d = Data.word_count[start:stop].copy() else: wc_d = 1.0 initDTC_d = None if initDocTopicCountLP == 'memo': if initDocTopicCount is not None: if DO_DENSE: initDTC_d = initDocTopicCount[d] else: DocTopicCount[d] = initDocTopicCount[d] else: initDocTopicCountLP = 'setDocProbsToEGlobalProbs' if not DO_DENSE and initDocTopicCountLP.count('fastfirstiter'): if obsModelName.count('Mult'): #tstart = time.time() DocTopicCount[d, :] = wc_d * init_spR[Data.word_id[start:stop]] #telapsed += time.time() - tstart if not DO_DENSE: m_start = nnzPerRowLP * start m_stop = nnzPerRowLP * stop # SPARSE RESP calcSparseLocalParams_SingleDoc( wc_d, Lik[lstart:lstop], alphaEbeta, topicCount_d_OUT=DocTopicCount[d], spResp_data_OUT=spR_data[m_start:m_stop], spResp_colids_OUT=spR_colids[m_start:m_stop], nnzPerRowLP=nnzPerRowLP, initDocTopicCountLP=initDocTopicCountLP, d=d, maxDiffVec=AggInfo['maxDiff'], numIterVec=AggInfo['iter'], nRAcceptVec=AggInfo['nRestartsAccepted'], nRTrialVec=AggInfo['nRestartsTried'], **kwargs) else: Lik_d = Lik[lstart:lstop].copy() # Local copy (DocTopicCount[d], DocTopicProb[d], sumRespTilde[lstart:lstop], Info_d) \ = calcLocalParams_SingleDoc( wc_d, Lik_d, alphaEbeta, alphaEbetaRem, DocTopicCount_d=initDTC_d, initDocTopicCountLP=initDocTopicCountLP, **kwargs) AggInfo = updateConvergenceInfoForDoc_d(d, Info_d, AggInfo, Data) #if initDocTopicCountLP.startswith('fast'): # AggInfo['time_extra'] = telapsed LP['DocTopicCount'] = DocTopicCount if hasattr(Data, 'word_count'): if cslice is None or (cslice[0] == 0 and cslice[1] is None): assert np.allclose(np.sum(DocTopicCount), np.sum(Data.word_count)) LP = updateLPGivenDocTopicCount(LP, DocTopicCount, alphaEbeta, alphaEbetaRem) if DO_DENSE: LP = updateLPWithResp(LP, Data, Lik, DocTopicProb, sumRespTilde, cslice, nnzPerRowLP=nnzPerRowLP, doSparseOnlyAtFinalLP=doSparseOnlyAtFinalLP) else: indptr = np.arange(0, (N + 1) * nnzPerRowLP, nnzPerRowLP, dtype=np.int32) LP['spR'] = scipy.sparse.csr_matrix((spR_data, spR_colids, indptr), shape=(N, K)) LP['nnzPerRow'] = nnzPerRowLP LP['Info'] = AggInfo writeLogMessageForManyDocs(Data, AggInfo, LP, **kwargs) return LP