def sparseLocalStep_WordCountData( Data=None, LP=None, alphaEbeta=None, alphaEbetaRem=None, ElogphiT=None, DocTopicCount=None, spResp_data_OUT=None, spResp_colids_OUT=None, nCoordAscentItersLP=10, convThrLP=0.001, nnzPerRowLP=2, activeonlyLP=1, restartLP=0, restartNumTrialsLP=50, initDocTopicCountLP='setDocProbsToEGlobalProbs', reviseActiveFirstLP=-1, reviseActiveEveryLP=1, maxDiffVec=None, numIterVec=None, nRAcceptVec=None, nRTrialVec=None, verboseLP=0, **kwargs): ''' Perform local inference for topic model. Wrapper around C++ code. ''' if LP is not None: ElogphiT = LP['ElogphiT'] N = Data.nUniqueToken V, K = ElogphiT.shape assert K == alphaEbeta.size nnzPerRowLP = np.minimum(nnzPerRowLP, K) # Parse params for tracking convergence progress if maxDiffVec is None: maxDiffVec = np.zeros(Data.nDoc, dtype=np.float64) numIterVec = np.zeros(Data.nDoc, dtype=np.int32) if nRTrialVec is None: nRTrialVec = np.zeros(1, dtype=np.int32) nRAcceptVec = np.zeros(1, dtype=np.int32) assert maxDiffVec.dtype == np.float64 assert numIterVec.dtype == np.int32 # Handle starting from memoized doc-topic counts if initDocTopicCountLP == 'memo': if 'DocTopicCount' in LP: DocTopicCount = LP['DocTopicCount'] else: initDocTopicCountLP = 'setDocProbsToEGlobalProbs' # Allow sparse restarts ONLY on first pass through dataset if restartLP > 1: if 'lapFrac' in kwargs and kwargs['lapFrac'] <= 1.0: restartLP = 1 else: restartLP = 0 # Use provided DocTopicCount array if its the right size # Otherwise, create a new one from scratch TopicCount_OUT = None if isinstance(DocTopicCount, np.ndarray): if DocTopicCount.shape == (Data.nDoc, K): TopicCount_OUT = DocTopicCount if TopicCount_OUT is None: TopicCount_OUT = np.zeros((Data.nDoc, K)) assert TopicCount_OUT.shape == (Data.nDoc, K) if spResp_data_OUT is None: spResp_data_OUT = np.zeros(N * nnzPerRowLP) spResp_colids_OUT = np.zeros(N * nnzPerRowLP, dtype=np.int32) assert spResp_data_OUT.size == N * nnzPerRowLP assert spResp_colids_OUT.size == N * nnzPerRowLP if initDocTopicCountLP.startswith("setDocProbsToEGlobalProbs"): initProbsToEbeta = 1 elif initDocTopicCountLP.startswith("fast"): initProbsToEbeta = 2 else: initProbsToEbeta = 0 if reviseActiveFirstLP < 0: reviseActiveFirstLP = nCoordAscentItersLP + 10 sparseLocalStepManyDocs_cpp( alphaEbeta, ElogphiT, Data.word_count, Data.word_id, Data.doc_range, nnzPerRowLP, N, K, Data.nDoc, Data.vocab_size, nCoordAscentItersLP, convThrLP, initProbsToEbeta, TopicCount_OUT, spResp_data_OUT, spResp_colids_OUT, numIterVec, maxDiffVec, restartNumTrialsLP * restartLP, nRAcceptVec, nRTrialVec, reviseActiveFirstLP, reviseActiveEveryLP, verboseLP) # Package results up into dict if not isinstance(LP, dict): LP = dict() LP['nnzPerRow'] = nnzPerRowLP LP['DocTopicCount'] = TopicCount_OUT indptr = np.arange(0, (N + 1) * nnzPerRowLP, nnzPerRowLP, dtype=np.int32) LP['spR'] = scipy.sparse.csr_matrix( (spResp_data_OUT, spResp_colids_OUT, indptr), shape=(N, K)) # Fill in remainder of LP dict, with derived quantities from bnpy.allocmodel.topics.LocalStepManyDocs \ import updateLPGivenDocTopicCount, writeLogMessageForManyDocs LP = updateLPGivenDocTopicCount(LP, LP['DocTopicCount'], alphaEbeta, alphaEbetaRem) LP['Info'] = dict() LP['Info']['iter'] = numIterVec LP['Info']['maxDiff'] = maxDiffVec if restartLP > 0: LP['Info']['nRestartsAccepted'] = nRAcceptVec[0] LP['Info']['nRestartsTried'] = nRTrialVec[0] writeLogMessageForManyDocs(Data, LP['Info'], LP, convThrLP=convThrLP, **kwargs) return LP
def train_image_specific_topics(self, y, sigma, Niter=50, Kfresh=100, pixelMask=None): print('Training %d image-specific clusters...' % Kfresh) D, patchSize, GP = self.D, int(np.sqrt(self.D)), self.GP # gather fully observable patches if pixelMask is None: # gray-scale image denoising v = im2col(y, patchSize) else: # color image inpainting C = 3 patchMask = np.logical_not( np.any(im2col(pixelMask, patchSize), axis=0)) v = np.hstack( tuple([ im2col(y[:, :, c], patchSize)[:, patchMask] for c in xrange(C) ])) v -= np.mean(v, axis=0) v = v.T testData = GroupXData(X=v, doc_range=[0, len(v)], nDocTotal=1) testData.name = 'test_image_patches' # set up hyper-parameters and run Bregman k-means cached_B_name = 'models/HDP/B.mat' xBar = loadmat(cached_B_name)['Cov'] xBar2 = loadmat(cached_B_name)['Cov2'] tmp0 = (np.diag(xBar) + sigma**2)**2 tmp1 = np.diag(xBar2) + 6 * np.diag(xBar) * sigma**2 + 3 * sigma**4 nu = D + 3 + 2 * np.sum(tmp0) / np.sum(tmp1 - tmp0) B = (nu - D - 1) * (xBar + sigma**2 * np.eye(D)) obsModel = ZeroMeanGaussObsModel(D=D, min_covar=1e-8, inferType='memoVB', B=B, nu=nu) Z, Mu, Lscores = runKMeans_BregmanDiv(testData.X, Kfresh, obsModel, Niter=Niter, assert_monotonic=False) Korig = self.K Kall = np.max(Z) + Korig + 1 Kfresh = Kall - Korig Z += Korig # load SuffStats of training images trainSS = loadSuffStatBag('models/HDP/SS.dump') trainSS.insertEmptyComps(Kfresh) # construct SuffStats of the test image DocTopicCount = np.bincount(Z, minlength=int(Kall)).reshape((1, Kall)) DocTopicCount = np.array(DocTopicCount, dtype=np.float64) resp = np.zeros((len(Z), Kall)) resp[np.arange(len(Z)), Z] = 1.0 testLP = dict(resp=resp, DocTopicCount=DocTopicCount) alphaPi0 = np.hstack( (GP.alphaPi0, GP.alphaPi0Rem / (Kfresh + 1) * np.ones(Kfresh))) alphaPi0Rem = GP.alphaPi0Rem / (Kfresh + 1) testLP = updateLPGivenDocTopicCount(testLP, DocTopicCount, alphaPi0, alphaPi0Rem) testSS = self.patchModel.get_global_suff_stats( testData, testLP, doPrecompEntropy=1, doTrackTruncationGrowth=1) xxT = np.zeros((Kall, D, D)) for k in xrange(Korig, Kall): idx = Z == k tmp = np.einsum('nd,ne->de', v[idx], v[idx]) tmp -= testSS.N[k] * sigma**2 * np.eye(D) val, vec = np.linalg.eig(tmp) val[val < EPS] = EPS xxT[k] = np.dot(vec, np.dot(np.diag(val), vec.T)) testSS.setField('xxT', xxT, dims=('K', 'D', 'D')) testSS.setUIDs(trainSS.uids) # combine training and test SS; update model parameters combinedSS = trainSS + testSS self.patchModel.update_global_params(combinedSS) self.calcGlobalParams()
combined_pi0_rem = prev_pi0_rem * new_pi0_rem assert np.allclose( combined_pi0_Knew.sum() + combined_pi0_rem, 1.0) # Now just multiply by alpha alpha = trainedModel.allocModel.alpha alphaPi0 = alpha * combined_pi0_Knew alphaPi0Rem = alpha * combined_pi0_rem assert np.allclose( alpha, alphaPi0.sum() + alphaPi0Rem) testLP = updateLPGivenDocTopicCount( testLP, test_DocTopicCount, alphaPi0, alphaPi0Rem) testSS = trainedModel.get_global_suff_stats( TestData, testLP, doPrecompEntropy=1, doTrackTruncationGrowth=1) print() print("Refining model!") print("Performing several full VB iterations") print("Merging any new clusters when VB objective approves") # Create a combined model for the train AND test set trainSS.insertEmptyComps(testSS.K - trainSS.K)