def loadTopicModelFromMEDLDA(filepath, prefix=None, returnTPA=0): ''' Load topic model saved in medlda format. ''' # Avoid circular import import bnpy.HModel as HModel assert prefix is not None alphafilepath = os.path.join(filepath, prefix + '.alpha') etafilepath = os.path.join(filepath, prefix + '.eta') topicfilepath = os.path.join(filepath, prefix + '.log_prob_w') alpha = float(np.loadtxt(alphafilepath)) eta = np.loadtxt(etafilepath) logtopics = np.loadtxt(topicfilepath) topics = np.exp(logtopics) topics += 1e-9 topics /= topics.sum(axis=1)[:, np.newaxis] assert np.all(np.isfinite(topics)) if returnTPA: K = topics.shape[0] probs = 1.0 / K * np.ones(K) return topics, probs, alpha, eta infAlg = 'VB' aPriorDict = dict(alpha=alpha) amodel = AllocModelConstructorsByName['FiniteTopicModel'](infAlg, aPriorDict) omodel = ObsModelConstructorsByName['Mult'](infAlg, lam=0.001, D=topics.shape[1]) hmodel = HModel(amodel, omodel) hmodel.obsModel.set_global_params(topics=topics, nTotalTokens=1000) return hmodel
def load_model( matfilepath, prefix='Best'): ''' Load model stored to disk by ModelWriter ''' # avoids circular import import bnpy.HModel as HModel obsModel = load_obs_model(matfilepath, prefix) allocModel = load_alloc_model(matfilepath, prefix) return HModel(allocModel, obsModel)
def load_model_at_prefix(matfilepath, prefix='Best', lap=None): ''' Load model stored to disk by ModelWriter Returns ------ model : bnpy.HModel Model object for saved at checkpoint indicated by prefix or lap. ''' # Avoids circular import import bnpy.HModel as HModel if lap is not None: prefix, _ = getPrefixForLapQuery(matfilepath, lap) try: obsModel = load_obs_model(matfilepath, prefix) allocModel = load_alloc_model(matfilepath, prefix) model = HModel.HModel(allocModel, obsModel) except IOError as e: print(str(e)) ''' if prefix == 'Best': matList = glob.glob(os.path.join(matfilepath, '*TopicModel.mat')) lpwList = glob.glob(os.path.join(matfilepath, '*.log_prob_w')) if len(matList) > 0: matList.sort() # ascending order, so most recent is last prefix = matList[-1].split(os.path.sep)[-1][:11] model = loadTopicModel(matfilepath, prefix=prefix) elif len(lpwList) > 0: lpwList.sort() # ascenting order prefix = lpwList[-1].split(os.path.sep)[-1][:7] else: raise e ''' try: model = loadTopicModel(matfilepath, prefix=prefix) except IOError as e: model = loadTopicModelFromMEDLDA(matfilepath, prefix=prefix) return model
def loadTopicModelFromTxtFiles(snapshotPath, returnTPA=False, returnWordCounts=False, normalizeProbs=True, normalizeTopics=True, **kwargs): ''' Load from snapshot text files. Returns ------- hmodel ''' Mdict = dict() possibleKeys = [ 'K', 'probs', 'alpha', 'beta', 'lam', 'gamma', 'nTopics', 'nTypes', 'vocab_size' ] keyMap = dict(beta='lam', nTopics='K', nTypes='vocab_size') for key in possibleKeys: try: arr = np.loadtxt(snapshotPath + "/%s.txt" % (key)) if key in keyMap: Mdict[keyMap[key]] = arr else: Mdict[key] = arr except Exception: pass assert 'K' in Mdict assert 'lam' in Mdict K = int(Mdict['K']) V = int(Mdict['vocab_size']) if os.path.exists(snapshotPath + "/topics.txt"): Mdict['topics'] = np.loadtxt(snapshotPath + "/topics.txt") Mdict['topics'] = as2D(toCArray(Mdict['topics'], dtype=np.float64)) assert Mdict['topics'].ndim == 2 assert Mdict['topics'].shape == (K, V) else: TWC_data = np.loadtxt(snapshotPath + "/TopicWordCount_data.txt") TWC_inds = np.loadtxt(snapshotPath + "/TopicWordCount_indices.txt", dtype=np.int32) if os.path.exists(snapshotPath + "/TopicWordCount_cscindptr.txt"): TWC_cscindptr = np.loadtxt(snapshotPath + "/TopicWordCount_cscindptr.txt", dtype=np.int32) TWC = scipy.sparse.csc_matrix((TWC_data, TWC_inds, TWC_cscindptr), shape=(K, V)) else: TWC_csrindptr = np.loadtxt(snapshotPath + "/TopicWordCount_indptr.txt", dtype=np.int32) TWC = scipy.sparse.csr_matrix((TWC_data, TWC_inds, TWC_csrindptr), shape=(K, V)) Mdict['WordCounts'] = TWC.toarray() if returnTPA: # Load topics : 2D array, K x vocab_size if 'WordCounts' in Mdict: topics = Mdict['WordCounts'] + Mdict['lam'] else: topics = Mdict['topics'] topics = as2D(toCArray(topics, dtype=np.float64)) assert topics.ndim == 2 K = topics.shape[0] if normalizeTopics: topics /= topics.sum(axis=1)[:, np.newaxis] # Load probs : 1D array, size K try: probs = Mdict['probs'] except KeyError: probs = (1.0 / K) * np.ones(K) probs = as1D(toCArray(probs, dtype=np.float64)) assert probs.ndim == 1 assert probs.size == K if normalizeProbs: probs = probs / np.sum(probs) # Load alpha : scalar float > 0 try: alpha = float(Mdict['alpha']) except KeyError: if 'alpha' in os.environ: alpha = float(os.environ['alpha']) else: raise ValueError('Unknown parameter alpha') return topics, probs, alpha # BUILD HMODEL FROM LOADED TXT infAlg = 'VB' # avoids circular import from bnpy.HModel import HModel if 'gamma' in Mdict: aPriorDict = dict(alpha=Mdict['alpha'], gamma=Mdict['gamma']) HDPTopicModel = AllocModelConstructorsByName['HDPTopicModel'] amodel = HDPTopicModel(infAlg, aPriorDict) else: FiniteTopicModel = AllocModelConstructorsByName['FiniteTopicModel'] amodel = FiniteTopicModel(infAlg, dict(alpha=Mdict['alpha'])) omodel = ObsModelConstructorsByName['Mult'](infAlg, **Mdict) hmodel = HModel(amodel, omodel) hmodel.set_global_params(**Mdict) if returnWordCounts: return hmodel, Mdict['WordCounts'] return hmodel
def loadTopicModel(matfilepath, queryLap=None, prefix=None, returnWordCounts=0, returnTPA=0, normalizeTopics=0, normalizeProbs=0, **kwargs): ''' Load saved topic model Returns ------- topics : 2D array, K x vocab_size (if returnTPA) probs : 1D array, size K (if returnTPA) alpha : scalar (if returnTPA) hmodel : HModel WordCounts : 2D array, size K x vocab_size (if returnWordCounts) ''' if prefix is None: prefix, lapQuery = getPrefixForLapQuery(matfilepath, queryLap) # avoids circular import from bnpy.HModel import HModel if len(glob.glob(os.path.join(matfilepath, "*.log_prob_w"))) > 0: return loadTopicModelFromMEDLDA(matfilepath, prefix, returnTPA=returnTPA) snapshotList = glob.glob(os.path.join(matfilepath, 'Lap*TopicSnapshot')) matfileList = glob.glob(os.path.join(matfilepath, 'Lap*TopicModel.mat')) if len(snapshotList) > 0: if prefix is None: snapshotList.sort() snapshotPath = snapshotList[-1] else: snapshotPath = None for curPath in snapshotList: if curPath.count(prefix): snapshotPath = curPath return loadTopicModelFromTxtFiles(snapshotPath, normalizeTopics=normalizeTopics, normalizeProbs=normalizeProbs, returnWordCounts=returnWordCounts, returnTPA=returnTPA) if prefix is not None: matfilepath = os.path.join(matfilepath, prefix + 'TopicModel.mat') Mdict = loadDictFromMatfile(matfilepath) if 'SparseWordCount_data' in Mdict: data = np.asarray(Mdict['SparseWordCount_data'], dtype=np.float64) K = int(Mdict['K']) vocab_size = int(Mdict['vocab_size']) try: indices = Mdict['SparseWordCount_indices'] indptr = Mdict['SparseWordCount_indptr'] WordCounts = scipy.sparse.csr_matrix((data, indices, indptr), shape=(K, vocab_size)) except KeyError: rowIDs = Mdict['SparseWordCount_i'] - 1 colIDs = Mdict['SparseWordCount_j'] - 1 WordCounts = scipy.sparse.csr_matrix((data, (rowIDs, colIDs)), shape=(K, vocab_size)) Mdict['WordCounts'] = WordCounts.toarray() if returnTPA: # Load topics : 2D array, K x vocab_size if 'WordCounts' in Mdict: topics = Mdict['WordCounts'] + Mdict['lam'] else: topics = Mdict['topics'] topics = as2D(toCArray(topics, dtype=np.float64)) assert topics.ndim == 2 K = topics.shape[0] if normalizeTopics: topics /= topics.sum(axis=1)[:, np.newaxis] # Load probs : 1D array, size K try: probs = Mdict['probs'] except KeyError: probs = (1.0 / K) * np.ones(K) probs = as1D(toCArray(probs, dtype=np.float64)) assert probs.ndim == 1 assert probs.size == K if normalizeProbs: probs = probs / np.sum(probs) # Load alpha : scalar float > 0 try: alpha = float(Mdict['alpha']) except KeyError: if 'alpha' in os.environ: alpha = float(os.environ['alpha']) else: raise ValueError('Unknown parameter alpha') if 'eta' in Mdict: return topics, probs, alpha, as1D(toCArray(Mdict['eta'])) return topics, probs, alpha infAlg = 'VB' if 'gamma' in Mdict: aPriorDict = dict(alpha=Mdict['alpha'], gamma=Mdict['gamma']) HDPTopicModel = AllocModelConstructorsByName['HDPTopicModel'] amodel = HDPTopicModel(infAlg, aPriorDict) else: FiniteTopicModel = AllocModelConstructorsByName['FiniteTopicModel'] amodel = FiniteTopicModel(infAlg, dict(alpha=Mdict['alpha'])) omodel = ObsModelConstructorsByName['Mult'](infAlg, **Mdict) hmodel = HModel(amodel, omodel) hmodel.set_global_params(**Mdict) if returnWordCounts: return hmodel, Mdict['WordCounts'] return hmodel
def proposeNewResp_dpmixture(Z_n, propResp, tempModel=None, tempSS=None, Data_n=None, origK=0, Kfresh=3, nVBIters=3, PRNG=None, PastAttemptLog=dict(), **kwargs): ''' Create new resp matrix by DP mixture clustering of subsampled data. Returns ------- propResp : 2D array, N x K' ''' # Avoid circular imports from bnpy.allocmodel import DPMixtureModel from bnpy import HModel from bnpy.mergemove import MergePlanner, MergeMove # Select ktarget if 'strategy' not in PastAttemptLog: PastAttemptLog['strategy'] = 'byState' if PastAttemptLog['strategy'] == 'byState': Kcur = tempModel.obsModel.K Kextra = Kcur - PastAttemptLog['uIDs'].size if Kextra > 0: maxUID = PastAttemptLog['maxUID'] uIDs = PastAttemptLog['uIDs'] for extraPos in range(Kextra): maxUID += 1 uIDs = np.append(uIDs, maxUID) PastAttemptLog['maxUID'] = maxUID PastAttemptLog['uIDs'] = uIDs candidateStateUIDs = set() for state in np.unique(Z_n): uid = PastAttemptLog['uIDs'][state] candidateStateUIDs.add(uid) allAvailableUIDs = [x for x in candidateStateUIDs] if 'nTryByStateUID' not in PastAttemptLog: PastAttemptLog['nTryByStateUID'] = dict() minTry = np.inf for badState, nTry in list(PastAttemptLog['nTryByStateUID'].items()): if badState in candidateStateUIDs: if nTry < minTry: minTry = nTry untriedList = [ x for x in candidateStateUIDs if x not in PastAttemptLog['nTryByStateUID'] or PastAttemptLog['nTryByStateUID'][x] == 0 ] if len(untriedList) > 0: candidateStateUIDs = untriedList else: # Keep only candidates that have been tried the least for badState, nTry in list( PastAttemptLog['nTryByStateUID'].items()): # Remove bad State from candidateStateUIDs if badState in candidateStateUIDs: if nTry > minTry: candidateStateUIDs.remove(badState) candidateStateUIDs = np.asarray([x for x in candidateStateUIDs]) # Pick a state that we haven't tried yet, # uniformly at random if len(candidateStateUIDs) > 0: chosenStateUID = PRNG.choice(np.asarray(candidateStateUIDs)) ktarget = np.flatnonzero( chosenStateUID == PastAttemptLog['uIDs'])[0] else: # Just pick a target at random chosenStateUID = PRNG.choice(np.asarray(allAvailableUIDs)) ktarget = np.flatnonzero(chosenStateUID == PastAttemptLog['uIDs'])[0] relDataIDs = np.flatnonzero(Z_n == ktarget) # If the selected state is too small, just make a new state for all relIDs if relDataIDs.size < Kfresh: if chosenStateUID in PastAttemptLog['nTryByStateUID']: PastAttemptLog['nTryByStateUID'][chosenStateUID] += 1 else: PastAttemptLog['nTryByStateUID'][chosenStateUID] = 1 propResp[relDataIDs, :] = 0 propResp[relDataIDs, origK + 1] = 1 return propResp, origK + 1 if hasattr(Data_n, 'Xprev'): Xprev = Data_n.Xprev[relDataIDs] else: Xprev = None targetData = XData(X=Data_n.X[relDataIDs], Xprev=Xprev) myDPModel = DPMixtureModel('VB', gamma0=10) myObsModel = copy.deepcopy(tempModel.obsModel) delattr(myObsModel, 'Post') myObsModel.ClearCache() myHModel = HModel(myDPModel, myObsModel) initname = PRNG.choice(['randexamplesbydist', 'randcontigblocks']) myHModel.init_global_params(targetData, K=Kfresh, initname=initname, **kwargs) Kfresh = myHModel.obsModel.K mergeIsPromising = True while Kfresh > 1 and mergeIsPromising: for vbiter in range(nVBIters): targetLP = myHModel.calc_local_params(targetData) targetSS = myHModel.get_global_suff_stats(targetData, targetLP) # Delete unnecessarily small comps if vbiter == nVBIters - 1: smallIDs = np.flatnonzero(targetSS.getCountVec() <= 1) for kdel in reversed(smallIDs): if targetSS.K > 1: targetSS.removeComp(kdel) # Global step myHModel.update_global_params(targetSS) # Do merges mPairIDs, MM = MergePlanner.preselect_candidate_pairs( myHModel, targetSS, preselect_routine='wholeELBO', doLimitNumPairs=0, returnScoreMatrix=1, **kwargs) targetLP = myHModel.calc_local_params(targetData, mPairIDs=mPairIDs, limitMemoryLP=1) targetSS = myHModel.get_global_suff_stats(targetData, targetLP, mPairIDs=mPairIDs, doPrecompEntropy=1, doPrecompMergeEntropy=1) myHModel.update_global_params(targetSS) curELBO = myHModel.calc_evidence(SS=targetSS) myHModel, targetSS, curELBO, Info = MergeMove.run_many_merge_moves( myHModel, targetSS, curELBO, mPairIDs, M=MM, isBirthCleanup=1) mergeIsPromising = len(Info['AcceptedPairs']) > 0 Kfresh = targetSS.K if mergeIsPromising: targetLP = myHModel.calc_local_params(targetData) propResp[relDataIDs, :] = 0 propResp[relDataIDs, origK:origK + Kfresh] = targetLP['resp'] # Test if we added at least 2 states with mass > 1 didAddNonEmptyNewStates = np.sum(targetSS.N > 1.0) >= 2 print('dpmixture proposal: targetUID %d didAddNonEmptyNewStates %d' % (chosenStateUID, didAddNonEmptyNewStates)) if didAddNonEmptyNewStates: print('NEW STATE MASSES:', end=' ') print(' '.join(['%5.1f' % (x) for x in targetSS.N])) PastAttemptLog['nTryByStateUID'][chosenStateUID] = 0 # success! else: if chosenStateUID in PastAttemptLog['nTryByStateUID']: PastAttemptLog['nTryByStateUID'][chosenStateUID] += 1 else: PastAttemptLog['nTryByStateUID'][chosenStateUID] = 1 return propResp, origK + Kfresh