def loadCountHistoriesForTask(taskpath, sortBy=None, MIN_PRESENT_COUNT=1e-10): ''' Load sparse matrix of counts for all clusters used throughout task. Returns ------- AllCountMat : 2D array, nCheckpoint x nTotal Info : dict ''' idpath = os.path.join(taskpath, 'ActiveIDs.txt') ctpath = os.path.join(taskpath, 'ActiveCounts.txt') fid = open(idpath, 'r') fct = open(ctpath, 'r') data = list() colids = list() rowids = list() for ii, idline in enumerate(fid.readlines()): idstr = str(idline.strip()) ctstr = str(fct.readline().strip()) idvec = np.asarray(idstr.split(' '), dtype=np.int32) ctvec = np.asarray(ctstr.split(' '), dtype=np.float) data.extend(ctvec) colids.extend(idvec) rowids.extend(ii * np.ones(idvec.size)) # Identify columns by unique ids allUIDs = np.unique(colids) compactColIDs = -1 * np.ones_like(colids) for pos, u in enumerate(allUIDs): mask = colids == u compactColIDs[mask] = pos assert compactColIDs.min() >= 0 # CountMat : sparse matrix of active counts at each checkpoint # Each row gives count (or zero if eliminated) at single lap data = np.asarray(data) np.maximum(data, MIN_PRESENT_COUNT, out=data) ij = np.vstack([rowids, compactColIDs]) CountMat = scipy.sparse.csr_matrix((data, ij)) CountMat = CountMat.toarray() assert allUIDs.size == CountMat.shape[1] # Split all columns into two sets: active and eliminated nCol = CountMat.shape[1] elimCols = np.flatnonzero(CountMat[-1, :] < MIN_PRESENT_COUNT) activeCols = np.setdiff1d(np.arange(nCol), elimCols) nElimCol = len(elimCols) nActiveCol = len(activeCols) ElimCountMat = CountMat[:, elimCols] ActiveCountMat = CountMat[:, activeCols] elimUIDs = allUIDs[elimCols] activeUIDs = allUIDs[activeCols] # Fill out info dict Info = dict(CountMat=CountMat, allUIDs=allUIDs, ActiveCountMat=ActiveCountMat, ElimCountMat=ElimCountMat, activeCols=activeCols, elimCols=elimCols, activeUIDs=activeUIDs, elimUIDs=elimUIDs) if not isinstance(sortBy, str) or sortBy.lower().count('none'): return CountMat, Info if sortBy.lower().count('finalorder'): rankedActiveUIDs = idvec raise ValueError("TODO") elif sortBy.lower().count('countvalues'): ## Sort columns from biggest to smallest (at last chkpt) rankedActiveIDs = argsort_bigtosmall_stable(ActiveCountMat[-1, :]) else: raise ValueError("TODO") # Sort active set by size at last snapshot ActiveCountMat = ActiveCountMat[:, rankedActiveIDs] activeUIDs = activeUIDs[rankedActiveIDs] activeCols = activeCols[rankedActiveIDs] # Sort eliminated set by historical size rankedElimIDs = argsort_bigtosmall_stable(ElimCountMat.sum(axis=0)) ElimCountMat = ElimCountMat[:, rankedElimIDs] elimUIDs = elimUIDs[rankedElimIDs] elimCols = elimCols[rankedElimIDs] Info['activeUIDs'] = activeUIDs Info['activeCols'] = activeCols Info['elimUIDs'] = elimUIDs Info['elimCols'] = elimCols return ActiveCountMat, ElimCountMat, Info
def selectShortListForBirthAtLapStart(hmodel, SS, MoveRecordsByUID=dict(), MovePlans=dict(), lapFrac=0, b_minNumAtomsForTargetComp=2, **BArgs): ''' Select list of comps to possibly target with birth during next lap. Shortlist uids are guaranteed to never be involved in a merge/delete. They are kept aside especially for a birth move, at least in this lap. Returns ------- MovePlans : dict with updated fields * b_shortlistUIDs : list of ints, Each uid in b_shortlistUIDs could be a promising birth target. None of these should be touched by deletes or merges in this lap. ''' MovePlans['b_shortlistUIDs'] = list() MovePlans['b_nDQ_toosmall'] = 0 MovePlans['b_nDQ_pastfail'] = 0 MovePlans['b_nDQ_toobusy'] = 0 MovePlans['b_roomToGrow'] = 0 MovePlans['b_maxLenShortlist'] = 0 if not canBirthHappenAtLap(lapFrac, **BArgs): BLogger.pprint('') return MovePlans K = hmodel.obsModel.K KroomToGrow = BArgs['Kmax'] - K MovePlans['b_roomToGrow'] = KroomToGrow # Each birth adds at least 2 comps. # If we have 10 slots left, we can do at most 5 births maxLenShortlist = KroomToGrow / 2 MovePlans['b_maxLenShortlist'] = maxLenShortlist # EXIT: early, if no room to grow. if KroomToGrow <= 1: BLogger.pprint( "Cannot shortlist any comps for birth." + \ " Adding 2 more comps to K=%d exceeds limit of %d (--Kmax)." % ( K, BArgs['Kmax']) ) BLogger.pprint('') return MovePlans # Log reasons for shortlist length if maxLenShortlist < K: msg = " Limiting shortlist to %d possible births this lap." % ( maxLenShortlist) msg += " Any more would cause current K=%d to exceed Kmax=%d" % ( K, BArgs['Kmax']) BLogger.pprint(msg) # Handle initialization case: SS is None # Must just select all possible comps if SS is None: shortlistUIDs = np.arange(K).tolist() shortlistUIDs = shortlistUIDs[:maxLenShortlist] MovePlans['b_shortlistUIDs'] = shortlistUIDs BLogger.pprint("No SS provided. Shortlist contains %d possible comps" % (len(shortlistUIDs))) BLogger.pprint('') return MovePlans assert SS.K == K CountVec = SS.getCountVec() eligible_mask = np.zeros(K, dtype=np.bool8) nTooSmall = 0 nPastFail = 0 for k, uid in enumerate(SS.uids): if uid not in MoveRecordsByUID: MoveRecordsByUID[uid] = defaultdict(int) tooSmall = CountVec[k] <= b_minNumAtomsForTargetComp hasFailRecord = MoveRecordsByUID[uid]['b_nFailRecent'] > 0 if MoveRecordsByUID[uid]['b_tryAgainFutureLap'] > 0: eligible_mask[k] = 1 MovePlans['b_shortlistUIDs'].append(uid) elif (not tooSmall) and (not hasFailRecord): eligible_mask[k] = 1 MovePlans['b_shortlistUIDs'].append(uid) elif tooSmall: nTooSmall += 1 else: assert hasFailRecord nPastFail += 1 assert len(MovePlans['b_shortlistUIDs']) == np.sum(eligible_mask) # Rank the shortlist by size if maxLenShortlist < len(MovePlans['b_shortlistUIDs']): sortIDs = argsort_bigtosmall_stable(CountVec[eligible_mask]) sortIDs = sortIDs[:maxLenShortlist] MovePlans['b_shortlistUIDs'] = [ MovePlans['b_shortlistUIDs'][s] for s in sortIDs ] shortlistCountVec = CountVec[eligible_mask][sortIDs] else: shortlistCountVec = CountVec[eligible_mask] MovePlans['b_nDQ_toosmall'] = nTooSmall MovePlans['b_nDQ_pastfail'] = nPastFail nShortList = len(MovePlans['b_shortlistUIDs']) assert nShortList <= maxLenShortlist BLogger.pprint("%d/%d uids selected for short list." % (nShortList, K)) if nShortList > 0: lineUID = vec2str(MovePlans['b_shortlistUIDs']) lineSize = vec2str(shortlistCountVec) BLogger.pprint( [lineUID, lineSize], prefix=['%7s' % 'uids', '%7s' % 'size'], ) BLogger.pprint('') return MovePlans
def learn_rhoomega_fromFixedCounts(DocTopicCount=None, nDoc=0, canShuffleInit='byUsage', canShuffle=None, maxiter=5, warmStart_rho=1, alpha=None, gamma=None, initrho=None, initomega=None, **kwargs): assert nDoc == DocTopicCount.shape[0] K = DocTopicCount.shape[1] didShuffle = 0 if canShuffleInit: if canShuffleInit.lower().count('byusage'): print 'INITIAL SORTING BY USAGE' avgPi = calcAvgPiFromDocTopicCount(DocTopicCount) bigtosmall = argsort_bigtosmall_stable(avgPi) elif canShuffleInit.lower().count('bycount'): print 'INITIAL SORTING BY COUNT' bigtosmall = argsort_bigtosmall_stable(DocTopicCount.sum(axis=0)) elif canShuffleInit.lower().count('random'): print 'INITIAL SORTING RANDOMLY' PRNG = np.random.RandomState(0) bigtosmall = np.arange(K) PRNG.shuffle(bigtosmall) else: bigtosmall = np.arange(K) # Now, sort. if not np.allclose(bigtosmall, np.arange(K)): DocTopicCount = DocTopicCount[:, bigtosmall] didShuffle = 1 avgPi = calcAvgPiFromDocTopicCount(DocTopicCount) sortedids = argsort_bigtosmall_stable(avgPi) if canShuffleInit.lower().count('byusage'): assert np.allclose(sortedids, np.arange(K)) # Find UIDs of comps to track emptyUIDs = np.flatnonzero(DocTopicCount.sum(axis=0) < 0.0001) if emptyUIDs.size >= 3: firstEmptyUID = emptyUIDs.min() lastEmptyUID = emptyUIDs.max() middleEmptyUID = emptyUIDs[len(emptyUIDs)/2] trackEmptyUIDs = [firstEmptyUID, middleEmptyUID, lastEmptyUID] emptyLabels = ['first', 'middle', 'last'] elif emptyUIDs.size == 2: trackEmptyUIDs = [emptyUIDs.min(), emptyUIDs.max()] emptyLabels = ['first', 'last'] elif emptyUIDs.size == 1: firstEmptyUID = emptyUIDs.min() trackEmptyUIDs = [firstEmptyUID] emptyLabels = ['first'] else: trackEmptyUIDs = [] emptyLabels = [] trackActiveUIDs = list() activeLabels = list() # Track the top 5 active columns of DocTopicCount for pos in range(0, np.minimum(5, K)): if sortedids[pos] not in emptyUIDs: trackActiveUIDs.append(sortedids[pos]) activeLabels.append('max+%d' % (pos)) # Find the minnonemptyID for pos in range(K-1, 0, -1): curid = sortedids[pos] if curid not in emptyUIDs: break minnonemptyPos = pos # Track the 5 smallest active columns of DocTopicCount nBeyond5 = np.minimum(5, K - len(emptyUIDs) - 5) for i in range(-1 * (nBeyond5-1), 1): trackActiveUIDs.append(sortedids[minnonemptyPos + i]) activeLabels.append('min+%d' % (-1 * i)) assert np.all(avgPi[trackActiveUIDs] > 0) assert np.allclose(avgPi[trackEmptyUIDs], 0.0) assert is_sorted_bigtosmall(avgPi[trackActiveUIDs]) nDocToDisplay = np.minimum(nDoc, 10) # Initialize rho if initrho is None: rho = OptimizerRhoOmegaBetter.make_initrho(K, nDoc, gamma) else: if didShuffle: rho, _ = reorder_rho(initrho, bigtosmall) else: rho = initrho # Initialize omega if initomega is None: omega = OptimizerRhoOmegaBetter.make_initomega(K, nDoc, gamma) else: omega = initomega # ELBO value of initial state Ltro = evalELBOandPrint( rho=rho, omega=omega, nDoc=nDoc, DocTopicCount=DocTopicCount, alpha=alpha, gamma=gamma, msg='init', ) Snapshots = dict() Snapshots['DTCSum'] = list() Snapshots['DTCUsage'] = list() Snapshots['beta'] = list() Snapshots['Lscore'] = list() Snapshots['activeLabels'] = activeLabels Snapshots['emptyLabels'] = emptyLabels Snapshots['pos_trackActive'] = list() Snapshots['pos_trackEmpty'] = list() Snapshots['beta_trackActive'] = list() Snapshots['beta_trackEmpty'] = list() Snapshots['count_trackActive'] = list() Snapshots['count_trackEmpty'] = list() Snapshots['beta_trackRem'] = list() LtroList = list() LtroList.append(Ltro) betaK = rho2beta(rho, returnSize="K") iterid = 0 prevbetaK = np.zeros_like(betaK) prevrho = rho.copy() while np.sum(np.abs(betaK - prevbetaK)) > 0.0000001: iterid += 1 if iterid > maxiter: break # Take Snapshots of Learned Params Snapshots['Lscore'].append(Ltro) Snapshots['DTCSum'].append(DocTopicCount.sum(axis=0)) Snapshots['DTCUsage'].append((DocTopicCount > 0.001).sum(axis=0)) Snapshots['beta'].append(betaK) Snapshots['pos_trackActive'].append(trackActiveUIDs) Snapshots['pos_trackEmpty'].append(trackEmptyUIDs) Snapshots['beta_trackActive'].append(betaK[trackActiveUIDs]) Snapshots['beta_trackEmpty'].append(betaK[trackEmptyUIDs]) Snapshots['beta_trackRem'].append(1.0 - betaK.sum()) Snapshots['count_trackActive'].append( DocTopicCount.sum(axis=0)[trackActiveUIDs]) Snapshots['count_trackEmpty'].append( DocTopicCount.sum(axis=0)[trackEmptyUIDs]) # Sort by beta didShuffle = 0 tlabel = '_t' if iterid > 1 and canShuffle and canShuffle.lower().count('bybeta'): bigtosmall = argsort_bigtosmall_stable(betaK) if not np.allclose(bigtosmall, np.arange(K)): trackActiveUIDs = mapToNewPos(trackActiveUIDs, bigtosmall) trackEmptyUIDs = mapToNewPos(trackEmptyUIDs, bigtosmall) rho, betaK = reorder_rho(rho, bigtosmall) DocTopicCount = DocTopicCount[:, bigtosmall] didShuffle = 1 tlabel = '_ts' # Update theta sumLogPiActiveVec, sumLogPiRemVec, LP = DocTopicCount_to_sumLogPi( rho=rho, omega=omega, DocTopicCount=DocTopicCount, alpha=alpha, gamma=gamma, **kwargs) # Show ELBO with freshly-optimized theta value. Ltro = evalELBOandPrint( rho=rho, omega=omega, DocTopicCount=DocTopicCount, theta=LP['theta'], thetaRem=LP['thetaRem'], nDoc=nDoc, sumLogPiActiveVec=sumLogPiActiveVec, sumLogPiRemVec=sumLogPiRemVec, alpha=alpha, gamma=gamma, f=None, msg=str(iterid) + tlabel, ) LtroList.append(Ltro) if not LtroList[-1] >= LtroList[-2]: if didShuffle: print 'NOT MONOTONIC! just after theta update with SHUFFLE!' else: print 'NOT MONOTONIC! just after theta standard update' didELBODrop = 0 if canShuffle: if canShuffle.lower().count('bysumlogpi'): bigtosmall = argsort_bigtosmall_stable( sumLogPiActiveVec) elif canShuffle.lower().count('bycounts'): bigtosmall = argsort_bigtosmall_stable( DocTopicCount.sum(axis=0)) elif canShuffle.lower().count('byusage'): estPi = DocTopicCount / DocTopicCount.sum(axis=1)[:,np.newaxis] avgPi = np.sum(estPi, axis=0) bigtosmall = argsort_bigtosmall_stable(avgPi) else: bigtosmall = np.arange(K) if not np.allclose(bigtosmall, np.arange(K)): trackActiveUIDs = mapToNewPos(trackActiveUIDs, bigtosmall) trackEmptyUIDs = mapToNewPos(trackEmptyUIDs, bigtosmall) rho, betaK = reorder_rho(rho, bigtosmall) sumLogPiActiveVec = sumLogPiActiveVec[bigtosmall] DocTopicCount = DocTopicCount[:,bigtosmall] LP['theta'] = LP['theta'][:, bigtosmall] didShuffle = 1 # Show ELBO with freshly-optimized rho value. Ltro = evalELBOandPrint( rho=rho, omega=omega, DocTopicCount=DocTopicCount, theta=LP['theta'], thetaRem=LP['thetaRem'], nDoc=nDoc, sumLogPiActiveVec=sumLogPiActiveVec, sumLogPiRemVec=sumLogPiRemVec, alpha=alpha, gamma=gamma, f=None, msg=str(iterid) + "_ss", ) LtroList.append(Ltro) if not LtroList[-1] >= LtroList[-2]: print 'NOT MONOTONIC! just after %s shuffle update!' % ( canShuffle) didELBODrop = 1 prevrho[:] = rho # Update rhoomega if warmStart_rho: initrho = rho else: initrho = None rho, omega, f, Info = OptimizerRhoOmegaBetter.\ find_optimum_multiple_tries( alpha=alpha, gamma=gamma, sumLogPiActiveVec=sumLogPiActiveVec, sumLogPiRemVec=sumLogPiRemVec, nDoc=nDoc, initrho=initrho, initomega=omega, approx_grad=1, do_grad_omega=0, ) prevbetaK[:] = betaK betaK = rho2beta(rho, returnSize="K") # Show ELBO with freshly-optimized rho value. Ltro = evalELBOandPrint( rho=rho, omega=omega, DocTopicCount=DocTopicCount, theta=LP['theta'], thetaRem=LP['thetaRem'], nDoc=nDoc, sumLogPiActiveVec=sumLogPiActiveVec, sumLogPiRemVec=sumLogPiRemVec, alpha=alpha, gamma=gamma, f=f, msg=str(iterid) + "_r", ) LtroList.append(Ltro) if not LtroList[-1] >= LtroList[-2]: print 'NOT MONOTONIC! just after rho update!' if didELBODrop: if LtroList[-1] >= LtroList[-3]: print 'Phew. Combined update of sorting then optimizing rho OK' else: print 'WHOA! Combined update of sorting then' + \ ' optimizing rho beta NOT MONOTONIC' Snapshots['Lscore'].append(Ltro) Snapshots['DTCSum'].append(DocTopicCount.sum(axis=0)) Snapshots['DTCUsage'].append((DocTopicCount > 0.001).sum(axis=0)) Snapshots['beta'].append(betaK) Snapshots['pos_trackActive'].append(trackActiveUIDs) Snapshots['pos_trackEmpty'].append(trackEmptyUIDs) Snapshots['beta_trackActive'].append(betaK[trackActiveUIDs]) Snapshots['beta_trackEmpty'].append(betaK[trackEmptyUIDs]) Snapshots['beta_trackRem'].append(1.0 - betaK.sum()) Snapshots['count_trackActive'].append( DocTopicCount.sum(axis=0)[trackActiveUIDs]) Snapshots['count_trackEmpty'].append( DocTopicCount.sum(axis=0)[trackEmptyUIDs]) print '\nEmpty cluster ids (%d of %d)' % ( len(trackEmptyUIDs), len(emptyUIDs)) print '-----------------' print ' '.join(['% 10d' % (x) for x in trackEmptyUIDs]) print '\nSelected active clusters to track' print '---------------------------------' print ' '.join(['% 10d' % (x) for x in trackActiveUIDs]) print ' '.join(['% .3e' % (x) for x in avgPi[trackActiveUIDs]]) print '\nDocTopicCount for %d of %d docs' % (nDocToDisplay, nDoc) print '---------------------------------' for n in range(nDocToDisplay): print ' '.join([ '% 9.2f' % (x) for x in DocTopicCount[n, trackActiveUIDs]]) print '\nFinal sumLogPiActiveVec' print '---------------------------------' print ' '.join(['% .3e' % (x) for x in sumLogPiActiveVec[trackActiveUIDs]]) print 'is sumLogPiActiveVec sorted?', \ is_sorted_bigtosmall(sumLogPiActiveVec) return rho, omega, Snapshots