def pprintCountVec(SS, uids=initSS.uids, cleanupMassRemoved=None, cleanupSizeThr=None, uidpairsToAccept=None): s = '' emptyVal = ' ' for uid in uids: try: k = SS.uid2k(uid) s += ' ' + count2str(SS.getCountVec()[k]) except: didWriteThisUID = False if uidpairsToAccept: for uidA, uidB in uidpairsToAccept: if uidB == uid: s += ' m' + '%3d' % (uidA) didWriteThisUID = True break if not didWriteThisUID: s += emptyVal if cleanupSizeThr: s += " (removed comps below minimum size of %.2f)" % ( cleanupSizeThr) pprint(' ' + s)
def selectCandidateMergePairs(hmodel, SS, MovePlans=dict(), MoveRecordsByUID=dict(), lapFrac=None, m_maxNumPairsContainingComp=3, m_minPercChangeInNumAtomsToReactivate=0.01, m_nLapToReactivate=10, m_pair_ranking_procedure='total_size', m_pair_ranking_direction='descending', m_pair_ranking_do_exclude_by_thr=0, m_pair_ranking_exclusion_thr=-0.000001, **kwargs): ''' Select candidate pairs to consider for merge move. Returns ------- Info : dict, with fields * m_UIDPairs : list of tuples, each defining a pair of uids * m_targetUIDSet : set of all uids involved in a proposed merge pair ''' MLogger.pprint("PLANNING merges at lap %.2f. K=%d" % (lapFrac, SS.K), 'debug') # Mark any targetUIDs used in births as off-limits for merges uidUsageCount = defaultdict(int) if 'b_shortlistUIDs' in MovePlans: for uid in MovePlans['b_shortlistUIDs']: uidUsageCount[uid] = 10 * m_maxNumPairsContainingComp nDisqualified = len(uidUsageCount.keys()) MLogger.pprint( " %d/%d UIDs ineligible because on shortlist for births. " % (nDisqualified, SS.K), 'debug') if nDisqualified > 0: MLogger.pprint( " Ineligible UIDs:" + \ vec2str(uidUsageCount.keys()), 'debug') uid2k = dict() uid2count = dict() for uid in SS.uids: uid2k[uid] = SS.uid2k(uid) uid2count[uid] = SS.getCountForUID(uid) EligibleUIDPairs = list() EligibleAIDPairs = list() nPairTotal = 0 nPairDQ = 0 nPairBusy = 0 for kA, uidA in enumerate(SS.uids): for b, uidB in enumerate(SS.uids[kA + 1:]): kB = kA + b + 1 assert kA < kB nPairTotal += 1 if uidUsageCount[uidA] > 0 or uidUsageCount[uidB] > 0: nPairBusy += 1 continue if uidA < uidB: uidTuple = (uidA, uidB) else: uidTuple = (uidB, uidA) aidTuple = (kA, kB) if uidTuple not in MoveRecordsByUID: EligibleUIDPairs.append(uidTuple) EligibleAIDPairs.append(aidTuple) else: pairRecord = MoveRecordsByUID[uidTuple] assert pairRecord['m_nFailRecent'] >= 1 latestMinCount = pairRecord['m_latestMinCount'] newMinCount = np.minimum(uid2count[uidA], uid2count[uidB]) percDiff = np.abs(latestMinCount - newMinCount) / \ latestMinCount if (lapFrac - pairRecord['m_latestLap']) >= m_nLapToReactivate: EligibleUIDPairs.append(uidTuple) EligibleAIDPairs.append(aidTuple) del MoveRecordsByUID[uidTuple] elif percDiff >= m_minPercChangeInNumAtomsToReactivate: EligibleUIDPairs.append(uidTuple) EligibleAIDPairs.append(aidTuple) del MoveRecordsByUID[uidTuple] else: nPairDQ += 1 MLogger.pprint( " %d/%d pairs eligible. %d disqualified by past failures." % (len(EligibleAIDPairs), nPairTotal, nPairDQ), 'debug') MLogger.pprint( " Prioritizing elible pairs via ranking procedure: %s" % (m_pair_ranking_procedure), 'debug') if m_pair_ranking_procedure == 'random': A = len(EligibleAIDPairs) prng = np.random.RandomState(lapFrac) rank_scores_per_pair = prng.permutation(np.arange(A)) elif m_pair_ranking_procedure == 'total_size': A = len(EligibleAIDPairs) rank_scores_per_pair = np.asarray([ SS.getCountForUID(uidA) + SS.getCountForUID(uidB) for (uidA, uidB) in EligibleUIDPairs ]) elif m_pair_ranking_procedure.count('elbo'): # Compute Ldata gain for each possible pair of comps rank_scores_per_pair = hmodel.obsModel.calcHardMergeGap_SpecificPairs( SS, EligibleAIDPairs) if hasattr(hmodel.allocModel, 'calcHardMergeGap_SpecificPairs'): rank_scores_per_pair = \ rank_scores_per_pair + hmodel.allocModel.calcHardMergeGap_SpecificPairs( SS, EligibleAIDPairs) rank_scores_per_pair /= hmodel.obsModel.getDatasetScale(SS) else: raise ValueError("Unrecognised --m_pair_ranking_procedure: %s" % m_pair_ranking_procedure) # Find pairs with positive gains if m_pair_ranking_direction == 'ascending': if m_pair_ranking_do_exclude_by_thr: MLogger.pprint( "Keeping only uid pairs with score < %.3e" % (m_pair_ranking_exclusion_thr), 'debug') keep_pair_ids = np.flatnonzero( rank_scores_per_pair < m_pair_ranking_exclusion_thr) ranked_pair_locs = keep_pair_ids[np.argsort( rank_scores_per_pair[keep_pair_ids])] else: ranked_pair_locs = np.argsort(rank_scores_per_pair) else: if m_pair_ranking_do_exclude_by_thr: MLogger.pprint( "Keeping only uid pairs with score > %.3e" % (m_pair_ranking_exclusion_thr), 'debug') keep_pair_ids = np.flatnonzero( rank_scores_per_pair > m_pair_ranking_exclusion_thr) ranked_pair_locs = keep_pair_ids[np.argsort( -1 * rank_scores_per_pair[keep_pair_ids])] else: ranked_pair_locs = np.argsort(-1 * rank_scores_per_pair) nKeep = 0 mUIDPairs = list() mAIDPairs = list() mGainVals = list() for loc in ranked_pair_locs: uidA, uidB = EligibleUIDPairs[loc] kA, kB = EligibleAIDPairs[loc] if uidUsageCount[uidA] >= m_maxNumPairsContainingComp or \ uidUsageCount[uidB] >= m_maxNumPairsContainingComp: continue uidUsageCount[uidA] += 1 uidUsageCount[uidB] += 1 mAIDPairs.append((kA, kB)) mUIDPairs.append((uidA, uidB)) mGainVals.append(rank_scores_per_pair[loc]) if nKeep == 0: MLogger.pprint("Chosen uid pairs:", 'debug') MLogger.pprint( "%4d, %4d : pair_score %.3e, size %s %s" % ( uidA, uidB, rank_scores_per_pair[loc], count2str(uid2count[uidA]), count2str(uid2count[uidB]), ), 'debug') nKeep += 1 Info = dict() Info['m_UIDPairs'] = mUIDPairs Info['m_GainVals'] = mGainVals Info['mPairIDs'] = mAIDPairs targetUIDs = set() for uidA, uidB in mUIDPairs: targetUIDs.add(uidA) targetUIDs.add(uidB) if 'b_shortlistUIDs' in MovePlans: for uid in MovePlans['b_shortlistUIDs']: assert uid != uidA assert uid != uidB Info['m_targetUIDSet'] = targetUIDs return Info