Python vec2str примеры использования

Язык программирования: Python

Пространство имен/Пакет: viz.PrintTopics

Метод/Функция: vec2str

Примеров на hotexamples.com: 6

Python vec2str - 6 примеров найдено. Это лучшие примеры Python кода для viz.PrintTopics.vec2str, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Пример #1

Показать файл

Файл: DPlanner.py Проект: zhaottcrystal/bnpy

def selectCandidateDeleteComps(hmodel,
                               SS,
                               MoveRecordsByUID=dict(),
                               MovePlans=dict(),
                               lapFrac=0,
                               **DArgs):
    ''' Select specific comps to target with delete move.

    Returns
    -------
    MovePlans : dict, with fields
    * d_targetUIDs : list of ints
    * d_absorbingUIDSet : set of ints, all uids that can absorb target mass
    OR
    * failMsg : string explaining why building list of eligible UIDs failed
    '''
    DLogger.pprint("PLANNING delete at lap %.2f" % (lapFrac))
    K = SS.K

    availableUIDs = set(SS.uids)
    if len(availableUIDs) < 2:
        DLogger.pprint(
            "Delete proposal requires at least 2 available UIDs.\n" + \
            "   Need 1 uid to target, and at least 1 to absorb." + \
            "   Only have %d total uids in the model." % (len(availableUIDs)))
        failMsg = "Ineligible. Did not find >= 2 UIDs in entire model."
        return dict(failMsg=failMsg)

    uidsBusyWithOtherMoves = set()
    '''
    if 'm_UIDPairs' in MovePlans:
        for (uidA, uidB) in MovePlans['m_UIDPairs']:
            availableUIDs.discard(uidA)
            availableUIDs.discard(uidB)
            uidsBusyWithOtherMoves.add(uidA)
            uidsBusyWithOtherMoves.add(uidB)
    if 'b_shortlistUIDs' in MovePlans:
        for uid in MovePlans['b_shortlistUIDs']:
            availableUIDs.discard(uid)
            uidsBusyWithOtherMoves.add(uid)

    if len(availableUIDs) < 2:
        DLogger.pprint("Delete requires at least 2 UIDs" + \
            " not occupied by merge or birth.\n" + \
            "   Need 1 uid to target, and at least 1 to absorb.\n" + \
            "   Only have %d total uids eligible." % (len(availableUIDs)))
        failMsg = "Ineligible. Too many uids occupied by merge or shortlisted for birth."
        return dict(failMsg=failMsg)
    '''

    # Compute score for each eligible state
    countVec = np.maximum(SS.getCountVec(), 1e-100)
    eligibleUIDs = list()
    tooBigUIDs = list()
    failRecordUIDs = list()
    nFailRecord = 0
    nReactivated = 0
    for uid in availableUIDs:
        k = SS.uid2k(uid)
        size = countVec[k]
        if uid not in MoveRecordsByUID:
            MoveRecordsByUID[uid] = defaultdict(int)

        # Skip ahead if this cluster is too big
        if size > DArgs['d_maxNumAtomsForTargetComp']:
            tooBigUIDs.append(uid)
            continue
        # Avoid comps we've failed deleting in the past
        # unless they have changed by a reasonable amount
        # or enough laps have passed to try again
        lapsSinceLastTry = lapFrac - MoveRecordsByUID[uid]['d_latestLap']
        nFailRecent_Delete = MoveRecordsByUID[uid]['d_nFailRecent'] > 0
        oldsize = MoveRecordsByUID[uid]['d_latestCount']
        if oldsize > 0 and nFailRecent_Delete > 0:
            nFailRecord += 1
            sizePercDiff = np.abs(size - oldsize) / (1e-100 + np.abs(oldsize))
            if sizePercDiff > DArgs['d_minPercChangeInNumAtomsToReactivate']:
                nReactivated += 1
            elif DArgs['d_nLapToReactivate'] > 0 \
                    and lapsSinceLastTry > DArgs['d_nLapToReactivate']:
                nReactivated += 1
            else:
                failRecordUIDs.append(uid)
                continue
        # If we make it here, the uid is eligible
        eligibleUIDs.append(uid)

    # Log which uids were marked has high potential births
    msg = "%d/%d UIDs busy with other moves (birth/merge)" % (
        len(uidsBusyWithOtherMoves), K)
    DLogger.pprint(msg)
    if len(uidsBusyWithOtherMoves) > 0:
        DLogger.pprint('  ' + vec2str(uidsBusyWithOtherMoves), 'debug')

    msg = "%d/%d UIDs too large [--d_maxNumAtomsForTargetComp %.2f]" % (
        len(tooBigUIDs), K, DArgs['d_maxNumAtomsForTargetComp'])
    DLogger.pprint(msg)
    if len(tooBigUIDs) > 0:
        DLogger.pprint('  ' + vec2str(tooBigUIDs), 'debug')

    # Log which uids were marked has having a record.
    msg = '%d/%d UIDs un-deleteable for past failures. %d reactivated.' % (
        len(failRecordUIDs), K, nReactivated)
    DLogger.pprint(msg)
    if len(failRecordUIDs) > 0:
        DLogger.pprint('  ' + vec2str(failRecordUIDs), 'debug')
    # Log all remaining eligible uids
    msg = '%d/%d UIDs eligible for targeted delete proposal' % (
        len(eligibleUIDs), K)
    DLogger.pprint(msg)
    if len(eligibleUIDs) == 0:
        failMsg = ("Empty plan. 0 UIDs eligible as delete target." + \
            " %d too busy with other moves." + \
            " %d too big." + \
            " %d have past failures.") % (
                len(uidsBusyWithOtherMoves),
                len(tooBigUIDs),
                len(failRecordUIDs))
        return dict(failMsg=failMsg)

    # Log count statistics for each uid
    eligibleCountVec = [countVec[SS.uid2k(u)] for u in eligibleUIDs]
    DLogger.pprint(' uid   ' + vec2str(eligibleUIDs), 'debug')
    DLogger.pprint(' count ' + vec2str(eligibleCountVec), 'debug')

    # Select the single state to target
    # by taking the one with highest score
    #Scores = np.asarray([x for x in ScoreByEligibleUID.values()])
    #targetUID = eligibleUIDs[np.argmax(eligibleCountVec)]
    #MovePlans['d_targetUIDs'] = [targetUID]

    targetUID = eligibleUIDs[np.argmax(eligibleCountVec)]
    MovePlans['d_targetUIDs'] = [targetUID]

    # Determine all comps eligible to receive its transfer mass
    absorbUIDset = set(eligibleUIDs)
    absorbUIDset.discard(targetUID)
    absorbUIDset.update(tooBigUIDs)
    absorbUIDset.update(failRecordUIDs)
    MovePlans['d_absorbingUIDSet'] = absorbUIDset

    DLogger.pprint('Selecting one single state to target.')
    DLogger.pprint('targetUID ' + str(targetUID))
    DLogger.pprint('absorbingUIDs: ' + vec2str(absorbUIDset))
    return MovePlans

Пример #2

Показать файл

def makeSummaryForBirthProposal(Dslice,
                                curModel,
                                curLPslice,
                                curSSwhole=None,
                                b_creationProposalName='bregmankmeans',
                                targetUID=None,
                                ktarget=None,
                                newUIDs=None,
                                LPkwargs=DefaultLPkwargs,
                                lapFrac=0,
                                batchID=0,
                                seed=0,
                                b_nRefineSteps=3,
                                b_debugOutputDir=None,
                                b_minNumAtomsForNewComp=None,
                                b_doInitCompleteLP=1,
                                b_cleanupWithMerge=1,
                                b_method_initCoordAscent='fromprevious',
                                vocabList=None,
                                **kwargs):
    ''' Create summary that reassigns mass from target to Kfresh new comps.

    TODO support other options than bregman???

    Returns
    -------
    xSSslice : SuffStatBag
        Contains exact summaries for reassignment of target mass.
        * Total mass is equal to mass assigned to ktarget in curLPslice
        * Number of components is Kfresh
    Info : dict
        Contains info for detailed debugging of construction process.
    '''
    # Parse input to decide which cluster to target
    # * targetUID is the unique ID of this cluster
    # * ktarget is its position in the current cluster ordering
    if targetUID is None:
        targetUID = curSSwhole.k2uid(ktarget)
    if ktarget is None:
        ktarget = curSSwhole.uid2k(targetUID)
    # START log for this birth proposal
    BLogger.pprint(
        'Creating proposal for targetUID %s at lap %.2f batchID %d' %
        (targetUID, lapFrac, batchID))
    # Grab vocabList, if available.
    if hasattr(Dslice, 'vocabList') and Dslice.vocabList is not None:
        vocabList = Dslice.vocabList
    # Parse input to decide where to save HTML output
    if b_debugOutputDir == 'None':
        b_debugOutputDir = None
    if b_debugOutputDir:
        BLogger.pprint('HTML output:' + b_debugOutputDir)
        # Create snapshot of current model comps
        plotCompsFromSS(curModel,
                        curSSwhole,
                        os.path.join(b_debugOutputDir, 'OrigComps.png'),
                        vocabList=vocabList,
                        compsToHighlight=[ktarget])

    # Determine exactly how many new states we can make...
    xK = len(newUIDs)
    if xK + curSSwhole.K > kwargs['Kmax']:
        xK = kwargs['Kmax'] - curSSwhole.K
        newUIDs = newUIDs[:xK]
        if xK <= 1:
            errorMsg = 'Cancelled.' + \
                'Adding 2 or more states would exceed budget of %d comps.' % (
                    kwargs['Kmax'])
            BLogger.pprint(errorMsg)
            BLogger.pprint('')
            return None, dict(errorMsg=errorMsg)
    # Create suff stats for some new states
    xInitSStarget, Info = initSS_BregmanDiv(
        Dslice,
        curModel,
        curLPslice,
        K=xK,
        ktarget=ktarget,
        lapFrac=lapFrac,
        seed=seed + int(1000 * lapFrac),
        logFunc=BLogger.pprint,
        NiterForBregmanKMeans=kwargs['b_NiterForBregmanKMeans'],
        **kwargs)
    # EXIT EARLY: if proposal initialization fails (not enough data).
    if xInitSStarget is None:
        BLogger.pprint('Proposal initialization FAILED. ' + \
                       Info['errorMsg'])
        BLogger.pprint('')
        return None, Info

    # If here, we have a valid set of initial stats.
    xInitSStarget.setUIDs(newUIDs[:xInitSStarget.K])
    if b_doInitCompleteLP:
        # Create valid whole-dataset clustering from hard init
        xInitSSslice, tempInfo = makeExpansionSSFromZ(
            Dslice=Dslice,
            curModel=curModel,
            curLPslice=curLPslice,
            ktarget=ktarget,
            xInitSS=xInitSStarget,
            atomType=Info['atomType'],
            targetZ=Info['targetZ'],
            chosenDataIDs=Info['chosenDataIDs'],
            **kwargs)
        Info.update(tempInfo)

        xSSslice = xInitSSslice
    else:
        xSSslice = xInitSStarget

    if b_debugOutputDir:
        plotCompsFromSS(curModel,
                        xSSslice,
                        os.path.join(b_debugOutputDir, 'NewComps_Init.png'),
                        vocabList=vocabList)

        # Determine current model objective score
        curModelFWD = curModel.copy()
        curModelFWD.update_global_params(SS=curSSwhole)
        curLdict = curModelFWD.calc_evidence(SS=curSSwhole, todict=1)
        # Track proposal ELBOs as refinement improves things
        propLdictList = list()
        # Create initial proposal
        if b_doInitCompleteLP:
            propSS = curSSwhole.copy()
            propSS.transferMassFromExistingToExpansion(uid=targetUID,
                                                       xSS=xSSslice)
            # Verify quality
            assert np.allclose(propSS.getCountVec().sum(),
                               curSSwhole.getCountVec().sum())
            propModel = curModel.copy()
            propModel.update_global_params(propSS)
            propLdict = propModel.calc_evidence(SS=propSS, todict=1)
            BLogger.pprint(
                "init %d/%d  gainL % .3e  propL % .3e  curL % .3e" %
                (0, b_nRefineSteps, propLdict['Ltotal'] - curLdict['Ltotal'],
                 propLdict['Ltotal'], curLdict['Ltotal']))
            propLdictList.append(propLdict)

        docUsageByUID = dict()
        if curModel.getAllocModelName().count('HDP'):
            for k, uid in enumerate(xInitSStarget.uids):
                if 'targetZ' in Info:
                    if Info['atomType'].count('doc'):
                        initDocUsage_uid = np.sum(Info['targetZ'] == k)
                    else:
                        initDocUsage_uid = 0.0
                        for d in xrange(Dslice.nDoc):
                            start = Dslice.doc_range[d]
                            stop = Dslice.doc_range[d + 1]
                            initDocUsage_uid += np.any(
                                Info['targetZ'][start:stop] == k)
                else:
                    initDocUsage_uid = 0.0
                docUsageByUID[uid] = [initDocUsage_uid]

    # Create initial observation model
    xObsModel = curModel.obsModel.copy()

    if b_method_initCoordAscent == 'fromprevious' and 'xLPslice' in Info:
        xInitLPslice = Info['xLPslice']
    else:
        xInitLPslice = None

    # Make a function to pretty-print counts as we refine the initialization
    pprintCountVec = BLogger.makeFunctionToPrettyPrintCounts(xSSslice)
    BLogger.pprint("   " + vec2str(xInitSStarget.uids))
    pprintCountVec(xSSslice)

    # Log messages to describe the initialization.
    BLogger.pprint(' Running %d refinement iterations (--b_nRefineSteps)' %
                   (b_nRefineSteps))
    prevCountVec = xSSslice.getCountVec()
    didConvEarly = False
    convstep = 100 + b_nRefineSteps
    # Run several refinement steps.
    # Each step does a restricted local step to improve
    # the proposed cluster assignments.
    for rstep in range(b_nRefineSteps):
        # Update xObsModel
        xObsModel.update_global_params(xSSslice)

        # Restricted local step!
        # * xInitSS : specifies obs-model stats used for initialization
        xSSslice, refineInfo = summarizeRestrictedLocalStep(
            Dslice=Dslice,
            curModel=curModel,
            curLPslice=curLPslice,
            curSSwhole=curSSwhole,
            ktarget=ktarget,
            xUIDs=xSSslice.uids,
            xInitSS=xSSslice,
            xObsModel=xObsModel,
            xInitLPslice=xInitLPslice,
            LPkwargs=LPkwargs,
            nUpdateSteps=1,
            **kwargs)
        Info.update(refineInfo)
        # Get most recent xLPslice for initialization
        if b_method_initCoordAscent == 'fromprevious' and 'xLPslice' in Info:
            xInitLPslice = Info['xLPslice']
        # On first step, show diagnostics for new states
        if rstep == 0:
            targetPi = refineInfo['emptyPi'] + refineInfo['xPiVec'].sum()
            BLogger.pprint(
                " target prob redistributed by policy %s (--b_method_xPi)" %
                (kwargs['b_method_xPi']))
            msg = " pi[ktarget] before %.4f  after %.4f." % (
                targetPi, refineInfo['emptyPi'])
            BLogger.pprint(msg)
            BLogger.pprint(" pi[new comps]: "  + \
                vec2str(
                    refineInfo['xPiVec'],
                    width=6, minVal=0.0001))
            logLPConvergenceDiagnostics(refineInfo,
                                        rstep=rstep,
                                        b_nRefineSteps=b_nRefineSteps)
            BLogger.pprint("   " + vec2str(xInitSStarget.uids))
        # Show diagnostic counts in each fresh state
        pprintCountVec(xSSslice)
        # Write HTML debug info
        if b_debugOutputDir:
            plotCompsFromSS(curModel,
                            xSSslice,
                            os.path.join(b_debugOutputDir,
                                         'NewComps_Step%d.png' % (rstep + 1)),
                            vocabList=vocabList)
            propSS = curSSwhole.copy()
            propSS.transferMassFromExistingToExpansion(uid=targetUID,
                                                       xSS=xSSslice)
            # Reordering only lifts score by small amount. Not worth it.
            # propSS.reorderComps(np.argsort(-1 * propSS.getCountVec()))
            propModel = curModel.copy()
            propModel.update_global_params(propSS)
            propLdict = propModel.calc_evidence(SS=propSS, todict=1)

            propSSsubset = xSSslice
            tmpModel = curModelFWD
            tmpModel.obsModel.update_global_params(propSSsubset)
            propLdata_subset = tmpModel.obsModel.calcELBO_Memoized(
                propSSsubset)

            curSSsubset = xSSslice.copy(includeELBOTerms=0)
            while curSSsubset.K > 1:
                curSSsubset.mergeComps(0, 1)
            tmpModel.obsModel.update_global_params(curSSsubset)
            curLdata_subset = tmpModel.obsModel.calcELBO_Memoized(curSSsubset)
            gainLdata_subset = propLdata_subset - curLdata_subset
            msg = \
                "step %d/%d  gainL % .3e  propL % .3e  curL % .3e" % (
                    rstep+1, b_nRefineSteps,
                    propLdict['Ltotal'] - curLdict['Ltotal'],
                    propLdict['Ltotal'],
                    curLdict['Ltotal'])
            msg += "  gainLdata_subset % .3e" % (gainLdata_subset)
            BLogger.pprint(msg)
            propLdictList.append(propLdict)
            if curModel.getAllocModelName().count('HDP'):
                docUsageVec = xSSslice.getSelectionTerm('DocUsageCount')
                for k, uid in enumerate(xSSslice.uids):
                    docUsageByUID[uid].append(docUsageVec[k])
        # If converged early and did the final refinement step
        if didConvEarly and rstep > convstep:
            break
        # Cleanup by deleting small clusters
        if rstep < b_nRefineSteps - 1:
            if rstep == b_nRefineSteps - 2 or didConvEarly:
                # After all but last step,
                # delete small (but not empty) comps
                minNumAtomsToStay = b_minNumAtomsForNewComp
            else:
                # Always remove empty clusters. They waste our time.
                minNumAtomsToStay = np.minimum(1, b_minNumAtomsForNewComp)
            xSSslice, xInitLPslice = cleanupDeleteSmallClusters(
                xSSslice,
                minNumAtomsToStay,
                xInitLPslice=xInitLPslice,
                pprintCountVec=pprintCountVec)
        # Decide if we have converged early
        if rstep < b_nRefineSteps - 2 and prevCountVec.size == xSSslice.K:
            if np.allclose(xSSslice.getCountVec(), prevCountVec, atol=0.5):
                # Converged. Jump directly to the merge phase!
                didConvEarly = True
                convstep = rstep
        # Cleanup by merging clusters
        if b_cleanupWithMerge and \
                (rstep == b_nRefineSteps - 2 or didConvEarly):
            # Only cleanup on second-to-last pass, or if converged early
            Info['mergestep'] = rstep + 1
            xSSslice, xInitLPslice = cleanupMergeClusters(
                xSSslice,
                curModel,
                obsSSkeys=xInitSStarget._Fields._FieldDims.keys(),
                vocabList=vocabList,
                pprintCountVec=pprintCountVec,
                xInitLPslice=xInitLPslice,
                b_debugOutputDir=b_debugOutputDir,
                **kwargs)

        prevCountVec = xSSslice.getCountVec().copy()

    Info['Kfinal'] = xSSslice.K
    if b_debugOutputDir:
        savefilename = os.path.join(b_debugOutputDir, 'ProposalTrace_ELBO.png')
        plotELBOtermsForProposal(curLdict,
                                 propLdictList,
                                 savefilename=savefilename)
        if curModel.getAllocModelName().count('HDP'):
            savefilename = os.path.join(b_debugOutputDir,
                                        'ProposalTrace_DocUsage.png')
            plotDocUsageForProposal(docUsageByUID, savefilename=savefilename)

    # EXIT EARLY: error if we didn't create enough "big-enough" states.
    nnzCount = np.sum(xSSslice.getCountVec() >= b_minNumAtomsForNewComp)
    if nnzCount < 2:
        Info['errorMsg'] = \
            "Could not create at least two comps" + \
            " with mass >= %.1f (--%s)" % (
                b_minNumAtomsForNewComp, 'b_minNumAtomsForNewComp')
        BLogger.pprint('Proposal build phase FAILED. ' + Info['errorMsg'])
        BLogger.pprint('')  # Blank line
        return None, Info

    # If here, we have a valid proposal.
    # Need to verify mass conservation
    if hasattr(Dslice, 'word_count') and \
            curModel.obsModel.DataAtomType.count('word') and \
            curModel.getObsModelName().count('Mult'):
        origMass = np.inner(Dslice.word_count, curLPslice['resp'][:, ktarget])
    else:
        if 'resp' in curLPslice:
            origMass = curLPslice['resp'][:, ktarget].sum()
        else:
            origMass = curLPslice['spR'][:, ktarget].sum()
    newMass = xSSslice.getCountVec().sum()
    assert np.allclose(newMass, origMass, atol=1e-6, rtol=0)
    BLogger.pprint('Proposal build phase DONE.' + \
        ' Created %d candidate clusters.' % (Info['Kfinal']))
    BLogger.pprint('')  # Blank line
    return xSSslice, Info

Пример #3

Показать файл

def makeSummaryForExistingBirthProposal(
        Dslice,
        curModel,
        curLPslice,
        curSSwhole=None,
        targetUID=None,
        ktarget=None,
        LPkwargs=DefaultLPkwargs,
        lapFrac=0,
        batchID=0,
        b_nRefineSteps=3,
        b_debugOutputDir=None,
        b_method_initCoordAscent='fromprevious',
        vocabList=None,
        **kwargs):
    ''' Create summary that reassigns mass from target given set of comps

    Given set of comps is a fixed proposal from a previously-seen batch.

    Returns
    -------
    xSSslice : SuffStatBag
        Contains exact summaries for reassignment of target mass.
        * Total mass is equal to mass assigned to ktarget in curLPslice
        * Number of components is Kfresh
    Info : dict
        Contains info for detailed debugging of construction process.
    '''
    if targetUID is None:
        targetUID = curSSwhole.uids(ktarget)
    if ktarget is None:
        ktarget = curSSwhole.uid2k(targetUID)
    # START log for this birth proposal
    BLogger.pprint(
        'Extending previous birth for targetUID %s at lap %.2f batch %d' %
        (targetUID, lapFrac, batchID))
    # Grab vocabList, if available.
    if hasattr(Dslice, 'vocabList') and Dslice.vocabList is not None:
        vocabList = Dslice.vocabList
    # Parse input to decide where to save HTML output
    if b_debugOutputDir == 'None':
        b_debugOutputDir = None
    if b_debugOutputDir:
        BLogger.pprint('HTML output:' + b_debugOutputDir)
        # Create snapshot of current model comps
        plotCompsFromSS(curModel,
                        curSSwhole,
                        os.path.join(b_debugOutputDir, 'OrigComps.png'),
                        vocabList=vocabList,
                        compsToHighlight=[ktarget])

    assert targetUID in curSSwhole.propXSS
    xinitSS = curSSwhole.propXSS[targetUID]
    xK = xinitSS.K
    if xK + curSSwhole.K > kwargs['Kmax']:
        errorMsg = 'Cancelled.' + \
            'Adding 2 or more states would exceed budget of %d comps.' % (
                kwargs['Kmax'])
        BLogger.pprint(errorMsg)
        BLogger.pprint('')
        return None, dict(errorMsg=errorMsg)

    # Log messages to describe the initialization.
    # Make a function to pretty-print counts as we refine the initialization
    pprintCountVec = BLogger.makeFunctionToPrettyPrintCounts(xinitSS)
    BLogger.pprint('  Using previous proposal with %d clusters %s.' %
                   (xinitSS.K, '(--b_Kfresh=%d)' % kwargs['b_Kfresh']))
    BLogger.pprint("  Initial uid/counts from previous proposal:")
    BLogger.pprint('   ' + vec2str(xinitSS.uids))
    pprintCountVec(xinitSS)
    BLogger.pprint('  Running %d refinement iterations (--b_nRefineSteps)' %
                   (b_nRefineSteps))

    xSSinitPlusSlice = xinitSS.copy()
    if b_debugOutputDir:
        plotCompsFromSS(curModel,
                        xinitSS,
                        os.path.join(b_debugOutputDir, 'NewComps_Init.png'),
                        vocabList=vocabList)

        # Determine current model objective score
        curModelFWD = curModel.copy()
        curModelFWD.update_global_params(SS=curSSwhole)
        curLdict = curModelFWD.calc_evidence(SS=curSSwhole, todict=1)
        # Track proposal ELBOs as refinement improves things
        propLdictList = list()
        docUsageByUID = dict()
        if curModel.getAllocModelName().count('HDP'):
            for k, uid in enumerate(xinitSS.uids):
                initDocUsage_uid = 0.0
                docUsageByUID[uid] = [initDocUsage_uid]

    # Create initial observation model
    xObsModel = curModel.obsModel.copy()
    xInitLPslice = None
    Info = dict()
    # Run several refinement steps.
    # Each step does a restricted local step to improve
    # the proposed cluster assignments.
    nRefineSteps = np.maximum(1, b_nRefineSteps)
    for rstep in range(nRefineSteps):
        xObsModel.update_global_params(xSSinitPlusSlice)

        # Restricted local step!
        # * xInitSS : specifies obs-model stats used for initialization
        xSSslice, refineInfo = summarizeRestrictedLocalStep(
            Dslice=Dslice,
            curModel=curModel,
            curLPslice=curLPslice,
            curSSwhole=curSSwhole,
            ktarget=ktarget,
            xUIDs=xSSinitPlusSlice.uids,
            xObsModel=xObsModel,
            xInitSS=xSSinitPlusSlice,  # first time in loop <= xinitSS
            xInitLPslice=xInitLPslice,
            LPkwargs=LPkwargs,
            **kwargs)

        xSSinitPlusSlice += xSSslice
        if rstep >= 1:
            xSSinitPlusSlice -= prevSSslice
        prevSSslice = xSSslice

        Info.update(refineInfo)
        # Show diagnostics for new states
        pprintCountVec(xSSslice)
        logLPConvergenceDiagnostics(refineInfo,
                                    rstep=rstep,
                                    b_nRefineSteps=b_nRefineSteps)
        # Get most recent xLPslice for initialization
        if b_method_initCoordAscent == 'fromprevious' and 'xLPslice' in Info:
            xInitLPslice = Info['xLPslice']
        if b_debugOutputDir:
            plotCompsFromSS(curModel,
                            xSSslice,
                            os.path.join(b_debugOutputDir,
                                         'NewComps_Step%d.png' % (rstep + 1)),
                            vocabList=vocabList)
            propSS = curSSwhole.copy()
            propSS.transferMassFromExistingToExpansion(uid=targetUID,
                                                       xSS=xSSslice)
            propModel = curModel.copy()
            propModel.update_global_params(propSS)
            propLdict = propModel.calc_evidence(SS=propSS, todict=1)
            BLogger.pprint(
                "step %d/%d  gainL % .3e  propL % .3e  curL % .3e" %
                (rstep + 1, b_nRefineSteps, propLdict['Ltotal'] -
                 curLdict['Ltotal'], propLdict['Ltotal'], curLdict['Ltotal']))
            propLdictList.append(propLdict)
            if curModel.getAllocModelName().count('HDP'):
                docUsageVec = xSSslice.getSelectionTerm('DocUsageCount')
                for k, uid in enumerate(xSSslice.uids):
                    docUsageByUID[uid].append(docUsageVec[k])

    Info['Kfinal'] = xSSslice.K
    if b_debugOutputDir:
        savefilename = os.path.join(b_debugOutputDir, 'ProposalTrace_ELBO.png')
        plotELBOtermsForProposal(curLdict,
                                 propLdictList,
                                 savefilename=savefilename)
        if curModel.getAllocModelName().count('HDP'):
            savefilename = os.path.join(b_debugOutputDir,
                                        'ProposalTrace_DocUsage.png')
            plotDocUsageForProposal(docUsageByUID, savefilename=savefilename)

    # If here, we have a valid proposal.
    # Need to verify mass conservation
    if hasattr(Dslice, 'word_count') and \
            curModel.obsModel.DataAtomType.count('word') and \
            curModel.getObsModelName().count('Mult'):
        origMass = np.inner(Dslice.word_count, curLPslice['resp'][:, ktarget])
    else:
        if 'resp' in curLPslice:
            origMass = curLPslice['resp'][:, ktarget].sum()
        else:
            origMass = curLPslice['spR'][:, ktarget].sum()
    newMass = xSSslice.getCountVec().sum()
    assert np.allclose(newMass, origMass, atol=1e-6, rtol=0)
    BLogger.pprint('Proposal extension DONE. %d candidate clusters.' %
                   (Info['Kfinal']))
    BLogger.pprint('')
    return xSSslice, Info

Пример #4

Показать файл

def selectCandidateMergePairs(hmodel,
                              SS,
                              MovePlans=dict(),
                              MoveRecordsByUID=dict(),
                              lapFrac=None,
                              m_maxNumPairsContainingComp=3,
                              m_minPercChangeInNumAtomsToReactivate=0.01,
                              m_nLapToReactivate=10,
                              m_pair_ranking_procedure='total_size',
                              m_pair_ranking_direction='descending',
                              m_pair_ranking_do_exclude_by_thr=0,
                              m_pair_ranking_exclusion_thr=-0.000001,
                              **kwargs):
    ''' Select candidate pairs to consider for merge move.
    
    Returns
    -------
    Info : dict, with fields
        * m_UIDPairs : list of tuples, each defining a pair of uids
        * m_targetUIDSet : set of all uids involved in a proposed merge pair
    '''
    MLogger.pprint("PLANNING merges at lap %.2f. K=%d" % (lapFrac, SS.K),
                   'debug')

    # Mark any targetUIDs used in births as off-limits for merges
    uidUsageCount = defaultdict(int)
    if 'b_shortlistUIDs' in MovePlans:
        for uid in MovePlans['b_shortlistUIDs']:
            uidUsageCount[uid] = 10 * m_maxNumPairsContainingComp
    nDisqualified = len(uidUsageCount.keys())
    MLogger.pprint(
        "   %d/%d UIDs ineligible because on shortlist for births. " %
        (nDisqualified, SS.K), 'debug')
    if nDisqualified > 0:
        MLogger.pprint(
            "   Ineligible UIDs:" + \
                vec2str(uidUsageCount.keys()),
            'debug')

    uid2k = dict()
    uid2count = dict()
    for uid in SS.uids:
        uid2k[uid] = SS.uid2k(uid)
        uid2count[uid] = SS.getCountForUID(uid)

    EligibleUIDPairs = list()
    EligibleAIDPairs = list()
    nPairTotal = 0
    nPairDQ = 0
    nPairBusy = 0
    for kA, uidA in enumerate(SS.uids):
        for b, uidB in enumerate(SS.uids[kA + 1:]):
            kB = kA + b + 1
            assert kA < kB
            nPairTotal += 1
            if uidUsageCount[uidA] > 0 or uidUsageCount[uidB] > 0:
                nPairBusy += 1
                continue
            if uidA < uidB:
                uidTuple = (uidA, uidB)
            else:
                uidTuple = (uidB, uidA)
            aidTuple = (kA, kB)

            if uidTuple not in MoveRecordsByUID:
                EligibleUIDPairs.append(uidTuple)
                EligibleAIDPairs.append(aidTuple)
            else:
                pairRecord = MoveRecordsByUID[uidTuple]
                assert pairRecord['m_nFailRecent'] >= 1
                latestMinCount = pairRecord['m_latestMinCount']
                newMinCount = np.minimum(uid2count[uidA], uid2count[uidB])
                percDiff = np.abs(latestMinCount - newMinCount) / \
                    latestMinCount
                if (lapFrac - pairRecord['m_latestLap']) >= m_nLapToReactivate:
                    EligibleUIDPairs.append(uidTuple)
                    EligibleAIDPairs.append(aidTuple)
                    del MoveRecordsByUID[uidTuple]
                elif percDiff >= m_minPercChangeInNumAtomsToReactivate:
                    EligibleUIDPairs.append(uidTuple)
                    EligibleAIDPairs.append(aidTuple)
                    del MoveRecordsByUID[uidTuple]
                else:
                    nPairDQ += 1
    MLogger.pprint(
        "   %d/%d pairs eligible. %d disqualified by past failures." %
        (len(EligibleAIDPairs), nPairTotal, nPairDQ), 'debug')
    MLogger.pprint(
        "   Prioritizing elible pairs via ranking procedure: %s" %
        (m_pair_ranking_procedure), 'debug')
    if m_pair_ranking_procedure == 'random':
        A = len(EligibleAIDPairs)
        prng = np.random.RandomState(lapFrac)
        rank_scores_per_pair = prng.permutation(np.arange(A))
    elif m_pair_ranking_procedure == 'total_size':
        A = len(EligibleAIDPairs)
        rank_scores_per_pair = np.asarray([
            SS.getCountForUID(uidA) + SS.getCountForUID(uidB)
            for (uidA, uidB) in EligibleUIDPairs
        ])
    elif m_pair_ranking_procedure.count('elbo'):
        # Compute Ldata gain for each possible pair of comps
        rank_scores_per_pair = hmodel.obsModel.calcHardMergeGap_SpecificPairs(
            SS, EligibleAIDPairs)
        if hasattr(hmodel.allocModel, 'calcHardMergeGap_SpecificPairs'):
            rank_scores_per_pair = \
                rank_scores_per_pair + hmodel.allocModel.calcHardMergeGap_SpecificPairs(
                    SS, EligibleAIDPairs)
        rank_scores_per_pair /= hmodel.obsModel.getDatasetScale(SS)
    else:
        raise ValueError("Unrecognised --m_pair_ranking_procedure: %s" %
                         m_pair_ranking_procedure)

    # Find pairs with positive gains
    if m_pair_ranking_direction == 'ascending':
        if m_pair_ranking_do_exclude_by_thr:
            MLogger.pprint(
                "Keeping only uid pairs with score < %.3e" %
                (m_pair_ranking_exclusion_thr), 'debug')
            keep_pair_ids = np.flatnonzero(
                rank_scores_per_pair < m_pair_ranking_exclusion_thr)
            ranked_pair_locs = keep_pair_ids[np.argsort(
                rank_scores_per_pair[keep_pair_ids])]
        else:
            ranked_pair_locs = np.argsort(rank_scores_per_pair)
    else:
        if m_pair_ranking_do_exclude_by_thr:
            MLogger.pprint(
                "Keeping only uid pairs with score > %.3e" %
                (m_pair_ranking_exclusion_thr), 'debug')
            keep_pair_ids = np.flatnonzero(
                rank_scores_per_pair > m_pair_ranking_exclusion_thr)
            ranked_pair_locs = keep_pair_ids[np.argsort(
                -1 * rank_scores_per_pair[keep_pair_ids])]
        else:
            ranked_pair_locs = np.argsort(-1 * rank_scores_per_pair)

    nKeep = 0
    mUIDPairs = list()
    mAIDPairs = list()
    mGainVals = list()
    for loc in ranked_pair_locs:
        uidA, uidB = EligibleUIDPairs[loc]
        kA, kB = EligibleAIDPairs[loc]
        if uidUsageCount[uidA] >= m_maxNumPairsContainingComp or \
                uidUsageCount[uidB] >= m_maxNumPairsContainingComp:
            continue
        uidUsageCount[uidA] += 1
        uidUsageCount[uidB] += 1

        mAIDPairs.append((kA, kB))
        mUIDPairs.append((uidA, uidB))
        mGainVals.append(rank_scores_per_pair[loc])
        if nKeep == 0:
            MLogger.pprint("Chosen uid pairs:", 'debug')
        MLogger.pprint(
            "%4d, %4d : pair_score %.3e, size %s %s" % (
                uidA,
                uidB,
                rank_scores_per_pair[loc],
                count2str(uid2count[uidA]),
                count2str(uid2count[uidB]),
            ), 'debug')
        nKeep += 1
    Info = dict()
    Info['m_UIDPairs'] = mUIDPairs
    Info['m_GainVals'] = mGainVals
    Info['mPairIDs'] = mAIDPairs
    targetUIDs = set()
    for uidA, uidB in mUIDPairs:
        targetUIDs.add(uidA)
        targetUIDs.add(uidB)
        if 'b_shortlistUIDs' in MovePlans:
            for uid in MovePlans['b_shortlistUIDs']:
                assert uid != uidA
                assert uid != uidB
    Info['m_targetUIDSet'] = targetUIDs
    return Info

Пример #5

Показать файл

Файл: BPlanner.py Проект: zhaottcrystal/bnpy

def selectCompsForBirthAtCurrentBatch(hmodel=None,
                                      SS=None,
                                      SSbatch=None,
                                      MoveRecordsByUID=dict(),
                                      MovePlans=dict(),
                                      lapFrac=0,
                                      batchID=0,
                                      batchPos=0,
                                      nBatch=1,
                                      isFirstBatch=False,
                                      doPrintLotsOfDetails=True,
                                      **BArgs):
    ''' Select specific comps to target with birth move at current batch.

    Returns
    -------
    MovePlans : dict with updated fields
    * b_targetUIDs : list of ints,
        Each uid in b_targetUIDs will be tried immediately, at current batch.

    MoveRecordsByUID : dict with updated fields
    * [uid]['byBatch'][batchID] : dict with fields
        proposalBatchSize
        proposalTotalSize
    '''
    # Extract num clusters in current model
    K = SS.K
    if K > 25:
        doPrintLotsOfDetails = False
    statusStr = ' lap %7.3f lapCeil %5d batchPos %3d/%d batchID %3d ' % (
        lapFrac, np.ceil(lapFrac), batchPos, nBatch, batchID)
    BLogger.pprint('PLAN at ' + statusStr)

    if BArgs['Kmax'] - SS.K <= 0:
        msg = "Cannot plan any more births." + \
            " Reached upper limit of %d existing comps (--Kmax)." % (
                BArgs['Kmax'])
        BLogger.pprint(msg)
        if 'b_targetUIDs' in MovePlans:
            del MovePlans['b_targetUIDs']
        MovePlans['b_statusMsg'] = msg
        BLogger.pprint('')
        return MovePlans

    if isFirstBatch:
        assert 'b_targetUIDs' not in MovePlans

    if isFirstBatch or 'b_firstbatchUIDs' not in MovePlans:
        MovePlans['b_firstbatchUIDs'] = SSbatch.uids.copy()
        MovePlans['b_CountVec_SeenThisLap'] = np.zeros(K)
    for k, uid in enumerate(MovePlans['b_firstbatchUIDs']):
        MovePlans['b_CountVec_SeenThisLap'][k] += SSbatch.getCountForUID(uid)

    # Short-circuit. Keep retained clusters.
    if lapFrac > 1.0 and BArgs['b_retainAcrossBatchesAfterFirstLap']:
        if not isFirstBatch:
            if 'b_targetUIDs' in MovePlans:
                msg = "%d UIDs retained from proposals earlier this lap." + \
                    " No new proposals at this batch."
                msg = msg % (len(MovePlans['b_targetUIDs']))
                BLogger.pprint(msg)
                if len(MovePlans['b_targetUIDs']) > 0:
                    BLogger.pprint(vec2str(MovePlans['b_targetUIDs']))
            else:
                BLogger.pprint(
                    'No UIDs targeted earlier in lap.' + \
                    ' No new proposals at this batch.')
            return MovePlans

    # Compute sizes for each cluster
    CountVec_b = np.maximum(SSbatch.getCountVec(), 1e-100)
    CountVec_all = np.maximum(SS.getCountVec(), 1e-100)
    atomstr = 'atoms'
    labelstr = 'count_b'

    uidsBusyWithOtherMoves = list()
    uidsTooSmall = list()
    uidsWithFailRecord = list()
    eligible_mask = np.zeros(K, dtype=np.bool8)
    for ii, uid in enumerate(SS.uids):
        if uid not in MoveRecordsByUID:
            MoveRecordsByUID[uid] = defaultdict(int)
        if not isinstance(MoveRecordsByUID[uid]['byBatch'], dict):
            MoveRecordsByUID[uid]['byBatch'] = \
                defaultdict(lambda: defaultdict(int))
        uidRec = MoveRecordsByUID[uid]
        uidRec_b = MoveRecordsByUID[uid]['byBatch'][batchID]

        uidstatusStr = "STATUS uid %5d %s N_b %9.3f N_ttl %9.3f" % (
            uid, statusStr, SSbatch.getCountForUID(uid),
            SS.getCountForUID(uid))
        # Continue to track UIDs that are pre-existing targets
        if 'b_targetUIDs' in MovePlans:
            if uid in MovePlans['b_targetUIDs']:
                BLogger.startUIDSpecificLog(uid)
                BLogger.pprint(uidstatusStr + " CHOSENAGAIN")
                BLogger.stopUIDSpecificLog(uid)
                continue
        # TODO REMOVE DEAD CODE
        if MoveRecordsByUID[uid]['b_tryAgainFutureLap'] > 0:
            msg = "Try targeting uid %d again." % (uid)
            BLogger.pprint(msg)
            del MoveRecordsByUID[uid]['b_tryAgainFutureLap']
            eligible_mask[ii] = 1
            continue

        # Discard uids which are active in another proposal.
        if 'd_targetUIDs' in MovePlans:
            if uid in MovePlans['d_targetUIDs']:
                uidsBusyWithOtherMoves.append(uid)
                BLogger.startUIDSpecificLog(uid)
                BLogger.pprint(uidstatusStr + " BUSY DELETE PROPOSAL")
                BLogger.stopUIDSpecificLog(uid)
                continue
        if 'd_absorbingUIDSet' in MovePlans:
            if uid in MovePlans['d_absorbingUIDSet']:
                uidsBusyWithOtherMoves.append(uid)
                BLogger.startUIDSpecificLog(uid)
                BLogger.pprint(uidstatusStr + " BUSY DELETE PROPOSAL")
                BLogger.stopUIDSpecificLog(uid)
                continue

        if 'm_targetUIDSet' in MovePlans:
            if uid in MovePlans['m_targetUIDSet']:
                uidsBusyWithOtherMoves.append(uid)
                BLogger.startUIDSpecificLog(uid)
                BLogger.pprint(uidstatusStr + " BUSY MERGE PROPOSAL")
                BLogger.stopUIDSpecificLog(uid)
                continue

        # Filter out uids without large presence in current batch
        bigEnough = CountVec_b[ii] >= BArgs['b_minNumAtomsForTargetComp']
        if not bigEnough:
            uidsTooSmall.append((uid, CountVec_b[ii]))
            BLogger.startUIDSpecificLog(uid)
            BLogger.pprint(
                uidstatusStr + " TOO SMALL %.2f < %.2f" %
                (CountVec_b[ii], BArgs['b_minNumAtomsForTargetComp']))
            BLogger.stopUIDSpecificLog(uid)
            continue

        eligibleSuffix = ''
        # Filter out uids we've failed on this particular batch before
        if uidRec_b['nFail'] > 0:
            prevBatchSize = uidRec_b['proposalBatchSize']
            prevTotalSize = uidRec_b['proposalTotalSize']

            curBatchSize = SSbatch.getCountForUID(uid)
            sizePercDiff = np.abs(curBatchSize -
                                  prevBatchSize) / (curBatchSize + 1e-100)
            sizeChangedEnoughToReactivate = sizePercDiff > \
                BArgs['b_minPercChangeInNumAtomsToReactivate']

            curTotalSize = SS.getCountForUID(uid)
            totalPercDiff = np.abs(curTotalSize -
                                   prevTotalSize) / (curTotalSize + 1e-100)
            totalsizeChangedEnoughToReactivate = totalPercDiff > \
                BArgs['b_minPercChangeInNumAtomsToReactivate']

            if sizeChangedEnoughToReactivate:
                eligibleSuffix = \
                    "REACTIVATE BY BATCH SIZE." + \
                    "\n Batch size percDiff %.2f > %.2f" % (
                        sizePercDiff,
                        BArgs['b_minPercChangeInNumAtomsToReactivate']) \
                    + "\n prevBatchSize %9.2f \n curBatchSize %9.2f" % (
                        prevBatchSize, curBatchSize)
                uidRec_b['nFail'] = 0  # Reactivated
            elif totalsizeChangedEnoughToReactivate:
                eligibleSuffix = \
                    "REACTIVATED BY TOTAL SIZE" + \
                    "\n Total size percDiff %.2f > %.2f" % (
                        totalPercDiff,
                        BArgs['b_minPercChangeInNumAtomsToReactivate']) \
                    + "\n prevTotalSize %9.1f \n curTotalSize %9.1f" % (
                        prevTotalSize, curTotalSize)
                uidRec_b['nFail'] = 0  # Reactivated
            else:
                uidsWithFailRecord.append(uid)
                BLogger.startUIDSpecificLog(uid)
                BLogger.pprint(uidstatusStr + " DISQUALIFIED FOR PAST FAILURE")
                BLogger.stopUIDSpecificLog(uid)
                continue
        # If we've made it here, the uid is eligible.
        eligible_mask[ii] = 1
        BLogger.startUIDSpecificLog(uid)
        BLogger.pprint(uidstatusStr + " ELIGIBLE " + eligibleSuffix)
        BLogger.stopUIDSpecificLog(uid)

    # Notify about uids retained
    if 'b_targetUIDs' not in MovePlans:
        MovePlans['b_targetUIDs'] = list()
    msg = "%d/%d UIDs retained from preexisting proposals." % (len(
        MovePlans['b_targetUIDs']), K)
    BLogger.pprint(msg)

    # Log info about busy disqualifications
    nDQ_toobusy = len(uidsBusyWithOtherMoves)
    nDQ_pastfail = len(uidsWithFailRecord)
    msg = "%d/%d UIDs too busy with other moves (merge/delete)." % (
        nDQ_toobusy, K)
    BLogger.pprint(msg)
    # Log info about toosmall disqualification
    nDQ_toosmall = len(uidsTooSmall)
    msg = "%d/%d UIDs too small (too few %s in current batch)." + \
        " Required size >= %d (--b_minNumAtomsForTargetComp)"
    msg = msg % (nDQ_toosmall, K, atomstr, BArgs['b_minNumAtomsForTargetComp'])
    BLogger.pprint(msg, 'debug')
    if nDQ_toosmall > 0 and doPrintLotsOfDetails:
        lineUID = vec2str([u[0] for u in uidsTooSmall])
        lineSize = vec2str([u[1] for u in uidsTooSmall])
        BLogger.pprint(
            [lineUID, lineSize],
            prefix=['%7s' % 'uids', '%7s' % labelstr],
        )
    # Notify about past failure disqualifications to the log
    BLogger.pprint(
        '%d/%d UIDs disqualified for past failures.' % (nDQ_pastfail, K),
        'debug')
    if nDQ_pastfail > 0 and doPrintLotsOfDetails:
        lineUID = vec2str(uidsWithFailRecord)
        BLogger.pprint(lineUID)
    # Store nDQ counts for reporting.
    MovePlans['b_nDQ_toosmall'] = nDQ_toosmall
    MovePlans['b_nDQ_toobusy'] = nDQ_toobusy
    MovePlans['b_nDQ_pastfail'] = nDQ_pastfail
    # Finalize list of eligible UIDs
    eligibleUIDs = SS.uids[eligible_mask]
    BLogger.pprint('%d/%d UIDs eligible for new proposal' %
                   (len(eligibleUIDs), K))
    # EXIT if nothing eligible.
    if len(eligibleUIDs) == 0:
        BLogger.pprint('')
        assert 'b_targetUIDs' in MovePlans
        return MovePlans

    # Record all uids that are eligible!
    # And make vector of how recently they have failed in other attempts
    FailVec = np.inf * np.ones(K)
    for uid in eligibleUIDs:
        uidRec['b_latestEligibleLap'] = lapFrac
        k = SS.uid2k(uid)
        FailVec[k] = MoveRecordsByUID[uid]['b_nFailRecent']

    if doPrintLotsOfDetails:
        lineUID = vec2str(eligibleUIDs)
        lineSize = vec2str(CountVec_all[eligible_mask])
        lineBatchSize = vec2str(CountVec_b[eligible_mask])
        lineFail = vec2str(FailVec[eligible_mask])
        BLogger.pprint(
            [lineUID, lineSize, lineBatchSize, lineFail],
            prefix=[
                '%7s' % 'uids',
                '%7s' % 'cnt_ttl',
                '%7s' % 'cnt_b',
                '%7s' % 'nFail',
            ],
        )

    # Figure out how many new states we can target this round.
    # Prioritize the top comps as ranked by the desired score
    # until we max out the budget of Kmax total comps.
    maxnewK = BArgs['Kmax'] - SS.K
    totalnewK_perEligibleComp = np.minimum(
        np.ceil(CountVec_b[eligible_mask]),
        np.minimum(BArgs['b_Kfresh'], maxnewK))
    # TODO: Worry about retained ids with maxnewK
    sortorder = argsortBigToSmallByTiers(-1 * FailVec[eligible_mask],
                                         CountVec_b[eligible_mask])
    sortedCumulNewK = np.cumsum(totalnewK_perEligibleComp[sortorder])
    nToKeep = np.searchsorted(sortedCumulNewK, maxnewK + 0.0042)
    if nToKeep == 0:
        nToKeep = 1
    keepEligibleIDs = sortorder[:nToKeep]
    newK = np.minimum(sortedCumulNewK[nToKeep - 1], maxnewK)
    chosenUIDs = [eligibleUIDs[s] for s in keepEligibleIDs]

    if nToKeep < len(chosenUIDs):
        BLogger.pprint(
            'Selected %d/%d eligible UIDs to do proposals.' % (
                nToKeep, len(chosenUIDs)) + \
            '\n Could create up to %d new clusters, %d total clusters.' % (
                newK, newK + SS.K) + \
            '\n Total budget allows at most %d clusters (--Kmax).' % (
                BArgs['Kmax']),
            )
    BLogger.pprint('%d/%d UIDs chosen for new proposals (rankby: cnt_b)' %
                   (len(chosenUIDs), len(eligibleUIDs)))
    if doPrintLotsOfDetails:
        lineUID = vec2str(chosenUIDs)
        lineSize = vec2str(CountVec_all[eligible_mask][keepEligibleIDs])
        lineBatchSize = vec2str(CountVec_b[eligible_mask][keepEligibleIDs])
        lineFail = vec2str(FailVec[eligible_mask][keepEligibleIDs])
        BLogger.pprint(
            [lineUID, lineSize, lineBatchSize, lineFail],
            prefix=[
                '%7s' % 'uids',
                '%7s' % 'cnt_ttl',
                '%7s' % 'cnt_b',
                '%7s' % 'fail',
            ],
        )

    for uid in chosenUIDs:
        uidRec = MoveRecordsByUID[uid]
        uidRec['b_proposalBatchID'] = batchID
        uidRec_b = MoveRecordsByUID[uid]['byBatch'][batchID]
        uidRec_b['proposalBatchSize'] = SSbatch.getCountForUID(uid)
        uidRec_b['proposalTotalSize'] = SSbatch.getCountForUID(uid)

    # Aggregate all uids
    MovePlans['b_newlyChosenTargetUIDs'] = chosenUIDs
    MovePlans['b_preExistingTargetUIDs'] = \
        [u for u in MovePlans['b_targetUIDs']]
    MovePlans['b_targetUIDs'].extend(chosenUIDs)

    BLogger.pprint('')
    return MovePlans

Пример #6

Показать файл

Файл: BPlanner.py Проект: zhaottcrystal/bnpy

def selectShortListForBirthAtLapStart(hmodel,
                                      SS,
                                      MoveRecordsByUID=dict(),
                                      MovePlans=dict(),
                                      lapFrac=0,
                                      b_minNumAtomsForTargetComp=2,
                                      **BArgs):
    ''' Select list of comps to possibly target with birth during next lap.

    Shortlist uids are guaranteed to never be involved in a merge/delete.
    They are kept aside especially for a birth move, at least in this lap.
    
    Returns
    -------
    MovePlans : dict with updated fields
    * b_shortlistUIDs : list of ints,
        Each uid in b_shortlistUIDs could be a promising birth target.
        None of these should be touched by deletes or merges in this lap.
    '''
    MovePlans['b_shortlistUIDs'] = list()
    MovePlans['b_nDQ_toosmall'] = 0
    MovePlans['b_nDQ_pastfail'] = 0
    MovePlans['b_nDQ_toobusy'] = 0
    MovePlans['b_roomToGrow'] = 0
    MovePlans['b_maxLenShortlist'] = 0
    if not canBirthHappenAtLap(lapFrac, **BArgs):
        BLogger.pprint('')
        return MovePlans

    K = hmodel.obsModel.K
    KroomToGrow = BArgs['Kmax'] - K
    MovePlans['b_roomToGrow'] = KroomToGrow
    # Each birth adds at least 2 comps.
    # If we have 10 slots left, we can do at most 5 births
    maxLenShortlist = KroomToGrow / 2
    MovePlans['b_maxLenShortlist'] = maxLenShortlist

    # EXIT: early, if no room to grow.
    if KroomToGrow <= 1:
        BLogger.pprint(
            "Cannot shortlist any comps for birth." + \
            " Adding 2 more comps to K=%d exceeds limit of %d (--Kmax)." % (
                K, BArgs['Kmax'])
            )
        BLogger.pprint('')
        return MovePlans
    # Log reasons for shortlist length
    if maxLenShortlist < K:
        msg = " Limiting shortlist to %d possible births this lap." % (
            maxLenShortlist)
        msg += " Any more would cause current K=%d to exceed Kmax=%d" % (
            K, BArgs['Kmax'])
        BLogger.pprint(msg)
    # Handle initialization case: SS is None
    # Must just select all possible comps
    if SS is None:
        shortlistUIDs = np.arange(K).tolist()
        shortlistUIDs = shortlistUIDs[:maxLenShortlist]
        MovePlans['b_shortlistUIDs'] = shortlistUIDs
        BLogger.pprint("No SS provided. Shortlist contains %d possible comps" %
                       (len(shortlistUIDs)))
        BLogger.pprint('')
        return MovePlans
    assert SS.K == K

    CountVec = SS.getCountVec()
    eligible_mask = np.zeros(K, dtype=np.bool8)
    nTooSmall = 0
    nPastFail = 0
    for k, uid in enumerate(SS.uids):
        if uid not in MoveRecordsByUID:
            MoveRecordsByUID[uid] = defaultdict(int)
        tooSmall = CountVec[k] <= b_minNumAtomsForTargetComp
        hasFailRecord = MoveRecordsByUID[uid]['b_nFailRecent'] > 0
        if MoveRecordsByUID[uid]['b_tryAgainFutureLap'] > 0:
            eligible_mask[k] = 1
            MovePlans['b_shortlistUIDs'].append(uid)
        elif (not tooSmall) and (not hasFailRecord):
            eligible_mask[k] = 1
            MovePlans['b_shortlistUIDs'].append(uid)
        elif tooSmall:
            nTooSmall += 1
        else:
            assert hasFailRecord
            nPastFail += 1
    assert len(MovePlans['b_shortlistUIDs']) == np.sum(eligible_mask)
    # Rank the shortlist by size
    if maxLenShortlist < len(MovePlans['b_shortlistUIDs']):
        sortIDs = argsort_bigtosmall_stable(CountVec[eligible_mask])
        sortIDs = sortIDs[:maxLenShortlist]
        MovePlans['b_shortlistUIDs'] = [
            MovePlans['b_shortlistUIDs'][s] for s in sortIDs
        ]
        shortlistCountVec = CountVec[eligible_mask][sortIDs]
    else:
        shortlistCountVec = CountVec[eligible_mask]

    MovePlans['b_nDQ_toosmall'] = nTooSmall
    MovePlans['b_nDQ_pastfail'] = nPastFail
    nShortList = len(MovePlans['b_shortlistUIDs'])
    assert nShortList <= maxLenShortlist
    BLogger.pprint("%d/%d uids selected for short list." % (nShortList, K))
    if nShortList > 0:
        lineUID = vec2str(MovePlans['b_shortlistUIDs'])
        lineSize = vec2str(shortlistCountVec)
        BLogger.pprint(
            [lineUID, lineSize],
            prefix=['%7s' % 'uids', '%7s' % 'size'],
        )
    BLogger.pprint('')
    return MovePlans