示例#1
0
def loadCountHistoriesForTask(taskpath, sortBy=None, MIN_PRESENT_COUNT=1e-10):
    ''' Load sparse matrix of counts for all clusters used throughout task.

    Returns
    -------
    AllCountMat : 2D array, nCheckpoint x nTotal
    Info : dict
    '''
    idpath = os.path.join(taskpath, 'ActiveIDs.txt')
    ctpath = os.path.join(taskpath, 'ActiveCounts.txt')
    fid = open(idpath, 'r')
    fct = open(ctpath, 'r')
    data = list()
    colids = list()
    rowids = list()
    for ii, idline in enumerate(fid.readlines()):
        idstr = str(idline.strip())
        ctstr = str(fct.readline().strip())
        idvec = np.asarray(idstr.split(' '), dtype=np.int32)
        ctvec = np.asarray(ctstr.split(' '), dtype=np.float)
        data.extend(ctvec)
        colids.extend(idvec)
        rowids.extend(ii * np.ones(idvec.size))

    # Identify columns by unique ids
    allUIDs = np.unique(colids)
    compactColIDs = -1 * np.ones_like(colids)
    for pos, u in enumerate(allUIDs):
        mask = colids == u
        compactColIDs[mask] = pos
    assert compactColIDs.min() >= 0

    # CountMat : sparse matrix of active counts at each checkpoint
    # Each row gives count (or zero if eliminated) at single lap
    data = np.asarray(data)
    np.maximum(data, MIN_PRESENT_COUNT, out=data)
    ij = np.vstack([rowids, compactColIDs])
    CountMat = scipy.sparse.csr_matrix((data, ij))
    CountMat = CountMat.toarray()
    assert allUIDs.size == CountMat.shape[1]

    # Split all columns into two sets: active and eliminated
    nCol = CountMat.shape[1]
    elimCols = np.flatnonzero(CountMat[-1, :] < MIN_PRESENT_COUNT)
    activeCols = np.setdiff1d(np.arange(nCol), elimCols)
    nElimCol = len(elimCols)
    nActiveCol = len(activeCols)
    ElimCountMat = CountMat[:, elimCols]
    ActiveCountMat = CountMat[:, activeCols]
    elimUIDs = allUIDs[elimCols]
    activeUIDs = allUIDs[activeCols]

    # Fill out info dict
    Info = dict(CountMat=CountMat,
                allUIDs=allUIDs,
                ActiveCountMat=ActiveCountMat,
                ElimCountMat=ElimCountMat,
                activeCols=activeCols,
                elimCols=elimCols,
                activeUIDs=activeUIDs,
                elimUIDs=elimUIDs)

    if not isinstance(sortBy, str) or sortBy.lower().count('none'):
        return CountMat, Info

    if sortBy.lower().count('finalorder'):
        rankedActiveUIDs = idvec
        raise ValueError("TODO")
    elif sortBy.lower().count('countvalues'):
        ## Sort columns from biggest to smallest (at last chkpt)
        rankedActiveIDs = argsort_bigtosmall_stable(ActiveCountMat[-1, :])
    else:
        raise ValueError("TODO")

    # Sort active set by size at last snapshot
    ActiveCountMat = ActiveCountMat[:, rankedActiveIDs]
    activeUIDs = activeUIDs[rankedActiveIDs]
    activeCols = activeCols[rankedActiveIDs]

    # Sort eliminated set by historical size
    rankedElimIDs = argsort_bigtosmall_stable(ElimCountMat.sum(axis=0))
    ElimCountMat = ElimCountMat[:, rankedElimIDs]
    elimUIDs = elimUIDs[rankedElimIDs]
    elimCols = elimCols[rankedElimIDs]

    Info['activeUIDs'] = activeUIDs
    Info['activeCols'] = activeCols
    Info['elimUIDs'] = elimUIDs
    Info['elimCols'] = elimCols
    return ActiveCountMat, ElimCountMat, Info
示例#2
0
文件: BPlanner.py 项目: jpfeil/hydra
def selectShortListForBirthAtLapStart(hmodel,
                                      SS,
                                      MoveRecordsByUID=dict(),
                                      MovePlans=dict(),
                                      lapFrac=0,
                                      b_minNumAtomsForTargetComp=2,
                                      **BArgs):
    ''' Select list of comps to possibly target with birth during next lap.

    Shortlist uids are guaranteed to never be involved in a merge/delete.
    They are kept aside especially for a birth move, at least in this lap.
    
    Returns
    -------
    MovePlans : dict with updated fields
    * b_shortlistUIDs : list of ints,
        Each uid in b_shortlistUIDs could be a promising birth target.
        None of these should be touched by deletes or merges in this lap.
    '''
    MovePlans['b_shortlistUIDs'] = list()
    MovePlans['b_nDQ_toosmall'] = 0
    MovePlans['b_nDQ_pastfail'] = 0
    MovePlans['b_nDQ_toobusy'] = 0
    MovePlans['b_roomToGrow'] = 0
    MovePlans['b_maxLenShortlist'] = 0
    if not canBirthHappenAtLap(lapFrac, **BArgs):
        BLogger.pprint('')
        return MovePlans

    K = hmodel.obsModel.K
    KroomToGrow = BArgs['Kmax'] - K
    MovePlans['b_roomToGrow'] = KroomToGrow
    # Each birth adds at least 2 comps.
    # If we have 10 slots left, we can do at most 5 births
    maxLenShortlist = KroomToGrow / 2
    MovePlans['b_maxLenShortlist'] = maxLenShortlist

    # EXIT: early, if no room to grow.
    if KroomToGrow <= 1:
        BLogger.pprint(
            "Cannot shortlist any comps for birth." + \
            " Adding 2 more comps to K=%d exceeds limit of %d (--Kmax)." % (
                K, BArgs['Kmax'])
            )
        BLogger.pprint('')
        return MovePlans
    # Log reasons for shortlist length
    if maxLenShortlist < K:
        msg = " Limiting shortlist to %d possible births this lap." % (
            maxLenShortlist)
        msg += " Any more would cause current K=%d to exceed Kmax=%d" % (
            K, BArgs['Kmax'])
        BLogger.pprint(msg)
    # Handle initialization case: SS is None
    # Must just select all possible comps
    if SS is None:
        shortlistUIDs = np.arange(K).tolist()
        shortlistUIDs = shortlistUIDs[:maxLenShortlist]
        MovePlans['b_shortlistUIDs'] = shortlistUIDs
        BLogger.pprint("No SS provided. Shortlist contains %d possible comps" %
                       (len(shortlistUIDs)))
        BLogger.pprint('')
        return MovePlans
    assert SS.K == K

    CountVec = SS.getCountVec()
    eligible_mask = np.zeros(K, dtype=np.bool8)
    nTooSmall = 0
    nPastFail = 0
    for k, uid in enumerate(SS.uids):
        if uid not in MoveRecordsByUID:
            MoveRecordsByUID[uid] = defaultdict(int)
        tooSmall = CountVec[k] <= b_minNumAtomsForTargetComp
        hasFailRecord = MoveRecordsByUID[uid]['b_nFailRecent'] > 0
        if MoveRecordsByUID[uid]['b_tryAgainFutureLap'] > 0:
            eligible_mask[k] = 1
            MovePlans['b_shortlistUIDs'].append(uid)
        elif (not tooSmall) and (not hasFailRecord):
            eligible_mask[k] = 1
            MovePlans['b_shortlistUIDs'].append(uid)
        elif tooSmall:
            nTooSmall += 1
        else:
            assert hasFailRecord
            nPastFail += 1
    assert len(MovePlans['b_shortlistUIDs']) == np.sum(eligible_mask)
    # Rank the shortlist by size
    if maxLenShortlist < len(MovePlans['b_shortlistUIDs']):
        sortIDs = argsort_bigtosmall_stable(CountVec[eligible_mask])
        sortIDs = sortIDs[:maxLenShortlist]
        MovePlans['b_shortlistUIDs'] = [
            MovePlans['b_shortlistUIDs'][s] for s in sortIDs
        ]
        shortlistCountVec = CountVec[eligible_mask][sortIDs]
    else:
        shortlistCountVec = CountVec[eligible_mask]

    MovePlans['b_nDQ_toosmall'] = nTooSmall
    MovePlans['b_nDQ_pastfail'] = nPastFail
    nShortList = len(MovePlans['b_shortlistUIDs'])
    assert nShortList <= maxLenShortlist
    BLogger.pprint("%d/%d uids selected for short list." % (nShortList, K))
    if nShortList > 0:
        lineUID = vec2str(MovePlans['b_shortlistUIDs'])
        lineSize = vec2str(shortlistCountVec)
        BLogger.pprint(
            [lineUID, lineSize],
            prefix=['%7s' % 'uids', '%7s' % 'size'],
        )
    BLogger.pprint('')
    return MovePlans
示例#3
0
def learn_rhoomega_fromFixedCounts(DocTopicCount=None,
                                   nDoc=0,
                                   canShuffleInit='byUsage',
                                   canShuffle=None,
                                   maxiter=5,
                                   warmStart_rho=1,
                                   alpha=None, gamma=None,
                                   initrho=None, initomega=None, **kwargs):
    assert nDoc == DocTopicCount.shape[0]
    K = DocTopicCount.shape[1]

    didShuffle = 0
    if canShuffleInit:
        if canShuffleInit.lower().count('byusage'):
            print 'INITIAL SORTING BY USAGE'
            avgPi = calcAvgPiFromDocTopicCount(DocTopicCount)
            bigtosmall = argsort_bigtosmall_stable(avgPi)
        elif canShuffleInit.lower().count('bycount'):
            print 'INITIAL SORTING BY COUNT'
            bigtosmall = argsort_bigtosmall_stable(DocTopicCount.sum(axis=0))
        elif canShuffleInit.lower().count('random'):
            print 'INITIAL SORTING RANDOMLY'
            PRNG = np.random.RandomState(0)
            bigtosmall = np.arange(K)
            PRNG.shuffle(bigtosmall)
        else:
            bigtosmall = np.arange(K)
        # Now, sort.
        if not np.allclose(bigtosmall, np.arange(K)):
            DocTopicCount = DocTopicCount[:, bigtosmall]
            didShuffle = 1
    avgPi = calcAvgPiFromDocTopicCount(DocTopicCount)
    sortedids = argsort_bigtosmall_stable(avgPi)
    if canShuffleInit.lower().count('byusage'):
        assert np.allclose(sortedids, np.arange(K))

    # Find UIDs of comps to track
    emptyUIDs = np.flatnonzero(DocTopicCount.sum(axis=0) < 0.0001)
    if emptyUIDs.size >= 3:
        firstEmptyUID = emptyUIDs.min()
        lastEmptyUID = emptyUIDs.max()
        middleEmptyUID = emptyUIDs[len(emptyUIDs)/2]
        trackEmptyUIDs = [firstEmptyUID, middleEmptyUID, lastEmptyUID]
        emptyLabels = ['first', 'middle', 'last']
    elif emptyUIDs.size == 2:
        trackEmptyUIDs = [emptyUIDs.min(), emptyUIDs.max()]
        emptyLabels = ['first', 'last']
    elif emptyUIDs.size == 1:
        firstEmptyUID = emptyUIDs.min()
        trackEmptyUIDs = [firstEmptyUID]
        emptyLabels = ['first']
    else:
        trackEmptyUIDs = []
        emptyLabels = []

    trackActiveUIDs = list()
    activeLabels = list()
    # Track the top 5 active columns of DocTopicCount
    for pos in range(0, np.minimum(5, K)):
        if sortedids[pos] not in emptyUIDs:
            trackActiveUIDs.append(sortedids[pos])
            activeLabels.append('max+%d' % (pos))
    # Find the minnonemptyID
    for pos in range(K-1, 0, -1):
        curid = sortedids[pos]
        if curid not in emptyUIDs:
            break
    minnonemptyPos = pos
    # Track the 5 smallest active columns of DocTopicCount
    nBeyond5 = np.minimum(5, K - len(emptyUIDs) - 5)
    for i in range(-1 * (nBeyond5-1), 1):
        trackActiveUIDs.append(sortedids[minnonemptyPos + i])
        activeLabels.append('min+%d' % (-1 * i))

    assert np.all(avgPi[trackActiveUIDs] > 0)
    assert np.allclose(avgPi[trackEmptyUIDs], 0.0)
    assert is_sorted_bigtosmall(avgPi[trackActiveUIDs])

    nDocToDisplay = np.minimum(nDoc, 10)

    # Initialize rho
    if initrho is None:
        rho = OptimizerRhoOmegaBetter.make_initrho(K, nDoc, gamma)
    else:
        if didShuffle:
            rho, _ = reorder_rho(initrho, bigtosmall)
        else:
            rho = initrho
    # Initialize omega
    if initomega is None:
        omega = OptimizerRhoOmegaBetter.make_initomega(K, nDoc, gamma)
    else:
        omega = initomega
    # ELBO value of initial state
    Ltro = evalELBOandPrint(
        rho=rho, omega=omega,
        nDoc=nDoc,
        DocTopicCount=DocTopicCount,
        alpha=alpha, gamma=gamma,
        msg='init',
    )
    Snapshots = dict()
    Snapshots['DTCSum'] = list()
    Snapshots['DTCUsage'] = list()
    Snapshots['beta'] = list()
    Snapshots['Lscore'] = list()
    Snapshots['activeLabels'] = activeLabels
    Snapshots['emptyLabels'] = emptyLabels
    Snapshots['pos_trackActive'] = list()
    Snapshots['pos_trackEmpty'] = list()
    Snapshots['beta_trackActive'] = list()
    Snapshots['beta_trackEmpty'] = list()
    Snapshots['count_trackActive'] = list()
    Snapshots['count_trackEmpty'] = list()
    Snapshots['beta_trackRem'] = list()

    LtroList = list()
    LtroList.append(Ltro)
    betaK = rho2beta(rho, returnSize="K")
    iterid = 0
    prevbetaK = np.zeros_like(betaK)
    prevrho = rho.copy()
    while np.sum(np.abs(betaK - prevbetaK)) > 0.0000001:
        iterid += 1
        if iterid > maxiter:
            break
        # Take Snapshots of Learned Params
        Snapshots['Lscore'].append(Ltro)
        Snapshots['DTCSum'].append(DocTopicCount.sum(axis=0))
        Snapshots['DTCUsage'].append((DocTopicCount > 0.001).sum(axis=0))
        Snapshots['beta'].append(betaK)
        Snapshots['pos_trackActive'].append(trackActiveUIDs)
        Snapshots['pos_trackEmpty'].append(trackEmptyUIDs)
        Snapshots['beta_trackActive'].append(betaK[trackActiveUIDs])
        Snapshots['beta_trackEmpty'].append(betaK[trackEmptyUIDs])
        Snapshots['beta_trackRem'].append(1.0 - betaK.sum())
        Snapshots['count_trackActive'].append(
            DocTopicCount.sum(axis=0)[trackActiveUIDs])
        Snapshots['count_trackEmpty'].append(
            DocTopicCount.sum(axis=0)[trackEmptyUIDs])

        # Sort by beta
        didShuffle = 0
        tlabel = '_t'
        if iterid > 1 and canShuffle and canShuffle.lower().count('bybeta'):
            bigtosmall = argsort_bigtosmall_stable(betaK)
            if not np.allclose(bigtosmall, np.arange(K)):
                trackActiveUIDs = mapToNewPos(trackActiveUIDs, bigtosmall)
                trackEmptyUIDs = mapToNewPos(trackEmptyUIDs, bigtosmall)
                rho, betaK = reorder_rho(rho, bigtosmall)
                DocTopicCount = DocTopicCount[:, bigtosmall]
                didShuffle = 1
                tlabel = '_ts'
        # Update theta
        sumLogPiActiveVec, sumLogPiRemVec, LP = DocTopicCount_to_sumLogPi(
            rho=rho, omega=omega, 
            DocTopicCount=DocTopicCount,
            alpha=alpha, gamma=gamma,
            **kwargs)
        # Show ELBO with freshly-optimized theta value.
        Ltro = evalELBOandPrint(
            rho=rho, omega=omega,
            DocTopicCount=DocTopicCount,
            theta=LP['theta'],
            thetaRem=LP['thetaRem'],
            nDoc=nDoc,
            sumLogPiActiveVec=sumLogPiActiveVec,
            sumLogPiRemVec=sumLogPiRemVec,
            alpha=alpha, gamma=gamma, f=None,
            msg=str(iterid) + tlabel,
        )
        LtroList.append(Ltro)
        if not LtroList[-1] >= LtroList[-2]:
            if didShuffle:
                print 'NOT MONOTONIC! just after theta update with SHUFFLE!'
            else:
                print 'NOT MONOTONIC! just after theta standard update'

        didELBODrop = 0
        if canShuffle:
            if canShuffle.lower().count('bysumlogpi'):
                bigtosmall = argsort_bigtosmall_stable(
                    sumLogPiActiveVec)
            elif canShuffle.lower().count('bycounts'):
                bigtosmall = argsort_bigtosmall_stable(
                    DocTopicCount.sum(axis=0))
            elif canShuffle.lower().count('byusage'):
                estPi = DocTopicCount / DocTopicCount.sum(axis=1)[:,np.newaxis]
                avgPi = np.sum(estPi, axis=0)
                bigtosmall = argsort_bigtosmall_stable(avgPi)
            else:
                bigtosmall = np.arange(K)
            if not np.allclose(bigtosmall, np.arange(K)):
                trackActiveUIDs = mapToNewPos(trackActiveUIDs, bigtosmall)
                trackEmptyUIDs = mapToNewPos(trackEmptyUIDs, bigtosmall)
                rho, betaK = reorder_rho(rho, bigtosmall)
                sumLogPiActiveVec = sumLogPiActiveVec[bigtosmall]
                DocTopicCount = DocTopicCount[:,bigtosmall]
                LP['theta'] = LP['theta'][:, bigtosmall]
                didShuffle = 1
                # Show ELBO with freshly-optimized rho value.
                Ltro = evalELBOandPrint(
                    rho=rho, omega=omega,
                    DocTopicCount=DocTopicCount,
                    theta=LP['theta'],
                    thetaRem=LP['thetaRem'],
                    nDoc=nDoc,
                    sumLogPiActiveVec=sumLogPiActiveVec,
                    sumLogPiRemVec=sumLogPiRemVec,
                    alpha=alpha, gamma=gamma, f=None,
                    msg=str(iterid) + "_ss",
                )
                LtroList.append(Ltro)
                if not LtroList[-1] >= LtroList[-2]:
                    print 'NOT MONOTONIC! just after %s shuffle update!' % (
                        canShuffle)
                    didELBODrop = 1

        prevrho[:] = rho
        # Update rhoomega
        if warmStart_rho:
            initrho = rho
        else:
            initrho = None
        rho, omega, f, Info = OptimizerRhoOmegaBetter.\
            find_optimum_multiple_tries(
                alpha=alpha,
                gamma=gamma,
                sumLogPiActiveVec=sumLogPiActiveVec,
                sumLogPiRemVec=sumLogPiRemVec,
                nDoc=nDoc,
                initrho=initrho,
                initomega=omega,
                approx_grad=1,
                do_grad_omega=0,
            )
        prevbetaK[:] = betaK
        betaK = rho2beta(rho, returnSize="K")
        # Show ELBO with freshly-optimized rho value.
        Ltro = evalELBOandPrint(
            rho=rho, omega=omega,
            DocTopicCount=DocTopicCount,
            theta=LP['theta'],
            thetaRem=LP['thetaRem'],
            nDoc=nDoc,
            sumLogPiActiveVec=sumLogPiActiveVec,
            sumLogPiRemVec=sumLogPiRemVec,
            alpha=alpha, gamma=gamma, f=f,
            msg=str(iterid) + "_r",
        )
        LtroList.append(Ltro)
        if not LtroList[-1] >= LtroList[-2]:
            print 'NOT MONOTONIC! just after rho update!'

        if didELBODrop:
            if LtroList[-1] >= LtroList[-3]:
                print 'Phew. Combined update of sorting then optimizing rho OK'
            else:
                print 'WHOA! Combined update of sorting then' + \
                    ' optimizing rho beta NOT MONOTONIC'

    Snapshots['Lscore'].append(Ltro)
    Snapshots['DTCSum'].append(DocTopicCount.sum(axis=0))
    Snapshots['DTCUsage'].append((DocTopicCount > 0.001).sum(axis=0))
    Snapshots['beta'].append(betaK)
    Snapshots['pos_trackActive'].append(trackActiveUIDs)
    Snapshots['pos_trackEmpty'].append(trackEmptyUIDs)
    Snapshots['beta_trackActive'].append(betaK[trackActiveUIDs])
    Snapshots['beta_trackEmpty'].append(betaK[trackEmptyUIDs])
    Snapshots['beta_trackRem'].append(1.0 - betaK.sum())
    Snapshots['count_trackActive'].append(
        DocTopicCount.sum(axis=0)[trackActiveUIDs])
    Snapshots['count_trackEmpty'].append(
        DocTopicCount.sum(axis=0)[trackEmptyUIDs])

    print '\nEmpty cluster ids (%d of %d)' % (
        len(trackEmptyUIDs), len(emptyUIDs))
    print '-----------------'
    print ' '.join(['% 10d' % (x) for x in trackEmptyUIDs])
    

    print '\nSelected active clusters to track'
    print '---------------------------------'
    print ' '.join(['% 10d' % (x) for x in trackActiveUIDs])
    print ' '.join(['% .3e' % (x) for x in avgPi[trackActiveUIDs]])

    print '\nDocTopicCount for %d of %d docs' % (nDocToDisplay, nDoc)
    print '---------------------------------'
    for n in range(nDocToDisplay):
        print ' '.join([
            '% 9.2f' % (x) for x in DocTopicCount[n, trackActiveUIDs]])

    print '\nFinal sumLogPiActiveVec'
    print '---------------------------------'
    print ' '.join(['% .3e' % (x) for x in sumLogPiActiveVec[trackActiveUIDs]])

    print 'is sumLogPiActiveVec sorted?', \
        is_sorted_bigtosmall(sumLogPiActiveVec)
    return rho, omega, Snapshots