Exemplo n.º 1
0
def _make_xcandidate_LP(xbigModel, Data, xbigSS, xfreshSS, xfreshLP, k,
                        **kwargs):
    rfreshLP = _delete_comps_from_LP(Data, xbigModel, xfreshLP, k)
    rfreshSS = xbigModel.get_global_suff_stats(Data,
                                               rfreshLP,
                                               doPrecompEntropy=True)

    rbigModel = xbigModel.copy()
    rbigSS = xbigSS.copy()
    rbigSS.removeComp(rbigSS.K -
                      1)  # just chop off the last one in stickbrk order

    qbigSS = rbigSS + rfreshSS
    rbigModel.update_global_params(qbigSS)

    # We might consider another pass to make sure the alloc params converge
    rbigModel.allocModel.update_global_params(qbigSS)

    if 'cleanupDeleteNumIters' in kwargs and kwargs['cleanupDeleteNumIters']:
        nIters = kwargs['cleanupDeleteNumIters']
        for trial in xrange(nIters):
            rfreshLP = rbigModel.calc_local_params(Data,
                                                   rfreshLP,
                                                   methodLP='memo',
                                                   nCoordAscentItersLP=10)
            rfreshSS = rbigModel.get_global_suff_stats(Data,
                                                       rfreshLP,
                                                       doPrecompEntropy=1)
            qbigSS = rbigSS + rfreshSS
            rbigModel.update_global_params(qbigSS)
            rfreshELBO = rbigModel.calc_evidence(SS=rfreshSS)
            log('%d  %.6e' % (trial, rfreshELBO))
    else:
        rfreshELBO = rbigModel.calc_evidence(SS=rfreshSS)
    return rbigModel, rbigSS, rfreshSS, rfreshELBO, rfreshLP
Exemplo n.º 2
0
def expand_then_refine(freshModel, freshSS, freshData, bigModel, bigSS,
                       **kwargs):
    ''' Create expanded model with K + K' comps,
          then refine components K+1, K+2, ... K+K' via several VB iterations

        Guarantees that original comps of bigModel.obsModel are not altered.

        Returns
        -------
        xbigModel : HModel with K + Kfresh comps
                 * allocModel has scale bigSS + freshSS
                 * obsModel has scale bigSS + freshSS
        xbigSS : SuffStatBag with K + Kfresh comps
                  * has scale bigSS + freshSS
        xfreshSS : SuffStatBag with K + Kfresh comps
                  * has scale freshSS
        AdjustInfo : dict with adjustment factors
        ReplaceInfo : dict with replacement factors
    '''
    logPhase('Expansion')
    Korig = bigSS.K
    Info = dict()
    xbigModel = bigModel.copy()
    xbigSS = bigSS.copy(includeELBOTerms=False, includeMergeTerms=False)
    if kwargs['expandAdjustSuffStats'] \
            and hasattr(freshModel.allocModel, 'insertCompsIntoSuffStatBag'):
        xbigSS, AInfo, RInfo = xbigModel.allocModel.insertCompsIntoSuffStatBag(
            xbigSS, freshSS)
        log('Specialized, model-specific expansion')
        log('rho[K+1] ... rho[K+Knew]')
        logProbVector(xbigModel.allocModel.rho[Korig:])

    else:
        xbigSS.insertComps(freshSS)
        AInfo = None
        RInfo = None

    # Create expanded model, K + Kfresh comps
    Kx = xbigSS.K
    if xbigModel.allocModel.K < Kx:
        xbigModel.allocModel.update_global_params(xbigSS)
    if xbigModel.obsModel.K < Kx:
        xbigModel.obsModel.update_global_params(xbigSS)
    xbigSS.subtractSpecificComps(freshSS,
                                 list(range(bigSS.K, bigSS.K + freshSS.K)))
    if kwargs['birthDebug']:
        Info['xbigModelInit'] = xbigModel.copy()

    # Refine expanded model with VB iterations
    if kwargs['refineNumIters'] > 0:
        xbigModel, xfreshSS, xfreshLP, xInfo = \
            refine_expanded_model_with_VB_iters(
                xbigModel, freshData,
                xbigSS=xbigSS, Korig=bigSS.K, **kwargs)
    else:
        xfreshSS = xbigSS.copy()
        xfreshSS.setAllFieldsToZero()
        for key in list(xfreshSS._FieldDims.keys()):
            if xfreshSS._FieldDims[key] is None:
                continue
            arr = getattr(xfreshSS, key)
            arr[bigSS.K:] = getattr(freshSS, key)
        xfreshLP = None
        xInfo = dict(origIDs=list())

    if kwargs['birthDebug']:
        Info['xbigModelRefined'] = xbigModel.copy()
        Info['traceN'] = xInfo['traceN']
        Info['traceBeta'] = xInfo['traceBeta']
        Info['traceELBO'] = xInfo['traceELBO']

    AInfo = _delete_from_AInfo(AInfo, xInfo['origIDs'], Kx)
    if hasattr(xfreshSS, 'nDoc'):
        assert xbigSS.nDoc == bigSS.nDoc
        assert xfreshSS.nDoc == freshData.nDoc

    if kwargs['cleanupDeleteToImprove']:
        Kx = xbigSS.K
        xbigModel, xbigSS, xfreshSS, xfreshELBO, origIDs = \
            BirthCleanup.delete_comps_from_expanded_model_to_improve_ELBO(
                freshData, xbigModel,
                xbigSS, xfreshSS,
                Korig=bigSS.K, xfreshLP=xfreshLP, **kwargs)
        AInfo = _delete_from_AInfo(AInfo, origIDs, Kx)
        if kwargs['birthDebug']:
            Info['xbigModelPostDelete'] = xbigModel.copy()
            Info['ELBOPostDelete'] = xfreshELBO

    if hasattr(xfreshSS, 'nDoc'):
        assert xbigSS.nDoc == bigSS.nDoc
        assert xfreshSS.nDoc == freshData.nDoc
    xbigSS += xfreshSS
    Info['AInfo'] = AInfo
    Info['RInfo'] = RInfo
    return xbigModel, xbigSS, xfreshSS, Info
Exemplo n.º 3
0
def refine_expanded_model_with_VB_iters(xbigModel,
                                        freshData,
                                        xbigSS=None,
                                        Korig=0,
                                        **kwargs):
    ''' Execute multiple local/global update steps for the current model

        Args
        --------
        xbigSS : SuffStatBag, with K + Kfresh comps,
                                   scale equal to bigData only

        Returns
        --------
        model : HModel, with K + Kfresh comps
                        scale equal to bigData + freshData
        freshSS : SuffStatBag, with K + Kfresh comps
                        scale equal to freshData
        freshLP : dict of local parameters for freshData


        Updates (in-place)
        ----------
        xbigSS : SuffStatBag, with K + Kfresh comps
                         scale with equal to bigData only
    '''
    logPhase('Refinement')

    xInfo = dict()
    origIDs = list(range(0, xbigSS.K))

    nIters = kwargs['refineNumIters']
    traceBeta = np.zeros((nIters, xbigSS.K))
    traceN = np.zeros((nIters, xbigSS.K))
    traceELBO = np.zeros(nIters)

    xfreshLP = None
    for riter in range(nIters):
        xfreshLP = xbigModel.calc_local_params(freshData, xfreshLP, **kwargs)
        xfreshSS = xbigModel.get_global_suff_stats(freshData, xfreshLP)

        traceN[riter, origIDs] = xfreshSS.N
        if kwargs['birthDebug']:
            traceBeta[riter,
                      origIDs] = xbigModel.allocModel.get_active_comp_probs()
            traceELBO[riter] = xbigModel.calc_evidence(freshData, xfreshSS,
                                                       xfreshLP)

        if riter < 3 or (riter + 1) % 5 == 0:
            logPosVector(traceN[riter, Korig:], label='iter %3d' % (riter + 1))

        # For all but last iteration, attempt removing empty topics
        if kwargs[
                'cleanupDeleteEmpty'] and riter < kwargs['refineNumIters'] - 1:
            for k in reversed(list(range(Korig, xfreshSS.K))):
                if xfreshSS.N[k] < kwargs['cleanupMinSize']:
                    xfreshSS.removeComp(k)
                    xbigSS.removeComp(xbigSS.K - 1)  # last in order!
                    del origIDs[k]

        if xfreshSS.K == Korig:
            msg = "BIRTH failed. After refining, no comps > cleanupMinSize."
            raise BirthProposalError(msg)

        xbigSS += xfreshSS
        xbigModel.allocModel.update_global_params(xbigSS)
        xbigModel.obsModel.update_global_params(xbigSS)
        xbigSS -= xfreshSS

    xfreshLP = xbigModel.calc_local_params(freshData, xfreshLP, **kwargs)
    xfreshSS = xbigModel.get_global_suff_stats(freshData, xfreshLP)
    log('Final Assignment Counts')
    logPosVector(xfreshSS.N[Korig:], label='final')

    if kwargs['birthDebug']:
        xInfo['traceBeta'] = traceBeta
        xInfo['traceN'] = traceN
        xInfo['traceELBO'] = traceELBO
    xInfo['origIDs'] = origIDs

    return xbigModel, xfreshSS, xfreshLP, xInfo
Exemplo n.º 4
0
def create_model_with_new_comps(bigModel,
                                bigSS,
                                freshData,
                                Q=None,
                                Plan=None,
                                **kwargs):
    '''

      Returns
      -------
      freshModel : HModel with Kfresh components,
                     scale *may not* be consistent with target dataset
      freshSS : SuffStatBag with Kfresh components,
                     scale will be consistent with target dataset
    '''
    Info = dict()
    freshModel = bigModel.copy()

    if kwargs['creationRoutine'] == 'targetWordFreq':
        freshModel.set_global_params(
            beta=np.ones(1),
            K=1,
            topics=Plan['targetWordFreq'][np.newaxis, :],
            wordcountTotal=freshData.word_count.sum())
    elif kwargs['creationRoutine'] == 'findmissingtopics':
        freshModel = create_new_model_findmissingtopics(
            freshModel, freshData, bigModel, **kwargs)
    elif kwargs['creationRoutine'] == 'xspectral':
        assert Q is not None
        freshModel = create_new_model_expandedspectral(freshModel, Q,
                                                       freshData, bigModel,
                                                       **kwargs)
    elif kwargs['creationRoutine'] == 'spectralOnTarget':
        freshModel = create_new_model_spectralOnTarget(freshModel, freshData,
                                                       bigModel, **kwargs)
    else:
        freshModel.init_global_params(freshData,
                                      K=kwargs['Kfresh'],
                                      initname=kwargs['creationRoutine'],
                                      **kwargs)

    logPhase('Creation')
    log('CreationRoutine: ' + kwargs['creationRoutine'], 'debug')
    log('Kfresh=%d' % (freshModel.obsModel.K), 'debug')

    if not kwargs['creationDoUpdateFresh']:
        # Create freshSS that would produce (nearly) same freshModel.obsModel
        # after a call to update_global_params
        freshSS._Fields.setAllFieldsToZero()
        if hasattr(freshSS, 'WordCounts'):
            topics = freshSS.WordCounts
            priorvec = freshModel.obsModel.obsPrior.lamvec
            for k in range(freshSS.K):
                topics[k, :] = freshModel.obsModel.comp[k].lamvec - priorvec
            freshSS.setField('WordCounts', topics, dims=('K', 'D'))
        return freshModel, freshSS, Info

    # Record initial model for posterity
    if kwargs['birthDebug']:
        Info['freshModelInit'] = freshModel.copy()

    # Complete several iterations to improve this fresh proposal
    for step in range(kwargs['creationNumIters']):
        freshLP = freshModel.calc_local_params(freshData, **fastParams)
        freshSS = freshModel.get_global_suff_stats(freshData, freshLP)
        freshModel.update_global_params(freshSS)
        if step < 3 or (step + 1) % 10 == 0:
            logPosVector(freshSS.N,
                         label='iter %3d' % (step + 1),
                         level='debug')
        if step > 1:
            maxDiff = np.max(np.abs(freshSS.N - prevN))
            if maxDiff < 1.0:
                break
        prevN = freshSS.N.copy()

    logPosVector(freshSS.N, label='after creation', level='moreinfo')
    if kwargs['birthDebug']:
        Info['freshModelRefined'] = freshModel.copy()

    if kwargs['cleanupDeleteEmpty']:
        Kbefore = freshSS.K
        freshModel, freshSS = BirthCleanup.delete_empty_comps(freshData,
                                                              freshModel,
                                                              freshSS,
                                                              Korig=0,
                                                              **kwargs)
        freshLP = freshModel.calc_local_params(freshData)
        freshSS = freshModel.get_global_suff_stats(freshData, freshLP)
        freshModel.update_global_params(freshSS)
        if freshSS.K < Kbefore:
            msg = 'after remove empty (size < %d)' % (kwargs['cleanupMinSize'])
            logPosVector(freshSS.N, label=msg, level='moreinfo')

    if kwargs['cleanupDeleteToImproveFresh']:
        freshModel, freshSS, ELBO = BirthCleanup.delete_comps_to_improve_ELBO(
            freshData, freshModel, LP=freshLP)
        Info['evBound'] = ELBO
        if kwargs['birthDebug']:
            Info['freshModelPostDelete'] = freshModel.copy()

    elif kwargs['cleanupMergeToImproveFresh']:
        Korig = freshSS.K
        while freshSS.K > 1:
            mPairIDs, MM = MergePlanner.preselect_candidate_pairs(
                freshModel,
                freshSS,
                preselect_routine='wholeELBO',
                doLimitNumPairs=0,
                returnScoreMatrix=1,
                **kwargs)
            freshLP = freshModel.calc_local_params(freshData)
            freshSS = freshModel.get_global_suff_stats(freshData,
                                                       freshLP,
                                                       doPrecompEntropy=1,
                                                       doPrecompMergeEntropy=1,
                                                       mPairIDs=mPairIDs)
            freshModel.update_global_params(freshSS)
            freshELBO = freshModel.calc_evidence(SS=freshSS)
            freshModel, freshSS, freshELBO, Info = \
                MergeMove.run_many_merge_moves(
                    freshModel, freshSS, freshELBO,
                    mPairIDs, M=MM,
                    isBirthCleanup=1,
                    logFunc=log)
            if len(Info['AcceptedPairs']) == 0:
                break
        if freshSS.K < Korig:
            msg = 'after merges'
            logPosVector(freshSS.N, label=msg, level='moreinfo')

    if freshSS.K < 2:
        msg = "BIRTH failed. Fresh proposal does not prefer multiple comps."
        raise BirthProposalError(msg)

    return freshModel, freshSS, Info
Exemplo n.º 5
0
def delete_comps_from_expanded_model_to_improve_ELBO(Data,
                                                     xbigModel,
                                                     xbigSS,
                                                     xfreshSS,
                                                     xfreshLP=None,
                                                     Korig=0,
                                                     **kwargs):
    ''' Attempts deleting components K, K-1, K-2, ... Korig,
         keeping (and building on) any proposals that improve the ELBO

       Returns
       ---------
        model : HModel with Knew comps
        SS : SuffStatBag with Knew comps
        ELBO : evidence lower bound for the returned model
    '''
    logPhase('Cleanup')

    K = xbigSS.K
    assert xbigSS.K == xfreshSS.K
    assert xbigModel.obsModel.K == K

    origIDs = range(0, K)
    if K == 1:
        return xbigModel, xbigSS, xfreshSS, origIDs

    xfreshELBO = xbigModel.calc_evidence(SS=xfreshSS)
    for k in reversed(range(Korig, K)):
        if kwargs['cleanupDeleteViaLP']:
            rbigModel, rbigSS, rfreshSS, rfreshELBO, rfreshLP = \
                _make_xcandidate_LP(
                    xbigModel, Data,
                    xbigSS, xfreshSS, xfreshLP,
                    k, **kwargs)
        else:
            rbigModel, rbigSS, rfreshSS, rfreshELBO = _make_xcandidate(
                xbigModel, Data, xbigSS, xfreshSS, k)
        # If ELBO has improved, set current model to delete component k
        didAccept = False
        if rfreshELBO >= xfreshELBO:
            log('Deletion accepted. prop %.5e > cur %.5e' %
                (rfreshELBO, xfreshELBO))
            logPosVector(xfreshSS.N[Korig:])

            xbigSS = rbigSS
            xfreshSS = rfreshSS
            xbigModel = rbigModel
            xfreshELBO = rfreshELBO
            if kwargs['cleanupDeleteViaLP']:
                xfreshLP = rfreshLP
            didAccept = True
            del origIDs[k]

        if xfreshSS.K == 1:
            break
        # end loop over comps to delete

    if xbigSS.K == Korig and kwargs['cleanupRaiseErrorWhenAllDeleted']:
        log('FAILED. Deleting all new comps improves ELBO.')
        msg = "FAILED. After expansion, deleting all new comps improves ELBO."
        raise BirthProposalError(msg)
    return xbigModel, xbigSS, xfreshSS, xfreshELBO, origIDs
Exemplo n.º 6
0
def run_birth_move(bigModel, bigSS, freshData, Q=None, Plan=None, **kwargsIN):
    ''' Run birth move on provided target data, creating up to Kfresh new comps

        Returns
        -------
        bigmodel
        bigSS
        MoveInfo
    '''
    logPhase('Target Data')
    if 'ktarget' in Plan:
        ktarget = Plan['ktarget']
        if 'targetUID' in Plan:
            know = np.flatnonzero(bigSS.uIDs == Plan['targetUID'])
            if know.size == 1:
                sizeNow = bigSS.getCountVec()[know[0]]
            else:
                sizeNow = 0
            log(
                'target comp = %d. Size now %d. Size at selection %d.' %
                (Plan['targetUID'], sizeNow, Plan['count']), 'moreinfo')
        else:
            log('ktarget= %d.' % (ktarget), 'moreinfo')
    log(freshData.get_stats_summary(), 'debug')

    kwargs = dict(**kwargsIN)  # make local copy!
    origids = dict(bigModel=id(bigModel), bigSS=id(bigSS))

    try:
        if bigSS is None:
            msg = "SKIPPED. SS must be valid SuffStatBag, not None."
            raise BirthProposalError(msg)

        if bigSS.K + kwargs['Kfresh'] > kwargs['Kmax']:
            kwargs['Kfresh'] = kwargs['Kmax'] - bigSS.K

        if kwargs['Kfresh'] < 1:
            msg = "SKIPPED. Reached upper limit of Kmax=%d comps."
            msg = msg % (kwargs['Kmax'])
            raise BirthProposalError(msg)

        # Determine baseline ELBO
        if kwargs['birthVerifyELBOIncrease']:
            curbigModel = bigModel.copy()
            nStep = 3
            curfreshLP = None
            for step in range(nStep):
                doELBO = (step == nStep - 1)  # only on last step
                curfreshLP = curbigModel.calc_local_params(
                    freshData, curfreshLP, **kwargs)
                curfreshSS = curbigModel.get_global_suff_stats(
                    freshData, curfreshLP, doPrecompEntropy=doELBO)
                if not doELBO:  # all but the last step
                    curbigModel.update_global_params(bigSS + curfreshSS)
            curELBO = curbigModel.calc_evidence(SS=curfreshSS)

        # Create freshModel, freshSS, both with Kfresh comps
        #  freshSS has scale freshData
        #  freshModel has arbitrary scale
        freshModel, freshSS, freshInfo = \
            BirthCreate.create_model_with_new_comps(
                bigModel, bigSS, freshData, Q=Q,
                Plan=Plan, **kwargs)

        # Visualize, if desired
        if 'doVizBirth' in kwargs and kwargs['doVizBirth']:
            VizBirth.viz_birth_proposal(bigModel,
                                        freshModel,
                                        Plan,
                                        curELBO=None,
                                        propELBO=None,
                                        **kwargs)
            input('>>>')
            from matplotlib import pylab
            pylab.close('all')

        # Create xbigModel and xbigSS, with K + Kfresh comps
        # freshData can be assigned to any of the K+Kfresh comps
        # so, any of the K+Kfresh comps may be changed
        # but original comps won't lose influence of bigSS
        # * xbigSS has scale bigData + freshData
        # * xbigModel has scale bigData + freshData
        if kwargs['expandOrder'] == 'expandThenRefine':
            xbigModel, xbigSS, xfreshSS, xInfo = \
                BirthRefine.expand_then_refine(
                    freshModel, freshSS, freshData,
                    bigModel, bigSS, **kwargs)
        else:
            raise NotImplementedError('TODO')

        if kwargs['birthVerifyELBOIncrease']:
            logPhase('Evaluation')
            assert xfreshSS.hasELBOTerms()
            propELBO = xbigModel.calc_evidence(SS=xfreshSS)
            didPass, ELBOmsg = make_acceptance_decision(curELBO, propELBO)
            log(ELBOmsg)
        else:
            didPass = True
            ELBOmsg = ''
            propELBO = None  # needed for kwarg for viz_birth_proposal
            curELBO = None

        Kcur = bigSS.K
        Ktotal = xbigSS.K
        birthCompIDs = list(range(Kcur, Ktotal))

        # Reject. Abandon the move.
        if not didPass:
            msg = "BIRTH REJECTED. Did not explain target better than current."
            raise BirthProposalError(msg)

        assert xbigModel.obsModel.K == xbigSS.K
        # Create dict of info about this birth move
        msg = 'BIRTH ACCEPTED. %d fresh comps.' % (len(birthCompIDs))
        log(msg, 'info')

        MoveInfo = dict(
            didAddNew=True,
            msg=msg,
            AdjustInfo=xInfo['AInfo'],
            ReplaceInfo=xInfo['RInfo'],
            modifiedCompIDs=[],
            birthCompIDs=birthCompIDs,
            Korig=bigSS.K,
        )
        MoveInfo.update(xInfo)
        MoveInfo.update(freshInfo)
        assert not xbigSS.hasELBOTerms()
        assert not xbigSS.hasMergeTerms()
        xfreshSS.removeELBOTerms()
        if kwargs['birthRetainExtraMass']:
            MoveInfo['extraSS'] = xfreshSS
            MoveInfo['modifiedCompIDs'] = list(range(Ktotal))
        else:
            # Restore xbigSS to same scale as original "big" dataset
            xbigSS -= xfreshSS
            assert np.allclose(xbigSS.N.sum(), bigSS.N.sum())

        if bigSS.hasMergeTerms():
            MergeTerms = bigSS._MergeTerms.copy()
            MergeTerms.insertEmptyComps(Ktotal - Kcur)
            xbigSS.restoreMergeTerms(MergeTerms)
        if bigSS.hasELBOTerms():
            ELBOTerms = bigSS._ELBOTerms.copy()
            ELBOTerms.insertEmptyComps(Ktotal - Kcur)
            if xInfo['AInfo'] is not None:
                for key in xInfo['AInfo']:
                    if hasattr(ELBOTerms, key):
                        arr = getattr(ELBOTerms,
                                      key) + bigSS.nDoc * xInfo['AInfo'][key]
                        ELBOTerms.setField(key, arr, dims='K')
            if xInfo['RInfo'] is not None:
                for key in xInfo['RInfo']:
                    if hasattr(ELBOTerms, key):
                        ELBOTerms.setField(key,
                                           bigSS.nDoc * xInfo['RInfo'][key],
                                           dims=None)
            xbigSS.restoreELBOTerms(ELBOTerms)

        return xbigModel, xbigSS, MoveInfo
    except BirthProposalError as e:
        # We execute this code when birth fails for any reason, including:
        #  * user-specified Kmax limit reached
        #  * cleanup phase removed all new components

        # Verify guarantees that input model and input suff stats haven't
        # changed
        assert origids['bigModel'] == id(bigModel)
        assert origids['bigSS'] == id(bigSS)

        # Write reason for failure to log
        log(str(e), 'moreinfo')

        # Return failure info
        MoveInfo = dict(didAddNew=False,
                        msg=str(e),
                        modifiedCompIDs=[],
                        birthCompIDs=[])
        return bigModel, bigSS, MoveInfo