Пример #1
0
def loadTopicModelFromMEDLDA(filepath, prefix=None, returnTPA=0):
    ''' Load topic model saved in medlda format.
    '''
    # Avoid circular import
    import bnpy.HModel as HModel

    assert prefix is not None
    alphafilepath = os.path.join(filepath, prefix + '.alpha')
    etafilepath = os.path.join(filepath, prefix + '.eta')
    topicfilepath = os.path.join(filepath, prefix + '.log_prob_w')

    alpha = float(np.loadtxt(alphafilepath))
    eta = np.loadtxt(etafilepath)
    logtopics = np.loadtxt(topicfilepath)
    topics = np.exp(logtopics)
    topics += 1e-9
    topics /= topics.sum(axis=1)[:, np.newaxis]
    assert np.all(np.isfinite(topics))

    if returnTPA:
        K = topics.shape[0]
        probs = 1.0 / K * np.ones(K)
        return topics, probs, alpha, eta

    infAlg = 'VB'
    aPriorDict = dict(alpha=alpha)
    amodel = AllocModelConstructorsByName['FiniteTopicModel'](infAlg,
                                                              aPriorDict)
    omodel = ObsModelConstructorsByName['Mult'](infAlg,
                                                lam=0.001,
                                                D=topics.shape[1])
    hmodel = HModel(amodel, omodel)
    hmodel.obsModel.set_global_params(topics=topics, nTotalTokens=1000)
    return hmodel
Пример #2
0
def load_model( matfilepath, prefix='Best'):
  ''' Load model stored to disk by ModelWriter
  '''
  # avoids circular import
  import bnpy.HModel as HModel
  obsModel = load_obs_model(matfilepath, prefix)
  allocModel = load_alloc_model(matfilepath, prefix)
  return HModel(allocModel, obsModel)
Пример #3
0
def load_model_at_prefix(matfilepath, prefix='Best', lap=None):
    ''' Load model stored to disk by ModelWriter

    Returns
    ------
    model : bnpy.HModel
        Model object for saved at checkpoint indicated by prefix or lap.
    '''
    # Avoids circular import
    import bnpy.HModel as HModel

    if lap is not None:
        prefix, _ = getPrefixForLapQuery(matfilepath, lap)
    try:
        obsModel = load_obs_model(matfilepath, prefix)
        allocModel = load_alloc_model(matfilepath, prefix)
        model = HModel.HModel(allocModel, obsModel)
    except IOError as e:
        print(str(e))
        '''
        if prefix == 'Best':
            matList = glob.glob(os.path.join(matfilepath, '*TopicModel.mat'))
            lpwList = glob.glob(os.path.join(matfilepath, '*.log_prob_w'))
            if len(matList) > 0:
                matList.sort()  # ascending order, so most recent is last
                prefix = matList[-1].split(os.path.sep)[-1][:11]
                model = loadTopicModel(matfilepath, prefix=prefix)
            elif len(lpwList) > 0:
                lpwList.sort()  # ascenting order
                prefix = lpwList[-1].split(os.path.sep)[-1][:7]

            else:
                raise e
        '''
        try:
            model = loadTopicModel(matfilepath, prefix=prefix)
        except IOError as e:
            model = loadTopicModelFromMEDLDA(matfilepath, prefix=prefix)
    return model
Пример #4
0
def loadTopicModelFromTxtFiles(snapshotPath,
                               returnTPA=False,
                               returnWordCounts=False,
                               normalizeProbs=True,
                               normalizeTopics=True,
                               **kwargs):
    ''' Load from snapshot text files.

    Returns
    -------
    hmodel
    '''
    Mdict = dict()
    possibleKeys = [
        'K', 'probs', 'alpha', 'beta', 'lam', 'gamma', 'nTopics', 'nTypes',
        'vocab_size'
    ]
    keyMap = dict(beta='lam', nTopics='K', nTypes='vocab_size')
    for key in possibleKeys:
        try:
            arr = np.loadtxt(snapshotPath + "/%s.txt" % (key))
            if key in keyMap:
                Mdict[keyMap[key]] = arr
            else:
                Mdict[key] = arr
        except Exception:
            pass
    assert 'K' in Mdict
    assert 'lam' in Mdict
    K = int(Mdict['K'])
    V = int(Mdict['vocab_size'])

    if os.path.exists(snapshotPath + "/topics.txt"):
        Mdict['topics'] = np.loadtxt(snapshotPath + "/topics.txt")
        Mdict['topics'] = as2D(toCArray(Mdict['topics'], dtype=np.float64))
        assert Mdict['topics'].ndim == 2
        assert Mdict['topics'].shape == (K, V)
    else:
        TWC_data = np.loadtxt(snapshotPath + "/TopicWordCount_data.txt")
        TWC_inds = np.loadtxt(snapshotPath + "/TopicWordCount_indices.txt",
                              dtype=np.int32)
        if os.path.exists(snapshotPath + "/TopicWordCount_cscindptr.txt"):
            TWC_cscindptr = np.loadtxt(snapshotPath +
                                       "/TopicWordCount_cscindptr.txt",
                                       dtype=np.int32)
            TWC = scipy.sparse.csc_matrix((TWC_data, TWC_inds, TWC_cscindptr),
                                          shape=(K, V))
        else:
            TWC_csrindptr = np.loadtxt(snapshotPath +
                                       "/TopicWordCount_indptr.txt",
                                       dtype=np.int32)
            TWC = scipy.sparse.csr_matrix((TWC_data, TWC_inds, TWC_csrindptr),
                                          shape=(K, V))

        Mdict['WordCounts'] = TWC.toarray()

    if returnTPA:
        # Load topics : 2D array, K x vocab_size
        if 'WordCounts' in Mdict:
            topics = Mdict['WordCounts'] + Mdict['lam']
        else:
            topics = Mdict['topics']
        topics = as2D(toCArray(topics, dtype=np.float64))
        assert topics.ndim == 2
        K = topics.shape[0]
        if normalizeTopics:
            topics /= topics.sum(axis=1)[:, np.newaxis]

        # Load probs : 1D array, size K
        try:
            probs = Mdict['probs']
        except KeyError:
            probs = (1.0 / K) * np.ones(K)
        probs = as1D(toCArray(probs, dtype=np.float64))
        assert probs.ndim == 1
        assert probs.size == K
        if normalizeProbs:
            probs = probs / np.sum(probs)

        # Load alpha : scalar float > 0
        try:
            alpha = float(Mdict['alpha'])
        except KeyError:
            if 'alpha' in os.environ:
                alpha = float(os.environ['alpha'])
            else:
                raise ValueError('Unknown parameter alpha')
        return topics, probs, alpha

    # BUILD HMODEL FROM LOADED TXT
    infAlg = 'VB'
    # avoids circular import
    from bnpy.HModel import HModel
    if 'gamma' in Mdict:
        aPriorDict = dict(alpha=Mdict['alpha'], gamma=Mdict['gamma'])
        HDPTopicModel = AllocModelConstructorsByName['HDPTopicModel']
        amodel = HDPTopicModel(infAlg, aPriorDict)
    else:
        FiniteTopicModel = AllocModelConstructorsByName['FiniteTopicModel']
        amodel = FiniteTopicModel(infAlg, dict(alpha=Mdict['alpha']))
    omodel = ObsModelConstructorsByName['Mult'](infAlg, **Mdict)
    hmodel = HModel(amodel, omodel)
    hmodel.set_global_params(**Mdict)
    if returnWordCounts:
        return hmodel, Mdict['WordCounts']
    return hmodel
Пример #5
0
def loadTopicModel(matfilepath,
                   queryLap=None,
                   prefix=None,
                   returnWordCounts=0,
                   returnTPA=0,
                   normalizeTopics=0,
                   normalizeProbs=0,
                   **kwargs):
    ''' Load saved topic model

    Returns
    -------
    topics : 2D array, K x vocab_size (if returnTPA)
    probs : 1D array, size K (if returnTPA)
    alpha : scalar (if returnTPA)
    hmodel : HModel
    WordCounts : 2D array, size K x vocab_size (if returnWordCounts)
    '''
    if prefix is None:
        prefix, lapQuery = getPrefixForLapQuery(matfilepath, queryLap)
    # avoids circular import
    from bnpy.HModel import HModel
    if len(glob.glob(os.path.join(matfilepath, "*.log_prob_w"))) > 0:
        return loadTopicModelFromMEDLDA(matfilepath,
                                        prefix,
                                        returnTPA=returnTPA)

    snapshotList = glob.glob(os.path.join(matfilepath, 'Lap*TopicSnapshot'))
    matfileList = glob.glob(os.path.join(matfilepath, 'Lap*TopicModel.mat'))
    if len(snapshotList) > 0:
        if prefix is None:
            snapshotList.sort()
            snapshotPath = snapshotList[-1]
        else:
            snapshotPath = None
            for curPath in snapshotList:
                if curPath.count(prefix):
                    snapshotPath = curPath
        return loadTopicModelFromTxtFiles(snapshotPath,
                                          normalizeTopics=normalizeTopics,
                                          normalizeProbs=normalizeProbs,
                                          returnWordCounts=returnWordCounts,
                                          returnTPA=returnTPA)

    if prefix is not None:
        matfilepath = os.path.join(matfilepath, prefix + 'TopicModel.mat')
    Mdict = loadDictFromMatfile(matfilepath)
    if 'SparseWordCount_data' in Mdict:
        data = np.asarray(Mdict['SparseWordCount_data'], dtype=np.float64)
        K = int(Mdict['K'])
        vocab_size = int(Mdict['vocab_size'])
        try:
            indices = Mdict['SparseWordCount_indices']
            indptr = Mdict['SparseWordCount_indptr']
            WordCounts = scipy.sparse.csr_matrix((data, indices, indptr),
                                                 shape=(K, vocab_size))
        except KeyError:
            rowIDs = Mdict['SparseWordCount_i'] - 1
            colIDs = Mdict['SparseWordCount_j'] - 1
            WordCounts = scipy.sparse.csr_matrix((data, (rowIDs, colIDs)),
                                                 shape=(K, vocab_size))
        Mdict['WordCounts'] = WordCounts.toarray()
    if returnTPA:
        # Load topics : 2D array, K x vocab_size
        if 'WordCounts' in Mdict:
            topics = Mdict['WordCounts'] + Mdict['lam']
        else:
            topics = Mdict['topics']
        topics = as2D(toCArray(topics, dtype=np.float64))
        assert topics.ndim == 2
        K = topics.shape[0]
        if normalizeTopics:
            topics /= topics.sum(axis=1)[:, np.newaxis]

        # Load probs : 1D array, size K
        try:
            probs = Mdict['probs']
        except KeyError:
            probs = (1.0 / K) * np.ones(K)
        probs = as1D(toCArray(probs, dtype=np.float64))
        assert probs.ndim == 1
        assert probs.size == K
        if normalizeProbs:
            probs = probs / np.sum(probs)

        # Load alpha : scalar float > 0
        try:
            alpha = float(Mdict['alpha'])
        except KeyError:
            if 'alpha' in os.environ:
                alpha = float(os.environ['alpha'])
            else:
                raise ValueError('Unknown parameter alpha')
        if 'eta' in Mdict:
            return topics, probs, alpha, as1D(toCArray(Mdict['eta']))
        return topics, probs, alpha

    infAlg = 'VB'
    if 'gamma' in Mdict:
        aPriorDict = dict(alpha=Mdict['alpha'], gamma=Mdict['gamma'])
        HDPTopicModel = AllocModelConstructorsByName['HDPTopicModel']
        amodel = HDPTopicModel(infAlg, aPriorDict)
    else:
        FiniteTopicModel = AllocModelConstructorsByName['FiniteTopicModel']
        amodel = FiniteTopicModel(infAlg, dict(alpha=Mdict['alpha']))
    omodel = ObsModelConstructorsByName['Mult'](infAlg, **Mdict)
    hmodel = HModel(amodel, omodel)
    hmodel.set_global_params(**Mdict)
    if returnWordCounts:
        return hmodel, Mdict['WordCounts']
    return hmodel
Пример #6
0
def proposeNewResp_dpmixture(Z_n,
                             propResp,
                             tempModel=None,
                             tempSS=None,
                             Data_n=None,
                             origK=0,
                             Kfresh=3,
                             nVBIters=3,
                             PRNG=None,
                             PastAttemptLog=dict(),
                             **kwargs):
    ''' Create new resp matrix by DP mixture clustering of subsampled data.

    Returns
    -------
    propResp : 2D array, N x K'
    '''
    # Avoid circular imports
    from bnpy.allocmodel import DPMixtureModel
    from bnpy import HModel
    from bnpy.mergemove import MergePlanner, MergeMove

    # Select ktarget
    if 'strategy' not in PastAttemptLog:
        PastAttemptLog['strategy'] = 'byState'

    if PastAttemptLog['strategy'] == 'byState':
        Kcur = tempModel.obsModel.K
        Kextra = Kcur - PastAttemptLog['uIDs'].size
        if Kextra > 0:
            maxUID = PastAttemptLog['maxUID']
            uIDs = PastAttemptLog['uIDs']
            for extraPos in range(Kextra):
                maxUID += 1
                uIDs = np.append(uIDs, maxUID)
            PastAttemptLog['maxUID'] = maxUID
            PastAttemptLog['uIDs'] = uIDs

        candidateStateUIDs = set()
        for state in np.unique(Z_n):
            uid = PastAttemptLog['uIDs'][state]
            candidateStateUIDs.add(uid)
        allAvailableUIDs = [x for x in candidateStateUIDs]
        if 'nTryByStateUID' not in PastAttemptLog:
            PastAttemptLog['nTryByStateUID'] = dict()

        minTry = np.inf
        for badState, nTry in list(PastAttemptLog['nTryByStateUID'].items()):
            if badState in candidateStateUIDs:
                if nTry < minTry:
                    minTry = nTry
        untriedList = [
            x for x in candidateStateUIDs
            if x not in PastAttemptLog['nTryByStateUID']
            or PastAttemptLog['nTryByStateUID'][x] == 0
        ]
        if len(untriedList) > 0:
            candidateStateUIDs = untriedList
        else:
            # Keep only candidates that have been tried the least
            for badState, nTry in list(
                    PastAttemptLog['nTryByStateUID'].items()):
                # Remove bad State from candidateStateUIDs
                if badState in candidateStateUIDs:
                    if nTry > minTry:
                        candidateStateUIDs.remove(badState)
        candidateStateUIDs = np.asarray([x for x in candidateStateUIDs])
        # Pick a state that we haven't tried yet,
        # uniformly at random
        if len(candidateStateUIDs) > 0:
            chosenStateUID = PRNG.choice(np.asarray(candidateStateUIDs))
            ktarget = np.flatnonzero(
                chosenStateUID == PastAttemptLog['uIDs'])[0]
        else:
            # Just pick a target at random
            chosenStateUID = PRNG.choice(np.asarray(allAvailableUIDs))

    ktarget = np.flatnonzero(chosenStateUID == PastAttemptLog['uIDs'])[0]

    relDataIDs = np.flatnonzero(Z_n == ktarget)

    # If the selected state is too small, just make a new state for all relIDs
    if relDataIDs.size < Kfresh:
        if chosenStateUID in PastAttemptLog['nTryByStateUID']:
            PastAttemptLog['nTryByStateUID'][chosenStateUID] += 1
        else:
            PastAttemptLog['nTryByStateUID'][chosenStateUID] = 1
        propResp[relDataIDs, :] = 0
        propResp[relDataIDs, origK + 1] = 1
        return propResp, origK + 1

    if hasattr(Data_n, 'Xprev'):
        Xprev = Data_n.Xprev[relDataIDs]
    else:
        Xprev = None
    targetData = XData(X=Data_n.X[relDataIDs], Xprev=Xprev)

    myDPModel = DPMixtureModel('VB', gamma0=10)
    myObsModel = copy.deepcopy(tempModel.obsModel)
    delattr(myObsModel, 'Post')
    myObsModel.ClearCache()

    myHModel = HModel(myDPModel, myObsModel)
    initname = PRNG.choice(['randexamplesbydist', 'randcontigblocks'])
    myHModel.init_global_params(targetData,
                                K=Kfresh,
                                initname=initname,
                                **kwargs)

    Kfresh = myHModel.obsModel.K
    mergeIsPromising = True
    while Kfresh > 1 and mergeIsPromising:
        for vbiter in range(nVBIters):
            targetLP = myHModel.calc_local_params(targetData)
            targetSS = myHModel.get_global_suff_stats(targetData, targetLP)
            # Delete unnecessarily small comps
            if vbiter == nVBIters - 1:
                smallIDs = np.flatnonzero(targetSS.getCountVec() <= 1)
                for kdel in reversed(smallIDs):
                    if targetSS.K > 1:
                        targetSS.removeComp(kdel)
            # Global step
            myHModel.update_global_params(targetSS)

        # Do merges
        mPairIDs, MM = MergePlanner.preselect_candidate_pairs(
            myHModel,
            targetSS,
            preselect_routine='wholeELBO',
            doLimitNumPairs=0,
            returnScoreMatrix=1,
            **kwargs)
        targetLP = myHModel.calc_local_params(targetData,
                                              mPairIDs=mPairIDs,
                                              limitMemoryLP=1)
        targetSS = myHModel.get_global_suff_stats(targetData,
                                                  targetLP,
                                                  mPairIDs=mPairIDs,
                                                  doPrecompEntropy=1,
                                                  doPrecompMergeEntropy=1)
        myHModel.update_global_params(targetSS)
        curELBO = myHModel.calc_evidence(SS=targetSS)
        myHModel, targetSS, curELBO, Info = MergeMove.run_many_merge_moves(
            myHModel, targetSS, curELBO, mPairIDs, M=MM, isBirthCleanup=1)
        mergeIsPromising = len(Info['AcceptedPairs']) > 0
        Kfresh = targetSS.K

    if mergeIsPromising:
        targetLP = myHModel.calc_local_params(targetData)
    propResp[relDataIDs, :] = 0
    propResp[relDataIDs, origK:origK + Kfresh] = targetLP['resp']

    # Test if we added at least 2 states with mass > 1
    didAddNonEmptyNewStates = np.sum(targetSS.N > 1.0) >= 2
    print('dpmixture proposal: targetUID %d didAddNonEmptyNewStates %d' %
          (chosenStateUID, didAddNonEmptyNewStates))
    if didAddNonEmptyNewStates:
        print('NEW STATE MASSES:', end=' ')
        print(' '.join(['%5.1f' % (x) for x in targetSS.N]))
        PastAttemptLog['nTryByStateUID'][chosenStateUID] = 0  # success!
    else:
        if chosenStateUID in PastAttemptLog['nTryByStateUID']:
            PastAttemptLog['nTryByStateUID'][chosenStateUID] += 1
        else:
            PastAttemptLog['nTryByStateUID'][chosenStateUID] = 1
    return propResp, origK + Kfresh