Пример #1
0
def estimatePiAndDiv_ManyDocs(Data,
                              obsModel,
                              Mu,
                              Pi=None,
                              k=None,
                              alpha=1.0,
                              optim_method='frankwolfe',
                              doActiveOnly=True,
                              DivDataVec=None,
                              smoothVec='lam',
                              maxiter=100,
                              minDiv=None):
    ''' Estimate doc-topic probs for many docs, with corresponding divergence

    Returns
    -------
    Pi : 2D array, size D x K
    minDiv : 1D array, size D
        minDiv[d] : divergence from closest convex combination of topics in Mu
    '''
    K = len(Mu)
    if k is None:
        k = K
    if isinstance(Mu, list):
        topics = np.vstack(Mu[:k])
    else:
        topics = Mu[:k]

    if Pi is None:
        Pi = np.ones((Data.nDoc, K))
    if minDiv is None:
        minDiv = np.zeros(Data.nDoc)
    for d in range(Data.nDoc):
        start_d = Data.doc_range[d]
        stop_d = Data.doc_range[d + 1]
        wids_d = Data.word_id[start_d:stop_d]
        wcts_d = Data.word_count[start_d:stop_d]

        if doActiveOnly:
            activeIDs_d = np.flatnonzero(Pi[d, :k] > .01)
            if activeIDs_d[-1] != k - 1:
                activeIDs_d = np.append(activeIDs_d, k - 1)
        else:
            activeIDs_d = np.arange(k)
        assert activeIDs_d.size >= 1
        assert activeIDs_d.size <= k

        topics_d = topics[activeIDs_d, :]
        assert topics_d.shape[0] <= k

        initpiVec_d = Pi[d, activeIDs_d].copy()
        initpiVec_d[-1] = 0.1
        initpiVec_d[:-1] *= 0.9
        initpiVec_d /= initpiVec_d.sum()
        assert np.allclose(initpiVec_d.sum(), 1.0)

        if optim_method == 'frankwolfe':
            piVec_d = estimatePiForDoc_frankwolfe(ids_U=wids_d,
                                                  cts_U=wcts_d,
                                                  topics_KV=topics_d,
                                                  initpiVec_K=initpiVec_d,
                                                  alpha=alpha,
                                                  seed=(k * 101 + d),
                                                  maxiter=maxiter,
                                                  returnFuncValAndInfo=False,
                                                  verbose=False)
            piVec_d *= Pi[d, activeIDs_d[:-1]].sum()
            Pi[d, activeIDs_d] = piVec_d
        else:
            Pi[d, :k], _, _ = estimatePiForDoc_graddescent(ids_d=wids_d,
                                                           cts_d=wcts_d,
                                                           topics=topics,
                                                           alpha=alpha,
                                                           scale=1.0,
                                                           piInit=None)

        assert np.allclose(Pi[d, :k].sum(), 1.0)
        minDiv[d] = -1 * np.inner(wcts_d,
                                  np.log(np.dot(Pi[d, :k], topics[:, wids_d])))

    minDiv_check = -1 * np.sum(
        Data.getDocTypeCountMatrix() * np.log(np.dot(Pi[:, :k], topics)),
        axis=1)
    assert np.allclose(minDiv, minDiv_check)

    if isinstance(smoothVec, str) and smoothVec.count('lam'):
        minDiv -= np.dot(np.log(np.dot(Pi[:, :k], topics)), obsModel.Prior.lam)
    elif isinstance(smoothVec, np.ndarray):
        minDiv -= np.dot(np.log(np.dot(Pi[:, :k], topics)), smoothVec)
    if DivDataVec is not None:
        minDiv += DivDataVec
    assert np.min(minDiv) > -1e-6
    np.maximum(minDiv, 0, out=minDiv)
    return Pi, minDiv
Пример #2
0
            if randLtrace[-1] > bestL:
                bestL = randLtrace[-1]
                bestDTC_K = randDTC_K
            if randLtrace[-1] < worstL:
                worstL = randLtrace[-1]
                worstDTC_K = randDTC_K
            assert isMonotonicIncreasing(randLtrace)

        print "BEST of ", randlabel
        print ' '.join(['%6.1f' % (x) for x in bestDTC_K])
        print "WORST of ", randlabel
        print ' '.join(['%6.1f' % (x) for x in worstDTC_K])

        fwpi_K, _, _ = estimatePiForDoc_frankwolfe(
            ids_U=ids_U,
            cts_U=cts_U,
            topics_KV=topics_KV,
            alpha=alpha,
            seed=d)
        fwDTC_K, fwLtrace = calcLocalParamsWithELBOTraceForSingleDoc(
            initDocTopicProb_K=fwpi_K,
            logLik_UK=logLik_UK,
            cts_U=cts_U,
            alphaEbeta_K=alphaEbeta_K,
            convThrLP=convThrLP,
            nCoordAscentItersLP=nCoordAscentItersLP)
        pylab.plot(fwLtrace, 'b-', label='frankwolfeMAP', linewidth=2);
        assert isMonotonicIncreasing(fwLtrace)

        natpi_K, _, _ = estimatePiForDoc_graddescent(
            ids_U=ids_U,
            cts_U=cts_U,