示例#1
0
def sparseLocalStep_WordCountData(
        Data=None,
        LP=None,
        alphaEbeta=None,
        alphaEbetaRem=None,
        ElogphiT=None,
        DocTopicCount=None,
        spResp_data_OUT=None,
        spResp_colids_OUT=None,
        nCoordAscentItersLP=10,
        convThrLP=0.001,
        nnzPerRowLP=2,
        activeonlyLP=1,
        restartLP=0,
        restartNumTrialsLP=50,
        initDocTopicCountLP='setDocProbsToEGlobalProbs',
        reviseActiveFirstLP=-1,
        reviseActiveEveryLP=1,
        maxDiffVec=None,
        numIterVec=None,
        nRAcceptVec=None,
        nRTrialVec=None,
        verboseLP=0,
        **kwargs):
    ''' Perform local inference for topic model. Wrapper around C++ code.
    '''
    if LP is not None:
        ElogphiT = LP['ElogphiT']
    N = Data.nUniqueToken
    V, K = ElogphiT.shape
    assert K == alphaEbeta.size
    nnzPerRowLP = np.minimum(nnzPerRowLP, K)

    # Parse params for tracking convergence progress
    if maxDiffVec is None:
        maxDiffVec = np.zeros(Data.nDoc, dtype=np.float64)
        numIterVec = np.zeros(Data.nDoc, dtype=np.int32)
    if nRTrialVec is None:
        nRTrialVec = np.zeros(1, dtype=np.int32)
        nRAcceptVec = np.zeros(1, dtype=np.int32)
    assert maxDiffVec.dtype == np.float64
    assert numIterVec.dtype == np.int32

    # Handle starting from memoized doc-topic counts
    if initDocTopicCountLP == 'memo':
        if 'DocTopicCount' in LP:
            DocTopicCount = LP['DocTopicCount']
        else:
            initDocTopicCountLP = 'setDocProbsToEGlobalProbs'

    # Allow sparse restarts ONLY on first pass through dataset
    if restartLP > 1:
        if 'lapFrac' in kwargs and kwargs['lapFrac'] <= 1.0:
            restartLP = 1
        else:
            restartLP = 0

    # Use provided DocTopicCount array if its the right size
    # Otherwise, create a new one from scratch
    TopicCount_OUT = None
    if isinstance(DocTopicCount, np.ndarray):
        if DocTopicCount.shape == (Data.nDoc, K):
            TopicCount_OUT = DocTopicCount
    if TopicCount_OUT is None:
        TopicCount_OUT = np.zeros((Data.nDoc, K))
    assert TopicCount_OUT.shape == (Data.nDoc, K)
    if spResp_data_OUT is None:
        spResp_data_OUT = np.zeros(N * nnzPerRowLP)
        spResp_colids_OUT = np.zeros(N * nnzPerRowLP, dtype=np.int32)
    assert spResp_data_OUT.size == N * nnzPerRowLP
    assert spResp_colids_OUT.size == N * nnzPerRowLP

    if initDocTopicCountLP.startswith("setDocProbsToEGlobalProbs"):
        initProbsToEbeta = 1
    elif initDocTopicCountLP.startswith("fast"):
        initProbsToEbeta = 2
    else:
        initProbsToEbeta = 0
    if reviseActiveFirstLP < 0:
        reviseActiveFirstLP = nCoordAscentItersLP + 10

    sparseLocalStepManyDocs_cpp(
        alphaEbeta, ElogphiT, Data.word_count, Data.word_id, Data.doc_range,
        nnzPerRowLP, N, K, Data.nDoc, Data.vocab_size, nCoordAscentItersLP,
        convThrLP, initProbsToEbeta, TopicCount_OUT, spResp_data_OUT,
        spResp_colids_OUT, numIterVec, maxDiffVec,
        restartNumTrialsLP * restartLP, nRAcceptVec, nRTrialVec,
        reviseActiveFirstLP, reviseActiveEveryLP, verboseLP)

    # Package results up into dict
    if not isinstance(LP, dict):
        LP = dict()
    LP['nnzPerRow'] = nnzPerRowLP
    LP['DocTopicCount'] = TopicCount_OUT
    indptr = np.arange(0, (N + 1) * nnzPerRowLP, nnzPerRowLP, dtype=np.int32)
    LP['spR'] = scipy.sparse.csr_matrix(
        (spResp_data_OUT, spResp_colids_OUT, indptr), shape=(N, K))
    # Fill in remainder of LP dict, with derived quantities
    from bnpy.allocmodel.topics.LocalStepManyDocs \
        import updateLPGivenDocTopicCount, writeLogMessageForManyDocs
    LP = updateLPGivenDocTopicCount(LP, LP['DocTopicCount'], alphaEbeta,
                                    alphaEbetaRem)
    LP['Info'] = dict()
    LP['Info']['iter'] = numIterVec
    LP['Info']['maxDiff'] = maxDiffVec

    if restartLP > 0:
        LP['Info']['nRestartsAccepted'] = nRAcceptVec[0]
        LP['Info']['nRestartsTried'] = nRTrialVec[0]
    writeLogMessageForManyDocs(Data,
                               LP['Info'],
                               LP,
                               convThrLP=convThrLP,
                               **kwargs)
    return LP
示例#2
0
 def train_image_specific_topics(self,
                                 y,
                                 sigma,
                                 Niter=50,
                                 Kfresh=100,
                                 pixelMask=None):
     print('Training %d image-specific clusters...' % Kfresh)
     D, patchSize, GP = self.D, int(np.sqrt(self.D)), self.GP
     # gather fully observable patches
     if pixelMask is None:  # gray-scale image denoising
         v = im2col(y, patchSize)
     else:  # color image inpainting
         C = 3
         patchMask = np.logical_not(
             np.any(im2col(pixelMask, patchSize), axis=0))
         v = np.hstack(
             tuple([
                 im2col(y[:, :, c], patchSize)[:, patchMask]
                 for c in xrange(C)
             ]))
     v -= np.mean(v, axis=0)
     v = v.T
     testData = GroupXData(X=v, doc_range=[0, len(v)], nDocTotal=1)
     testData.name = 'test_image_patches'
     # set up hyper-parameters and run Bregman k-means
     cached_B_name = 'models/HDP/B.mat'
     xBar = loadmat(cached_B_name)['Cov']
     xBar2 = loadmat(cached_B_name)['Cov2']
     tmp0 = (np.diag(xBar) + sigma**2)**2
     tmp1 = np.diag(xBar2) + 6 * np.diag(xBar) * sigma**2 + 3 * sigma**4
     nu = D + 3 + 2 * np.sum(tmp0) / np.sum(tmp1 - tmp0)
     B = (nu - D - 1) * (xBar + sigma**2 * np.eye(D))
     obsModel = ZeroMeanGaussObsModel(D=D,
                                      min_covar=1e-8,
                                      inferType='memoVB',
                                      B=B,
                                      nu=nu)
     Z, Mu, Lscores = runKMeans_BregmanDiv(testData.X,
                                           Kfresh,
                                           obsModel,
                                           Niter=Niter,
                                           assert_monotonic=False)
     Korig = self.K
     Kall = np.max(Z) + Korig + 1
     Kfresh = Kall - Korig
     Z += Korig
     # load SuffStats of training images
     trainSS = loadSuffStatBag('models/HDP/SS.dump')
     trainSS.insertEmptyComps(Kfresh)
     # construct SuffStats of the test image
     DocTopicCount = np.bincount(Z, minlength=int(Kall)).reshape((1, Kall))
     DocTopicCount = np.array(DocTopicCount, dtype=np.float64)
     resp = np.zeros((len(Z), Kall))
     resp[np.arange(len(Z)), Z] = 1.0
     testLP = dict(resp=resp, DocTopicCount=DocTopicCount)
     alphaPi0 = np.hstack(
         (GP.alphaPi0, GP.alphaPi0Rem / (Kfresh + 1) * np.ones(Kfresh)))
     alphaPi0Rem = GP.alphaPi0Rem / (Kfresh + 1)
     testLP = updateLPGivenDocTopicCount(testLP, DocTopicCount, alphaPi0,
                                         alphaPi0Rem)
     testSS = self.patchModel.get_global_suff_stats(
         testData, testLP, doPrecompEntropy=1, doTrackTruncationGrowth=1)
     xxT = np.zeros((Kall, D, D))
     for k in xrange(Korig, Kall):
         idx = Z == k
         tmp = np.einsum('nd,ne->de', v[idx], v[idx])
         tmp -= testSS.N[k] * sigma**2 * np.eye(D)
         val, vec = np.linalg.eig(tmp)
         val[val < EPS] = EPS
         xxT[k] = np.dot(vec, np.dot(np.diag(val), vec.T))
     testSS.setField('xxT', xxT, dims=('K', 'D', 'D'))
     testSS.setUIDs(trainSS.uids)
     # combine training and test SS; update model parameters
     combinedSS = trainSS + testSS
     self.patchModel.update_global_params(combinedSS)
     self.calcGlobalParams()
    combined_pi0_rem = prev_pi0_rem * new_pi0_rem
    assert np.allclose(
        combined_pi0_Knew.sum() + combined_pi0_rem,
        1.0)

    # Now just multiply by alpha
    alpha = trainedModel.allocModel.alpha
    alphaPi0 = alpha * combined_pi0_Knew
    alphaPi0Rem = alpha * combined_pi0_rem
    assert np.allclose(
        alpha,
        alphaPi0.sum() + alphaPi0Rem)

    testLP = updateLPGivenDocTopicCount(
        testLP,
        test_DocTopicCount,
        alphaPi0,
        alphaPi0Rem)
    testSS = trainedModel.get_global_suff_stats(
        TestData,
        testLP,
        doPrecompEntropy=1,
        doTrackTruncationGrowth=1)

    print()
    print("Refining model!")
    print("Performing several full VB iterations")
    print("Merging any new clusters when VB objective approves")

    # Create a combined model for the train AND test set
    trainSS.insertEmptyComps(testSS.K - trainSS.K)