Exemplo n.º 1
0
def updateLPWithResp(LP,
                     Data,
                     Lik,
                     Prior,
                     sumRespTilde,
                     cslice=(0, None),
                     doSparseOnlyAtFinalLP=0,
                     nnzPerRowLP=0):
    ''' Compute assignment responsibilities given output of local step.

    Args
    ----
    LP : dict
        Has other fields like 'E_log_soft_ev'
    Data : DataObj
    Lik : 2D array, size N x K
        Will be overwritten and turned into resp.

    Returns
    -------
    LP : dict
        Add field 'resp' : N x K 2D array.
    '''
    # Create resp array directly from Lik array.
    # Do not make any copies, to save memory.
    LP['resp'] = Lik
    nDoc = calcNumDocFromSlice(Data, cslice)
    slice_start = Data.doc_range[cslice[0]]
    N = LP['resp'].shape[0]
    K = LP['resp'].shape[1]
    if N > Data.doc_range[-1]:
        assert N == nDoc * Data.vocab_size
        # Bernoulli naive case. Quite slow!
        for d in xrange(nDoc):
            rstart = d * Data.vocab_size
            rstop = (d + 1) * Data.vocab_size
            LP['resp'][rstart:rstop] *= Prior[d]
    else:
        # Usual case. Quite fast!
        for d in xrange(nDoc):
            start = Data.doc_range[cslice[0] + d] - slice_start
            stop = Data.doc_range[cslice[0] + d + 1] - slice_start
            LP['resp'][start:stop] *= Prior[d]
    if doSparseOnlyAtFinalLP and (nnzPerRowLP > 0 and nnzPerRowLP < K):
        LP['spR'] = sparsifyResp(LP['resp'], nnzPerRow=nnzPerRowLP)
        LP['nnzPerRow'] = nnzPerRowLP
        assert np.allclose(LP['spR'].sum(axis=1), 1.0)
        del LP['resp']
        np.maximum(LP['spR'].data, 1e-300, out=LP['spR'].data)
        fillInDocTopicCountFromSparseResp(Data, LP)
    else:
        LP['resp'] /= sumRespTilde[:, np.newaxis]
        np.maximum(LP['resp'], 1e-300, out=LP['resp'])
    # Time consuming:
    # >>> assert np.allclose(LP['resp'].sum(axis=1), 1.0)
    return LP
Exemplo n.º 2
0
def _make_kwarg_dict(X=None, R=None, nnzPerRow=2, N=100, K=3, D=2):
    if X is None:
        X = np.random.randn(N, D)
    if R is None:
        R = np.random.rand(N, K)
        R *= R
        R /= R.sum(axis=1)[:, np.newaxis]
    # Sparsify R
    spR_csr = sparsifyResp(R, nnzPerRow)
    spR_csc = spR_csr.tocsc()
    R = spR_csc.toarray()
    np.maximum(R, 1e-100, out=R)  # avoid NaN values
    return dict(X=X,
                R=R,
                spR_csc=spR_csc,
                spR_csr=spR_csr,
                nnzPerRow=nnzPerRow)
Exemplo n.º 3
0
 def save_batch_local_params_to_memory(self, batchID, batchLP):
     ''' Store certain fields of the provided local parameters dict
           into "memory" for later retrieval.
         Fields to save determined by the memoLPkeys attribute of this alg.
     '''
     batchLP = dict(**batchLP) # make a copy
     allkeys = batchLP.keys()
     for key in allkeys:
         if key != 'DocTopicCount':
             del batchLP[key]
     if len(batchLP.keys()) > 0:
         if self.algParams['doMemoizeLocalParams'] == 1:
             self.LPmemory[batchID] = batchLP
         elif self.algParams['doMemoizeLocalParams'] == 2:
             ElapsedTimeLogger.startEvent('io', 'savelocal')
             spDTC = sparsifyResp(
                 batchLP['DocTopicCount'],
                 self.algParams['nnzPerDocForStorage'])
             wc_D = batchLP['DocTopicCount'].sum(axis=1)
             wc_U = np.repeat(wc_D, self.algParams['nnzPerDocForStorage'])
             spDTC.data *= wc_U
             savepath = self.savedir.replace(os.environ['BNPYOUTDIR'], '')
             if os.path.exists('/ltmp/'):
                 savepath = '/ltmp/%s/' % (savepath)
             else:
                 savepath = '/tmp/%s/' % (savepath)
             from distutils.dir_util import mkpath
             mkpath(savepath)
             savepath = os.path.join(savepath, 'batch%d.npz' % (batchID))
             # Now actually save it!
             np.savez(savepath,
                 data=spDTC.data,
                 indices=spDTC.indices,
                 D=spDTC.shape[0],
                 K=spDTC.shape[1],
                 nnzPerDoc=spDTC.indptr[1])
             self.LPmemory[batchID] = savepath
             del batchLP
             del spDTC
             ElapsedTimeLogger.stopEvent('io', 'savelocal')