def updateLPWithResp(LP, Data, Lik, Prior, sumRespTilde, cslice=(0, None), doSparseOnlyAtFinalLP=0, nnzPerRowLP=0): ''' Compute assignment responsibilities given output of local step. Args ---- LP : dict Has other fields like 'E_log_soft_ev' Data : DataObj Lik : 2D array, size N x K Will be overwritten and turned into resp. Returns ------- LP : dict Add field 'resp' : N x K 2D array. ''' # Create resp array directly from Lik array. # Do not make any copies, to save memory. LP['resp'] = Lik nDoc = calcNumDocFromSlice(Data, cslice) slice_start = Data.doc_range[cslice[0]] N = LP['resp'].shape[0] K = LP['resp'].shape[1] if N > Data.doc_range[-1]: assert N == nDoc * Data.vocab_size # Bernoulli naive case. Quite slow! for d in xrange(nDoc): rstart = d * Data.vocab_size rstop = (d + 1) * Data.vocab_size LP['resp'][rstart:rstop] *= Prior[d] else: # Usual case. Quite fast! for d in xrange(nDoc): start = Data.doc_range[cslice[0] + d] - slice_start stop = Data.doc_range[cslice[0] + d + 1] - slice_start LP['resp'][start:stop] *= Prior[d] if doSparseOnlyAtFinalLP and (nnzPerRowLP > 0 and nnzPerRowLP < K): LP['spR'] = sparsifyResp(LP['resp'], nnzPerRow=nnzPerRowLP) LP['nnzPerRow'] = nnzPerRowLP assert np.allclose(LP['spR'].sum(axis=1), 1.0) del LP['resp'] np.maximum(LP['spR'].data, 1e-300, out=LP['spR'].data) fillInDocTopicCountFromSparseResp(Data, LP) else: LP['resp'] /= sumRespTilde[:, np.newaxis] np.maximum(LP['resp'], 1e-300, out=LP['resp']) # Time consuming: # >>> assert np.allclose(LP['resp'].sum(axis=1), 1.0) return LP
def _make_kwarg_dict(X=None, R=None, nnzPerRow=2, N=100, K=3, D=2): if X is None: X = np.random.randn(N, D) if R is None: R = np.random.rand(N, K) R *= R R /= R.sum(axis=1)[:, np.newaxis] # Sparsify R spR_csr = sparsifyResp(R, nnzPerRow) spR_csc = spR_csr.tocsc() R = spR_csc.toarray() np.maximum(R, 1e-100, out=R) # avoid NaN values return dict(X=X, R=R, spR_csc=spR_csc, spR_csr=spR_csr, nnzPerRow=nnzPerRow)
def save_batch_local_params_to_memory(self, batchID, batchLP): ''' Store certain fields of the provided local parameters dict into "memory" for later retrieval. Fields to save determined by the memoLPkeys attribute of this alg. ''' batchLP = dict(**batchLP) # make a copy allkeys = batchLP.keys() for key in allkeys: if key != 'DocTopicCount': del batchLP[key] if len(batchLP.keys()) > 0: if self.algParams['doMemoizeLocalParams'] == 1: self.LPmemory[batchID] = batchLP elif self.algParams['doMemoizeLocalParams'] == 2: ElapsedTimeLogger.startEvent('io', 'savelocal') spDTC = sparsifyResp( batchLP['DocTopicCount'], self.algParams['nnzPerDocForStorage']) wc_D = batchLP['DocTopicCount'].sum(axis=1) wc_U = np.repeat(wc_D, self.algParams['nnzPerDocForStorage']) spDTC.data *= wc_U savepath = self.savedir.replace(os.environ['BNPYOUTDIR'], '') if os.path.exists('/ltmp/'): savepath = '/ltmp/%s/' % (savepath) else: savepath = '/tmp/%s/' % (savepath) from distutils.dir_util import mkpath mkpath(savepath) savepath = os.path.join(savepath, 'batch%d.npz' % (batchID)) # Now actually save it! np.savez(savepath, data=spDTC.data, indices=spDTC.indices, D=spDTC.shape[0], K=spDTC.shape[1], nnzPerDoc=spDTC.indptr[1]) self.LPmemory[batchID] = savepath del batchLP del spDTC ElapsedTimeLogger.stopEvent('io', 'savelocal')