示例#1
0
    def calc_local_params(self,
                          Data,
                          LP=None,
                          doLogElapsedTime=False,
                          **kwargs):
        ''' Calculate local parameters specific to each data item.

            This is the E-step of the EM algorithm.
        '''
        if LP is None:
            LP = dict()
        if doLogElapsedTime:
            ElapsedTimeLogger.startEvent('local', 'obsupdate')
        # Calculate  "soft evidence" each component has for each item
        # Fills in LP['E_log_soft_ev'], N x K array
        LP = self.obsModel.calc_local_params(Data, LP, **kwargs)
        if doLogElapsedTime:
            ElapsedTimeLogger.stopEvent('local', 'obsupdate')
            ElapsedTimeLogger.startEvent('local', 'allocupdate')
        # Combine with allocModel probs of each cluster
        # Fills in LP['resp'], N x K array whose rows sum to one
        LP = self.allocModel.calc_local_params(Data, LP, **kwargs)
        if doLogElapsedTime:
            ElapsedTimeLogger.stopEvent('local', 'allocupdate')
        return LP
示例#2
0
    def load_batch_local_params_from_memory(self, batchID, doCopy=0):
        ''' Load local parameter dict stored in memory for provided batchID

        TODO: Fastforward so recent truncation changes are accounted for.

        Returns
        -------
        batchLP : dict of local parameters specific to batchID
        '''
        batchLP = self.LPmemory[batchID]
        if isinstance(batchLP, str):
            ElapsedTimeLogger.startEvent('io', 'loadlocal')
            batchLPpath = os.path.abspath(batchLP)
            assert os.path.exists(batchLPpath)
            F = np.load(batchLPpath, allow_pickle=True)
            indptr = np.arange(0, (F['D'] + 1) * F['nnzPerDoc'],
                               F['nnzPerDoc'])
            batchLP = dict()
            batchLP['DocTopicCount'] = scipy.sparse.csr_matrix(
                (F['data'], F['indices'], indptr),
                shape=(F['D'], F['K'])).toarray()
            ElapsedTimeLogger.stopEvent('io', 'loadlocal')
        if doCopy:
            # Duplicating to avoid changing the raw data stored in LPmemory
            # Usually for debugging only
            batchLP = copy.deepcopy(batchLP)
        return batchLP
示例#3
0
    def eval_custom_func(self, isFinal=0, isInitial=0, lapFrac=0, **kwargs):
        ''' Evaluates a custom hook function
        '''

        cFuncPath = self.outputParams['customFuncPath']
        if cFuncPath is None or cFuncPath == 'None':
            return None

        cbName = str(cFuncPath)
        ElapsedTimeLogger.startEvent('callback', cbName)

        cFuncArgs_string = self.outputParams['customFuncArgs']
        nLapTotal = self.algParams['nLap']
        if isinstance(cFuncPath, str):
            cFuncPath = cFuncPath.replace(".py", "")
            pathParts = cFuncPath.split(os.path.sep)
            if len(pathParts) > 1:
                # Absolute path provided
                cFuncDir = os.path.expandvars(os.path.sep.join(pathParts[:-1]))
                sys.path.append(cFuncDir)
                cFuncModName = pathParts[-1]
                cFuncModule = __import__(cFuncModName, fromlist=[])
            else:
                # Treat as relative path to file in bnpy.callbacks
                cFuncModule = __import__(
                    'bnpy.callbacks.', fromlist=[cFuncPath])
                cFuncModule = getattr(cFuncModule, cFuncPath)
        else:
            cFuncModule = cFuncPath  # directly passed in as object

        kwargs['nLap'] = self.algParams['nLap']
        kwargs['lapFrac'] = lapFrac
        kwargs['isFinal'] = isFinal
        kwargs['isInitial'] = isInitial
        if isInitial:
            kwargs['lapFrac'] = 0
            kwargs['iterid'] = 0

        hasCBFuncs = hasattr(cFuncModule, 'onBatchComplete') or \
            hasattr(cFuncModule, 'onLapComplete') or \
            hasattr(cFuncModule, 'onAlgorithmComplete')
        if not hasCBFuncs:
            raise ValueError("Specified customFuncPath has no callbacks!")
        if hasattr(cFuncModule, 'onBatchComplete') and not isFinal:
            cFuncModule.onBatchComplete(args=cFuncArgs_string, **kwargs)
        if hasattr(cFuncModule, 'onLapComplete') \
           and isEvenlyDivisibleFloat(lapFrac, 1.0) and not isFinal:
            cFuncModule.onLapComplete(args=cFuncArgs_string, **kwargs)
        if hasattr(cFuncModule, 'onAlgorithmComplete') \
           and isFinal:
            cFuncModule.onAlgorithmComplete(args=cFuncArgs_string, **kwargs)
        ElapsedTimeLogger.stopEvent('callback', cbName)
示例#4
0
 def update_global_params(self, SS, rho=None,
         doLogElapsedTime=False,
         **kwargs):
     ''' Update (in-place) global parameters given provided suff stats.
         This is the M-step of EM.
     '''
     if doLogElapsedTime:
         ElapsedTimeLogger.startEvent('global', 'alloc')
     self.allocModel.update_global_params(SS, rho, **kwargs)
     if doLogElapsedTime:
         ElapsedTimeLogger.stopEvent('global', 'alloc')
         ElapsedTimeLogger.startEvent('global', 'obs')
     self.obsModel.update_global_params(SS, rho, **kwargs)
     if doLogElapsedTime:
         ElapsedTimeLogger.stopEvent('global', 'obs')
示例#5
0
    def calc_evidence(self,
                      Data=None,
                      SS=None,
                      LP=None,
                      scaleFactor=None,
                      todict=False,
                      doLogElapsedTime=False,
                      **kwargs):
        ''' Compute evidence lower bound (ELBO) objective function.
        '''
        if doLogElapsedTime:
            ElapsedTimeLogger.startEvent('global', 'ev')

        if Data is not None and LP is None and SS is None:
            LP = self.calc_local_params(Data, **kwargs)
            SS = self.get_global_suff_stats(Data, LP)
        evA = self.allocModel.calc_evidence(Data,
                                            SS,
                                            LP,
                                            todict=todict,
                                            **kwargs)
        evObs = self.obsModel.calc_evidence(Data,
                                            SS,
                                            LP,
                                            todict=todict,
                                            **kwargs)
        if scaleFactor is None:
            if hasattr(SS, 'scaleFactor'):
                scaleFactor = SS.scaleFactor
            else:
                scaleFactor = self.obsModel.getDatasetScale(SS)

        if doLogElapsedTime:
            ElapsedTimeLogger.stopEvent('global', 'ev')

        if todict:
            evA.update(evObs)
            for key in evA:
                evA[key] /= scaleFactor
            # Identify unique keys, ignoring subdivided terms
            # eg Lalloc_top_term1 and Lalloc_top_term2 are not counted,
            # since we expect they are already aggregated in term Lalloc
            ukeys = list(set([key.split('_')[0] for key in evA.keys()]))
            evA['Ltotal'] = sum([evA[key] for key in ukeys])
            return evA
        else:
            return (evA + evObs) / scaleFactor
示例#6
0
    def get_global_suff_stats(self, Data, LP,
            doLogElapsedTime=False,
            **kwargs):
        ''' Calculate sufficient statistics for each component.

        These stats summarize the data and local parameters
        assigned to each component.

        This is necessary prep for the Global Step update.
        '''
        if doLogElapsedTime:
            ElapsedTimeLogger.startEvent('local', 'allocsummary')
        SS = self.allocModel.get_global_suff_stats(Data, LP, **kwargs)
        if doLogElapsedTime:
            ElapsedTimeLogger.stopEvent('local', 'allocsummary')
            ElapsedTimeLogger.startEvent('local', 'obssummary')
        SS = self.obsModel.get_global_suff_stats(Data, SS, LP, **kwargs)
        if doLogElapsedTime:
            ElapsedTimeLogger.stopEvent('local', 'obssummary')
        return SS
示例#7
0
 def saveParams(self, lap, hmodel, SS=None, **kwargs):
     ''' Save current model to disk
     '''
     if lap in self.SavedIters or self.task_output_path is None:
         return
     ElapsedTimeLogger.startEvent("io", "saveparams")
     self.SavedIters.add(lap)
     prefix = ModelWriter.makePrefixForLap(lap)
     with open(self.mkfile('snapshot_lap.txt'), 'a') as f:
         f.write(six.text_type('%.4f\n' % (lap)))
     with open(self.mkfile('snapshot_elapsed_time_sec.txt'), 'a') as f:
         f.write(six.text_type('%.3f\n' % (self.get_elapsed_time())))
     if self.outputParams['doSaveFullModel']:
         ModelWriter.save_model(
             hmodel, self.task_output_path, prefix,
             doSavePriorInfo=np.allclose(lap, 0.0),
             doLinkBest=True,
             doSaveObsModel=self.outputParams['doSaveObsModel'])
     if self.outputParams['doSaveTopicModel']:
         ModelWriter.saveTopicModel(
             hmodel, SS, self.task_output_path, prefix, **kwargs)
     ElapsedTimeLogger.stopEvent("io", "saveparams")
示例#8
0
 def save_batch_local_params_to_memory(self, batchID, batchLP):
     ''' Store certain fields of the provided local parameters dict
           into "memory" for later retrieval.
         Fields to save determined by the memoLPkeys attribute of this alg.
     '''
     batchLP = dict(**batchLP)  # make a copy
     allkeys = list(batchLP.keys())
     for key in allkeys:
         if key != 'DocTopicCount':
             del batchLP[key]
     if len(list(batchLP.keys())) > 0:
         if self.algParams['doMemoizeLocalParams'] == 1:
             self.LPmemory[batchID] = batchLP
         elif self.algParams['doMemoizeLocalParams'] == 2:
             ElapsedTimeLogger.startEvent('io', 'savelocal')
             spDTC = sparsifyResp(batchLP['DocTopicCount'],
                                  self.algParams['nnzPerDocForStorage'])
             wc_D = batchLP['DocTopicCount'].sum(axis=1)
             wc_U = np.repeat(wc_D, self.algParams['nnzPerDocForStorage'])
             spDTC.data *= wc_U
             savepath = self.savedir.replace(os.environ['BNPYOUTDIR'], '')
             if os.path.exists('/ltmp/'):
                 savepath = '/ltmp/%s/' % (savepath)
             else:
                 savepath = '/tmp/%s/' % (savepath)
             from distutils.dir_util import mkpath
             mkpath(savepath)
             savepath = os.path.join(savepath, 'batch%d.npz' % (batchID))
             # Now actually save it!
             np.savez(savepath,
                      data=spDTC.data,
                      indices=spDTC.indices,
                      D=spDTC.shape[0],
                      K=spDTC.shape[1],
                      nnzPerDoc=spDTC.indptr[1])
             self.LPmemory[batchID] = savepath
             del batchLP
             del spDTC
             ElapsedTimeLogger.stopEvent('io', 'savelocal')