def writeLogMessageForManyDocs(Data, AI, LP, sliceID=None, **kwargs): """ Write log message summarizing convergence behavior across docs. Args ---- Data : bnpy DataObj AI : dict of aggregated info for all documents. Post Condition -------------- Message written to LocalStepLogger. """ if 'lapFrac' not in kwargs: return if 'batchID' not in kwargs: return if isinstance(sliceID, int): sliceID = '%d' % (sliceID) else: sliceID = '0' perc = [0, 1, 10, 50, 90, 99, 100] siter = ' '.join( ['%d:%d' % (p, np.percentile(AI['iter'], p)) for p in perc]) sdiff = ' '.join( ['%d:%.4f' % (p, np.percentile(AI['maxDiff'], p)) for p in perc]) nConverged = np.sum(AI['maxDiff'] <= kwargs['convThrLP']) msg = 'lap %4.2f batch %d slice %s' % (kwargs['lapFrac'], kwargs['batchID'], sliceID) msg += ' nConverged %4d/%d' % (nConverged, AI['maxDiff'].size) worstDocID = np.argmax(AI['maxDiff']) msg += " worstDocID %4d \n" % (worstDocID) msg += ' iter prctiles %s\n' % (siter) msg += ' diff prctiles %s\n' % (sdiff) KactivePerDoc = np.sum(LP['DocTopicCount'] > .01, axis=1) sKactive = ' '.join( ['%d:%d' % (p, np.percentile(KactivePerDoc, p)) for p in perc]) msg += ' Kact prctiles %s\n' % (sKactive) if 'nRestartsAccepted' in AI and AI['nRestartsAccepted'] is not None: msg += " nRestarts %4d/%4d\n" % (AI['nRestartsAccepted'], AI['nRestartsTried']) LocalStepLogger.log(msg)
def _run_task_internal(jobname, taskid, nTask, ReqArgs, KwArgs, UnkArgs, dataName, allocModelName, obsModelName, algName, doSaveToDisk, doWriteStdOut): """ Internal method (should never be called by end-user!) Executes learning for a particular job and particular taskid. Returns ------- hmodel : bnpy HModel, fit to the data LP : Local parameter (LP) dict for the specific dataset RunInfo : dict of information about the run, with fields - 'loss' : final loss value for algorithm - 'loss_history' : vector of loss values over time """ # Make shallow copies of input dicts, so we any modifications here # do not return to the caller. ReqArgs = dict(**ReqArgs) KwArgs = dict(**KwArgs) UnkArgs = dict(**UnkArgs) algseed = createUniqueRandomSeed(jobname, taskID=taskid) dataorderseed = createUniqueRandomSeed('', taskID=taskid) KwArgs[algName]['algseed'] = algseed KwArgs[algName]['dataorderseed'] = dataorderseed if algName in OnlineDataAlgSet: KwArgs[algName]['nLap'] = KwArgs['OnlineDataPrefs']['nLap'] if isinstance(dataName, str): if os.path.exists(dataName): # dataName is a path to many data files on disk Data, InitData = loadDataIteratorFromDisk(dataName, ReqArgs, KwArgs, dataorderseed) DataArgs = UnkArgs # Set the short name for this dataset, # so that the filepath for results is informative. if not hasattr(Data, 'name'): try: Data.name = KwArgs['OnlineDataPrefs']['datasetName'] except KeyError: Data.name = 'UnknownDatasetName' else: DataArgs = getKwArgsForLoadData(ReqArgs, UnkArgs, KwArgs) Data, InitData = loadData(ReqArgs, KwArgs, DataArgs, dataorderseed) else: Data = dataName InitData = dataName DataArgs = dict() assert isinstance(Data, bnpy.data.DataObj) if algName in OnlineDataAlgSet: OnlineDataArgs = KwArgs['OnlineDataPrefs'] OnlineDataArgs['dataorderseed'] = dataorderseed DataArgs = getKwArgsForLoadData(Data, UnkArgs) OnlineDataArgs.update(DataArgs) # add custom args Data = Data.to_iterator(**OnlineDataArgs) if hasattr(Data, 'name'): ReqArgs['dataName'] = Data.name if doSaveToDisk: task_output_path = make_task_output_path(ReqArgs, KwArgs, taskID=taskid) createEmptyOutputPathOnDisk(task_output_path) writeArgsToFile(ReqArgs, KwArgs, task_output_path, UnkArgs) else: task_output_path = None KwArgs['OutputPrefs']['task_output_path'] = task_output_path jobID = configLoggingToConsoleAndFile(task_output_path, taskid, doSaveToDisk, doWriteStdOut) # Write descriptions to the log if taskid == 1 or jobID > 0: # Warn user about any unknown keyword arguments showWarningForUnknownArgs(UnkArgs, DataArgs) Log.info('Dataset Summary:') Log.info(Data.get_text_summary()) Log.info(Data.get_stats_summary()) # Create and initialize model parameters hmodel = make_initialized_model( InitData, seed=algseed, taskid=taskid, allocModelName=ReqArgs['allocModelName'], obsModelName=ReqArgs['obsModelName'], algName=ReqArgs['algName'], KwArgs=KwArgs, verbose=(taskid == 1 or jobID > 0), ) # Create learning algorithm learnAlg = createLearnAlg(Data, hmodel, ReqArgs, KwArgs, algseed=algseed, task_output_path=task_output_path) if learnAlg.hasMove('birth'): import bnpy.birthmove.BLogger as BirthLogger BirthLogger.configure(task_output_path, doSaveToDisk, doWriteStdOut) if learnAlg.hasMove('delete'): import bnpy.deletemove.DLogger as DeleteLogger DeleteLogger.configure(task_output_path, doSaveToDisk, doWriteStdOut) if learnAlg.hasMove('merge'): import bnpy.mergemove.MLogger as MergeLogger MergeLogger.configure(task_output_path, doSaveToDisk, doWriteStdOut) if learnAlg.hasMove('shuffle'): import bnpy.mergemove.SLogger as SLogger SLogger.configure(task_output_path, doSaveToDisk, doWriteStdOut) if str(type(hmodel.allocModel)).count('TopicModel'): import bnpy.allocmodel.topics.LocalStepLogger as LocalStepLogger LocalStepLogger.configure(task_output_path, doSaveToDisk, doWriteStdOut) # Set up logging for how long each step of the alg takes. import bnpy.learnalg.ElapsedTimeLogger as ElapsedTimeLogger ElapsedTimeLogger.configure(task_output_path, KwArgs['MoveNames'], doSaveToDisk, doWriteStdOut) Log.info( 'Learn Alg: %s | task %2d/%d | alg. seed: %d | data order seed: %d' % (algName, taskid, nTask, algseed, dataorderseed)) Log.info('task_output_path: %s' % (task_output_path)) # Fit the model to the data! RunInfo = learnAlg.fit(hmodel, Data) RunInfo['UnkArgs'] = UnkArgs RunInfo['KwArgs'] = KwArgs RunInfo['ReqArgs'] = ReqArgs return hmodel, RunInfo