示例#1
0
def writeLogMessageForManyDocs(Data, AI, LP, sliceID=None, **kwargs):
    """ Write log message summarizing convergence behavior across docs.

    Args
    ----
    Data : bnpy DataObj
    AI : dict of aggregated info for all documents.

    Post Condition
    --------------
    Message written to LocalStepLogger.
    """
    if 'lapFrac' not in kwargs:
        return
    if 'batchID' not in kwargs:
        return

    if isinstance(sliceID, int):
        sliceID = '%d' % (sliceID)
    else:
        sliceID = '0'

    perc = [0, 1, 10, 50, 90, 99, 100]
    siter = ' '.join(
        ['%d:%d' % (p, np.percentile(AI['iter'], p)) for p in perc])
    sdiff = ' '.join(
        ['%d:%.4f' % (p, np.percentile(AI['maxDiff'], p)) for p in perc])
    nConverged = np.sum(AI['maxDiff'] <= kwargs['convThrLP'])
    msg = 'lap %4.2f batch %d slice %s' % (kwargs['lapFrac'],
                                           kwargs['batchID'], sliceID)

    msg += ' nConverged %4d/%d' % (nConverged, AI['maxDiff'].size)
    worstDocID = np.argmax(AI['maxDiff'])
    msg += " worstDocID %4d \n" % (worstDocID)

    msg += ' iter prctiles %s\n' % (siter)
    msg += ' diff prctiles %s\n' % (sdiff)

    KactivePerDoc = np.sum(LP['DocTopicCount'] > .01, axis=1)
    sKactive = ' '.join(
        ['%d:%d' % (p, np.percentile(KactivePerDoc, p)) for p in perc])
    msg += ' Kact prctiles %s\n' % (sKactive)

    if 'nRestartsAccepted' in AI and AI['nRestartsAccepted'] is not None:
        msg += " nRestarts %4d/%4d\n" % (AI['nRestartsAccepted'],
                                         AI['nRestartsTried'])
    LocalStepLogger.log(msg)
示例#2
0
def _run_task_internal(jobname, taskid, nTask, ReqArgs, KwArgs, UnkArgs,
                       dataName, allocModelName, obsModelName, algName,
                       doSaveToDisk, doWriteStdOut):
    """ Internal method (should never be called by end-user!)
        Executes learning for a particular job and particular taskid.

        Returns
        -------
        hmodel : bnpy HModel, fit to the data
        LP : Local parameter (LP) dict for the specific dataset
        RunInfo : dict of information about the run, with fields
        - 'loss' : final loss value for algorithm
        - 'loss_history' : vector of loss values over time
    """
    # Make shallow copies of input dicts, so we any modifications here
    # do not return to the caller.
    ReqArgs = dict(**ReqArgs)
    KwArgs = dict(**KwArgs)
    UnkArgs = dict(**UnkArgs)

    algseed = createUniqueRandomSeed(jobname, taskID=taskid)
    dataorderseed = createUniqueRandomSeed('', taskID=taskid)
    KwArgs[algName]['algseed'] = algseed
    KwArgs[algName]['dataorderseed'] = dataorderseed

    if algName in OnlineDataAlgSet:
        KwArgs[algName]['nLap'] = KwArgs['OnlineDataPrefs']['nLap']

    if isinstance(dataName, str):
        if os.path.exists(dataName):
            # dataName is a path to many data files on disk
            Data, InitData = loadDataIteratorFromDisk(dataName, ReqArgs,
                                                      KwArgs, dataorderseed)
            DataArgs = UnkArgs
            # Set the short name for this dataset,
            # so that the filepath for results is informative.
            if not hasattr(Data, 'name'):
                try:
                    Data.name = KwArgs['OnlineDataPrefs']['datasetName']
                except KeyError:
                    Data.name = 'UnknownDatasetName'
        else:
            DataArgs = getKwArgsForLoadData(ReqArgs, UnkArgs, KwArgs)
            Data, InitData = loadData(ReqArgs, KwArgs, DataArgs, dataorderseed)
    else:
        Data = dataName
        InitData = dataName
        DataArgs = dict()
        assert isinstance(Data, bnpy.data.DataObj)
        if algName in OnlineDataAlgSet:
            OnlineDataArgs = KwArgs['OnlineDataPrefs']
            OnlineDataArgs['dataorderseed'] = dataorderseed

            DataArgs = getKwArgsForLoadData(Data, UnkArgs)
            OnlineDataArgs.update(DataArgs)  # add custom args
            Data = Data.to_iterator(**OnlineDataArgs)
    if hasattr(Data, 'name'):
        ReqArgs['dataName'] = Data.name
    if doSaveToDisk:
        task_output_path = make_task_output_path(ReqArgs,
                                                 KwArgs,
                                                 taskID=taskid)
        createEmptyOutputPathOnDisk(task_output_path)
        writeArgsToFile(ReqArgs, KwArgs, task_output_path, UnkArgs)
    else:
        task_output_path = None
    KwArgs['OutputPrefs']['task_output_path'] = task_output_path
    jobID = configLoggingToConsoleAndFile(task_output_path, taskid,
                                          doSaveToDisk, doWriteStdOut)

    # Write descriptions to the log
    if taskid == 1 or jobID > 0:
        # Warn user about any unknown keyword arguments
        showWarningForUnknownArgs(UnkArgs, DataArgs)

        Log.info('Dataset Summary:')
        Log.info(Data.get_text_summary())
        Log.info(Data.get_stats_summary())

    # Create and initialize model parameters
    hmodel = make_initialized_model(
        InitData,
        seed=algseed,
        taskid=taskid,
        allocModelName=ReqArgs['allocModelName'],
        obsModelName=ReqArgs['obsModelName'],
        algName=ReqArgs['algName'],
        KwArgs=KwArgs,
        verbose=(taskid == 1 or jobID > 0),
    )

    # Create learning algorithm
    learnAlg = createLearnAlg(Data,
                              hmodel,
                              ReqArgs,
                              KwArgs,
                              algseed=algseed,
                              task_output_path=task_output_path)
    if learnAlg.hasMove('birth'):
        import bnpy.birthmove.BLogger as BirthLogger
        BirthLogger.configure(task_output_path, doSaveToDisk, doWriteStdOut)
    if learnAlg.hasMove('delete'):
        import bnpy.deletemove.DLogger as DeleteLogger
        DeleteLogger.configure(task_output_path, doSaveToDisk, doWriteStdOut)
    if learnAlg.hasMove('merge'):
        import bnpy.mergemove.MLogger as MergeLogger
        MergeLogger.configure(task_output_path, doSaveToDisk, doWriteStdOut)
    if learnAlg.hasMove('shuffle'):
        import bnpy.mergemove.SLogger as SLogger
        SLogger.configure(task_output_path, doSaveToDisk, doWriteStdOut)
    if str(type(hmodel.allocModel)).count('TopicModel'):
        import bnpy.allocmodel.topics.LocalStepLogger as LocalStepLogger
        LocalStepLogger.configure(task_output_path, doSaveToDisk,
                                  doWriteStdOut)

    # Set up logging for how long each step of the alg takes.
    import bnpy.learnalg.ElapsedTimeLogger as ElapsedTimeLogger
    ElapsedTimeLogger.configure(task_output_path, KwArgs['MoveNames'],
                                doSaveToDisk, doWriteStdOut)

    Log.info(
        'Learn Alg: %s | task %2d/%d | alg. seed: %d | data order seed: %d' %
        (algName, taskid, nTask, algseed, dataorderseed))
    Log.info('task_output_path: %s' % (task_output_path))

    # Fit the model to the data!
    RunInfo = learnAlg.fit(hmodel, Data)
    RunInfo['UnkArgs'] = UnkArgs
    RunInfo['KwArgs'] = KwArgs
    RunInfo['ReqArgs'] = ReqArgs
    return hmodel, RunInfo