def birth_select_targets_for_next_lap(self, hmodel, SS, BirthResults):
    ''' Create plans for next lap's birth moves
    
        Returns
        -------
        BirthPlans : list of dicts, 
                     each entry represents the plan for one future birth move
    '''
    if SS is not None:
      assert hmodel.allocModel.K == SS.K
    K =  hmodel.allocModel.K

    # Update counter for which components haven't been updated in a while
    for kk in range(K):
      self.LapsSinceLastBirth[kk] += 1

    # Ignore components that have just been added to the model.
    excludeList = self.birth_get_all_new_comps(BirthResults)

    # For each birth move, create a "plan"
    BirthPlans = list()
    for posID in range(self.algParams['birth']['birthPerLap']):
      try:
        ktarget = BirthMove.select_birth_component(SS, K=K, 
                          randstate=self.PRNG,
                          excludeList=excludeList, doVerbose=False,
                          lapsSinceLastBirth=self.LapsSinceLastBirth,
                          **self.algParams['birth'])
        self.LapsSinceLastBirth[ktarget] = 0
        excludeList.append(ktarget)
        Plan = dict(ktarget=ktarget, Data=None)
      except BirthMove.BirthProposalError, e:
        Plan = dict(ktarget=None, Data=None, msg=str(e))

      BirthPlans.append(Plan)
示例#2
0
    def birth_collect_target_subsample(self, Dchunk, LPchunk, BirthPlans):
        ''' Collect subsample of the data in Dchunk, and add that subsample
          to overall targeted subsample stored in input list BirthPlans
        This overall sample is aggregated across many batches of data.
        Data from Dchunk is only collected if more data is needed.

        Returns
        -------
        BirthPlans : list of planned births for the next lap,
                      updated to include data from Dchunk if needed
    '''
        import BirthMove

        for Plan in BirthPlans:
            # Skip this move if component selection failed
            if Plan['ktarget'] is None:
                continue

            birthParams = dict(**self.algParams['birth'])
            # Skip collection if have enough data already
            if Plan['Data'] is not None:
                if hasattr(Plan['Data'], 'nDoc'):
                    if Plan['Data'].nDoc >= self.algParams['birth'][
                            'maxTargetSize']:
                        continue
                    birthParams['maxTargetSize'] -= Plan['Data'].nDoc
                else:
                    if Plan['Data'].nObs >= self.algParams['birth'][
                            'maxTargetObs']:
                        continue

            # Sample data from current batch, if more is needed
            targetData = BirthMove.subsample_data(Dchunk,
                                                  LPchunk,
                                                  Plan['ktarget'],
                                                  randstate=self.PRNG,
                                                  **birthParams)
            # Update Data for current entry in self.targetDataList
            if targetData is None:
                if Plan['Data'] is None:
                    Plan[
                        'msg'] = "TargetData: No samples for target comp found."
            else:
                if Plan['Data'] is None:
                    Plan['Data'] = targetData
                else:
                    Plan['Data'].add_data(targetData)
                Plan['msg'] = "TargetData: nObs %d" % (Plan['Data'].nObs)

        return BirthPlans
示例#3
0
  def run_birth_move(self, hmodel, Data, SS, LP, lap):
    ''' Run birth move on hmodel
    ''' 
    import BirthMove # avoid circular import
    self.BirthLog = list()
    if not self.do_birth_at_lap(lap):
      return hmodel, LP
      
    kbirth = BirthMove.select_birth_component(SS, 
                          randstate=self.PRNG,
                          **self.algParams['birth'])

    TargetData = BirthMove.subsample_data(Data, LP, kbirth, 
                          randstate=self.PRNG,
                          **self.algParams['birth'])

    hmodel, SS, MoveInfo = BirthMove.run_birth_move(
                 hmodel, TargetData, SS, ktarget=kbirth, randstate=self.PRNG, 
                 **self.algParams['birth'])
    self.print_msg(MoveInfo['msg'])
    self.BirthLog.extend(MoveInfo['birthCompIDs'])
    LP = None
    return hmodel, LP
  def birth_collect_target_subsample(self, Dchunk, LPchunk, BirthPlans):
    ''' Collect subsample of the data in Dchunk, and add that subsample
          to overall targeted subsample stored in input list BirthPlans
        This overall sample is aggregated across many batches of data.
        Data from Dchunk is only collected if more data is needed.

        Returns
        -------
        BirthPlans : list of planned births for the next lap,
                      updated to include data from Dchunk if needed
    '''
    import BirthMove
    
    for Plan in BirthPlans:
      # Skip this move if component selection failed
      if Plan['ktarget'] is None:
        continue

      birthParams = dict(**self.algParams['birth'])
      # Skip collection if have enough data already
      if Plan['Data'] is not None:
        if hasattr(Plan['Data'], 'nDoc'):
          if Plan['Data'].nDoc >= self.algParams['birth']['maxTargetSize']:
            continue
          birthParams['maxTargetSize'] -= Plan['Data'].nDoc
        else:
          if Plan['Data'].nObs >= self.algParams['birth']['maxTargetObs']:
            continue

      # Sample data from current batch, if more is needed
      targetData = BirthMove.subsample_data(Dchunk, LPchunk,
                          Plan['ktarget'], randstate=self.PRNG,
                          **birthParams)
      # Update Data for current entry in self.targetDataList
      if targetData is None:
        if Plan['Data'] is None:
          Plan['msg'] = "TargetData: No samples for target comp found."
      else:
        if Plan['Data'] is None:
          Plan['Data'] = targetData
        else:
          Plan['Data'].add_data(targetData)
        Plan['msg'] = "TargetData: nObs %d" % (Plan['Data'].nObs)

    return BirthPlans
示例#5
0
    def birth_select_targets_for_next_lap(self, hmodel, SS, BirthResults):
        ''' Create plans for next lap's birth moves
    
        Returns
        -------
        BirthPlans : list of dicts, 
                     each entry represents the plan for one future birth move
    '''
        if SS is not None:
            assert hmodel.allocModel.K == SS.K
        K = hmodel.allocModel.K

        # Update counter for which components haven't been updated in a while
        for kk in range(K):
            self.LapsSinceLastBirth[kk] += 1

        # Ignore components that have just been added to the model.
        excludeList = self.birth_get_all_new_comps(BirthResults)

        # For each birth move, create a "plan"
        BirthPlans = list()
        for posID in range(self.algParams['birth']['birthPerLap']):
            try:
                ktarget = BirthMove.select_birth_component(
                    SS,
                    K=K,
                    randstate=self.PRNG,
                    excludeList=excludeList,
                    doVerbose=False,
                    lapsSinceLastBirth=self.LapsSinceLastBirth,
                    **self.algParams['birth'])
                self.LapsSinceLastBirth[ktarget] = 0
                excludeList.append(ktarget)
                Plan = dict(ktarget=ktarget, Data=None)
            except BirthMove.BirthProposalError, e:
                Plan = dict(ktarget=None, Data=None, msg=str(e))

            BirthPlans.append(Plan)
示例#6
0
    def birth_create_new_comps(self, hmodel, SS, BirthPlans=list(), Data=None):
        ''' Create new components 

        Returns
        -------
        hmodel : bnpy HModel, with (possibly) new components
        SS : bnpy SuffStatBag, with (possibly) new components
        BirthResults : list of dictionaries, one entry per birth move
                        each entry has fields
                        * TODO
    '''
        if Data is not None:
            if hasattr(Data, 'nDoc'):
                wordPerDocThr = self.algParams['birth']['birthWordsPerDocThr']
                if wordPerDocThr > 0:
                    nWordPerDoc = np.asarray(
                        Data.to_sparse_docword_matrix().sum(axis=1))
                    candidates = nWordPerDoc >= wordPerDocThr
                    candidates = np.flatnonzero(candidates)
                else:
                    candidates = None
                targetData = Data.get_random_sample(
                    self.algParams['birth']['maxTargetSize'],
                    randstate=self.PRNG,
                    candidates=candidates)
            else:
                targetData = Data.get_random_sample(
                    self.algParams['birth']['maxTargetObs'],
                    randstate=self.PRNG)

            Plan = dict(Data=targetData, ktarget=-1)
            BirthPlans = [Plan]

        nMoves = len(BirthPlans)
        BirthResults = list()
        for moveID, Plan in enumerate(BirthPlans):
            # Unpack data for current move
            ktarget = Plan['ktarget']
            targetData = Plan['Data']

            if ktarget is None or targetData is None:
                msg = Plan['msg']

            elif targetData.nObs < self.algParams['birth']['minTargetObs']:
                # Verify targetData large enough that birth would be productive
                msg = "BIRTH skipped. Target data too small (size %d)"
                msg = msg % (targetData.nObs)
            elif hasattr(targetData, 'nDoc') \
                 and targetData.nDoc < self.algParams['birth']['minTargetSize']:
                msg = "BIRTH skipped. Target data too small (size %d)"
                msg = msg % (targetData.nDoc)

            else:
                hmodel, SS, MoveInfo = BirthMove.run_birth_move(
                    hmodel,
                    targetData,
                    SS,
                    randstate=self.PRNG,
                    ktarget=ktarget,
                    **self.algParams['birth'])
                msg = MoveInfo['msg']
                if MoveInfo['didAddNew']:
                    BirthResults.append(MoveInfo)

                    for kk in MoveInfo['birthCompIDs']:
                        self.LapsSinceLastBirth[kk] = -1

            if Data is None:
                self.print_msg("%d/%d %s" % (moveID + 1, nMoves, msg))
            else:
                self.print_msg("%d/%d BATCH %s" % (moveID + 1, nMoves, msg))

        return hmodel, SS, BirthResults
  def birth_create_new_comps(self, hmodel, SS, BirthPlans=list(), Data=None):
    ''' Create new components 

        Returns
        -------
        hmodel : bnpy HModel, with (possibly) new components
        SS : bnpy SuffStatBag, with (possibly) new components
        BirthResults : list of dictionaries, one entry per birth move
                        each entry has fields
                        * TODO
    '''
    if Data is not None:
      if hasattr(Data, 'nDoc'):
        wordPerDocThr = self.algParams['birth']['birthWordsPerDocThr']
        if wordPerDocThr > 0:
          nWordPerDoc = np.asarray(Data.to_sparse_docword_matrix().sum(axis=1))
          candidates = nWordPerDoc >= wordPerDocThr
          candidates = np.flatnonzero(candidates)
        else:
          candidates = None
        targetData = Data.get_random_sample(
                                self.algParams['birth']['maxTargetSize'],
                                randstate=self.PRNG, candidates=candidates)
      else:
        targetData = Data.get_random_sample(
                                self.algParams['birth']['maxTargetObs'],
                                randstate=self.PRNG)


      Plan = dict(Data=targetData, ktarget=-1)
      BirthPlans = [Plan]

    nMoves = len(BirthPlans)
    BirthResults = list()
    for moveID, Plan in enumerate(BirthPlans):
      # Unpack data for current move
      ktarget = Plan['ktarget']
      targetData = Plan['Data']

      if ktarget is None or targetData is None:
        msg = Plan['msg']

      elif targetData.nObs < self.algParams['birth']['minTargetObs']:
        # Verify targetData large enough that birth would be productive
        msg = "BIRTH skipped. Target data too small (size %d)"
        msg = msg % (targetData.nObs)
      elif hasattr(targetData, 'nDoc') \
           and targetData.nDoc < self.algParams['birth']['minTargetSize']:
        msg = "BIRTH skipped. Target data too small (size %d)"
        msg = msg % (targetData.nDoc)

      else:
        hmodel, SS, MoveInfo = BirthMove.run_birth_move(
                 hmodel, targetData, SS, randstate=self.PRNG, 
                 ktarget=ktarget, **self.algParams['birth'])
        msg = MoveInfo['msg']
        if MoveInfo['didAddNew']:
          BirthResults.append(MoveInfo)

          for kk in MoveInfo['birthCompIDs']:
            self.LapsSinceLastBirth[kk] = -1

      if Data is None:
          self.print_msg( "%d/%d %s" % (moveID+1, nMoves, msg) )
      else:
          self.print_msg( "%d/%d BATCH %s" % (moveID+1, nMoves, msg) )

    return hmodel, SS, BirthResults