Exemplo n.º 1
0
class nStepPerDecisionTDPrediction(nStepTDPredictionAgent):
    def __init__(self,
                 nStates,
                 nActions,
                 alpha,
                 gamma,
                 n,
                 valueInit="zeros",
                 policyUpdateMethod="greedy",
                 epsilon=0.0,
                 tieBreakingMethod="consistent"):
        super().__init__(nStates, alpha, gamma, n, valueInit=valueInit)
        self.name = "n-step Per-Decision TD Prediction"
        self.nActions = nActions
        self.policy = StochasticPolicy(self.nStates,
                                       self.nActions,
                                       policyUpdateMethod=policyUpdateMethod,
                                       epsilon=epsilon,
                                       tieBreakingMethod=tieBreakingMethod)

    def sweepBuffer(self, tau_start, tau_stop, t, T, behaviour_policy):
        for tau in range(tau_start, tau_stop):
            state = self.bufferExperience[tau]['state']
            l = min(T + 1, t + 1)
            G = self.valueTable[self.bufferExperience[l]['state']]
            for k in range(l - 1, tau - 1, -1):
                sweeping_state = self.bufferExperience[k]['state']
                sweeping_action = self.bufferExperience[k]['action']
                sweeping_reward = self.bufferExperience[k + 1]['reward']
                p = self.policy.getProbability(sweeping_state, sweeping_action)
                b = behaviour_policy.getProbability(sweeping_state,
                                                    sweeping_action)
                W = p / b
                G = W * (sweeping_reward + self.gamma * G) + (
                    1.0 - W) * self.valueTable[sweeping_state]
            td_error = G - self.valueTable[state]
            self.valueTable[
                state] = self.valueTable[state] + self.alpha * td_error

    def reset(self):
        super().reset()
        self.policy.reset()
Exemplo n.º 2
0
class MCControlAgent:

  def __init__(self, nStates, nActions, gamma, policyUpdateMethod="greedy", epsilon=0.0, tieBreakingMethod="arbitrary"):
    self.name = "Generic Monte Carlo Control Agent"
    self.nStates = nStates
    self.nActions = nActions
    self.gamma = gamma
    self.actionValueTable = np.zeros([self.nStates, self.nActions], dtype=float)
    self.policy = StochasticPolicy(self.nStates, self.nActions, policyUpdateMethod=policyUpdateMethod,
      epsilon=epsilon, tieBreakingMethod=tieBreakingMethod)

  def selectAction(self, state, actionsAvailable=None):
    return self.policy.sampleAction(state, actionsAvailable)
    
  def getGreedyAction(self, state, actionsAvailable=None):
    if(actionsAvailable is None):
      actionValues = self.actionValueTable[state,:]
      actionList = np.array(range(self.nActions))
    else:
      actionValues = self.actionValueTable[state, actionsAvailable]
      actionList = np.array(actionsAvailable)
    actionIdx = selectAction_greedy(actionValues)
    return actionList[actionIdx]
    
  def getValue(self, state):
    return np.dot(self.policy.getProbability(state), self.actionValueTable[state,:])
    
  def getActionValue(self, state, action):
    return self.actionValueTable[state,action]

  def getName(self):
    return self.name
    
  def reset(self):
    self.actionValueTable = np.zeros([self.nStates, self.nActions], dtype=np.float)
    self.policy.reset()