def runPolicyIteration(self): # initialize self.initialize() iterCount = 0 while True: # policy improvement oldValues = util.Counter() while True: # pre-store the state values of last iteration for state, value in self.values.items(): oldValues[state] = value delta = 0.0 for state in self.mdp.getStates(): oldValue = self.values[state] action = self.policy[state] successors = self.mdp.getTransitionStatesAndProbs( state, action) if len(successors) == 0: # for terminal state if any continue saValue = 0 for nextState, prob in successors: saValue += prob * ( self.mdp.getReward(state, action, nextState) + self.gamma * oldValues[nextState]) self.values[state] = saValue #delta = max(delta, abs(self.values[state] - oldValues[state])) delta = max(delta, abs(self.values[state] - oldValue)) if delta <= self.theta: break # policy extraction policyStable = True for state in self.mdp.getStates(): oldAction = self.policy[state] actions = self.mdp.getPossibleActions(state) if len(actions) == 0: # for terminal state, if any self.policy[state] = None continue actionValues = util.Counter() for action in actions: successors = self.mdp.getTransitionStatesAndProbs( state, action) saValue = 0 for nextState, prob in successors: saValue += prob * ( self.mdp.getReward(state, action, nextState) + self.gamma * oldValues[nextState]) actionValues[action] = saValue bestAction = actionValues.argMax() self.policy[state] = bestAction if oldAction != self.policy[state]: policyStable = False # if policy stable, converge! iterCount += 1 print("Iteration: ", iterCount) if policyStable or iterCount >= self.iterations: break
def runValueIteration(self): iterCount = 0 oldValues = util.Counter() while True: # pre-store the state values of last iteration for state,value in self.values.items(): oldValues[state] = value delta = 0.0 for state in self.mdp.getStates(): actions = self.mdp.getPossibleActions(state) if len(actions) == 0: continue actionValues = util.Counter() for action in actions: successors = self.mdp.getTransitionStatesAndProbs(state, action) saValue = 0 for nextState,prob in successors: saValue += prob * (self.mdp.getReward(state, action, nextState) + self.gamma * oldValues[nextState]) actionValues[action] = saValue maxValue = actionValues[actionValues.argMax()] self.values[state] = maxValue delta = max(delta, abs(self.values[state] - oldValues[state])) iterCount += 1 #if iterCount % 100 == 0: print("Iteration: ", iterCount) if iterCount >= self.iterations or delta <= self.theta: break print("Value Iteration Converged!") print("delta: ", delta)
def __init__(self, mdp, gamma=1.0, iterations=500, theta=0.01): self.mdp = mdp # markov decision process to be solved self.gamma = gamma # discount factor self.iterations = iterations # max iterations self.theta = theta # a small threshold self.values = util.Counter() # values self.policy = util.Counter() # policy
def getPolicy(self, state, legalActions): possibleStateQValues = util.Counter() for action in legalActions: possibleStateQValues[action] = self.getQValue(state, action) return possibleStateQValues.argMax()
def computeActionFromValues(self, state): actions = self.mdp.getPossibleActions(state) if len(actions) == 0: # terminal state return None actionValues = util.Counter() for action in actions: successors = self.mdp.getTransitionStatesAndProbs(state, action) saValue = 0.0 for nextState, prob in successors: saValue += prob * (self.mdp.getReward(state, action, nextState) + self.gamma * self.values[nextState]) actionValues[action] = saValue #maxValue = actionValues[actionValues.argMax()] #bestActions = [action for action,value in actionValues.items() if value == maxValue] #return random.choice(bestActions) # break ties if any bestAction = actionValues.argMax() # don't care about ties return bestAction
def __init__(self, **args): "You can initialize Q-values here..." ReinforcementAgent.__init__(self, **args) "*** YOUR CODE HERE ***" self.qvals = util.Counter()
def getValue(self, state): possibleStateQValues = util.Counter() for action in self.getLegalActions(state): possibleStateQValues[action] = self.getQValue(state, action) return possibleStateQValues[possibleStateQValues.argMax()]
def __init__(self, config): super(SarsaApproxAgent, self).__init__(config) self.featureExtractor = getattr(featureExtractor, config['featureExtractor']) self.weights = util.Counter()