def runPolicyIteration(self):
        # initialize
        self.initialize()

        iterCount = 0
        while True:
            # policy improvement
            oldValues = util.Counter()
            while True:
                # pre-store the state values of last iteration
                for state, value in self.values.items():
                    oldValues[state] = value

                delta = 0.0
                for state in self.mdp.getStates():
                    oldValue = self.values[state]
                    action = self.policy[state]
                    successors = self.mdp.getTransitionStatesAndProbs(
                        state, action)
                    if len(successors) == 0:  # for terminal state if any
                        continue
                    saValue = 0
                    for nextState, prob in successors:
                        saValue += prob * (
                            self.mdp.getReward(state, action, nextState) +
                            self.gamma * oldValues[nextState])
                    self.values[state] = saValue
                    #delta = max(delta, abs(self.values[state] - oldValues[state]))
                    delta = max(delta, abs(self.values[state] - oldValue))
                if delta <= self.theta:
                    break

            # policy extraction
            policyStable = True
            for state in self.mdp.getStates():
                oldAction = self.policy[state]
                actions = self.mdp.getPossibleActions(state)
                if len(actions) == 0:  # for terminal state, if any
                    self.policy[state] = None
                    continue
                actionValues = util.Counter()
                for action in actions:
                    successors = self.mdp.getTransitionStatesAndProbs(
                        state, action)
                    saValue = 0
                    for nextState, prob in successors:
                        saValue += prob * (
                            self.mdp.getReward(state, action, nextState) +
                            self.gamma * oldValues[nextState])
                    actionValues[action] = saValue
                bestAction = actionValues.argMax()
                self.policy[state] = bestAction
                if oldAction != self.policy[state]:
                    policyStable = False

            # if policy stable, converge!
            iterCount += 1
            print("Iteration: ", iterCount)
            if policyStable or iterCount >= self.iterations:
                break
    def runValueIteration(self):
        iterCount = 0
        oldValues = util.Counter()
        while True:
            # pre-store the state values of last iteration
            for state,value in self.values.items():
                oldValues[state] = value

            delta = 0.0
            for state in self.mdp.getStates():
                actions = self.mdp.getPossibleActions(state)
                if len(actions) == 0:
                    continue

                actionValues = util.Counter()
                for action in actions:
                    successors = self.mdp.getTransitionStatesAndProbs(state, action)
                    saValue = 0
                    for nextState,prob in successors:
                        saValue += prob * (self.mdp.getReward(state, action, nextState) +
                                           self.gamma * oldValues[nextState])
                    actionValues[action] = saValue
                maxValue = actionValues[actionValues.argMax()]
                self.values[state] = maxValue
                delta = max(delta, abs(self.values[state] - oldValues[state]))

            iterCount += 1
            #if iterCount % 100 == 0:
            print("Iteration: ", iterCount)
            if iterCount >= self.iterations or delta <= self.theta:
                break
        print("Value Iteration Converged!")
        print("delta: ", delta)
 def __init__(self, mdp, gamma=1.0, iterations=500, theta=0.01):
     self.mdp = mdp  # markov decision process to be solved
     self.gamma = gamma  # discount factor
     self.iterations = iterations  # max iterations
     self.theta = theta  # a small threshold
     self.values = util.Counter()  # values
     self.policy = util.Counter()  # policy
Exemplo n.º 4
0
    def getPolicy(self, state, legalActions):
        possibleStateQValues = util.Counter()

        for action in legalActions:
            possibleStateQValues[action] = self.getQValue(state, action)

        return possibleStateQValues.argMax()
    def computeActionFromValues(self, state):
        actions = self.mdp.getPossibleActions(state)
        if len(actions) == 0:   # terminal state
            return None

        actionValues = util.Counter()
        for action in actions:
            successors = self.mdp.getTransitionStatesAndProbs(state, action)
            saValue = 0.0
            for nextState, prob in successors:
                saValue += prob * (self.mdp.getReward(state, action, nextState) +
                                   self.gamma * self.values[nextState])
            actionValues[action] = saValue
        #maxValue = actionValues[actionValues.argMax()]
        #bestActions = [action for action,value in actionValues.items() if value == maxValue]
        #return random.choice(bestActions)   # break ties if any
        bestAction = actionValues.argMax()  # don't care about ties
        return bestAction
Exemplo n.º 6
0
    def __init__(self, **args):
        "You can initialize Q-values here..."
        ReinforcementAgent.__init__(self, **args)

        "*** YOUR CODE HERE ***"
        self.qvals = util.Counter()
Exemplo n.º 7
0
    def getValue(self, state):
        possibleStateQValues = util.Counter()
        for action in self.getLegalActions(state):
            possibleStateQValues[action] = self.getQValue(state, action)

        return possibleStateQValues[possibleStateQValues.argMax()]
Exemplo n.º 8
0
 def __init__(self, config):
     super(SarsaApproxAgent, self).__init__(config)
     self.featureExtractor = getattr(featureExtractor, config['featureExtractor'])
     self.weights = util.Counter()