Exemplo n.º 1
0
class GridAgent:
    def __init__(self,
                 epsilon=0.01,
                 greedy=False,
                 alpha=0.1,
                 gamma=0.95,
                 visual=True,
                 goal=(10, 8),
                 agentPose=(1, 1, 'up'),
                 showTrial=True,
                 randomReset=False,
                 epsilonStrat=1,
                 epsilonFactor=500):
        """
        gridWorld: GridWorld object
        epsilon: value used for epsilon greedy search
        alpha: step size
        gamma: discount favtor
        """
        self.actionValues = Counter()
        self.epsilonFactor = epsilonFactor
        self.randomReset = randomReset
        self.epsilon = epsilon
        self.greedy = greedy
        self.epsilonStrat = epsilonStrat
        self.goal = goal
        self.Q = dict()
        self.gridWorld = GridWorld(goal,
                                   agentPose,
                                   visual=visual,
                                   showTrial=showTrial,
                                   randomReset=randomReset)
        self.actions = self.gridWorld.getActions()
        self.Model = dict()
        self.alpha = alpha
        self.PriorityQueue = PriorityQueue()
        self.gamma = gamma
        self.exp = []
        self.rewards = dict()
        self.rewardNums = dict()
        self.predecessors = defaultdict(set)
        self.initQValues()

    def initQValues(self):
        """
        Output: initializes both model and Q values for all states provided by GridWorld
        """
        states = self.gridWorld.getStateSpace()
        for s in states:
            self.Q[s] = Counter()
            self.rewards[s] = 0
            self.rewardNums[s] = 0
            for a in self.actions:
                news = self.gridWorld.getPoseFromAction(s, a)
                if self.gridWorld.isOutsideBounds(news): news = s
                self.Model[(s, a)] = (0, news)
                self.Q[s].keyValue(a, 0)

    def printQValues(self, pose):
        print('actionValues: ', self.Q[pose].getDict())

    def getBestAction(self, pose):
        """
        Pose: (x,y,orientation) tuple representing a pose in grid world
        returns: best action to take dependent on the current Q values
        """
        return self.Q[pose].argmax()

    def getAction(self, pose):
        """
        pose: (x,y,pose) tuple representing 
        returns: action selected from epsilon greedy strategy
        """
        if self.epsilonStrat == 1: self.epsilon = self.epsilon
        elif self.epsilonStrat == 2:
            self.epsilon = self.epsilonFactor / (
                self.gridWorld.getTotalSteps() + 1)
        elif self.epsilonStrat == 3:
            if self.epsilon > .0001:
                self.epsilon -= .0001
        if self.greedy:
            bestAction, _ = self.getBestAction(pose)
            action = bestAction
            return action
        else:
            p = random.uniform(0, 1)
            if p > self.epsilon:
                action, value = self.getBestAction(pose)

            else:
                action = random.choice(list(self.Q[pose].getKeys()))
            #e(g('pose'))
            #print(self.Q[pose].getValues())
            return action

    def getRandomSFromExp(self):
        """
        returns random previously observed state
        """
        if len(self.exp) == 1:
            s, a = self.exp[0]
            return s
        randomInt = random.randint(0, len(self.exp) - 1)
        s, a = self.exp[randomInt]
        return s

    def getRandomAFromRandomS(self, S):
        """
        S: pose (x,y,orientation) tuple representing a pose in grid world
        returns: an action which was previously taken from state S
        """
        aList = []
        for s, a in self.exp:
            if S == s:
                aList += [a]
        return random.choice(aList)

    def getSA(self):
        """
        returns a random state with a random action. Both must have previously happened 
        """
        S = self.getRandomSFromExp()
        A = self.getRandomAFromRandomS(S)
        return S, A

    def updateExp(self, S, A, maxElements):
        """
        args:
            S - tuple of (x,y,orientation)
            A - action
            maxElements - integer, max number of elements to store in experience
        output: 
            - updates experience, where if the number of elements in 
            experience is above maxElements, a random element is removed.
        """
        if len(self.exp) > maxElements:
            randomInt = random.randint(0, len(self.exp) - 1)
            removedElement = self.exp.pop(randomInt)
        self.exp.append((S, A))

    def tabDynaQ(
            self,
            ndirect,  # number of iterations for direct learning
            nplanning,  # number of iterations for planning
            maxExp,  # maximum number of elements to keep in experience
    ):
        """
        learns the Q values using Dyna-Q planning with tabular (deterministic) model
        """
        for i in range(ndirect):
            S = self.gridWorld.getAgentPose()
            A = self.getAction(S)
            self.exp += [(S, A)]
            self.updateExp(S, A, maxExp)
            R, Su = self.gridWorld.takeStep(A)
            Q = self.Q

            delta = R + self.gamma * Q[Su].max() - Q[S].at(A)
            self.Q[S].keyValue(A, Q[S].at(A) + self.alpha * delta)
            self.Model[(S, A)] = (R, Su)
            for j in range(nplanning):
                S, A = self.getSA()
                R, Su = self.Model[(S, A)]
                delta = R + self.gamma * Q[Su].max() - Q[S].at(A)
                self.Q[S].keyValue(A, Q[S].at(A) + self.alpha * delta)

    def updateRewards(self, state, R):
        self.rewards[state] += R
        self.rewardNums[state] += 1

    def getRewardAvg(self, state):
        if self.rewardNums[state] == 0: return 0
        else: return self.rewards[state] / self.rewardNums[state]

    def updatePredecessors(self, S, A, Su):
        self.predecessors[Su].add((S, A))

    def PrioritySweep(self, ndirect, nplanning, theta=0.1):

        for i in range(ndirect):
            S = self.gridWorld.getAgentPose()
            A = self.getAction(S)
            R, Su = self.gridWorld.takeStep(A)
            self.updatePredecessors(S, A, Su)
            self.updateRewards(Su, R)
            self.Model[(S, A)] = (R, Su)
            Q = self.Q

            P = abs(R + self.gamma * Q[Su].max() - Q[S].at(A))

            if P > theta: self.PriorityQueue.put((-P, (S, A)))
            iterCount = 0
            while iterCount < nplanning and not self.PriorityQueue.empty():

                P, item = self.PriorityQueue.get()

                S, A = item
                R, Su = self.Model[(S, A)]
                delta = R + self.gamma * Q[Su].max() - Q[S].at(A)
                self.Q[S].keyValue(A, Q[S].at(A) + self.alpha * delta)
                predStates = self.gridWorld.getPredFromPose(S)
                for SB, AB in predStates:

                    RB, _ = self.Model[(SB, AB)]
                    P = abs(RB + self.gamma * Q[S].max() - Q[SB].at(AB))
                    if P > theta:
                        self.PriorityQueue.put((-P, (SB, AB)))
                iterCount += 1

    def tabDynaQ_Traj(
            self,
            ndirect,  # number of iterations for direct learning
            ntrajectories,
            nplanning,
            numMaxExp  # number of iterations for planning
    ):
        """
        args:
            ndirect - number of iterations for direct learning
            ntrajectories - number of individual trajectories to test
            nplanning - number of steps to take for each trajectory
        learns the Q values using Dyna-Q planning with tabular (deterministic) model
        """
        for i in range(ndirect):
            S = self.gridWorld.getAgentPose()
            A = self.getAction(S)
            self.updateExp(S, A, numMaxExp)
            R, Su = self.gridWorld.takeStep(A)
            Q = self.Q

            delta = R + self.gamma * Q[Su].max() - Q[S].at(A)
            self.Q[S].keyValue(A, Q[S].at(A) + self.alpha * delta)
            self.Model[(S, A)] = (R, Su)
            for t in range(ntrajectories):
                initState = self.getRandomSFromExp()
                for j in range(nplanning):
                    if j == 0: S = initState
                    A, _ = self.getBestAction(S)

                    R, Su = self.Model[(S, A)]
                    delta = R + self.gamma * Q[Su].max() - Q[S].at(A)
                    self.Q[S].keyValue(A, Q[S].at(A) + self.alpha * delta)
                    S = Su

    def learn(self, ndirect, nplanning, theta, learner, ntrajectories,
              maxNumExp):
        if learner == 1:
            self.tabDynaQ(ndirect, nplanning, maxNumExp)
        elif learner == 2:
            self.PrioritySweep(ndirect, nplanning, theta)
        elif learner == 3:
            self.tabDynaQ_Traj(ndirect, ntrajectories, nplanning, maxNumExp)