class QLearningAgent(FlappyBirdAgent):
    ''' Q-Learning Agent. '''
    def __init__(self, actions, probFlap=0.5, rounding=None):
        '''
        Initializes the agent.
        
        Args:
            actions (list): Possible action values.
            probFlap (float): The probability of flapping when choosing
                              the next action randomly.
            rounding (int): The level of discretization.
        '''
        super().__init__(actions)
        self.probFlap = probFlap
        self.qValues = defaultdict(float)
        self.env = FlappyBirdNormal(gym.make('FlappyBird-v0'),
                                    rounding=rounding)

    def act(self, state):
        '''
        Returns the next action for the current state.
        
        Args:
            state (str): The current state.
            
        Returns:
            int: 0 or 1.
        '''
        def randomAct():
            if random.random() < self.probFlap:
                return 0
            return 1

        if random.random() < self.epsilon:
            return randomAct()

        qValues = [
            self.qValues.get((state, action), 0) for action in self.actions
        ]

        if qValues[0] < qValues[1]:
            return 1
        elif qValues[0] > qValues[1]:
            return 0
        else:
            return randomAct()

    def saveQValues(self):
        ''' Saves the Q-values. '''
        toSave = {
            key[0] + ' action ' + str(key[1]): self.qValues[key]
            for key in self.qValues
        }
        with open('qValues.json', 'w') as fp:
            json.dump(toSave, fp)

    def loadQValues(self):
        ''' Loads the Q-values. '''
        def parseKey(key):
            state = key[:-9]
            action = int(key[-1])
            return (state, action)

        with open('qValues.json') as fp:
            toLoad = json.load(fp)
            self.qValues = {parseKey(key): toLoad[key] for key in toLoad}

    def train(self,
              order='forward',
              numIters=20000,
              epsilon=0.1,
              discount=1,
              eta=0.9,
              epsilonDecay=False,
              etaDecay=False,
              evalPerIters=250,
              numItersEval=1000):
        '''
        Trains the agent.
        
        Args:
            order (str): The order of updates, 'forward' or 'backward'.
            numIters (int): The number of training iterations.
            epsilon (float): The epsilon value.
            discount (float): The discount factor.
            eta (float): The eta value.
            epsilonDecay (bool): Whether to use epsilon decay.
            etaDecay (bool): Whether to use eta decay.
            evalPerIters (int): The number of iterations between two evaluation calls.
            numItersEval (int): The number of evaluation iterations.
        '''
        self.epsilon = epsilon
        self.initialEpsilon = epsilon
        self.discount = discount
        self.eta = eta
        self.epsilonDecay = epsilonDecay
        self.etaDecay = etaDecay
        self.evalPerIters = evalPerIters
        self.numItersEval = numItersEval
        self.env.seed(random.randint(0, 100))

        done = False
        maxScore = 0
        maxReward = 0
        for i in range(numIters):
            if i % 50 == 0 or i == numIters - 1:
                print("Iter: ", i)

            self.epsilon = self.initialEpsilon / (i + 1) if self.epsilonDecay \
                           else self.initialEpsilon
            score = 0
            totalReward = 0
            ob = self.env.reset()
            gameIter = []
            state = self.env.getGameState()

            while True:
                action = self.act(state)
                nextState, reward, done, _ = self.env.step(action)
                gameIter.append((state, action, reward, nextState))
                state = nextState
                #                self.env.render()  # Uncomment it to display graphics.
                totalReward += reward
                if reward >= 1:
                    score += 1
                if done:
                    break

            if score > maxScore: maxScore = score
            if totalReward > maxReward: maxReward = totalReward

            if order == 'forward':
                for (state, action, reward, nextState) in gameIter:
                    self.updateQ(state, action, reward, nextState)
            else:
                for (state, action, reward, nextState) in gameIter[::-1]:
                    self.updateQ(state, action, reward, nextState)

            if self.etaDecay:
                self.eta *= (i + 1) / (i + 2)

            if (i + 1) % self.evalPerIters == 0:
                if ((i + 1) == 2500):
                    output = self.test(numIters=60)
                elif ((i + 1) == 5000):
                    output = self.test(numIters=60)
                elif (i == 9999):
                    output = self.test(numIters=self.numItersEval)
                else:
                    output = self.test(numIters=self.numItersEval, awa=False)
                self.saveOutput(output, i + 1)
                self.saveQValues()

        self.env.close()
        print("Max Score Train: ", maxScore)
        print("Max Reward Train: ", maxReward)
        print()

    def test(self, numIters=20000, awa=True):
        '''
        Evaluates the agent.
        
        Args:
            numIters (int): The number of evaluation iterations.
        
        Returns:
            dict: A set of scores.
        '''
        self.epsilon = 0
        self.env.seed(0)

        done = False
        maxScore = 0
        maxReward = 0
        output = defaultdict(int)

        for i in range(numIters):
            score = 0
            totalReward = 0
            ob = self.env.reset()
            state = self.env.getGameState()

            while True:
                action = self.act(state)
                state, reward, done, _ = self.env.step(action)
                if (awa):
                    self.env.render()  # Uncomment it to display graphics.
                totalReward += reward
                if reward >= 1:
                    score += 1
                if done:
                    break

            output[score] += 1
            if score > maxScore: maxScore = score
            if totalReward > maxReward: maxReward = totalReward

        self.env.close()
        print("Max Score Test: ", maxScore)
        print("Max Reward Test: ", maxReward)
        print()
        return output

    def updateQ(self, state, action, reward, nextState):
        '''
        Updates the Q-values based on an observation.
        
        Args:
            state, nextState (str): Two states.
            action (int): 0 or 1.
            reward (int): The reward value.
        '''
        nextQValues = [
            self.qValues.get((nextState, nextAction), 0)
            for nextAction in self.actions
        ]
        nextValue = max(nextQValues)
        self.qValues[(state, action)] = (1 - self.eta) * self.qValues.get((state, action), 0) \
                                        + self.eta * (reward + self.discount * nextValue)

    def saveOutput(self, output, iter):
        '''
        Saves the scores.
        
        Args:
            output (dict): A set of scores.
            iter (int): Current iteration.
        '''
        if not os.path.isdir('scores'):
            os.mkdir('scores')
        with open('./scores/scores_{}.json'.format(iter), 'w') as fp:
            json.dump(output, fp)
Exemplo n.º 2
0
class BaselineAgent(FlappyBirdAgent):
    ''' Baseline Agent with a random policy. '''
    def __init__(self, actions, probFlap=0.5):
        '''
        Initializes the agent.
        
        Args:
            actions (list): Possible action values.
            probFlap (float): The probability of flapping when choosing
                              the next action randomly.
        '''
        super().__init__(actions)
        self.probFlap = probFlap
        self.env = FlappyBirdNormal(gym.make('FlappyBird-v0'))

    def act(self, state):
        '''
        Returns the next action for the current state.
        
        Args:
            state (list): The current state.
            
        Returns:
            int: 0 or 1.
        '''
        if random.random() < self.probFlap:
            return 0
        return 1

    def train(self, numIters=20000, evalPerIters=250, numItersEval=1000):
        '''
        Trains the agent.
        
        Args:
            numIters (int): The number of training iterations.
            evalPerIters (int): The number of iterations between two evaluation calls.
            numItersEval (int): The number of evaluation iterations.
        '''
        print("No training needed!")

        self.evalPerIters = evalPerIters
        self.numItersEval = numItersEval
        for i in range(numIters):
            if i % 50 == 0 or i == numIters - 1:
                print("Iter: ", i)

            if (i + 1) % self.evalPerIters == 0:
                output = self.test(numIters=self.numItersEval)
                self.saveOutput(output, i + 1)

    def test(self, numIters=2000):
        '''
        Evaluates the agent.
        
        Args:
            numIters (int): The number of evaluation iterations.
        
        Returns:
            dict: A set of scores.
        '''
        self.env.seed(0)

        done = False
        maxScore = 0
        maxReward = 0
        output = defaultdict(int)
        counter = 0

        for i in range(numIters):
            score = 0
            totalReward = 0
            ob = self.env.reset()
            state = self.env.getGameState()

            while True:
                action = self.act(state)
                state, reward, done, _ = self.env.step(action)
                #                self.env.render()  # Uncomment it to display graphics.
                totalReward += reward
                if reward >= 1:
                    score += 1
                counter += 1
                if done:
                    break
                    break

            output[score] += 1
            if score > maxScore: maxScore = score
            if totalReward > maxReward: maxReward = totalReward

        self.env.close()
        print("Max Score Test: ", maxScore)
        print("Max Reward Test: ", maxReward)
        print()
        return output

    def saveOutput(self, output, iter):
        '''
        Saves the scores.
        
        Args:
            output (dict): A set of scores.
            iter (int): Current iteration.
        '''
        if not os.path.isdir('scores'):
            os.mkdir('scores')
        with open('./scores/scores_{}.json'.format(iter), 'w') as fp:
            json.dump(output, fp)