예제 #1
0
    def __init__(self):
        self.env = FrozenLakeEnv(map_name='4x4')

        self.states = np.identity(16)
        self.x = tf.placeholder(shape=[1, 16], dtype=tf.float32)

        self.W = tf.Variable(tf.random_uniform([16, 4], 0, 0.1))

        self.Q = tf.matmul(self.x, self.W)
        self.Q_hat = tf.placeholder(shape=[1, 4], dtype=tf.float32)
        self.loss = tf.reduce_sum(tf.square(self.Q_hat - self.Q))
        self.train = tf.train.GradientDescentOptimizer(learning_rate=0.1)\
            .minimize(self.loss)

        self.gamma = 0.9
        self.epsilon = 1
        self.decay_rate = 0.001

        self.num_episodes = 500_000
        self.avg_rewards = []

        init = tf.initialize_all_variables()
        self.session = tf.Session()
        self.session.run(init)
        self.avg_rewards = []
예제 #2
0
 def __init__(self,
              num_episodes=1000,
              max_steps=200,
              learning_rate=0.1,
              gamma=0.9,
              epsilon=0.9,
              decay_rate=0.1,
              env=FrozenLakeEnv(map_name="10x10")):
     self.env = env
     state_size = self.env.observation_space.n
     actions_num = self.env.action_space.n
     self.q_table = np.zeros((state_size, actions_num))
     self.num_episodes = num_episodes
     self.max_steps = max_steps
     self.learning_rate = learning_rate
     self.gamma = gamma
     self.epsilon = epsilon
     self.decay_rate = decay_rate
     self.avg_rewards = []
예제 #3
0
        done = False
        print('failed ', episode + 1)

        time.sleep(1.5)

        steps = 0
        while not done:
            clear_output(wait=True)
            agent.env.render()
            time.sleep(0.3)

            action = np.argmax(agent.q_table[state])
            state, reward, done, _ = agent.env.step(action)
            steps += 1

        clear_output(wait=True)
        agent.env.render()

        if reward == 1:
            print(f'Congratulation! 🏆 found in {steps} steps.')
            time.sleep(2)
        else:
            print('Sorry, you fell through a 🕳, try again!')
            time.sleep(2)
        clear_output(wait=True)


agent = QFrozenLakeAgent(env=FrozenLakeEnv(map_name="10x10"))
agent.train()
# test(agent, num_episodes=1)
agent.plot()
예제 #4
0
 def __init__(self, env=FrozenLakeEnv(map_name="4x4")):
     self.env = env
     self.no_states = self.env.observation_space.n
     self.no_actions = self.env.action_space.n
     self.Q = np.zeros((self.no_states, self.no_actions))
     self.returns = []
예제 #5
0
            while not done:
                clear_output(wait=True)
                self.env.render()
                time.sleep(0.3)

                action = np.argmax(self.Q[state])
                state, reward, done, _ = self.env.step(action)
                steps += 1

            clear_output(wait=True)
            self.env.render()

            if reward == 1:
                print(f'steps are {steps} .')
                time.sleep(2)
            else:
                print('Sorry, try again!')
                time.sleep(2)
            clear_output(wait=True)


if __name__ == '__main__':
    agent = SARSA(env=FrozenLakeEnv(map_name="10x10"))
    agent.train(episodes=1000,
                gamma=1.0,
                alpha=0.1,
                epsilon=1.0,
                decay_rate=0.1,
                eligibility_decay=0.3)
    # agent.test(episodes_test=1)
예제 #6
0
class DQNFrozenLakeAgent:
    def __init__(self):
        self.env = FrozenLakeEnv(map_name='4x4')

        self.states = np.identity(16)
        self.x = tf.placeholder(shape=[1, 16], dtype=tf.float32)

        self.W = tf.Variable(tf.random_uniform([16, 4], 0, 0.1))

        self.Q = tf.matmul(self.x, self.W)
        self.Q_hat = tf.placeholder(shape=[1, 4], dtype=tf.float32)
        self.loss = tf.reduce_sum(tf.square(self.Q_hat - self.Q))
        self.train = tf.train.GradientDescentOptimizer(learning_rate=0.1)\
            .minimize(self.loss)

        self.gamma = 0.9
        self.epsilon = 1
        self.decay_rate = 0.001

        self.num_episodes = 500_000
        self.avg_rewards = []

        init = tf.initialize_all_variables()
        self.session = tf.Session()
        self.session.run(init)
        self.avg_rewards = []

    def test_matrix(self, Q, episode):
        total_reward = 0
        for i in range(100):
            state = self.env.reset()
            done = False
            while not done:
                Q_ = self.session.run(
                    Q, feed_dict={self.x: self.states[state:state + 1]})
                a = np.argmax(Q_, 1)[0]

                state, r, done, _ = self.env.step(a)
                total_reward += r

        result = total_reward / 100
        print('Episode: {:,}, Average reward: {}'.format(episode, result))
        return result

    def epsilon_greedy(self, Q_pred):
        """
        Returns the next action by exploration with probability epsilon and
        exploitation with probability 1-epsilon.
        """
        if np.random.random() <= self.epsilon:
            return self.env.action_space.sample()
        else:
            return np.argmax(Q_pred, 1)[0]

    def decay_epsilon(self, episode):
        """
        Decaying exploration with the number of episodes.
        """
        self.epsilon = 0.1 + 0.9 * np.exp(-self.decay_rate * episode)

    def run_training(self):
        """Training the agent to find the frisbee on the frozen lake"""

        self.avg_rewards = []
        self.episode_len = np.zeros(self.num_episodes)

        for episode in range(self.num_episodes):
            state = self.env.reset()
            done = False

            while not done:
                # Predicted Q
                Q_pred = self.session.run(
                    self.Q, {self.x: self.states[state:state + 1]})
                action = self.epsilon_greedy(Q_pred)
                new_state, reward, done, _ = self.env.step(action)

                # Actual Q after performing an action
                Q_true = reward + self.gamma * np.max(
                    self.session.run(
                        self.Q,
                        {self.x: self.states[new_state:new_state + 1]}))

                Q_pred[0, action] = Q_true

                # Calculate loss and train the agent
                self.session.run(self.train,
                                 feed_dict={
                                     self.x: self.states[state:state + 1],
                                     self.Q_hat: Q_pred
                                 })

                state = new_state

                self.episode_len[episode] += 1

            self.decay_epsilon(episode)

            if episode % 5000 == 0:
                avg_reward = self.test_matrix(self.Q, episode)
                self.avg_rewards.append(avg_reward)
                if avg_reward > 0.8:
                    print("Frozen Lake solved 🏆🏆🏆")
                    break

    def plot(self):
        """Plot the episode length and average rewards per episode"""

        fig = plt.figure(figsize=(20, 5))

        episode_len = [i for i in self.episode_len if i != 0]

        rolling_len = pd.DataFrame(episode_len).rolling(100, min_periods=100)
        mean_len = rolling_len.mean()
        std_len = rolling_len.std()

        plt.plot(mean_len, color='red')
        plt.fill_between(x=std_len.index,
                         y1=(mean_len - std_len)[0],
                         y2=(mean_len + std_len)[0],
                         color='red',
                         alpha=.2)

        plt.ylabel('Episode length')
        plt.xlabel('Episode')
        plt.title(
            f'Frozen Lake - Length of episodes (mean over window size 100)')
        plt.show(fig)

        fig = plt.figure(figsize=(20, 5))

        plt.plot(self.avg_rewards, color='red')
        plt.gca().set_xticklabels(
            [i + i * 4999 for i in range(len(self.avg_rewards))])

        plt.ylabel('Average Reward')
        plt.xlabel('Episode')
        plt.title(f'Frozen Lake - Average rewards per episode ')
        plt.show(fig)
예제 #7
0
        for j in range(map_size):
            values[i, j] = (np.array(V[(i * map_size + j)]))

    fig, ax = plt.subplots()
    im = ax.imshow(values)
    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(),
             rotation=45,
             ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    for i in range(map_size):
        for j in range(map_size):
            text = ax.text(j,
                           i,
                           values[i, j],
                           ha="center",
                           va="center",
                           color="w")

    ax.set_title("MAP VALUES")
    fig.tight_layout()
    plt.show()
    print(V)


main(env=FrozenLakeEnv(map_name="4x4", is_slippery=True),
     map_size=4,
     episodes=2000)