Exemplo n.º 1
0
class Agent:
    """
    This class builds an agent with its own Network, memory buffer and
    environment to learn a policy.
    """
    def __init__(self, sess, gui, displayer, saver):
        """
        Build a new instance of Environment, QNetwork and ExperienceBuffer.

        Args:
            sess     : the tensorflow session in which to build the network
            gui      : a GUI instance to manage the control of the agent
            displayer: a Displayer instance to keep track of the episode rewards
            saver    : a Saver instance to save periodically the network
        """
        print("Initializing the agent...")

        self.sess = sess
        self.gui = gui
        self.displayer = displayer
        self.saver = saver

        self.env = Environment()
        self.network = Network(sess)
        self.buffer = ExperienceBuffer()

        self.best_run = -1e10
        self.n_gif = 0

        print("Agent initialized !")

    def pre_train(self):
        """
        Method to run a random agent in the environment to fill the memory
        buffer.
        """
        print("Beginning of the pre-training...")

        for i in range(Settings.PRE_TRAIN_EPS):

            s = self.env.reset()
            done = False
            episode_reward = 0
            episode_step = 0

            while episode_step < Settings.MAX_EPISODE_STEPS and not done:

                a = self.env.act_random()
                s_, r, done, info = self.env.act(a)
                self.buffer.add((s, a, r, s_, 1 if not done else 0))

                s = s_
                episode_reward += r
                episode_step += 1

            if Settings.PRE_TRAIN_EPS > 5 and i % (Settings.PRE_TRAIN_EPS //
                                                   5) == 0:
                print("Pre-train step n", i)

            # Set the best score to at least the max score the random agent got
            self.best_run = max(self.best_run, episode_reward)

        print("End of the pre training !")

    def save_best(self, episode_reward):
        self.best_run = episode_reward
        print("Save best", episode_reward)
        self.saver.save('best')
        # self.play(1, 'best')

    def run(self):
        """
        Method to run the agent in the environment to collect experiences and
        learn on these experiences by gradient descent.
        """
        print("Beginning of the run...")

        self.pre_train()
        self.network.init_target()

        self.total_steps = 0
        self.nb_ep = 1

        while self.nb_ep < Settings.TRAINING_EPS and not self.gui.STOP:

            s = self.env.reset()
            episode_reward = 0
            done = False

            episode_step = 1
            # The more episodes the agent performs, the longer they are
            max_step = Settings.MAX_EPISODE_STEPS
            if Settings.EP_ELONGATION > 0:
                max_step += self.nb_ep // Settings.EP_ELONGATION

            # Initialize exploration noise process
            noise_process = np.zeros(Settings.ACTION_SIZE)
            noise_scale = (Settings.NOISE_SCALE_INIT *
                           Settings.NOISE_DECAY**self.nb_ep) * \
                (Settings.HIGH_BOUND - Settings.LOW_BOUND)

            # Render settings
            self.env.set_render(self.gui.render.get(self.nb_ep))
            self.env.set_gif(self.gui.gif.get(self.nb_ep))

            while episode_step <= max_step and not done:

                # Choose action based on deterministic policy
                a = self.network.act(s)

                # Add temporally-correlated exploration noise to action
                noise_process = Settings.EXPLO_THETA * \
                    (Settings.EXPLO_MU - noise_process) + \
                    Settings.EXPLO_SIGMA * np.random.randn(Settings.ACTION_SIZE)

                a += noise_scale * noise_process
                s_, r, done, info = self.env.act(a)
                episode_reward += r

                self.buffer.add((s, a, r, s_, 1 if not done else 0))

                if self.total_steps % Settings.TRAINING_FREQ == 0:
                    batch = self.buffer.sample()
                    self.network.train(np.asarray(batch))
                    self.network.update_target()

                s = s_
                episode_step += 1
                self.total_steps += 1

            self.displayer.add_reward(episode_reward,
                                      plot=self.gui.plot.get(self.nb_ep))
            # if episode_reward > self.best_run:
            #     self.save_best(episode_reward)

            # Episode display
            if self.gui.ep_reward.get(self.nb_ep):
                print(
                    'Episode %2i, Reward: %7.3f, Steps: %i, Final noise scale: %7.3f'
                    % (self.nb_ep, episode_reward, episode_step, noise_scale))

            # Save the model
            if self.gui.save.get(self.nb_ep):
                self.saver.save(self.nb_ep)

            self.nb_ep += 1

        self.env.close()

    def play(self, number_run, name=None):
        """
        Method to evaluate the policy without exploration.

        Args:
            number_run: the number of episodes to perform
            name      : the name of the gif that will be saved
        """
        print("Playing for", number_run, "runs")

        self.env.set_render(Settings.DISPLAY)
        try:
            for i in range(number_run):

                s = self.env.reset()
                episode_reward = 0
                done = False
                self.env.set_gif(True, name)

                while not done:
                    a = self.network.act(s)
                    s, r, done, info = self.env.act(a)

                    episode_reward += r

                print("Episode reward :", episode_reward)

        except KeyboardInterrupt as e:
            pass

        except Exception as e:
            print("Exception :", e)

        finally:
            print("End of the demo")

    def stop(self):
        self.env.close()
Exemplo n.º 2
0
class Agent:
    def __init__(self, sess):
        print("Initializing the agent...")

        self.sess = sess
        self.env = Environment()
        self.state_size = self.env.get_state_size()[0]
        self.action_size = self.env.get_action_size()
        self.bounds = self.env.get_bounds()

        print("Creation of the actor-critic network")
        self.network = Network(self.sess, self.state_size, self.action_size,
                               self.bounds)

        self.critic_lr = settings.CRITIC_LEARNING_RATE
        self.actor_lr = settings.ACTOR_LEARNING_RATE

        self.delta_critic_lr = self.critic_lr / settings.TRAINING_EPS
        self.delta_actor_lr = self.actor_lr / settings.TRAINING_EPS

        self.sess.run(tf.global_variables_initializer())

    def predict_action(self, s, plot_distrib):
        if plot_distrib:
            action, distrib, value = self.sess.run([
                self.network.actions, self.network.Q_distrib_suggested_actions,
                self.network.Q_values_suggested_actions
            ],
                                                   feed_dict={
                                                       self.network.state_ph:
                                                       s[None]
                                                   })
            action, distrib, value = action[0], distrib[0], value[0]
            fig = plt.figure(2)
            fig.clf()
            plt.bar(self.z, distrib, self.delta_z)
            plt.axvline(value, color='red', linewidth=0.7)
            plt.show(block=False)
            plt.pause(0.001)
            return action

        return self.sess.run(self.network.actions,
                             feed_dict={self.network.state_ph: s[None]})[0]

    def run(self):

        self.total_steps = 1
        self.sess.run(self.network.target_init)
        self.z = self.sess.run(self.network.z)
        self.delta_z = self.network.delta_z

        ep = 1
        while ep < settings.TRAINING_EPS + 1 and not GUI.STOP:

            s = self.env.reset()
            episode_reward = 0
            episode_step = 0
            done = False
            memory = deque()

            # Initialize exploration noise process
            noise_scale = settings.NOISE_SCALE * settings.NOISE_DECAY**ep

            # Initial state
            self.env.set_render(GUI.render.get(ep))
            self.env.set_gif(GUI.gif.get(ep))
            plot_distrib = GUI.plot_distrib.get(ep)

            max_eps = settings.MAX_EPISODE_STEPS + (ep // 50)

            while episode_step < max_eps and not done:

                noise = np.random.normal(size=self.action_size)
                scaled_noise = noise_scale * noise

                a = np.clip(
                    self.predict_action(s, plot_distrib) + scaled_noise,
                    *self.bounds)

                s_, r, done, info = self.env.act(a)

                episode_reward += r

                memory.append((s, a, r, s_, 0 if done else 1))

                if len(memory) >= settings.N_STEP_RETURN:
                    s_mem, a_mem, discount_r, ss_mem, done_mem = memory.popleft(
                    )
                    for i, (si, ai, ri, s_i, di) in enumerate(memory):
                        discount_r += ri * settings.DISCOUNT**(i + 1)
                    BUFFER.add(s_mem, a_mem, discount_r, s_, 0 if done else 1)

                if len(
                        BUFFER
                ) > 0 and self.total_steps % settings.TRAINING_FREQ == 0:
                    self.network.train(BUFFER.sample(), self.critic_lr,
                                       self.actor_lr)

                s = s_
                episode_step += 1
                self.total_steps += 1

            self.critic_lr -= self.delta_critic_lr
            self.actor_lr -= self.delta_actor_lr

            # Plot reward
            plot = GUI.plot.get(ep)
            DISPLAYER.add_reward(episode_reward, plot)

            # Print episode reward
            if GUI.ep_reward.get(ep):
                print(
                    'Episode %2i, Reward: %7.3f, Steps: %i, Final noise scale: %7.3f, Critic LR: %f, Actor LR: %f'
                    % (ep, episode_reward, episode_step, noise_scale,
                       self.critic_lr, self.actor_lr))

            # Save the model
            if GUI.save.get(ep):
                SAVER.save(ep)

            ep += 1

    def play(self, number_run):
        print("Playing for", number_run, "runs")

        self.env.set_render(settings.DISPLAY)
        try:
            for i in range(number_run):

                s = self.env.reset()
                episode_reward = 0
                done = False

                while not done:

                    a = self.predict_action(s)

                    s, r, done, info = self.env.act(a)

                    episode_reward += r

                print("Episode reward :", episode_reward)

        except KeyboardInterrupt as e:
            pass

        except Exception as e:
            print("Exception :", e)

        finally:
            self.env.set_render(False)
            print("End of the demo")
            self.env.close()

    def close(self):
        self.env.close()
Exemplo n.º 3
0
class Agent:
    """
    This class builds an agent with its own Network, memory buffer and
    environment to learn a policy.
    """
    def __init__(self, sess, gui, displayer, saver):
        """
        Build a new instance of Environment, QNetwork and ExperienceBuffer.

        Args:
            sess     : the tensorflow session in which to build the network
            gui      : a GUI instance to manage the control of the agent
            displayer: a Displayer instance to keep track of the episode rewards
            saver    : a Saver instance to save periodically the network
        """
        print("Initializing the agent...")

        self.sess = sess
        self.gui = gui
        self.gui_thread = threading.Thread(target=lambda: self.gui.run(self))
        self.displayer = displayer
        self.saver = saver
        signal.signal(signal.SIGINT, self.interrupt)

        self.env = Environment()
        self.network = Network(sess)
        self.buffer = ExperienceBuffer()

        self.create_summaries()

        self.best_run = -1e10
        self.n_gif = 0

        print("Agent initialized !")

    def create_summaries(self):

        self.ep_reward_ph = tf.placeholder(tf.float32)
        ep_reward_summary = tf.summary.scalar("Episode/Episode reward",
                                              self.ep_reward_ph)

        self.steps_ph = tf.placeholder(tf.float32)
        steps_summary = tf.summary.scalar("Episode/Nb steps", self.steps_ph)

        self.noise_ph = tf.placeholder(tf.float32)
        noise_summary = tf.summary.scalar("Settings/Noise", self.noise_ph)

        self.ep_summary = tf.summary.merge(
            [ep_reward_summary, noise_summary, steps_summary])

        self.writer = tf.summary.FileWriter("./logs", self.sess.graph)

    def pre_train(self):
        """
        Method to run a random agent in the environment to fill the memory
        buffer.
        """
        print("Beginning of the pre-training...")

        for i in range(Settings.PRE_TRAIN_EPS):

            s = self.env.reset()
            done = False
            episode_reward = 0
            episode_step = 0

            while episode_step < Settings.MAX_EPISODE_STEPS and not done:

                a = self.env.act_random()
                s_, r, done, info = self.env.act(a)
                self.buffer.add((s, a, r, s_, 1 if not done else 0))

                s = s_
                episode_reward += r
                episode_step += 1

            if Settings.PRE_TRAIN_EPS > 5 and i % (Settings.PRE_TRAIN_EPS //
                                                   5) == 0:
                print("Pre-train step n", i)

            # Set the best score to at least the max score the random agent got
            self.best_run = max(self.best_run, episode_reward)

        print("End of the pre training !")

    def save_best(self, episode_reward):
        self.best_run = episode_reward
        print("Save best", episode_reward)
        self.saver.save('best')
        # self.play(1, 'best')

    def run(self):
        """
        Method to run the agent in the environment to collect experiences and
        learn on these experiences by gradient descent.
        """
        print("Beginning of the run...")

        self.pre_train()
        self.network.init_target()
        self.gui_thread.start()

        self.total_steps = 0
        self.nb_ep = 1

        while self.nb_ep < Settings.TRAINING_EPS and not self.gui.STOP:

            s = self.env.reset()
            episode_reward = 0
            done = False

            episode_step = 1
            # The more episodes the agent performs, the longer they are
            max_step = Settings.MAX_EPISODE_STEPS
            if Settings.EP_ELONGATION > 0:
                max_step += self.nb_ep // Settings.EP_ELONGATION

            # Initialize exploration noise process
            noise_process = np.zeros(Settings.ACTION_SIZE)
            noise_scale = (Settings.NOISE_SCALE_INIT *
                           Settings.NOISE_DECAY**self.nb_ep) * \
                (Settings.HIGH_BOUND - Settings.LOW_BOUND)

            # Render settings
            self.env.set_render(self.gui.render.get(self.nb_ep))
            self.env.set_gif(self.gui.gif.get(self.nb_ep))

            while episode_step <= max_step and not done:

                # Choose action based on deterministic policy
                a = self.network.act(s)

                # Add temporally-correlated exploration noise to action
                noise_process = Settings.EXPLO_THETA * \
                    (Settings.EXPLO_MU - noise_process) + \
                    Settings.EXPLO_SIGMA * np.random.randn(Settings.ACTION_SIZE)

                a += noise_scale * noise_process
                s_, r, done, info = self.env.act(a)
                episode_reward += r

                self.buffer.add((s, a, r, s_, 1 if not done else 0))

                if self.total_steps % Settings.TRAINING_FREQ == 0:
                    batch = self.buffer.sample()
                    self.network.train(np.asarray(batch))
                    self.network.update_target()

                s = s_
                episode_step += 1
                self.total_steps += 1

            self.displayer.add_reward(episode_reward,
                                      plot=self.gui.plot.get(self.nb_ep))
            # if episode_reward > self.best_run:
            #     self.save_best(episode_reward)

            # Episode display
            if self.gui.ep_reward.get(self.nb_ep):
                print(
                    'Episode %2i, Reward: %7.3f, Steps: %i, Final noise scale: %7.3f'
                    % (self.nb_ep, episode_reward, episode_step, noise_scale))

            # Write the summary
            feed_dict = {
                self.ep_reward_ph: episode_reward,
                self.noise_ph: noise_scale[0],
                self.steps_ph: episode_step
            }
            summary = self.sess.run(self.ep_summary, feed_dict=feed_dict)
            self.writer.add_summary(summary, self.nb_ep)

            # Save the model
            if self.gui.save.get(self.nb_ep):
                self.saver.save(self.nb_ep)

            self.nb_ep += 1

        print("Training completed !")
        self.env.close()
        self.display()
        self.gui.end_training()
        self.gui_thread.join()

    def play(self, number_run=1, gif=False, name=None):
        """
        Method to evaluate the policy without exploration.

        Args:
            number_run: the number of episodes to perform
            gif       : whether to save a gif or not
            name      : the name of the gif that will be saved
        """
        self.env.set_render(Settings.DISPLAY)

        for i in range(number_run):

            s = self.env.reset()
            episode_reward = 0
            done = False
            self.env.set_gif(gif, name)

            while not done:
                a = self.network.act(s)
                s, r, done, info = self.env.act(a)
                episode_reward += r

            if gif: self.env.save_gif()
            print("Episode reward :", episode_reward)

    def display(self):
        self.displayer.disp()

    def stop(self):
        self.env.close()

    def interrupt(self, sig, frame):
        self.gui.stop_run()