예제 #1
0
 def dl_progress(count, block_size, total_size):
     if ProgressTracker.progbar is None:
         if total_size is -1:
             total_size = None
         ProgressTracker.progbar = Progbar(total_size)
     else:
         ProgressTracker.progbar.update(count * block_size)
예제 #2
0
 def __init__(self, env, config):
     Dueling.__init__(self, env, config)
     self.logger = get_logger(config.log_path)
     self.avg_reward = 0
     self.progress = Progbar(target=self.config.nsteps_train)
예제 #3
0
class train_Dueling(Dueling):
    def __init__(self, env, config):
        Dueling.__init__(self, env, config)
        self.logger = get_logger(config.log_path)
        self.avg_reward = 0
        self.progress = Progbar(target=self.config.nsteps_train)

    def get_log(self, exp_schedule, lr_schedule, t, loss_eval, max_q_values,
                rewards):
        if ((t > self.config.learning_start)
                and (t % self.config.log_freq == 0)
                and (t % self.config.learning_freq == 0)):
            self.avg_reward = np.mean(rewards)
            max_q = np.mean(max_q_values)
            exp_schedule.update(t)
            lr_schedule.update(t)
            if len(rewards) > 0:
                self.progress.update(t + 1,
                                     values=[("Loss", loss_eval),
                                             ("Avg_R", self.avg_reward),
                                             ("Max_R", np.max(rewards)),
                                             ("eps", exp_schedule.epsilon),
                                             ("Max_Q", max_q),
                                             ("lr", lr_schedule.epsilon)])

        elif (t < self.config.learning_start) and (t % self.config.log_freq
                                                   == 0):
            sys.stdout.write("\rLearning not start yet: {}/{}...".format(
                t, self.config.learning_start))
            sys.stdout.flush()

    def train_step(self, t, replay_buffer, lr):
        loss_eval = 0

        if (t > self.config.learning_start
                and t % self.config.learning_freq == 0):
            s_batch, a_batch, r_batch, sp_batch, done_mask_batch = replay_buffer.sample(
                self.config.batch_size)
            model_spec = {
                self.s: s_batch,
                self.a: a_batch,
                self.r: r_batch,
                self.sp: sp_batch,
                self.done_mask: done_mask_batch,
                self.lr: lr,
                self.avg_reward_placeholder: self.avg_reward,
            }
            loss_eval, summary, _ = self.sess.run(
                [self.loss, self.all_summary, self.train_op],
                feed_dict=model_spec)

            self.file_writer.add_summary(summary, t)

        if t % self.config.target_update_freq == 0:
            self.sess.run(self.update_target_op)

        if (t % self.config.saving_freq == 0):
            self.saver.save(self.sess,
                            self.config.model_output2,
                            global_step=t)

        return loss_eval

    def train(self, exp_schedule, lr_schedule):
        replay_buffer = ReplayBuffer(self.config.buffer_size,
                                     self.config.state_history)
        rewards = deque(maxlen=self.config.num_episodes_test)
        max_q_values = deque(maxlen=1000)
        q_values = deque(maxlen=1000)

        t = last_eval = last_record = 0
        scores_eval = []  # scores for plot
        scores_eval += [self.evaluate()]

        while t < self.config.nsteps_train:
            sum_reward = 0
            state = self.env.reset()
            while True:
                t += 1
                last_eval += 1
                last_record += 1

                # replay memory stuff
                idx = replay_buffer.store_frame(state)
                q_input = replay_buffer.encode_recent_observation()

                action_values = self.sess.run(self.q,
                                              feed_dict={self.s: [q_input]})[0]
                best_action = np.argmax(action_values)
                q_values = action_values
                action = exp_schedule.get_action(best_action)

                max_q_values.append(max(q_values))
                q_values += list(q_values)
                new_state, reward, done, info = self.env.step(action)

                # store the transition
                replay_buffer.store_effect(idx, action, reward, done)
                state = new_state

                loss_eval = self.train_step(t, replay_buffer,
                                            lr_schedule.epsilon)
                self.get_log(exp_schedule, lr_schedule, t, loss_eval,
                             max_q_values, rewards)
                sum_reward += reward
                if done or t >= self.config.nsteps_train: break

            rewards.append(sum_reward)

            if t > self.config.learning_start:
                if last_eval > self.config.eval_freq:
                    last_eval = 0
                    scores_eval += [self.evaluate()]

                elif self.config.record and (last_record >
                                             self.config.record_freq):
                    self.logger.info("Recording...")
                    last_record = 0
                    self.record()

        self.logger.info("*** Training is done.")
        self.saver.save(self.sess, self.config.model_output2, global_step=t)
        scores_eval += [self.evaluate()]
        export_plot(scores_eval, "Scores", self.config.plot_output)

    def evaluate(self, env=None, num_episodes=None):
        if env is None: env = self.env
        if num_episodes is None:
            self.logger.info("Evaluating...")
            num_episodes = self.config.num_episodes_test

        replay_buffer = ReplayBuffer(self.config.buffer_size,
                                     self.config.state_history)
        rewards = []
        for i in range(num_episodes):
            sum_reward = 0
            state = env.reset()
            while True:
                idx = replay_buffer.store_frame(state)
                q_input = replay_buffer.encode_recent_observation()
                action = self.env.action_space.sample()
                if self.config.soft_epsilon < np.random.random():
                    action = np.argmax(
                        self.sess.run(self.q, feed_dict={self.s:
                                                         [q_input]})[0])
                new_state, reward, done, info = env.step(action)
                replay_buffer.store_effect(idx, action, reward, done)
                state = new_state
                sum_reward += reward
                if done: break
            rewards.append(sum_reward)

        avg_reward = np.mean(rewards)
        if num_episodes > 1:
            self.logger.info("Average reward: {:04.2f}".format(avg_reward))
        return avg_reward

    def record(self):
        record_env = gym.wrappers.Monitor(self.env,
                                          self.config.record_path,
                                          video_callable=lambda x: True,
                                          resume=True)
        self.evaluate(record_env, 1)

    def run(self, exp_schedule, lr_schedule):
        self.sess = tf.Session()
        self.all_summary = tf.summary.merge_all()
        self.file_writer = tf.summary.FileWriter(config.output_path,
                                                 self.sess.graph)

        init = tf.global_variables_initializer()
        self.sess.run(init)
        self.sess.run(model.update_target_op)

        self.saver = tf.train.Saver(max_to_keep=2)
        self.cnn_saver.restore(self.sess, self.config.model_output)

        # model
        self.train(exp_schedule, lr_schedule)

        if self.config.record:
            self.record()