Пример #1
0
    def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="A2C",
              reset_num_timesteps=True):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)
        callback = self._init_callback(callback)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn()
            self.learning_rate_schedule = Scheduler(initial_value=self.learning_rate, n_values=total_timesteps,
                                                    schedule=self.lr_schedule)

            t_start = time.time()
            callback.on_training_start(locals(), globals())

            for update in range(1, total_timesteps // self.n_batch + 1):

                callback.on_rollout_start()
                # true_reward is the reward without discount
                rollout = self.runner.run(callback)
                # unpack
                obs, states, rewards, masks, actions, values, ep_infos, true_reward = rollout

                callback.on_rollout_end()

                # Early stopping due to the callback
                if not self.runner.continue_training:
                    break

                self.ep_info_buf.extend(ep_infos)
                _, value_loss, policy_entropy = self._train_step(obs, states, rewards, masks, actions, values,
                                                                 self.num_timesteps // self.n_batch, writer)
                n_seconds = time.time() - t_start
                fps = int((update * self.n_batch) / n_seconds)

                if writer is not None:
                    total_episode_reward_logger(self.episode_reward,
                                                true_reward.reshape((self.n_envs, self.n_steps)),
                                                masks.reshape((self.n_envs, self.n_steps)),
                                                writer, self.num_timesteps)


                if self.verbose >= 1 and (update % log_interval == 0 or update == 1):
                    explained_var = explained_variance(values, rewards)
                    logger.record_tabular("nupdates", update)
                    logger.record_tabular("total_timesteps", self.num_timesteps)
                    logger.record_tabular("fps", fps)
                    logger.record_tabular("policy_entropy", float(policy_entropy))
                    logger.record_tabular("value_loss", float(value_loss))
                    logger.record_tabular("explained_variance", float(explained_var))
                    if len(self.ep_info_buf) > 0 and len(self.ep_info_buf[0]) > 0:
                        logger.logkv('ep_reward_mean', safe_mean([ep_info['r'] for ep_info in self.ep_info_buf]))
                        logger.logkv('ep_len_mean', safe_mean([ep_info['l'] for ep_info in self.ep_info_buf]))
                    logger.dump_tabular()

        callback.on_training_end()
        return self
Пример #2
0
    def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="ACKTR",
              reset_num_timesteps=True):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)
        callback = self._init_callback(callback)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn()
            self.n_batch = self.n_envs * self.n_steps

            self.learning_rate_schedule = Scheduler(initial_value=self.learning_rate, n_values=total_timesteps,
                                                    schedule=self.lr_schedule)

            # FIFO queue of the q_runner thread is closed at the end of the learn function.
            # As a result, it needs to be redefinied at every call
            with self.graph.as_default():
                with tf.variable_scope("kfac_apply", reuse=self.trained,
                                       custom_getter=tf_util.outer_scope_getter("kfac_apply")):
                    # Some of the variables are not in a scope when they are create
                    # so we make a note of any previously uninitialized variables
                    tf_vars = tf.global_variables()
                    is_uninitialized = self.sess.run([tf.is_variable_initialized(var) for var in tf_vars])
                    old_uninitialized_vars = [v for (v, f) in zip(tf_vars, is_uninitialized) if not f]

                    self.train_op, self.q_runner = self.optim.apply_gradients(list(zip(self.grads_check, self.params)))

                    # then we check for new uninitialized variables and initialize them
                    tf_vars = tf.global_variables()
                    is_uninitialized = self.sess.run([tf.is_variable_initialized(var) for var in tf_vars])
                    new_uninitialized_vars = [v for (v, f) in zip(tf_vars, is_uninitialized)
                                              if not f and v not in old_uninitialized_vars]

                    if len(new_uninitialized_vars) != 0:
                        self.sess.run(tf.variables_initializer(new_uninitialized_vars))

            self.trained = True

            t_start = time.time()
            coord = tf.train.Coordinator()
            if self.q_runner is not None:
                enqueue_threads = self.q_runner.create_threads(self.sess, coord=coord, start=True)
            else:
                enqueue_threads = []

            callback.on_training_start(locals(), globals())

            for update in range(1, total_timesteps // self.n_batch + 1):

                callback.on_rollout_start()

                # pytype:disable=bad-unpacking
                # true_reward is the reward without discount
                if isinstance(self.runner, PPO2Runner):
                    # We are using GAE
                    rollout = self.runner.run(callback)
                    obs, returns, masks, actions, values, _, states, ep_infos, true_reward = rollout
                else:
                    rollout = self.runner.run(callback)
                    obs, states, returns, masks, actions, values, ep_infos, true_reward = rollout
                # pytype:enable=bad-unpacking

                callback.on_rollout_end()

                # Early stopping due to the callback
                if not self.runner.continue_training:
                    break

                self.ep_info_buf.extend(ep_infos)
                policy_loss, value_loss, policy_entropy = self._train_step(obs, states, returns, masks, actions, values,
                                                                           self.num_timesteps // (self.n_batch + 1),
                                                                           writer)
                n_seconds = time.time() - t_start
                fps = int((update * self.n_batch) / n_seconds)

                if writer is not None:
                    total_episode_reward_logger(self.episode_reward,
                                                true_reward.reshape((self.n_envs, self.n_steps)),
                                                masks.reshape((self.n_envs, self.n_steps)),
                                                writer, self.num_timesteps)

                if self.verbose >= 1 and (update % log_interval == 0 or update == 1):
                    explained_var = explained_variance(values, returns)
                    logger.record_tabular("nupdates", update)
                    logger.record_tabular("total_timesteps", self.num_timesteps)
                    logger.record_tabular("fps", fps)
                    logger.record_tabular("policy_entropy", float(policy_entropy))
                    logger.record_tabular("policy_loss", float(policy_loss))
                    logger.record_tabular("value_loss", float(value_loss))
                    logger.record_tabular("explained_variance", float(explained_var))
                    if len(self.ep_info_buf) > 0 and len(self.ep_info_buf[0]) > 0:
                        logger.logkv('ep_reward_mean', safe_mean([ep_info['r'] for ep_info in self.ep_info_buf]))
                        logger.logkv('ep_len_mean', safe_mean([ep_info['l'] for ep_info in self.ep_info_buf]))
                    logger.dump_tabular()

            coord.request_stop()
            coord.join(enqueue_threads)

        callback.on_training_end()
        return self
Пример #3
0
    def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="ACER",
              reset_num_timesteps=True):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)
        callback = self._init_callback(callback)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn()

            self.learning_rate_schedule = Scheduler(initial_value=self.learning_rate, n_values=total_timesteps,
                                                    schedule=self.lr_schedule)

            episode_stats = EpisodeStats(self.n_steps, self.n_envs)

            if self.replay_ratio > 0:
                buffer = Buffer(env=self.env, n_steps=self.n_steps, size=self.buffer_size)
            else:
                buffer = None

            t_start = time.time()
            callback.on_training_start(locals(), globals())

            # n_batch samples, 1 on_policy call and multiple off-policy calls
            for steps in range(0, total_timesteps, self.n_batch):

                callback.on_rollout_start()

                enc_obs, obs, actions, rewards, mus, dones, masks = self.runner.run(callback)
                callback.update_locals(locals())
                callback.on_rollout_end()

                # Early stopping due to the callback
                if not self.runner.continue_training:
                    break

                episode_stats.feed(rewards, dones)

                if buffer is not None:
                    buffer.put(enc_obs, actions, rewards, mus, dones, masks)

                if writer is not None:
                    total_episode_reward_logger(self.episode_reward,
                                                rewards.reshape((self.n_envs, self.n_steps)),
                                                dones.reshape((self.n_envs, self.n_steps)),
                                                writer, self.num_timesteps)

                # reshape stuff correctly
                obs = obs.reshape(self.runner.batch_ob_shape)
                actions = actions.reshape([self.n_batch])
                rewards = rewards.reshape([self.n_batch])
                mus = mus.reshape([self.n_batch, self.n_act])
                dones = dones.reshape([self.n_batch])
                masks = masks.reshape([self.runner.batch_ob_shape[0]])

                names_ops, values_ops = self._train_step(obs, actions, rewards, dones, mus, self.initial_state, masks,
                                                         self.num_timesteps, writer)

                if self.verbose >= 1 and (int(steps / self.n_batch) % log_interval == 0):
                    logger.record_tabular("total_timesteps", self.num_timesteps)
                    logger.record_tabular("fps", int(steps / (time.time() - t_start)))
                    # IMP: In EpisodicLife env, during training, we get done=True at each loss of life,
                    # not just at the terminal state. Thus, this is mean until end of life, not end of episode.
                    # For true episode rewards, see the monitor files in the log folder.
                    logger.record_tabular("mean_episode_length", episode_stats.mean_length())
                    logger.record_tabular("mean_episode_reward", episode_stats.mean_reward())
                    for name, val in zip(names_ops, values_ops):
                        logger.record_tabular(name, float(val))
                    logger.dump_tabular()

                if (self.replay_ratio > 0 and
                    buffer is not None and
                    buffer.has_atleast(self.replay_start)):
                    samples_number = np.random.poisson(self.replay_ratio)
                    for _ in range(samples_number):
                        # get obs, actions, rewards, mus, dones from buffer.
                        obs, actions, rewards, mus, dones, masks = buffer.get()

                        # reshape stuff correctly
                        obs = obs.reshape(self.runner.batch_ob_shape)
                        actions = actions.reshape([self.n_batch])
                        rewards = rewards.reshape([self.n_batch])
                        mus = mus.reshape([self.n_batch, self.n_act])
                        dones = dones.reshape([self.n_batch])
                        masks = masks.reshape([self.runner.batch_ob_shape[0]])

                        self._train_step(obs, actions, rewards, dones, mus, self.initial_state, masks,
                                         self.num_timesteps)

        callback.on_training_end()

        return self