Пример #1
0
    def learn(self, total_timesteps, save_dir, render, load_path=None,callback=None, seed=None, log_interval=1, tb_log_name="PPO2",reset_num_timesteps=True):
        # Transform to callable if needed
        self.learning_rate = get_schedule_fn(self.learning_rate)
        self.cliprange = get_schedule_fn(self.cliprange)

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn(seed)

            episode_stats = EpisodeStats(self.n_steps, self.n_envs)

            #if load_path is not None:
            #  loaded_model = self.load(load_path)
            #  runner = Runner(env=self.env, model=loaded_model, n_steps=self.n_steps, gamma=self.gamma, lam=self.lam)
            #  print("loaded model {}".format(loaded_model))
            #else:
            runner = Runner(env=self.env, model=self, n_steps=self.n_steps, gamma=self.gamma, lam=self.lam)
            self.episode_reward = np.zeros((self.n_envs,))

            ep_info_buf = deque(maxlen=100)
            t_first_start = time.time()

            nupdates = total_timesteps // self.n_batch
            for update in range(1, nupdates + 1):
                assert self.n_batch % self.nminibatches == 0
                batch_size = self.n_batch // self.nminibatches
                t_start = time.time()
                frac = 1.0 - (update - 1.0) / nupdates
                lr_now = self.learning_rate(frac)
                cliprangenow = self.cliprange(frac)
                # true_reward is the reward without discount
                obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = runner.run(render=render)
                episode_stats.feed(true_reward, masks)
                ep_info_buf.extend(ep_infos)
                mb_loss_vals = []
                prev_num_timesteps = self.num_timesteps
                if states is None:  # nonrecurrent version
                    update_fac = self.n_batch // self.nminibatches // self.noptepochs + 1
                    inds = np.arange(self.n_batch)
                    for epoch_num in range(self.noptepochs):
                        np.random.shuffle(inds)
                        for start in range(0, self.n_batch, batch_size):
                            timestep = self.num_timesteps // update_fac + ((self.noptepochs * self.n_batch + epoch_num *
                                                                            self.n_batch + start) // batch_size)
                            end = start + batch_size
                            mbinds = inds[start:end]
                            slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
                            mb_loss_vals.append(self._train_step(lr_now, cliprangenow, *slices, writer=writer,
                                                                 update=timestep))
                    self.num_timesteps += (self.n_batch * self.noptepochs) // batch_size * update_fac
                else:  # recurrent version
                    update_fac = self.n_batch // self.nminibatches // self.noptepochs // self.n_steps + 1
                    assert self.n_envs % self.nminibatches == 0
                    env_indices = np.arange(self.n_envs)
                    flat_indices = np.arange(self.n_envs * self.n_steps).reshape(self.n_envs, self.n_steps)
                    envs_per_batch = batch_size // self.n_steps
                    for epoch_num in range(self.noptepochs):
                        np.random.shuffle(env_indices)
                        for start in range(0, self.n_envs, envs_per_batch):
                            timestep = self.num_timesteps // update_fac + ((self.noptepochs * self.n_envs + epoch_num *
                                                                            self.n_envs + start) // envs_per_batch)
                            end = start + envs_per_batch
                            mb_env_inds = env_indices[start:end]
                            mb_flat_inds = flat_indices[mb_env_inds].ravel()
                            slices = (arr[mb_flat_inds] for arr in (obs, returns, masks, actions, values, neglogpacs))
                            mb_states = states[mb_env_inds]
                            mb_loss_vals.append(self._train_step(lr_now, cliprangenow, *slices, update=timestep,
                                                                 writer=writer, states=mb_states))
                    self.num_timesteps += (self.n_envs * self.noptepochs) // envs_per_batch * update_fac

                loss_vals = np.mean(mb_loss_vals, axis=0)
                t_now = time.time()
                fps = int(self.n_batch / (t_now - t_start))

                if writer is not None:
                    self.episode_reward = total_episode_reward_logger(self.episode_reward,
                                                                      true_reward.reshape((self.n_envs, self.n_steps)),
                                                                      masks.reshape((self.n_envs, self.n_steps)),
                                                                      writer, self.num_timesteps)

                if self.verbose >= 1 and (update % log_interval == 0 or update == 1):
                    explained_var = explained_variance(values, returns)
                    logger.logkv("serial_timesteps", update * self.n_steps)
                    logger.logkv("nupdates", update)
                    logger.logkv("total_timesteps", self.num_timesteps)
                    logger.logkv("fps", fps)
                    logger.logkv("explained_variance", float(explained_var))
                    logger.logkv("mean_episode_length", episode_stats.mean_length())
                    logger.logkv("mean_episode_reward", episode_stats.mean_reward())
                    logger.logkv('time_elapsed', t_start - t_first_start)
                    for (loss_val, loss_name) in zip(loss_vals, self.loss_names):
                        logger.logkv(loss_name, loss_val)
                    logger.dumpkvs()

                # save checkpoint
                # check if save_dir exists, otherwise make new directory
                if not os.path.exists(save_dir):
                  os.makedirs(save_dir)
                model_path = save_dir + str(self.num_timesteps) + "model.ckpt"
                self.save(model_path)
                print("Checkpoint {} saved".format(model_path))

                # Also save explained variance to a txt file
                fname = save_dir + "explained-var.txt" 
                fid = open(fname, "a+")
                fid.write(str(explained_var) + "\n")
                fid.close()

                # look for previously saved checkpoint, and delete it
                #prev_checkpoint_num = prev_num_timesteps
                #prev_checkpoint_file = save_dir + str(prev_checkpoint_num) + "model.ckpt"
                #if os.path.exists(prev_checkpoint_file):
                #  os.remove(prev_checkpoint_file)
                #  print("Prev checkpoint file {} removed".format(prev_checkpoint_file))

                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break

            return self
Пример #2
0
    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=1,
              tb_log_name="PPO2",
              reset_num_timesteps=True):
        # Transform to callable if needed
        self.learning_rate = get_schedule_fn(self.learning_rate)
        self.cliprange = get_schedule_fn(self.cliprange)

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn(seed)

            runner = Runner(env=self.env,
                            model=self,
                            n_steps=self.n_steps,
                            gamma=self.gamma,
                            lam=self.lam)
            self.episode_reward = np.zeros((self.n_envs, ))

            ep_info_buf = deque(maxlen=100)
            t_first_start = time.time()

            nupdates = total_timesteps // self.n_batch
            for update in range(1, nupdates + 1):
                assert self.n_batch % self.nminibatches == 0
                batch_size = self.n_batch // self.nminibatches
                t_start = time.time()
                frac = 1.0 - (update - 1.0) / nupdates
                lr_now = self.learning_rate(frac)
                cliprangenow = self.cliprange(frac)
                # true_reward is the reward without discount
                obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = runner.run(
                )
                ep_info_buf.extend(ep_infos)
                mb_loss_vals = []
                if states is None:  # nonrecurrent version
                    update_fac = self.n_batch // self.nminibatches // self.noptepochs + 1
                    inds = np.arange(self.n_batch)
                    for epoch_num in range(self.noptepochs):
                        np.random.shuffle(inds)
                        for start in range(0, self.n_batch, batch_size):
                            timestep = self.num_timesteps // update_fac + (
                                (self.noptepochs * self.n_batch + epoch_num *
                                 self.n_batch + start) // batch_size)
                            end = start + batch_size
                            mbinds = inds[start:end]
                            slices = (arr[mbinds]
                                      for arr in (obs, returns, masks, actions,
                                                  values, neglogpacs))
                            mb_loss_vals.append(
                                self._train_step(lr_now,
                                                 cliprangenow,
                                                 *slices,
                                                 writer=writer,
                                                 update=timestep))
                    self.num_timesteps += (self.n_batch * self.noptepochs
                                           ) // batch_size * update_fac
                else:  # recurrent version
                    update_fac = self.n_batch // self.nminibatches // self.noptepochs // self.n_steps + 1
                    assert self.n_envs % self.nminibatches == 0
                    env_indices = np.arange(self.n_envs)
                    flat_indices = np.arange(self.n_envs *
                                             self.n_steps).reshape(
                                                 self.n_envs, self.n_steps)
                    envs_per_batch = batch_size // self.n_steps
                    for epoch_num in range(self.noptepochs):
                        np.random.shuffle(env_indices)
                        for start in range(0, self.n_envs, envs_per_batch):
                            timestep = self.num_timesteps // update_fac + (
                                (self.noptepochs * self.n_envs + epoch_num *
                                 self.n_envs + start) // envs_per_batch)
                            end = start + envs_per_batch
                            mb_env_inds = env_indices[start:end]
                            mb_flat_inds = flat_indices[mb_env_inds].ravel()
                            slices = (arr[mb_flat_inds]
                                      for arr in (obs, returns, masks, actions,
                                                  values, neglogpacs))
                            mb_states = states[mb_env_inds]
                            mb_loss_vals.append(
                                self._train_step(lr_now,
                                                 cliprangenow,
                                                 *slices,
                                                 update=timestep,
                                                 writer=writer,
                                                 states=mb_states))
                    self.num_timesteps += (self.n_envs * self.noptepochs
                                           ) // envs_per_batch * update_fac

                loss_vals = np.mean(mb_loss_vals, axis=0)
                t_now = time.time()
                fps = int(self.n_batch / (t_now - t_start))

                if writer is not None:
                    self.episode_reward = total_episode_reward_logger(
                        self.episode_reward,
                        true_reward.reshape((self.n_envs, self.n_steps)),
                        masks.reshape((self.n_envs, self.n_steps)), writer,
                        self.num_timesteps)

                if self.verbose >= 1 and (update % log_interval == 0
                                          or update == 1):
                    explained_var = explained_variance(values, returns)
                    logger.logkv("serial_timesteps", update * self.n_steps)
                    logger.logkv("nupdates", update)
                    logger.logkv("total_timesteps", self.num_timesteps)
                    logger.logkv("fps", fps)
                    logger.logkv("explained_variance", float(explained_var))
                    if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0:
                        logger.logkv(
                            'ep_reward_mean',
                            safe_mean(
                                [ep_info['r'] for ep_info in ep_info_buf]))
                        logger.logkv(
                            'ep_len_mean',
                            safe_mean(
                                [ep_info['l'] for ep_info in ep_info_buf]))
                    logger.logkv('time_elapsed', t_start - t_first_start)
                    for (loss_val, loss_name) in zip(loss_vals,
                                                     self.loss_names):
                        logger.logkv(loss_name, loss_val)
                    logger.dumpkvs()

                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break

            return self
Пример #3
0
    def learn(self, total_timesteps, callback=None, log_interval=1, tb_log_name="Dual",
              reset_num_timesteps=True):
        # Transform to callable if needed
        self.learning_rate  = get_schedule_fn(self.learning_rate)
        self.cliprange      = get_schedule_fn(self.cliprange)
        cliprange_vf        = get_schedule_fn(self.cliprange_vf)

        new_tb_log   = self._init_num_timesteps(reset_num_timesteps)
        top_callback = SaveOnTopRewardCallback(check_freq=self.n_steps, logdir=self.tensorboard_log, models_num=self.models_num)
        callback.append(top_callback)
        callback     = self._init_callback(callback)

        with SetVerbosity(self.verbose), TensorboardWriter(self.models[0].graph, self.tensorboard_log, tb_log_name, new_tb_log) as writer:

            for model in self.models:
                model._setup_learn(self)

            t_first_start = time.time()
            n_updates     = total_timesteps // (self.n_envs * self.n_steps)

            callback.on_training_start(locals(), globals())

            for update in range(1, n_updates + 1):
                assert (self.n_envs * self.n_steps) % self.nminibatches == 0, ("The number of minibatches (`nminibatches`) is not a factor of the total number of samples collected per rollout (`n_batch`), some samples won't be used.")

                batch_size       = (self.n_envs * self.n_steps) // self.nminibatches
                t_start          = time.time()
                frac             = 1.0 - (update - 1.0) / n_updates
                lr_now           = self.learning_rate(frac)
                cliprange_now    = self.cliprange(frac)
                cliprange_vf_now = cliprange_vf(frac)

                callback.on_rollout_start()

                rollouts = self.runner.run(callback) #execute episode

                callback.on_rollout_end()

                # Early stopping due to the callback
                if not self.runner.continue_training:
                    break


                # Unpack
                i = 0
                steps_used = rollouts[-1]
                for rollout in rollouts[0]:
                    obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward, success_stages = rollout
                    model = self.models[i]
                    # calc = len(true_reward)
                    # model.n_batch = calc

                    if model.n_batch == 0:
                        b = 0
                    else:
                        self.ep_info_buf.extend(ep_infos)   
                        mb_loss_vals = []
                        if states is None:  # nonrecurrent version
                            update_fac = max(model.n_batch // self.nminibatches // self.noptepochs, 1)
                            inds = np.arange(len(obs))#np.arange(model.n_batch)
                            for epoch_num in range(self.noptepochs):
                                np.random.shuffle(inds)
                                for start in range(0, model.n_batch, batch_size):
                                    timestep = self.num_timesteps // update_fac + ((epoch_num * model.n_batch + start) // batch_size)
                                    end      = start + batch_size
                                    mbinds   = inds[start:end]
                                    if len(obs) > 1:
                                        slices   = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
                                        mb_loss_vals.append(self._train_step(lr_now, cliprange_now, *slices, model=self.models[i], writer=writer, update=timestep, cliprange_vf=cliprange_vf_now))
                                    else:
                                        mb_loss_vals.append((0,0,0,0,0))
                            i+=1
                        else:
                            exit("does not support recurrent version")

                        loss_vals = np.mean(mb_loss_vals, axis=0)
                        t_now     = time.time()
                        fps       = int(model.n_batch / (t_now - t_start))

                        if writer is not None:
                            n_steps = model.n_batch
                            try:
                                total_episode_reward_logger(self.episode_reward, true_reward.reshape((self.n_envs, n_steps)), masks.reshape((self.n_envs, n_steps)), writer, self.num_timesteps)
                            except:
                                print("Failed to log episode reward of shape {}".format(true_reward.shape))
                            summary = tf.Summary(value=[tf.Summary.Value(tag='episode_reward/Successful stages',
                                                                         simple_value=success_stages)])
                            writer.add_summary(summary, self.num_timesteps)
                            #@TODO plot in one graph:
                            for i, val in enumerate(steps_used):
                                summary = tf.Summary(value=[tf.Summary.Value(tag='episode_reward/Used steps net {}'.format(i),
                                                                              simple_value=val)])
                                writer.add_summary(summary, self.num_timesteps)

                        if self.verbose >= 1 and (update % log_interval == 0 or update == 1):
                            explained_var = explained_variance(values, returns)
                            logger.logkv("serial_timesteps", update * self.n_steps)
                            logger.logkv("n_updates", update)
                            logger.logkv("total_timesteps", self.num_timesteps)
                            logger.logkv("fps", fps)
                            logger.logkv("Steps", steps_used)
                            logger.logkv("explained_variance", float(explained_var))
                            if len(self.ep_info_buf) > 0 and len(self.ep_info_buf[0]) > 0:
                                logger.logkv('ep_reward_mean', safe_mean([ep_info['r'] for ep_info in self.ep_info_buf]))
                                logger.logkv('ep_len_mean', safe_mean([ep_info['l'] for ep_info in self.ep_info_buf]))
                            logger.logkv('time_elapsed', t_start - t_first_start)
                            for (loss_val, loss_name) in zip(loss_vals, model.loss_names):
                                logger.logkv(loss_name, loss_val)
                            logger.dumpkvs()

            callback.on_training_end()
            return self
Пример #4
0
    def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="PPO2"):
        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name) as writer:
            self._setup_learn(seed)

            runner = Runner(env=self.env, model=self, n_steps=self.n_steps, gamma=self.gamma, lam=self.lam)
            self.episode_reward = np.zeros((self.n_envs,))

            ep_info_buf = deque(maxlen=100)
            t_first_start = time.time()

            nupdates = total_timesteps // self.n_batch
            for update in range(nupdates + 1):
                assert self.n_batch % self.nminibatches == 0
                n_batch_train = self.n_batch // self.nminibatches
                t_start = time.time()
                frac = 1.0 - (update / (nupdates + 1))
                lr_now = self.learning_rate(frac)
                cliprangenow = self.cliprange(frac)
                # true_reward is the reward without discount
                obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = runner.run()
                ep_info_buf.extend(ep_infos)
                mb_loss_vals = []
                if states is None:  # nonrecurrent version
                    inds = np.arange(self.n_batch)
                    for epoch_num in range(self.noptepochs):
                        np.random.shuffle(inds)
                        for start in range(0, self.n_batch, n_batch_train):
                            timestep = ((update * self.noptepochs * self.n_batch + epoch_num * self.n_batch + start) //
                                        n_batch_train)
                            end = start + n_batch_train
                            mbinds = inds[start:end]
                            slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
                            mb_loss_vals.append(self._train_step(lr_now, cliprangenow, *slices, writer=writer,
                                                                 update=timestep))
                else:  # recurrent version
                    assert self.n_envs % self.nminibatches == 0
                    envinds = np.arange(self.n_envs)
                    flatinds = np.arange(self.n_envs * self.n_steps).reshape(self.n_envs, self.n_steps)
                    envsperbatch = n_batch_train // self.n_steps
                    for epoch_num in range(self.noptepochs):
                        np.random.shuffle(envinds)
                        for start in range(0, self.n_envs, envsperbatch):
                            timestep = ((update * self.noptepochs * self.n_envs + epoch_num * self.n_envs + start) //
                                        envsperbatch)
                            end = start + envsperbatch
                            mb_env_inds = envinds[start:end]
                            mb_flat_inds = flatinds[mb_env_inds].ravel()
                            slices = (arr[mb_flat_inds] for arr in (obs, returns, masks, actions, values, neglogpacs))
                            mb_states = states[mb_env_inds]
                            mb_loss_vals.append(self._train_step(lr_now, cliprangenow, *slices, update=timestep,
                                                                 writer=writer, states=mb_states))

                loss_vals = np.mean(mb_loss_vals, axis=0)
                t_now = time.time()
                fps = int(self.n_batch / (t_now - t_start))

                if writer is not None:
                    self.episode_reward = total_episode_reward_logger(self.episode_reward,
                                                                      true_reward.reshape((self.n_envs, self.n_steps)),
                                                                      masks.reshape((self.n_envs, self.n_steps)),
                                                                      writer, update * (self.n_batch + 1))

                if callback is not None:
                    callback(locals(), globals())

                if self.verbose >= 1 and ((update + 1) % log_interval//100 == 0 or update == 0):
                    explained_var = explained_variance(values, returns)
                    logger.logkv("serial_timesteps", (update + 1) * self.n_steps)
                    logger.logkv("nupdates", (update + 1))
                    logger.logkv("total_timesteps", (update + 1) * self.n_batch)
                    logger.logkv("fps", fps)
                    logger.logkv("explained_variance", float(explained_var))
                    logger.logkv('ep_rewmean', safe_mean([ep_info['r'] for ep_info in ep_info_buf]))
                    logger.logkv('eplenmean', safe_mean([ep_info['l'] for ep_info in ep_info_buf]))
                    logger.logkv('time_elapsed', t_start - t_first_start)
                    for (loss_val, loss_name) in zip(loss_vals, self.loss_names):
                        logger.logkv(loss_name, loss_val)
                    logger.dumpkvs()

            return self
Пример #5
0
    def multi_task_learn_for_one_episode(self, task: str,
                                         runner: MultiTaskA2CRunner,
                                         writer: TensorboardWriter):
        """
        Trains until game over.

        :param task: (str) name of the game
        :param runner: (MultiTaskA2CRunnner)
        :param writer: (TensorboardWriter)
        :return:
        """
        print(
            "-----------------------------------------------------------------"
        )
        print(
            "---------------------------{}---------------------------".format(
                task))

        mask = [False] * self.n_envs_per_task
        episode_scores = np.zeros(self.n_envs_per_task)
        policy_loss = value_loss = None
        episode_training_updates = 0
        episode_timesteps = 0
        all_process_ended = False
        tmp_scores = np.zeros(self.n_batch)

        while not all_process_ended:
            t_start = time.time()
            # self.updates = self.num_timesteps // self.n_batch + 1
            self.total_train_steps += 1
            # true_reward is the reward without discount
            obs, states, rewards, masks, actions, values, true_rewards, dones = runner.run(
            )
            policy_loss, value_loss, policy_entropy = self._train_step(
                task, obs, states, rewards, masks, actions, values, writer)
            n_seconds = time.time() - t_start
            fps = int(self.n_batch / n_seconds)
            train_step_per_sec = int(1 / n_seconds)
            tmp_ep_scores = self.episode_reward.get_reward(
                task, true_rewards.reshape(
                    (self.n_envs_per_task, self.n_steps)),
                masks.reshape((self.n_envs_per_task, self.n_steps)),
                self.total_train_steps)
            tmp_scores += true_rewards
            masks_reshaped = masks.reshape(
                (self.n_envs_per_task, self.n_steps))
            assert masks_reshaped.shape[
                0] == self.n_envs_per_task, "dones.shape[0] must be n_envs_per_task"
            for i in range(masks_reshaped.shape[0]):
                if True in masks_reshaped[i, :]:
                    mask[i] = True
                    episode_scores[i] = tmp_ep_scores[i]
            all_process_ended = all(mask)

            self.num_timesteps += self.n_batch
            episode_timesteps += self.n_steps
            episode_training_updates += 1

            if self.verbose >= 1 and (
                (self.total_train_steps %
                 config.stdout_logging_frequency_in_train_steps == 0)
                    or all_process_ended):
                explained_var = explained_variance(values, rewards)
                logger.record_tabular("training_updates",
                                      self.total_train_steps)
                logger.record_tabular("total_timesteps", self.num_timesteps)
                logger.record_tabular("train_step_per_sec", train_step_per_sec)
                logger.record_tabular("fps", fps)
                logger.record_tabular("policy_loss", float(policy_loss))
                logger.record_tabular("value_loss", float(value_loss))
                logger.record_tabular("explained_variance",
                                      float(explained_var))
                logger.dump_tabular()

        print("Game over: {}".format(all_process_ended))

        episode_score = np.mean(episode_scores)

        return episode_score, policy_loss, value_loss, episode_training_updates
Пример #6
0
    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=100):
        with SetVerbosity(self.verbose):
            self._setup_learn(seed)

            runner = Runner(env=self.env,
                            model=self,
                            n_steps=self.n_steps,
                            gamma=self.gamma,
                            lam=self.lam)

            ep_info_buf = deque(maxlen=100)
            t_first_start = time.time()

            nupdates = total_timesteps // self.n_batch
            for update in range(1, nupdates + 1):
                assert self.n_batch % self.nminibatches == 0
                n_batch_train = self.n_batch // self.nminibatches
                t_start = time.time()
                frac = 1.0 - (update - 1.0) / nupdates
                lr_now = self.learning_rate(frac)
                cliprangenow = self.cliprange(frac)
                obs, returns, masks, actions, values, neglogpacs, states, ep_infos = runner.run(
                )  # pylint: disable=E0632
                ep_info_buf.extend(ep_infos)
                mb_loss_vals = []
                if states is None:  # nonrecurrent version
                    inds = np.arange(self.n_batch)
                    for _ in range(self.noptepochs):
                        np.random.shuffle(inds)
                        for start in range(0, self.n_batch, n_batch_train):
                            end = start + n_batch_train
                            mbinds = inds[start:end]
                            slices = (arr[mbinds]
                                      for arr in (obs, returns, masks, actions,
                                                  values, neglogpacs))
                            mb_loss_vals.append(
                                self._train_step(lr_now, cliprangenow,
                                                 *slices))
                else:  # recurrent version
                    assert self.n_envs % self.nminibatches == 0
                    envinds = np.arange(self.n_envs)
                    flatinds = np.arange(self.n_envs * self.n_steps).reshape(
                        self.n_envs, self.n_steps)
                    envsperbatch = n_batch_train // self.n_steps
                    for _ in range(self.noptepochs):
                        np.random.shuffle(envinds)
                        for start in range(0, self.n_envs, envsperbatch):
                            end = start + envsperbatch
                            mb_env_inds = envinds[start:end]
                            mb_flat_inds = flatinds[mb_env_inds].ravel()
                            slices = (arr[mb_flat_inds]
                                      for arr in (obs, returns, masks, actions,
                                                  values, neglogpacs))
                            mb_states = states[mb_env_inds]
                            mb_loss_vals.append(
                                self._train_step(lr_now, cliprangenow, *slices,
                                                 mb_states))

                loss_vals = np.mean(mb_loss_vals, axis=0)
                t_now = time.time()
                fps = int(self.n_batch / (t_now - t_start))

                if callback is not None:
                    callback(locals(), globals())

                if self.verbose >= 1 and (update % log_interval // 100 == 0
                                          or update == 1):
                    explained_var = explained_variance(values, returns)
                    logger.logkv("serial_timesteps", update * self.n_steps)
                    logger.logkv("nupdates", update)
                    logger.logkv("total_timesteps", update * self.n_batch)
                    logger.logkv("fps", fps)
                    logger.logkv("explained_variance", float(explained_var))
                    logger.logkv(
                        'ep_rewmean',
                        safe_mean([ep_info['r'] for ep_info in ep_info_buf]))
                    logger.logkv(
                        'eplenmean',
                        safe_mean([ep_info['l'] for ep_info in ep_info_buf]))
                    logger.logkv('time_elapsed', t_start - t_first_start)
                    for (loss_val, loss_name) in zip(loss_vals,
                                                     self.loss_names):
                        logger.logkv(loss_name, loss_val)
                    logger.dumpkvs()

            return self
Пример #7
0
    def learn(self,
              total_timesteps,
              callback=None,
              log_interval=100,
              tb_log_name="A2C",
              reset_num_timesteps=True):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn()
            self.learning_rate_schedule = Scheduler(
                initial_value=self.learning_rate,
                n_values=total_timesteps,
                schedule=self.lr_schedule)

            t_start = time.time()
            for update in range(1, total_timesteps // self.n_batch + 1):
                # true_reward is the reward without discount
                obs, states, rewards, masks, actions, values, ep_infos, true_reward = self.runner.run(
                )
                self.ep_info_buf.extend(ep_infos)
                _, value_loss, policy_entropy = self._train_step(
                    obs, states, rewards, masks, actions, values,
                    self.num_timesteps // self.n_batch, writer)
                n_seconds = time.time() - t_start
                fps = int((update * self.n_batch) / n_seconds)

                if writer is not None:
                    total_episode_reward_logger(
                        self.episode_reward,
                        true_reward.reshape((self.n_envs, self.n_steps)),
                        masks.reshape((self.n_envs, self.n_steps)), writer,
                        self.num_timesteps)

                self.num_timesteps += self.n_batch

                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break

                if self.verbose >= 1 and (update % log_interval == 0
                                          or update == 1):
                    explained_var = explained_variance(values, rewards)
                    logger.record_tabular("nupdates", update)
                    logger.record_tabular("total_timesteps",
                                          self.num_timesteps)
                    logger.record_tabular("fps", fps)
                    logger.record_tabular("policy_entropy",
                                          float(policy_entropy))
                    logger.record_tabular("value_loss", float(value_loss))
                    logger.record_tabular("explained_variance",
                                          float(explained_var))
                    if len(self.ep_info_buf) > 0 and len(
                            self.ep_info_buf[0]) > 0:
                        logger.logkv(
                            'ep_reward_mean',
                            safe_mean([
                                ep_info['r'] for ep_info in self.ep_info_buf
                            ]))
                        logger.logkv(
                            'ep_len_mean',
                            safe_mean([
                                ep_info['l'] for ep_info in self.ep_info_buf
                            ]))
                    logger.dump_tabular()

        return self
Пример #8
0
    def learn(self,
              total_timesteps,
              callback=None,
              log_interval=100,
              tb_log_name="PPO1",
              reset_num_timesteps=True,
              save_path=None,
              save_iters=20):
        is_root = (MPI.COMM_WORLD.Get_rank() == 0)
        new_tb_log = self._init_num_timesteps(reset_num_timesteps)
        callback = self._init_callback(callback)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn()

            assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the PPO1 model must be " \
                                                               "an instance of common.policies.ActorCriticPolicy."

            with self.sess.as_default():
                self.adam.sync()
                callback.on_training_start(locals(), globals())

                # Prepare for rollouts
                seg_gen = traj_segment_generator(self.policy_pi,
                                                 self.env,
                                                 self.timesteps_per_actorbatch,
                                                 callback=callback)

                episodes_so_far = 0
                timesteps_so_far = 0
                iters_so_far = 0
                t_start = time.time()

                # rolling buffer for episode lengths
                len_buffer = deque(maxlen=100)
                # rolling buffer for episode rewards
                reward_buffer = deque(maxlen=100)

                while True:
                    t_episode = time.time()
                    if timesteps_so_far >= total_timesteps:
                        break

                    if self.schedule == 'constant':
                        cur_lrmult = 1.0
                    elif self.schedule == 'linear':
                        cur_lrmult = max(
                            1.0 - float(timesteps_so_far) / total_timesteps, 0)
                    else:
                        raise NotImplementedError

                    if is_root:
                        logger.log("********** Iteration %i ************" %
                                   iters_so_far)

                    seg = seg_gen.__next__()

                    # Stop training early (triggered by the callback)
                    if not seg.get('continue_training', True):  # pytype: disable=attribute-error
                        break

                    add_vtarg_and_adv(seg, self.num_robot, self.gamma,
                                      self.lam)

                    # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
                    observations, actions = seg["observations"], seg["actions"]
                    atarg, tdlamret = seg["adv"], seg["tdlamret"]

                    # true_rew is the reward without discount
                    # if writer is not None:
                    #     total_episode_reward_logger(self.episode_reward,
                    #                                 seg["true_rewards"].reshape((self.n_envs, -1)),
                    #                                 writer, self.num_timesteps, int(self.timesteps_per_actorbatch/100)) # step write reward sum

                    # predicted value function before udpate
                    vpredbefore = seg["vpred"]

                    # standardized advantage function estimate
                    # atarg = (atarg - atarg.mean()) / atarg.std()
                    temp_atarg = [[] for _ in range(self.num_robot)]
                    for i in range(
                            int(self.timesteps_per_actorbatch /
                                self.num_robot)):
                        for j in range(self.num_robot):
                            temp_atarg[j].append(atarg[i * self.num_robot + j])
                    for i in range(self.num_robot):
                        temp_atarg[i] = np.array(temp_atarg[i])
                        temp_atarg[i] = (temp_atarg[i] - temp_atarg[i].mean()
                                         ) / temp_atarg[i].std()
                    for i in range(
                            int(self.timesteps_per_actorbatch /
                                self.num_robot)):
                        for j in range(self.num_robot):
                            atarg[i * self.num_robot + j] = temp_atarg[j][i]

                    dataset = Dataset(dict(ob=observations,
                                           ac=actions,
                                           atarg=atarg,
                                           vtarg=tdlamret),
                                      shuffle=not self.policy.recurrent)
                    optim_batchsize = self.optim_batchsize or observations.shape[
                        0]

                    # set old parameter values to new parameter values
                    self.assign_old_eq_new(sess=self.sess)

                    if is_root:
                        logger.log("Optimizing...")
                        logger.log(fmt_row(13, self.loss_names))

                    # Here we do a bunch of optimization epochs over the data
                    for k in range(self.optim_epochs):
                        # list of tuples, each of which gives the loss for a minibatch
                        losses = []
                        for i, batch in enumerate(
                                dataset.iterate_once(optim_batchsize)):
                            steps = (
                                self.num_timesteps + k * optim_batchsize +
                                int(i *
                                    (optim_batchsize / len(dataset.data_map))))
                            if writer is not None:
                                # run loss backprop with summary, but once every 10 runs save the metadata
                                # (memory, compute time, ...)
                                if self.full_tensorboard_log and (1 +
                                                                  k) % 10 == 0:
                                    run_options = tf.RunOptions(
                                        trace_level=tf.RunOptions.FULL_TRACE)
                                    run_metadata = tf.RunMetadata()
                                    summary, grad, *newlosses = self.lossandgrad(
                                        batch["ob"],
                                        batch["ob"],
                                        batch["ac"],
                                        batch["atarg"],
                                        batch["vtarg"],
                                        cur_lrmult,
                                        sess=self.sess,
                                        options=run_options,
                                        run_metadata=run_metadata)
                                    writer.add_run_metadata(
                                        run_metadata, 'step%d' % steps)
                                else:
                                    summary, grad, *newlosses = self.lossandgrad(
                                        batch["ob"],
                                        batch["ob"],
                                        batch["ac"],
                                        batch["atarg"],
                                        batch["vtarg"],
                                        cur_lrmult,
                                        sess=self.sess)
                                writer.add_summary(summary, steps)
                            else:
                                _, grad, *newlosses = self.lossandgrad(
                                    batch["ob"],
                                    batch["ob"],
                                    batch["ac"],
                                    batch["atarg"],
                                    batch["vtarg"],
                                    cur_lrmult,
                                    sess=self.sess)

                            self.adam.update(grad,
                                             self.optim_stepsize * cur_lrmult)
                            losses.append(newlosses)

                        if is_root:
                            logger.log(fmt_row(13, np.mean(losses, axis=0)))

                    if is_root:
                        logger.log("Evaluating losses...")

                    losses = []
                    for batch in dataset.iterate_once(optim_batchsize):
                        newlosses = self.compute_losses(batch["ob"],
                                                        batch["ob"],
                                                        batch["ac"],
                                                        batch["atarg"],
                                                        batch["vtarg"],
                                                        cur_lrmult,
                                                        sess=self.sess)
                        losses.append(newlosses)
                    mean_losses, _, _ = mpi_moments(losses, axis=0)

                    if is_root:
                        logger.log(fmt_row(13, mean_losses))

                    for (loss_val, name) in zipsame(mean_losses,
                                                    self.loss_names):
                        logger.record_tabular("loss_" + name, loss_val)
                    logger.record_tabular(
                        "ev_tdlam_before",
                        explained_variance(vpredbefore, tdlamret))

                    # local values
                    lrlocal = (seg["ep_lens"], seg["ep_rets"])

                    # list of tuples
                    listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)
                    lens, rews = map(flatten_lists, zip(*listoflrpairs))
                    if writer is not None:
                        for i in range(len(rews)):
                            summary = tf.Summary(value=[
                                tf.Summary.Value(tag="episode_reward",
                                                 simple_value=rews[i])
                            ])
                            writer.add_summary(summary, self.num_timesteps + i)
                    len_buffer.extend(lens)
                    reward_buffer.extend(rews)
                    if len(len_buffer) > 0:
                        logger.record_tabular("EpLenMean", np.mean(len_buffer))
                        logger.record_tabular("EpRewMean",
                                              np.mean(reward_buffer))
                    logger.record_tabular("EpThisIter", len(lens))
                    episodes_so_far += len(lens)
                    current_it_timesteps = MPI.COMM_WORLD.allreduce(
                        seg["total_timestep"])
                    timesteps_so_far += current_it_timesteps
                    self.num_timesteps += current_it_timesteps

                    if is_root and (save_path
                                    is not None) and (iters_so_far % save_iters
                                                      == 0):
                        self.save(save_path)

                    iters_so_far += 1
                    logger.record_tabular("EpisodesSoFar", episodes_so_far)
                    logger.record_tabular("TimestepsSoFar", self.num_timesteps)
                    logger.record_tabular("TimeElapsed", time.time() - t_start)
                    logger.record_tabular("TimePerEpisode",
                                          time.time() - t_episode)
                    if self.verbose >= 1 and is_root:
                        logger.dump_tabular()
        callback.on_training_end()

        if is_root:
            self.save(save_path)

        return self
Пример #9
0
    def update(self):
        if self.normrew:
            rffs = np.array(
                [self.rff.update(rew) for rew in self.rollout.buf_rews.T])
            rffs_mean, rffs_std, rffs_count = mpi_moments(rffs.ravel())
            self.rff_rms.update_from_moments(rffs_mean, rffs_std**2,
                                             rffs_count)
            rews = self.rollout.buf_rews / np.sqrt(self.rff_rms.var)
        else:
            rews = np.copy(self.rollout.buf_rews)
        self.calculate_advantages(rews=rews,
                                  use_news=self.use_news,
                                  gamma=self.gamma,
                                  lam=self.lam)

        info = dict(advmean=self.buf_advs.mean(),
                    advstd=self.buf_advs.std(),
                    retmean=self.buf_rets.mean(),
                    retstd=self.buf_rets.std(),
                    vpredmean=self.rollout.buf_vpreds.mean(),
                    vpredstd=self.rollout.buf_vpreds.std(),
                    ev=explained_variance(self.rollout.buf_vpreds.ravel(),
                                          self.buf_rets.ravel()),
                    rew_mean=np.mean(self.rollout.buf_rews),
                    recent_best_ext_ret=self.rollout.current_max)
        if self.rollout.best_ext_ret is not None:
            info['best_ext_ret'] = self.rollout.best_ext_ret

        # normalize advantages
        if self.normadv:
            m, s = get_mean_and_std(self.buf_advs)
            self.buf_advs = (self.buf_advs - m) / (s + 1e-7)
        envsperbatch = (self.nenvs * self.nsegs_per_env) // self.nminibatches
        envsperbatch = max(1, envsperbatch)
        envinds = np.arange(self.nenvs * self.nsegs_per_env)

        def resh(x):
            if self.nsegs_per_env == 1:
                return x
            sh = x.shape
            return x.reshape((sh[0] * self.nsegs_per_env,
                              self.nsteps_per_seg) + sh[2:])

        ph_buf = [
            (self.stochpol.ph_ac, resh(self.rollout.buf_acs)),
            (self.ph_rews, resh(self.rollout.buf_rews)),
            (self.ph_oldvpred, resh(self.rollout.buf_vpreds)),
            (self.ph_oldnlp, resh(self.rollout.buf_nlps)),
            (self.stochpol.ph_ob, resh(self.rollout.buf_obs)),
            (self.ph_ret, resh(self.buf_rets)),
            (self.ph_adv, resh(self.buf_advs)),
        ]
        ph_buf.extend([(self.dynamics.last_ob,
                        self.rollout.buf_obs_last.reshape([
                            self.nenvs * self.nsegs_per_env, 1,
                            *self.ob_space.shape
                        ]))])
        mblossvals = []

        for _ in range(self.nepochs):
            np.random.shuffle(envinds)
            for start in range(0, self.nenvs * self.nsegs_per_env,
                               envsperbatch):
                end = start + envsperbatch
                mbenvinds = envinds[start:end]
                fd = {ph: buf[mbenvinds] for (ph, buf) in ph_buf}
                fd.update({
                    self.ph_lr: self.lr,
                    self.ph_cliprange: self.cliprange
                })
                mblossvals.append(getsess().run(self._losses + (self._train, ),
                                                fd)[:-1])

        mblossvals = [mblossvals[0]]
        info.update(
            zip(['opt_' + ln for ln in self.loss_names],
                np.mean([mblossvals[0]], axis=0)))
        info["rank"] = MPI.COMM_WORLD.Get_rank()
        self.n_updates += 1
        info["n_updates"] = self.n_updates
        info.update({
            dn: (np.mean(dvs) if len(dvs) > 0 else 0)
            for (dn, dvs) in self.rollout.statlists.items()
        })
        info.update(self.rollout.stats)
        if "states_visited" in info:
            info.pop("states_visited")
        tnow = time.time()
        info["ups"] = 1. / (tnow - self.t_last_update)
        info["total_secs"] = tnow - self.t_start
        info['tps'] = MPI.COMM_WORLD.Get_size(
        ) * self.rollout.nsteps * self.nenvs / (tnow - self.t_last_update)
        self.t_last_update = tnow

        return info
Пример #10
0
    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=1,
              tb_log_name=None,
              reset_num_timesteps=True):
        # Transform to callable if needed
        self.learning_rate = get_schedule_fn(self.learning_rate)
        self.cliprange = get_schedule_fn(self.cliprange)
        cliprange_vf = get_schedule_fn(self.cliprange_vf)

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        tb_log_name = self.model_name

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn(seed)

            runner = Runner(env=self.env,
                            model=self,
                            n_steps=self.n_steps,
                            gamma=self.gamma,
                            lam=self.lam)
            self.episode_reward = np.zeros((self.n_envs, ))

            ep_info_buf = deque(maxlen=100)
            t_first_start = time.time()

            n_updates = total_timesteps // self.n_batch
            n_checkpoints = self.checkpoint_inc // self.n_batch
            for update in range(1, n_updates + 1):
                assert self.n_batch % self.nminibatches == 0
                batch_size = self.n_batch // self.nminibatches
                t_start = time.time()
                frac = 1.0 - (update - 1.0) / n_updates
                lr_now = self.learning_rate(frac)
                cliprange_now = self.cliprange(frac)
                cliprange_vf_now = cliprange_vf(frac)
                # true_reward is the reward without discount
                obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = runner.run(
                )
                self.num_timesteps += self.n_batch
                ep_info_buf.extend(ep_infos)
                mb_loss_vals = []
                if states is None:  # nonrecurrent version
                    update_fac = self.n_batch // self.nminibatches // self.noptepochs + 1
                    inds = np.arange(self.n_batch)
                    for epoch_num in range(self.noptepochs):
                        np.random.shuffle(inds)
                        for start in range(0, self.n_batch, batch_size):
                            timestep = self.num_timesteps // update_fac + (
                                (self.noptepochs * self.n_batch + epoch_num *
                                 self.n_batch + start) // batch_size)
                            end = start + batch_size
                            mbinds = inds[start:end]
                            slices = (arr[mbinds]
                                      for arr in (obs, returns, masks, actions,
                                                  values, neglogpacs))
                            mb_loss_vals.append(
                                self._train_step(
                                    lr_now,
                                    cliprange_now,
                                    *slices,
                                    writer=writer,
                                    update=timestep,
                                    cliprange_vf=cliprange_vf_now))
                else:  # recurrent version
                    update_fac = self.n_batch // self.nminibatches // self.noptepochs // self.n_steps + 1
                    assert self.n_envs % self.nminibatches == 0
                    env_indices = np.arange(self.n_envs)
                    flat_indices = np.arange(self.n_envs *
                                             self.n_steps).reshape(
                                                 self.n_envs, self.n_steps)
                    envs_per_batch = batch_size // self.n_steps
                    for epoch_num in range(self.noptepochs):
                        np.random.shuffle(env_indices)
                        for start in range(0, self.n_envs, envs_per_batch):
                            timestep = self.num_timesteps // update_fac + (
                                (self.noptepochs * self.n_envs + epoch_num *
                                 self.n_envs + start) // envs_per_batch)
                            end = start + envs_per_batch
                            mb_env_inds = env_indices[start:end]
                            mb_flat_inds = flat_indices[mb_env_inds].ravel()
                            slices = (arr[mb_flat_inds]
                                      for arr in (obs, returns, masks, actions,
                                                  values, neglogpacs))
                            mb_states = states[mb_env_inds]
                            mb_loss_vals.append(
                                self._train_step(
                                    lr_now,
                                    cliprange_now,
                                    *slices,
                                    update=timestep,
                                    writer=writer,
                                    states=mb_states,
                                    cliprange_vf=cliprange_vf_now))

                loss_vals = np.mean(mb_loss_vals, axis=0)
                t_now = time.time()
                fps = int(self.n_batch / (t_now - t_start))

                if writer is not None:
                    self.episode_reward = total_episode_reward_logger(
                        self.episode_reward,
                        true_reward.reshape((self.n_envs, self.n_steps)),
                        masks.reshape((self.n_envs, self.n_steps)), writer,
                        self.num_timesteps)

                if self.verbose >= 1 and (update % log_interval == 0
                                          or update == 1):
                    explained_var = explained_variance(values, returns)
                    logger.logkv("serial_timesteps", update * self.n_steps)
                    logger.logkv("n_updates", update)
                    logger.logkv("total_timesteps", self.num_timesteps)
                    logger.logkv("fps", fps)
                    logger.logkv("explained_variance", float(explained_var))
                    if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0:
                        logger.logkv(
                            'ep_reward_mean',
                            safe_mean(
                                [ep_info['r'] for ep_info in ep_info_buf]))
                        logger.logkv(
                            'ep_len_mean',
                            safe_mean(
                                [ep_info['l'] for ep_info in ep_info_buf]))
                    logger.logkv('time_elapsed', t_start - t_first_start)
                    for (loss_val, loss_name) in zip(loss_vals,
                                                     self.loss_names):
                        logger.logkv(loss_name, loss_val)
                    logger.dumpkvs()

                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break

                for ep_info in ep_infos:
                    self.plot_l.append(ep_info['l'])
                    self.plot_r.append(ep_info['r'])
                    self.plot_t.append(ep_info['t'])

                if update % n_checkpoints == 0:
                    checkpoint = 'model-' + str(self.num_timesteps)
                    with open(self.checkpoint_log, mode='a') as employee_file:
                        employee_writer = csv.writer(employee_file)
                        employee_writer.writerow([checkpoint])
                    graph_name_csv = self.graph_name + '-' + str(
                        self.num_timesteps) + '.csv'
                    graph_name_sb = self.graph_name + '-' + str(
                        self.num_timesteps) + '.pkl'
                    self.save(graph_name_sb)
                    with open(graph_name_csv, mode='w') as employee_file:
                        employee_writer = csv.writer(employee_file,
                                                     delimiter=',',
                                                     quotechar='"',
                                                     quoting=csv.QUOTE_MINIMAL)
                        employee_writer.writerow(['r', 'l', 't'])
                        for i in range(len(self.plot_l)):
                            employee_writer.writerow([
                                self.plot_r[i], self.plot_l[i], self.plot_t[i]
                            ])
                    pydrive_util.upload_file(self.drive, graph_name_sb)
                    pydrive_util.upload_file(self.drive, graph_name_csv)
            return self
Пример #11
0
    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=100):
        with SetVerbosity(self.verbose):
            self._setup_learn(seed)

            with self.sess.as_default():
                self.adam.sync()

                # Prepare for rollouts
                seg_gen = traj_segment_generator(self.policy_pi, self.env,
                                                 self.timesteps_per_actorbatch)

                episodes_so_far = 0
                timesteps_so_far = 0
                iters_so_far = 0
                t_start = time.time()

                # rolling buffer for episode lengths
                lenbuffer = deque(maxlen=100)
                # rolling buffer for episode rewards
                rewbuffer = deque(maxlen=100)

                while True:
                    if callback:
                        callback(locals(), globals())
                    if total_timesteps and timesteps_so_far >= total_timesteps:
                        break

                    if self.schedule == 'constant':
                        cur_lrmult = 1.0
                    elif self.schedule == 'linear':
                        cur_lrmult = max(
                            1.0 - float(timesteps_so_far) / total_timesteps, 0)
                    else:
                        raise NotImplementedError

                    logger.log("********** Iteration %i ************" %
                               iters_so_far)

                    seg = seg_gen.__next__()
                    add_vtarg_and_adv(seg, self.gamma, self.lam)

                    # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
                    obs_ph, action_ph, atarg, tdlamret = seg["ob"], seg[
                        "ac"], seg["adv"], seg["tdlamret"]

                    # predicted value function before udpate
                    vpredbefore = seg["vpred"]

                    # standardized advantage function estimate
                    atarg = (atarg - atarg.mean()) / atarg.std()
                    dataset = Dataset(
                        dict(ob=obs_ph,
                             ac=action_ph,
                             atarg=atarg,
                             vtarg=tdlamret),
                        shuffle=not issubclass(self.policy, LstmPolicy))
                    optim_batchsize = self.optim_batchsize or obs_ph.shape[0]

                    # set old parameter values to new parameter values
                    self.assign_old_eq_new(sess=self.sess)
                    logger.log("Optimizing...")
                    logger.log(fmt_row(13, self.loss_names))

                    # Here we do a bunch of optimization epochs over the data
                    for _ in range(self.optim_epochs):
                        # list of tuples, each of which gives the loss for a minibatch
                        losses = []
                        for batch in dataset.iterate_once(optim_batchsize):
                            *newlosses, grad = self.lossandgrad(batch["ob"],
                                                                batch["ob"],
                                                                batch["ac"],
                                                                batch["atarg"],
                                                                batch["vtarg"],
                                                                cur_lrmult,
                                                                sess=self.sess)
                            self.adam.update(grad,
                                             self.optim_stepsize * cur_lrmult)
                            losses.append(newlosses)
                        logger.log(fmt_row(13, np.mean(losses, axis=0)))

                    logger.log("Evaluating losses...")
                    losses = []
                    for batch in dataset.iterate_once(optim_batchsize):
                        newlosses = self.compute_losses(batch["ob"],
                                                        batch["ob"],
                                                        batch["ac"],
                                                        batch["atarg"],
                                                        batch["vtarg"],
                                                        cur_lrmult,
                                                        sess=self.sess)
                        losses.append(newlosses)
                    mean_losses, _, _ = mpi_moments(losses, axis=0)
                    logger.log(fmt_row(13, mean_losses))
                    for (loss_val, name) in zipsame(mean_losses,
                                                    self.loss_names):
                        logger.record_tabular("loss_" + name, loss_val)
                    logger.record_tabular(
                        "ev_tdlam_before",
                        explained_variance(vpredbefore, tdlamret))

                    # local values
                    lrlocal = (seg["ep_lens"], seg["ep_rets"])

                    # list of tuples
                    listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)
                    lens, rews = map(flatten_lists, zip(*listoflrpairs))
                    lenbuffer.extend(lens)
                    rewbuffer.extend(rews)
                    logger.record_tabular("EpLenMean", np.mean(lenbuffer))
                    logger.record_tabular("EpRewMean", np.mean(rewbuffer))
                    logger.record_tabular("EpThisIter", len(lens))
                    episodes_so_far += len(lens)
                    timesteps_so_far += seg["total_timestep"]
                    iters_so_far += 1
                    logger.record_tabular("EpisodesSoFar", episodes_so_far)
                    logger.record_tabular("TimestepsSoFar", timesteps_so_far)
                    logger.record_tabular("TimeElapsed", time.time() - t_start)
                    if self.verbose >= 1 and MPI.COMM_WORLD.Get_rank() == 0:
                        logger.dump_tabular()

        return self
Пример #12
0
    def learn(self,
              total_timesteps,
              callback=None,
              log_interval=100,
              tb_log_name="TRPO",
              reset_num_timesteps=True):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)
        callback = self._init_callback(callback)

        with SetVerbosity(self.verbose), TensorboardWriter(
                self.graph, self.tensorboard_log, tb_log_name,
                new_tb_log) as writer:
            self._setup_learn()

            with self.sess.as_default():
                callback.on_training_start(locals(), globals())

                seg_gen = traj_segment_generator(self.policy_pi,
                                                 self.env,
                                                 self.timesteps_per_batch,
                                                 callback=callback)

                episodes_so_far = 0
                timesteps_so_far = 0
                iters_so_far = 0
                t_start = time.time()
                len_buffer = deque(
                    maxlen=40)  # rolling buffer for episode lengths
                reward_buffer = deque(
                    maxlen=40)  # rolling buffer for episode rewards

                while True:
                    if timesteps_so_far >= total_timesteps:
                        break

                    logger.log("********** Iteration %i ************" %
                               iters_so_far)

                    def fisher_vector_product(vec):
                        return self.allmean(
                            self.compute_fvp(
                                vec, *fvpargs,
                                sess=self.sess)) + self.cg_damping * vec

                    # ------------------ Update G ------------------
                    logger.log("Optimizing Policy...")
                    # g_step = 1 when not using GAIL
                    mean_losses = None
                    vpredbefore = None
                    tdlamret = None
                    observation = None
                    action = None
                    seg = None
                    for k in range(self.g_step):
                        with self.timed("sampling"):
                            seg = seg_gen.__next__()

                        # Stop training early (triggered by the callback)
                        if not seg.get('continue_training', True):  # pytype: disable=attribute-error
                            break

                        add_vtarg_and_adv(seg, self.gamma, self.lam)
                        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
                        observation, action = seg["observations"], seg[
                            "actions"]
                        atarg, tdlamret = seg["adv"], seg["tdlamret"]

                        vpredbefore = seg[
                            "vpred"]  # predicted value function before update
                        atarg = (atarg - atarg.mean()) / (
                            atarg.std() + 1e-8
                        )  # standardized advantage function estimate

                        print('advantages: ', np.min(atarg), np.max(atarg),
                              np.mean(atarg))
                        # true_rew is the reward without discount
                        if writer is not None:
                            total_episode_reward_logger(
                                self.episode_reward,
                                seg["true_rewards"].reshape(
                                    (self.n_envs, -1)), seg["dones"].reshape(
                                        (self.n_envs, -1)), writer,
                                self.num_timesteps)

                        args = seg["observations"], seg["observations"], seg[
                            "actions"], atarg
                        # Subsampling: see p40-42 of John Schulman thesis
                        # http://joschu.net/docs/thesis.pdf
                        fvpargs = [arr[::5] for arr in args]

                        self.assign_old_eq_new(sess=self.sess)

                        with self.timed("computegrad"):
                            steps = self.num_timesteps + (k + 1) * (
                                seg["total_timestep"] / self.g_step)
                            run_options = tf.RunOptions(
                                trace_level=tf.RunOptions.FULL_TRACE)
                            run_metadata = tf.RunMetadata(
                            ) if self.full_tensorboard_log else None

                            _, grad, *lossbefore = self.compute_lossandgrad(
                                *args,
                                tdlamret,
                                sess=self.sess,
                                options=run_options,
                                run_metadata=run_metadata)

                        print(f'losses before', lossbefore)
                        lossbefore = self.allmean(np.array(lossbefore))
                        grad = self.allmean(grad)
                        if np.allclose(grad, 0):
                            logger.log("Got zero gradient. not updating")
                        else:
                            with self.timed("conjugate_gradient"):
                                stepdir = conjugate_gradient(
                                    fisher_vector_product,
                                    grad,
                                    cg_iters=self.cg_iters,
                                    verbose=self.rank == 0
                                    and self.verbose >= 1)
                            assert np.isfinite(stepdir).all()
                            shs = .5 * stepdir.dot(
                                fisher_vector_product(stepdir))
                            # abs(shs) to avoid taking square root of negative values
                            lagrange_multiplier = np.sqrt(
                                abs(shs) / self.max_kl)
                            # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g))
                            fullstep = stepdir / lagrange_multiplier
                            expectedimprove = grad.dot(fullstep)
                            surrbefore = lossbefore[0]
                            stepsize = 1.0
                            thbefore = self.get_flat()
                            for _ in range(10):
                                thnew = thbefore + fullstep * stepsize
                                self.set_from_flat(thnew)
                                mean_losses = surr, kl_loss, *_ = self.allmean(
                                    np.array(
                                        self.compute_losses(*args,
                                                            sess=self.sess)))
                                improve = surr - surrbefore
                                logger.log("Expected: %.3f Actual: %.3f" %
                                           (expectedimprove, improve))
                                if not np.isfinite(mean_losses).all():
                                    logger.log(
                                        "Got non-finite value of losses -- bad!"
                                    )
                                elif kl_loss > self.max_kl * 1.5:
                                    logger.log(
                                        "violated KL constraint. shrinking step."
                                    )
                                elif improve < 0:
                                    logger.log(
                                        "surrogate didn't improve. shrinking step."
                                    )
                                else:
                                    logger.log("Stepsize OK!")
                                    break
                                stepsize *= .5
                            else:
                                logger.log("couldn't compute a good step")
                                self.set_from_flat(thbefore)
                            if self.nworkers > 1 and iters_so_far % 20 == 0:
                                # list of tuples
                                paramsums = MPI.COMM_WORLD.allgather(
                                    (thnew.sum(), self.vfadam.getflat().sum()))
                                assert all(
                                    np.allclose(ps, paramsums[0])
                                    for ps in paramsums[1:])

                            for (loss_name,
                                 loss_val) in zip(self.loss_names,
                                                  mean_losses):
                                logger.record_tabular(loss_name, loss_val)

                        with self.timed("vf"):
                            for _ in range(self.vf_iters):
                                # NOTE: for recurrent policies, use shuffle=False?
                                for (mbob, mbret) in dataset.iterbatches(
                                    (seg["observations"], seg["tdlamret"]),
                                        include_final_partial_batch=False,
                                        batch_size=128,
                                        shuffle=True):
                                    grad = self.allmean(
                                        self.compute_vflossandgrad(
                                            mbob, mbob, mbret, sess=self.sess))
                                    self.vfadam.update(grad, self.vf_stepsize)

                    # Stop training early (triggered by the callback)
                    if not seg.get('continue_training', True):  # pytype: disable=attribute-error
                        break

                    logger.record_tabular(
                        "explained_variance_tdlam_before",
                        explained_variance(vpredbefore, tdlamret))

                    # lr: lengths and rewards
                    lr_local = (seg["ep_lens"], seg["ep_rets"])  # local values
                    list_lr_pairs = MPI.COMM_WORLD.allgather(
                        lr_local)  # list of tuples
                    lens, rews = map(flatten_lists, zip(*list_lr_pairs))

                    len_buffer.extend(lens)
                    reward_buffer.extend(rews)

                    if len(len_buffer) > 0:
                        logger.record_tabular("EpLenMean", np.mean(len_buffer))
                        logger.record_tabular("EpRewMean",
                                              np.mean(reward_buffer))

                    logger.record_tabular("EpThisIter", len(lens))
                    episodes_so_far += len(lens)
                    current_it_timesteps = MPI.COMM_WORLD.allreduce(
                        seg["total_timestep"])
                    timesteps_so_far += current_it_timesteps
                    self.num_timesteps += current_it_timesteps
                    iters_so_far += 1

                    logger.record_tabular("EpisodesSoFar", episodes_so_far)
                    logger.record_tabular("TimestepsSoFar", self.num_timesteps)
                    logger.record_tabular("TimeElapsed", time.time() - t_start)

                    if self.verbose >= 1 and self.rank == 0:
                        logger.dump_tabular()

        callback.on_training_end()
        return self
Пример #13
0
    def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="MDPO",
              reset_num_timesteps=True):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)
        callback = self._init_callback(callback)
        print("got seed {}, sgd_steps {}".format(seed, self.sgd_steps))

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:

            with self.sess.as_default():
                callback.on_training_start(locals(), globals())

                seg_gen = traj_segment_generator(self.old_policy, self.env, self.timesteps_per_batch,
                                                     reward_giver=self.reward_giver,
                                                     gail=self.using_gail, mdal=self.using_mdal, neural=self.neural,
                                                     action_space=self.action_space, gamma=self.gamma, callback=callback)


                episodes_so_far = 0
                timesteps_so_far = 0
                iters_so_far = 0
                t_start = time.time()
                len_buffer = deque(maxlen=40)  # rolling buffer for episode lengths
                reward_buffer = deque(maxlen=40)  # rolling buffer for episode rewards

                self.episode_reward = np.zeros((self.n_envs,))
                self.outer_learning_rate = get_schedule_fn(3e-4)
                self.cliprange_vf = get_schedule_fn(0.2)

                true_reward_buffer = None
                if self.using_gail or self.using_mdal:
                    true_reward_buffer = deque(maxlen=40)

                    # Initialize dataloader
                    batchsize = self.timesteps_per_batch // self.d_step
                    self.expert_dataset.init_dataloader(batchsize)

                    #  Stats not used for now
                    # TODO: replace with normal tb logging
                    #  g_loss_stats = Stats(loss_names)
                    #  d_loss_stats = Stats(reward_giver.loss_name)
                    #  ep_stats = Stats(["True_rewards", "Rewards", "Episode_length"])

                while True:
                    # if callback is not None:
                    #     # Only stop training if return value is False, not when it is None. This is for backwards
                    #     # compatibility with callbacks that have no return statement.
                    #     if callback(locals(), globals()) is False:
                    #         break
                    if total_timesteps and timesteps_so_far >= total_timesteps:
                        break

                    logger.log("********** Iteration %i ************" % iters_so_far)

                    #def fisher_vector_product(vec):
                    #    return self.allmean(self.compute_fvp(vec, *fvpargs, sess=self.sess)) + self.cg_damping * vec

                    # ------------------ Update G ------------------
                    # logger.log("Optimizing Policy...")
                    # g_step = 1 when not using GAIL
                    mean_losses = None
                    vpredbefore = None
                    tdlamret = None
                    observation = None
                    action = None
                    seg = None
                    for k in range(self.g_step):
                        with self.timed("sampling"):
                            seg = seg_gen.__next__()
                        if not seg.get('continue_training', True):  # pytype: disable=attribute-error
                            break

                        add_vtarg_and_adv(seg, self.gamma, self.lam)
                        if self.using_mdal:
                            policy_successor_features = add_successor_features(seg, self.gamma,
                                                                           is_action_features=self.is_action_features)
                        else:
                            policy_successor_features = add_successor_features(seg, self.gamma)
                        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
                        observation, action = seg["observations"], seg["actions"]
                        atarg, tdlamret = seg["adv"], seg["tdlamret"]


                        vpredbefore = seg["vpred"]  # predicted value function before update
                        atarg = (atarg - atarg.mean()) / atarg.std()  # standardized advantage function estimate

                        # true_rew is the reward without discount
                        if writer is not None:
                            self.episode_reward = total_episode_reward_logger(self.episode_reward,
                                                                              seg["true_rewards"].reshape(
                                                                                  (self.n_envs, -1)),
                                                                              seg["dones"].reshape((self.n_envs, -1)),
                                                                              writer, self.num_timesteps)

                        n_updates = int(total_timesteps / self.timesteps_per_batch)
                        lr_now = np.float32(1.0 - (iters_so_far - 1.0) / n_updates)
                        outer_lr_now = self.outer_learning_rate(1.0 - (iters_so_far - 1.0) / n_updates)
                        clip_now = self.cliprange_vf(1.0 - (iters_so_far - 1.0) / n_updates)
                        args = seg["observations"], seg["observations"], seg["actions"], atarg
                        # Subsampling: see p40-42 of John Schulman thesis
                        # http://joschu.net/docs/thesis.pdf
                        #fvpargs = [arr[::5] for arr in args]

                        with self.timed("computegrad"):
                            steps = self.num_timesteps + (k + 1) * (seg["total_timestep"] / self.g_step)
                            run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
                            run_metadata = tf.RunMetadata() if self.full_tensorboard_log else None
                            # run loss backprop with summary, and save the metadata (memory, compute time, ...)
                            if writer is not None:
                                summary, grad, *lossbefore = self.compute_lossandgrad(*args, tdlamret,
                                                                                      lr_now, seg["vpred"],
                                                                                      seg["observations"],
                                                                                      sess=self.sess,
                                                                                      options=run_options,
                                                                                      run_metadata=run_metadata)
                                if self.full_tensorboard_log:
                                    writer.add_run_metadata(run_metadata, 'step%d' % steps)
                                writer.add_summary(summary, steps)
                            else:
                                _, grad, *lossbefore = self.compute_lossandgrad(*args, tdlamret,
                                                                                lr_now, seg["vpred"],
                                                                                seg["observations"],
                                                                                sess=self.sess,
                                                                                options=run_options,
                                                                                run_metadata=run_metadata)
                                td_map = {self.policy_pi.obs_ph: seg["observations"],
                                            self.old_policy.obs_ph: seg["observations"],
                                            self.closed_policy.obs_ph: seg["observations"],
                                            self.action: seg["actions"], self.atarg: atarg, self.ret: tdlamret,
                                            self.learning_rate_ph: lr_now, self.outer_learning_rate_ph: outer_lr_now,
                                            self.vtarg: seg["vpred"]}
                                for _ in range(int(self.sgd_steps)):
                                    _ = self.sess.run(self._train, td_map)
                                    #if self.method == "closed-KL":
                                    #    _ = self.sess.run(self._train_policy, td_map)

                        if np.allclose(grad, 0):
                            logger.log("Got zero gradient. not updating")
                        else:
                            for _ in range(1):
                                mean_losses = surr, kl_loss, *_ = self.allmean(
                                    np.array(self.compute_losses(*args, lr_now, seg["vpred"], sess=self.sess)))

                        with self.timed("vf"):
                            for _ in range(self.vf_iters):
                                # NOTE: for recurrent policies, use shuffle=False?
                                for (mbob, mbret, mbval) in dataset.iterbatches((seg["observations"], seg["tdlamret"], seg["vpred"]),
                                                                         include_final_partial_batch=False,
                                                                         batch_size=128,
                                                                         shuffle=True):
                                    grad = self.allmean(self.compute_vflossandgrad(mbob, mbob, mbret, mbval, clip_now, sess=self.sess))
                                    self.vfadam.update(grad, outer_lr_now) #self.vf_stepsize)

                        if iters_so_far % 1 == 0:
                            # print("updating theta now")
                            self.assign_old_eq_new(sess=self.sess)

                    for (loss_name, loss_val) in zip(self.loss_names, mean_losses):
                        logger.record_tabular(loss_name, loss_val)

                    logger.record_tabular("explained_variance_tdlam_before",
                                          explained_variance(vpredbefore, tdlamret))

                    if self.using_gail:
                        # ------------------ Update D ------------------
                        logger.log("Optimizing Discriminator...")
                        logger.log(fmt_row(13, self.reward_giver.loss_name))
                        assert len(observation) == self.timesteps_per_batch
                        batch_size = self.timesteps_per_batch // self.d_step

                        # NOTE: uses only the last g step for observation
                        d_losses = []  # list of tuples, each of which gives the loss for a minibatch
                        # NOTE: for recurrent policies, use shuffle=False?
                        for ob_batch, ac_batch in dataset.iterbatches((observation, action),
                                                                      include_final_partial_batch=False,
                                                                      batch_size=batch_size,
                                                                      shuffle=True):
                            ob_expert, ac_expert = self.expert_dataset.get_next_batch()
                            # update running mean/std for reward_giver
                            if self.reward_giver.normalize:
                                self.reward_giver.obs_rms.update(np.concatenate((ob_batch, ob_expert), 0))

                            # Reshape actions if needed when using discrete actions
                            if isinstance(self.action_space, gym.spaces.Discrete):
                                if len(ac_batch.shape) == 2:
                                    ac_batch = ac_batch[:, 0]
                                if len(ac_expert.shape) == 2:
                                    ac_expert = ac_expert[:, 0]
                            *newlosses, grad = self.reward_giver.lossandgrad(ob_batch, ac_batch, ob_expert, ac_expert)
                            self.d_adam.update(self.allmean(grad), self.d_stepsize)
                            d_losses.append(newlosses)
                        logger.log(fmt_row(13, np.mean(d_losses, axis=0)))

                    elif self.using_mdal:
                        batch_sampling = True

                        if self.neural:

                            if batch_sampling:
                                batch_size = self.timesteps_per_batch // self.d_step

                                # NOTE: uses only the last g step for observation
                                d_losses = []  # list of tuples, each of which gives the loss for a minibatch
                                # NOTE: for recurrent policies, use shuffle=False?
                                for ob_batch, ac_batch in dataset.iterbatches((observation, action),
                                                                              include_final_partial_batch=False,
                                                                              batch_size=batch_size,
                                                                              shuffle=True):
                                # ob_batch, ac_batch, gamma_batch = np.array(batch_buffer['obs']), np.array(
                                #     batch_buffer['acs']), np.array(batch_buffer['gammas'])
                                    gamma_batch = np.ones((ob_batch.shape[0]))
                                    ob_expert, ac_expert = self.expert_dataset.get_next_batch()
                                    gamma_expert = np.ones((ob_expert.shape[0]))
                                    # ob_expert, ac_expert, gamma_expert = np.concatenate(self.expert_dataset.ep_obs),\
                                    #                                      np.concatenate(self.expert_dataset.ep_acs),\
                                    #                                      np.concatenate(self.expert_dataset.ep_gammas)

                                    # update running mean/std for reward_giver
                                    if self.reward_giver.normalize:
                                        self.reward_giver.obs_rms.update(np.concatenate((ob_batch, ob_expert), 0))

                                    # Reshape actions if needed when using discrete actions
                                    if isinstance(self.action_space, gym.spaces.Discrete):
                                        if len(ac_batch.shape) == 2:
                                            ac_batch = ac_batch[:, 0]
                                        if len(ac_expert.shape) == 2:
                                            ac_expert = ac_expert[:, 0]

                                    ob_reg_expert, ac_reg_expert = np.array(ob_expert), np.array(ac_expert)

                                    # while True:
                                    #     if ob_reg_expert.shape[0] == ob_batch.shape[0] and ac_reg_expert.shape[0] == \
                                    #             ac_batch.shape[0]:
                                    #         break
                                    #     ob_reg_expert, ac_reg_expert = self.expert_dataset.get_next_batch()
                                    #     ob_reg_expert, ac_reg_expert = np.array(ob_reg_expert), np.array(ac_reg_expert)


                                    alpha = np.random.uniform(0.0, 1.0, size=(ob_reg_expert.shape[0], 1))
                                    ob_mix_batch = alpha * ob_batch[:ob_reg_expert.shape[0]] + (1 - alpha) * ob_reg_expert
                                    ac_mix_batch = alpha * ac_batch[:ac_reg_expert.shape[0]] + (1 - alpha) * ac_reg_expert
                                    with self.sess.as_default():
                                        # self.reward_giver.train(ob_batch, ac_batch, np.expand_dims(gamma_batch, axis=1),
                                        #                         ob_expert, ac_expert, np.expand_dims(gamma_expert, axis=1))
                                        *newlosses, grad = self.reward_giver.lossandgrad(
                                                                ob_batch, ac_batch, np.expand_dims(gamma_batch, axis=1),
                                                                ob_expert, ac_expert, np.expand_dims(gamma_expert, axis=1),
                                                                ob_mix_batch, ac_mix_batch)
                                        self.d_adam.update(self.allmean(grad), self.d_stepsize)
                            else:
                                # assert len(observation) == self.timesteps_per_batch
                                # Comment out if you want only the latest rewards:
                                obs_batch, acs_batch, gammas_batch = seg['obs_batch'], seg['acs_batch'], seg['gammas_batch']
                                batch_successor_features = seg['successor_features_batch']


                                if self.reward_giver.normalize:
                                    ob_reg_batch, ac_reg_batch = observation, action
                                    ob_expert, _ = self.expert_dataset.get_next_batch()
                                    self.reward_giver.obs_rms.update(np.concatenate((ob_reg_batch, ob_expert), 0))
                                #     self.reward_giver.obs_rms.update(
                                #         np.array(batch_successor_features)[:, :self.observation_space.shape[0]])

                                for idx, (ob_batch, ac_batch, gamma_batch) in enumerate(
                                        zip(obs_batch, acs_batch, gammas_batch)):
                                    rand_traj = np.random.randint(self.expert_dataset.num_traj)
                                    ob_expert, ac_expert, gamma_expert = self.expert_dataset.ep_obs[rand_traj], \
                                                                         self.expert_dataset.ep_acs[rand_traj], \
                                                                         self.expert_dataset.ep_gammas[rand_traj]

                                    ob_batch, ac_batch, gamma_batch = np.array(ob_batch), np.array(ac_batch), np.array(
                                        gamma_batch)

                                    while True:
                                        ob_reg_expert, ac_reg_expert = self.expert_dataset.get_next_batch()
                                        ob_reg_expert, ac_reg_expert = np.array(ob_reg_expert), np.array(ac_reg_expert)

                                        if ob_reg_expert.shape[0] == ob_reg_batch.shape[0] and ac_reg_expert.shape[0] == \
                                                ac_reg_batch.shape[0]:
                                            break
                                    alpha = np.random.uniform(0.0, 1.0, size=(ob_reg_batch.shape[0], 1))
                                    ob_mix_batch = alpha * ob_reg_batch + (1 - alpha) * ob_reg_expert
                                    ac_mix_batch = alpha * ac_reg_batch + (1 - alpha) * ac_reg_expert

                                    with self.sess.as_default():
                                        *newlosses, grad = self.reward_giver.lossandgrad(
                                                                ob_batch, ac_batch, np.expand_dims(gamma_batch, axis=1),
                                                                ob_expert, ac_expert, np.expand_dims(gamma_expert, axis=1),
                                                                ob_mix_batch, ac_mix_batch)
                                        self.d_adam.update(self.allmean(grad), self.d_stepsize)
                                        # self.reward_giver.train(ob_batch, ac_batch, np.expand_dims(gamma_batch, axis=1),
                                        #                         ob_expert, ac_expert,
                                        #                         np.expand_dims(gamma_expert, axis=1),
                                        #                         ob_mix_batch, ac_mix_batch)

                    if self.using_gail or self.using_mdal:
                        # lr: lengths and rewards
                        lr_local = (seg["ep_lens"], seg["ep_rets"], seg["ep_true_rets"])  # local values
                        list_lr_pairs = MPI.COMM_WORLD.allgather(lr_local)  # list of tuples
                        lens, rews, true_rets = map(flatten_lists, zip(*list_lr_pairs))
                        true_reward_buffer.extend(true_rets)
                    else:
                        lr_local = (seg["ep_lens"], seg["ep_rets"])  # local values
                        list_lr_pairs = MPI.COMM_WORLD.allgather(lr_local)  # list of tuples
                        lens, rews = map(flatten_lists, zip(*list_lr_pairs))
                    len_buffer.extend(lens)
                    reward_buffer.extend(rews)

                    if len(len_buffer) > 0:
                        if self.using_gail or self.using_mdal:
                            logger.record_tabular("EpTrueRewMean", np.mean(true_reward_buffer))

                        logger.record_tabular("EpRewMean", np.mean(reward_buffer))
                        logger.record_tabular("EpLenMean", np.mean(len_buffer))


                    logger.record_tabular("EpThisIter", len(lens))
                    episodes_so_far += len(lens)
                    current_it_timesteps = MPI.COMM_WORLD.allreduce(seg["total_timestep"])
                    timesteps_so_far += current_it_timesteps
                    self.num_timesteps += current_it_timesteps
                    iters_so_far += 1

                    logger.record_tabular("EpisodesSoFar", episodes_so_far)
                    logger.record_tabular("TimestepsSoFar", self.num_timesteps)
                    logger.record_tabular("TimeElapsed", time.time() - t_start)
                    logger.record_tabular("Tsallis-q", self.tsallis_q)
                    logger.record_tabular("steps", self.num_timesteps)
                    logger.record_tabular("seed", self.seed)

                    if self.verbose >= 1 and self.rank == 0:
                        logger.dump_tabular()
        callback.on_training_end()

        return self
Пример #14
0
    def learn(self, total_timesteps, callback=None, seed=None, log_interval=1, tb_log_name="PPO2",
              reset_num_timesteps=True):
        # Transform to callable if needed
        self.learning_rate = get_schedule_fn(self.learning_rate)
        self.cliprange = get_schedule_fn(self.cliprange)
        cliprange_vf = get_schedule_fn(self.cliprange_vf)

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn(seed)

            runner = Runner(env=self.env, model=self, n_steps=self.n_steps, gamma=self.gamma, lam=self.lam)
            self.episode_reward = np.zeros((self.n_envs,))

            ep_info_buf = deque(maxlen=100)
            t_first_start = time.time()

            n_updates = total_timesteps // self.n_batch
            start_decay = n_updates
            pp_sr_buf = deque(maxlen=5)
            for update in range(1, n_updates + 1):
                assert self.n_batch % self.nminibatches == 0
                batch_size = self.n_batch // self.nminibatches
                t_start = time.time()
                frac = 1.0 - (update - 1.0) / n_updates
                lr_now = self.learning_rate(frac)
                cliprange_now = self.cliprange(frac)
                cliprange_vf_now = cliprange_vf(frac)
                if self.curriculum:
                    if 'FetchStack' in self.env.get_attr('spec')[0].id:
                        # Stacking
                        pp_sr = pp_eval_model(self.eval_env, self)
                        pp_sr_buf.append(pp_sr)
                        print('Pick-and-place success rate', np.mean(pp_sr_buf))
                        if start_decay == n_updates and np.mean(pp_sr_buf) > 0.8:
                            start_decay = update
                        _ratio = np.clip(0.7 - 0.8 * (update - start_decay) / 380, 0.3, 0.7)  # from 0.7 to 0.3
                    elif 'FetchPushWallObstacle' in self.env.get_attr('spec')[0].id:
                        _ratio = max(1.0 - (update - 1.0) / n_updates, 0.0)
                    elif 'MasspointMaze-v3' in self.env.get_attr('spec')[0].id:
                        _ratio = 1.0 - (update - 1.0) / n_updates
                    else:
                        raise NotImplementedError
                    self.env.env_method('set_random_ratio', _ratio)
                    print('Set random_ratio to', self.env.get_attr('random_ratio')[0])
                # true_reward is the reward without discount
                obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = runner.run()
                self.num_timesteps += self.n_batch
                ep_info_buf.extend(ep_infos)
                mb_loss_vals = []
                if states is None:  # nonrecurrent version
                    update_fac = self.n_batch // self.nminibatches // self.noptepochs + 1
                    inds = np.arange(self.n_batch)
                    for epoch_num in range(self.noptepochs):
                        np.random.shuffle(inds)
                        for start in range(0, self.n_batch, batch_size):
                            timestep = self.num_timesteps // update_fac + ((self.noptepochs * self.n_batch + epoch_num *
                                                                            self.n_batch + start) // batch_size)
                            end = start + batch_size
                            mbinds = inds[start:end]
                            slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
                            mb_loss_vals.append(self._train_step(lr_now, cliprange_now, *slices, writer=writer,
                                                                 update=timestep, cliprange_vf=cliprange_vf_now))
                else:  # recurrent version
                    update_fac = self.n_batch // self.nminibatches // self.noptepochs // self.n_steps + 1
                    assert self.n_envs % self.nminibatches == 0
                    env_indices = np.arange(self.n_envs)
                    flat_indices = np.arange(self.n_envs * self.n_steps).reshape(self.n_envs, self.n_steps)
                    envs_per_batch = batch_size // self.n_steps
                    for epoch_num in range(self.noptepochs):
                        np.random.shuffle(env_indices)
                        for start in range(0, self.n_envs, envs_per_batch):
                            timestep = self.num_timesteps // update_fac + ((self.noptepochs * self.n_envs + epoch_num *
                                                                            self.n_envs + start) // envs_per_batch)
                            end = start + envs_per_batch
                            mb_env_inds = env_indices[start:end]
                            mb_flat_inds = flat_indices[mb_env_inds].ravel()
                            slices = (arr[mb_flat_inds] for arr in (obs, returns, masks, actions, values, neglogpacs))
                            mb_states = states[mb_env_inds]
                            mb_loss_vals.append(self._train_step(lr_now, cliprange_now, *slices, update=timestep,
                                                                 writer=writer, states=mb_states,
                                                                 cliprange_vf=cliprange_vf_now))

                loss_vals = np.mean(mb_loss_vals, axis=0)
                t_now = time.time()
                fps = int(self.n_batch / (t_now - t_start))

                if writer is not None:
                    self.episode_reward = total_episode_reward_logger(self.episode_reward,
                                                                      true_reward.reshape((self.n_envs, self.n_steps)),
                                                                      masks.reshape((self.n_envs, self.n_steps)),
                                                                      writer, self.num_timesteps)

                if self.verbose >= 1 and (update % log_interval == 0 or update == 1):
                    explained_var = explained_variance(values, returns)
                    logger.logkv("serial_timesteps", update * self.n_steps)
                    logger.logkv("n_updates", update)
                    logger.logkv("total_timesteps", self.num_timesteps)
                    logger.logkv("fps", fps)
                    logger.logkv("explained_variance", float(explained_var))
                    if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0:
                        logger.logkv('ep_reward_mean', safe_mean([ep_info['r'] for ep_info in ep_info_buf]))
                        logger.logkv('ep_len_mean', safe_mean([ep_info['l'] for ep_info in ep_info_buf]))
                        logger.logkv('ep_success_rate', safe_mean([ep_info['is_success'] for ep_info in ep_info_buf]))
                    logger.logkv('time_elapsed', t_start - t_first_start)
                    for (loss_val, loss_name) in zip(loss_vals, self.loss_names):
                        logger.logkv(loss_name, loss_val)
                    logger.dumpkvs()

                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break

            return self
Пример #15
0
    def learn(self,
              total_timesteps,
              callback=None,
              log_interval=1,
              tb_log_name="PPO2",
              reset_num_timesteps=True):
        # Transform to callable if needed
        self.learning_rate = get_schedule_fn(self.learning_rate)
        self.cliprange = get_schedule_fn(self.cliprange)
        cliprange_vf = get_schedule_fn(self.cliprange_vf)

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)
        callback = self._init_callback(callback)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn()

            t_first_start = time.time()
            n_updates = total_timesteps // self.n_batch

            callback.on_training_start(locals(), globals())

            for update in range(1, n_updates + 1):
                assert self.n_batch % self.nminibatches == 0, (
                    "The number of minibatches (`nminibatches`) "
                    "is not a factor of the total number of samples "
                    "collected per rollout (`n_batch`), "
                    "some samples won't be used.")
                batch_size = self.n_batch // self.nminibatches
                t_start = time.time()
                frac = 1.0 - (update - 1.0) / n_updates
                lr_now = self.learning_rate(frac)
                cliprange_now = self.cliprange(frac)
                cliprange_vf_now = cliprange_vf(frac)

                callback.on_rollout_start()
                # true_reward is the reward without discount
                rollout = self.runner.run(callback)
                # Unpack
                obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = rollout

                callback.on_rollout_end()

                # Early stopping due to the callback
                if not self.runner.continue_training:
                    break

                self.ep_info_buf.extend(ep_infos)
                mb_loss_vals = []
                if states is None:  # nonrecurrent version
                    update_fac = max(
                        self.n_batch // self.nminibatches // self.noptepochs,
                        1)
                    inds = np.arange(self.n_batch)
                    for epoch_num in range(self.noptepochs):
                        np.random.shuffle(inds)
                        for start in range(0, self.n_batch, batch_size):
                            timestep = self.num_timesteps // update_fac + (
                                (epoch_num * self.n_batch + start) //
                                batch_size)
                            end = start + batch_size
                            mbinds = inds[start:end]
                            slices = (arr[mbinds]
                                      for arr in (obs, returns, masks, actions,
                                                  values, neglogpacs))
                            mb_loss_vals.append(
                                self._train_step(
                                    lr_now,
                                    cliprange_now,
                                    *slices,
                                    writer=writer,
                                    update=timestep,
                                    cliprange_vf=cliprange_vf_now))
                else:  # recurrent version
                    update_fac = max(
                        self.n_batch // self.nminibatches // self.noptepochs //
                        self.n_steps, 1)
                    assert self.n_envs % self.nminibatches == 0
                    env_indices = np.arange(self.n_envs)
                    flat_indices = np.arange(self.n_envs *
                                             self.n_steps).reshape(
                                                 self.n_envs, self.n_steps)
                    envs_per_batch = batch_size // self.n_steps
                    for epoch_num in range(self.noptepochs):
                        np.random.shuffle(env_indices)
                        for start in range(0, self.n_envs, envs_per_batch):
                            timestep = self.num_timesteps // update_fac + (
                                (epoch_num * self.n_envs + start) //
                                envs_per_batch)
                            end = start + envs_per_batch
                            mb_env_inds = env_indices[start:end]
                            mb_flat_inds = flat_indices[mb_env_inds].ravel()
                            slices = (arr[mb_flat_inds]
                                      for arr in (obs, returns, masks, actions,
                                                  values, neglogpacs))
                            mb_states = states[mb_env_inds]
                            mb_loss_vals.append(
                                self._train_step(
                                    lr_now,
                                    cliprange_now,
                                    *slices,
                                    update=timestep,
                                    writer=writer,
                                    states=mb_states,
                                    cliprange_vf=cliprange_vf_now))

                loss_vals = np.mean(mb_loss_vals, axis=0)
                t_now = time.time()
                fps = int(self.n_batch / (t_now - t_start))

                if writer is not None:
                    total_episode_reward_logger(
                        self.episode_reward,
                        true_reward.reshape((self.n_envs, self.n_steps)),
                        masks.reshape((self.n_envs, self.n_steps)), writer,
                        self.num_timesteps)

                if self.verbose >= 1 and (update % log_interval == 0
                                          or update == 1):
                    explained_var = explained_variance(values, returns)
                    logger.logkv("serial_timesteps", update * self.n_steps)
                    logger.logkv("n_updates", update)
                    logger.logkv("total_timesteps", self.num_timesteps)
                    logger.logkv("fps", fps)
                    logger.logkv("explained_variance", float(explained_var))
                    if len(self.ep_info_buf) > 0 and len(
                            self.ep_info_buf[0]) > 0:
                        logger.logkv(
                            'ep_reward_mean',
                            safe_mean([
                                ep_info['r'] for ep_info in self.ep_info_buf
                            ]))
                        logger.logkv(
                            'ep_len_mean',
                            safe_mean([
                                ep_info['l'] for ep_info in self.ep_info_buf
                            ]))
                    logger.logkv('time_elapsed', t_start - t_first_start)
                    for (loss_val, loss_name) in zip(loss_vals,
                                                     self.loss_names):
                        logger.logkv(loss_name, loss_val)
                    logger.dumpkvs()

            callback.on_training_end()
            return self
Пример #16
0
    def learn(self,
              total_timesteps,
              callback=None,
              log_interval=100,
              tb_log_name="ACKTR",
              reset_num_timesteps=True):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)
        callback = self._init_callback(callback)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn()
            self.n_batch = self.n_envs * self.n_steps

            self.learning_rate_schedule = Scheduler(
                initial_value=self.learning_rate,
                n_values=total_timesteps,
                schedule=self.lr_schedule)

            # FIFO queue of the q_runner thread is closed at the end of the learn function.
            # As a result, it needs to be redefinied at every call
            with self.graph.as_default():
                with tf.variable_scope(
                        "kfac_apply",
                        reuse=self.trained,
                        custom_getter=tf_util.outer_scope_getter(
                            "kfac_apply")):
                    # Some of the variables are not in a scope when they are create
                    # so we make a note of any previously uninitialized variables
                    tf_vars = tf.global_variables()
                    is_uninitialized = self.sess.run(
                        [tf.is_variable_initialized(var) for var in tf_vars])
                    old_uninitialized_vars = [
                        v for (v, f) in zip(tf_vars, is_uninitialized) if not f
                    ]

                    self.train_op, self.q_runner = self.optim.apply_gradients(
                        list(zip(self.grads_check, self.params)))

                    # then we check for new uninitialized variables and initialize them
                    tf_vars = tf.global_variables()
                    is_uninitialized = self.sess.run(
                        [tf.is_variable_initialized(var) for var in tf_vars])
                    new_uninitialized_vars = [
                        v for (v, f) in zip(tf_vars, is_uninitialized)
                        if not f and v not in old_uninitialized_vars
                    ]

                    if len(new_uninitialized_vars) != 0:
                        self.sess.run(
                            tf.variables_initializer(new_uninitialized_vars))

            self.trained = True

            t_start = time.time()
            coord = tf.train.Coordinator()
            if self.q_runner is not None:
                enqueue_threads = self.q_runner.create_threads(self.sess,
                                                               coord=coord,
                                                               start=True)
            else:
                enqueue_threads = []

            callback.on_training_start(locals(), globals())

            for update in range(1, total_timesteps // self.n_batch + 1):

                callback.on_rollout_start()

                # pytype:disable=bad-unpacking
                # true_reward is the reward without discount
                if isinstance(self.runner, PPO2Runner):
                    # We are using GAE
                    rollout = self.runner.run(callback)
                    obs, returns, masks, actions, values, _, states, ep_infos, true_reward = rollout
                else:
                    rollout = self.runner.run(callback)
                    obs, states, returns, masks, actions, values, ep_infos, true_reward = rollout
                # pytype:enable=bad-unpacking

                callback.on_rollout_end()

                # Early stopping due to the callback
                if not self.runner.continue_training:
                    break

                self.ep_info_buf.extend(ep_infos)
                policy_loss, value_loss, policy_entropy = self._train_step(
                    obs, states, returns, masks, actions, values,
                    self.num_timesteps // (self.n_batch + 1), writer)
                n_seconds = time.time() - t_start
                fps = int((update * self.n_batch) / n_seconds)

                if writer is not None:
                    total_episode_reward_logger(
                        self.episode_reward,
                        true_reward.reshape((self.n_envs, self.n_steps)),
                        masks.reshape((self.n_envs, self.n_steps)), writer,
                        self.num_timesteps)

                if self.verbose >= 1 and (update % log_interval == 0
                                          or update == 1):
                    explained_var = explained_variance(values, returns)
                    logger.record_tabular("nupdates", update)
                    logger.record_tabular("total_timesteps",
                                          self.num_timesteps)
                    logger.record_tabular("fps", fps)
                    logger.record_tabular("policy_entropy",
                                          float(policy_entropy))
                    logger.record_tabular("policy_loss", float(policy_loss))
                    logger.record_tabular("value_loss", float(value_loss))
                    logger.record_tabular("explained_variance",
                                          float(explained_var))
                    if len(self.ep_info_buf) > 0 and len(
                            self.ep_info_buf[0]) > 0:
                        logger.logkv(
                            'ep_reward_mean',
                            safe_mean([
                                ep_info['r'] for ep_info in self.ep_info_buf
                            ]))
                        logger.logkv(
                            'ep_len_mean',
                            safe_mean([
                                ep_info['l'] for ep_info in self.ep_info_buf
                            ]))
                    logger.dump_tabular()

            coord.request_stop()
            coord.join(enqueue_threads)

        callback.on_training_end()
        return self
Пример #17
0
    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=100,
              tb_log_name="ACKTR",
              reset_num_timesteps=True):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn(seed)
            self.n_batch = self.n_envs * self.n_steps

            self.learning_rate_schedule = Scheduler(
                initial_value=self.learning_rate,
                n_values=total_timesteps,
                schedule=self.lr_schedule)

            # FIFO queue of the q_runner thread is closed at the end of the learn function.
            # As a result, it needs to be redefinied at every call
            with self.graph.as_default():
                with tf.variable_scope(
                        "kfac_apply",
                        reuse=self.trained,
                        custom_getter=tf_util.outer_scope_getter(
                            "kfac_apply")):
                    # Some of the variables are not in a scope when they are create
                    # so we make a note of any previously uninitialized variables
                    tf_vars = tf.global_variables()
                    is_uninitialized = self.sess.run(
                        [tf.is_variable_initialized(var) for var in tf_vars])
                    old_uninitialized_vars = [
                        v for (v, f) in zip(tf_vars, is_uninitialized) if not f
                    ]

                    self.train_op, self.q_runner = self.optim.apply_gradients(
                        list(zip(self.grads_check, self.params)))

                    # then we check for new uninitialized variables and initialize them
                    tf_vars = tf.global_variables()
                    is_uninitialized = self.sess.run(
                        [tf.is_variable_initialized(var) for var in tf_vars])
                    new_uninitialized_vars = [
                        v for (v, f) in zip(tf_vars, is_uninitialized)
                        if not f and v not in old_uninitialized_vars
                    ]

                    if len(new_uninitialized_vars) != 0:
                        self.sess.run(
                            tf.variables_initializer(new_uninitialized_vars))

            self.trained = True

            runner = A2CRunner(self.env,
                               self,
                               n_steps=self.n_steps,
                               gamma=self.gamma)
            self.episode_reward = np.zeros((self.n_envs, ))

            t_start = time.time()
            coord = tf.train.Coordinator()
            if self.q_runner is not None:
                enqueue_threads = self.q_runner.create_threads(self.sess,
                                                               coord=coord,
                                                               start=True)
            else:
                enqueue_threads = []

            # Training stats (when using Monitor wrapper)
            ep_info_buf = deque(maxlen=100)

            for update in range(1, total_timesteps // self.n_batch + 1):
                # true_reward is the reward without discount
                obs, states, rewards, masks, actions, values, ep_infos, true_reward = runner.run(
                )
                ep_info_buf.extend(ep_infos)
                policy_loss, value_loss, policy_entropy = self._train_step(
                    obs, states, rewards, masks, actions, values,
                    self.num_timesteps // (self.n_batch + 1), writer)
                n_seconds = time.time() - t_start
                fps = int((update * self.n_batch) / n_seconds)

                if writer is not None:
                    self.episode_reward = total_episode_reward_logger(
                        self.episode_reward,
                        true_reward.reshape((self.n_envs, self.n_steps)),
                        masks.reshape((self.n_envs, self.n_steps)), writer,
                        self.num_timesteps)

                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break

                if self.verbose >= 1 and (update % log_interval == 0
                                          or update == 1):
                    explained_var = explained_variance(values, rewards)
                    logger.record_tabular("nupdates", update)
                    logger.record_tabular("total_timesteps",
                                          self.num_timesteps)
                    logger.record_tabular("fps", fps)
                    logger.record_tabular("policy_entropy",
                                          float(policy_entropy))
                    logger.record_tabular("policy_loss", float(policy_loss))
                    logger.record_tabular("value_loss", float(value_loss))
                    logger.record_tabular("explained_variance",
                                          float(explained_var))
                    if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0:
                        logger.logkv(
                            'ep_reward_mean',
                            safe_mean(
                                [ep_info['r'] for ep_info in ep_info_buf]))
                        logger.logkv(
                            'ep_len_mean',
                            safe_mean(
                                [ep_info['l'] for ep_info in ep_info_buf]))
                    logger.dump_tabular()

                self.num_timesteps += self.n_batch + 1

            coord.request_stop()
            coord.join(enqueue_threads)

        return self
Пример #18
0
    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=100):
        with SetVerbosity(self.verbose):
            self._setup_learn(seed)

            with self.sess.as_default():
                seg_gen = traj_segment_generator(
                    self.policy_pi,
                    self.env,
                    self.timesteps_per_batch,
                    reward_giver=self.reward_giver,
                    gail=self.using_gail)

                episodes_so_far = 0
                timesteps_so_far = 0
                iters_so_far = 0
                t_start = time.time()
                lenbuffer = deque(
                    maxlen=40)  # rolling buffer for episode lengths
                rewbuffer = deque(
                    maxlen=40)  # rolling buffer for episode rewards

                true_rewbuffer = None
                if self.using_gail:
                    true_rewbuffer = deque(maxlen=40)
                    #  Stats not used for now
                    #  g_loss_stats = Stats(loss_names)
                    #  d_loss_stats = Stats(reward_giver.loss_name)
                    #  ep_stats = Stats(["True_rewards", "Rewards", "Episode_length"])

                    # if provide pretrained weight
                    if self.pretrained_weight is not None:
                        tf_util.load_state(
                            self.pretrained_weight,
                            var_list=tf_util.get_globals_vars("pi"),
                            sess=self.sess)

                while True:
                    if callback:
                        callback(locals(), globals())
                    if total_timesteps and timesteps_so_far >= total_timesteps:
                        break

                    logger.log("********** Iteration %i ************" %
                               iters_so_far)

                    def fisher_vector_product(vec):
                        return self.allmean(
                            self.compute_fvp(
                                vec, *fvpargs,
                                sess=self.sess)) + self.cg_damping * vec

                    # ------------------ Update G ------------------
                    logger.log("Optimizing Policy...")
                    # g_step = 1 when not using GAIL
                    mean_losses = None
                    vpredbefore = None
                    tdlamret = None
                    observation = None
                    action = None
                    seg = None
                    for _ in range(self.g_step):
                        with self.timed("sampling"):
                            seg = seg_gen.__next__()
                        add_vtarg_and_adv(seg, self.gamma, self.lam)
                        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
                        observation, action, atarg, tdlamret = seg["ob"], seg[
                            "ac"], seg["adv"], seg["tdlamret"]
                        vpredbefore = seg[
                            "vpred"]  # predicted value function before udpate
                        atarg = (atarg - atarg.mean()) / atarg.std(
                        )  # standardized advantage function estimate

                        args = seg["ob"], seg["ob"], seg["ac"], atarg
                        fvpargs = [arr[::5] for arr in args]

                        self.assign_old_eq_new(sess=self.sess)

                        with self.timed("computegrad"):
                            *lossbefore, grad = self.compute_lossandgrad(
                                *args, sess=self.sess)
                        lossbefore = self.allmean(np.array(lossbefore))
                        grad = self.allmean(grad)
                        if np.allclose(grad, 0):
                            logger.log("Got zero gradient. not updating")
                        else:
                            with self.timed("cg"):
                                stepdir = conjugate_gradient(
                                    fisher_vector_product,
                                    grad,
                                    cg_iters=self.cg_iters,
                                    verbose=self.rank == 0
                                    and self.verbose >= 1)
                            assert np.isfinite(stepdir).all()
                            shs = .5 * stepdir.dot(
                                fisher_vector_product(stepdir))
                            # abs(shs) to avoid taking square root of negative values
                            lagrange_multiplier = np.sqrt(
                                abs(shs) / self.max_kl)
                            # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g))
                            fullstep = stepdir / lagrange_multiplier
                            expectedimprove = grad.dot(fullstep)
                            surrbefore = lossbefore[0]
                            stepsize = 1.0
                            thbefore = self.get_flat()
                            thnew = None
                            for _ in range(10):
                                thnew = thbefore + fullstep * stepsize
                                self.set_from_flat(thnew)
                                mean_losses = surr, kl_loss, *_ = self.allmean(
                                    np.array(
                                        self.compute_losses(*args,
                                                            sess=self.sess)))
                                improve = surr - surrbefore
                                logger.log("Expected: %.3f Actual: %.3f" %
                                           (expectedimprove, improve))
                                if not np.isfinite(mean_losses).all():
                                    logger.log(
                                        "Got non-finite value of losses -- bad!"
                                    )
                                elif kl_loss > self.max_kl * 1.5:
                                    logger.log(
                                        "violated KL constraint. shrinking step."
                                    )
                                elif improve < 0:
                                    logger.log(
                                        "surrogate didn't improve. shrinking step."
                                    )
                                else:
                                    logger.log("Stepsize OK!")
                                    break
                                stepsize *= .5
                            else:
                                logger.log("couldn't compute a good step")
                                self.set_from_flat(thbefore)
                            if self.nworkers > 1 and iters_so_far % 20 == 0:
                                # list of tuples
                                paramsums = MPI.COMM_WORLD.allgather(
                                    (thnew.sum(), self.vfadam.getflat().sum()))
                                assert all(
                                    np.allclose(ps, paramsums[0])
                                    for ps in paramsums[1:])

                        with self.timed("vf"):
                            for _ in range(self.vf_iters):
                                for (mbob, mbret) in dataset.iterbatches(
                                    (seg["ob"], seg["tdlamret"]),
                                        include_final_partial_batch=False,
                                        batch_size=128):
                                    grad = self.allmean(
                                        self.compute_vflossandgrad(
                                            mbob, mbob, mbret, sess=self.sess))
                                    self.vfadam.update(grad, self.vf_stepsize)

                    for (loss_name, loss_val) in zip(self.loss_names,
                                                     mean_losses):
                        logger.record_tabular(loss_name, loss_val)

                    logger.record_tabular(
                        "ev_tdlam_before",
                        explained_variance(vpredbefore, tdlamret))

                    if self.using_gail:
                        # ------------------ Update D ------------------
                        logger.log("Optimizing Discriminator...")
                        logger.log(fmt_row(13, self.reward_giver.loss_name))
                        ob_expert, ac_expert = self.expert_dataset.get_next_batch(
                            len(observation))
                        batch_size = len(observation) // self.d_step
                        d_losses = [
                        ]  # list of tuples, each of which gives the loss for a minibatch
                        for ob_batch, ac_batch in dataset.iterbatches(
                            (observation, action),
                                include_final_partial_batch=False,
                                batch_size=batch_size):
                            ob_expert, ac_expert = self.expert_dataset.get_next_batch(
                                len(ob_batch))
                            # update running mean/std for reward_giver
                            if hasattr(self.reward_giver, "obs_rms"):
                                self.reward_giver.obs_rms.update(
                                    np.concatenate((ob_batch, ob_expert), 0))
                            *newlosses, grad = self.reward_giver.lossandgrad(
                                ob_batch, ac_batch, ob_expert, ac_expert)
                            self.d_adam.update(self.allmean(grad),
                                               self.d_stepsize)
                            d_losses.append(newlosses)
                        logger.log(fmt_row(13, np.mean(d_losses, axis=0)))

                        lrlocal = (seg["ep_lens"], seg["ep_rets"],
                                   seg["ep_true_rets"])  # local values
                        listoflrpairs = MPI.COMM_WORLD.allgather(
                            lrlocal)  # list of tuples
                        lens, rews, true_rets = map(flatten_lists,
                                                    zip(*listoflrpairs))
                        true_rewbuffer.extend(true_rets)
                    else:
                        lrlocal = (seg["ep_lens"], seg["ep_rets"]
                                   )  # local values
                        listoflrpairs = MPI.COMM_WORLD.allgather(
                            lrlocal)  # list of tuples
                        lens, rews = map(flatten_lists, zip(*listoflrpairs))
                    lenbuffer.extend(lens)
                    rewbuffer.extend(rews)

                    logger.record_tabular("EpLenMean", np.mean(lenbuffer))
                    logger.record_tabular("EpRewMean", np.mean(rewbuffer))
                    if self.using_gail:
                        logger.record_tabular("EpTrueRewMean",
                                              np.mean(true_rewbuffer))
                    logger.record_tabular("EpThisIter", len(lens))
                    episodes_so_far += len(lens)
                    timesteps_so_far += seg["total_timestep"]
                    iters_so_far += 1

                    logger.record_tabular("EpisodesSoFar", episodes_so_far)
                    logger.record_tabular("TimestepsSoFar", timesteps_so_far)
                    logger.record_tabular("TimeElapsed", time.time() - t_start)

                    if self.verbose >= 1 and self.rank == 0:
                        logger.dump_tabular()

        return self
Пример #19
0
    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=1,
              tb_log_name="PPO2",
              reset_num_timesteps=True):
        # Transform to callable if needed
        self.learning_rate = get_schedule_fn(self.learning_rate)
        self.cliprange = get_schedule_fn(self.cliprange)
        cliprange_vf = get_schedule_fn(self.cliprange_vf)

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn(seed)

            # runner = DistributedRunner(env=self.env, model=self, n_steps=self.n_steps, gamma=self.gamma, lam=self.lam)

            ctx = multiprocessing.get_context('spawn')
            q = ctx.Queue()
            p = ctx.Process(
                target=runDRunner, kwargs={'examples_queue': q}
            )  #, 'env' : self.env, 'model' : self, 'n_steps' : self.n_steps, 'gamma' : self.gamma, 'lam':self.lam})
            p.start()
            print("STarted up queue from master")

            self.episode_reward = np.zeros((self.n_envs, ))

            ep_info_buf = deque(maxlen=100)
            t_first_start = time.time()

            n_updates = total_timesteps // self.n_batch
            print("about to run...", n_updates, "updates and batch size",
                  self.n_batch)
            for update in range(1, n_updates + 1):
                print("In loop.")
                assert self.n_batch % self.nminibatches == 0
                batch_size = self.n_batch // self.nminibatches
                t_start = time.time()
                frac = 1.0 - (update - 1.0) / n_updates
                lr_now = self.learning_rate(frac)
                cliprange_now = self.cliprange(frac)
                cliprange_vf_now = cliprange_vf(frac)
                # true_reward is the reward without discount

                # pull from queue
                print("Pulling from quee...")
                obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = q.get(
                    block=True)
                print("Got something!")

                self.num_timesteps += self.n_batch
                ep_info_buf.extend(ep_infos)
                mb_loss_vals = []

                #non-recurrent version
                update_fac = self.n_batch // self.nminibatches // self.noptepochs + 1
                inds = np.arange(self.n_batch)
                for epoch_num in range(self.noptepochs):
                    np.random.shuffle(inds)
                    for start in range(0, self.n_batch, batch_size):
                        timestep = self.num_timesteps // update_fac + (
                            (self.noptepochs * self.n_batch +
                             epoch_num * self.n_batch + start) // batch_size)
                        end = start + batch_size
                        mbinds = inds[start:end]
                        slices = (arr[mbinds]
                                  for arr in (obs, returns, masks, actions,
                                              values, neglogpacs))
                        mb_loss_vals.append(
                            self._train_step(lr_now,
                                             cliprange_now,
                                             *slices,
                                             writer=writer,
                                             update=timestep,
                                             cliprange_vf=cliprange_vf_now))

                loss_vals = np.mean(mb_loss_vals, axis=0)
                t_now = time.time()
                fps = int(self.n_batch / (t_now - t_start))

                ## BRODCAST WEIGHTS

                if writer is not None:
                    self.episode_reward = total_episode_reward_logger(
                        self.episode_reward,
                        true_reward.reshape((self.n_envs, self.n_steps)),
                        masks.reshape((self.n_envs, self.n_steps)), writer,
                        self.num_timesteps)

                if self.verbose >= 1 and (update % log_interval == 0
                                          or update == 1):
                    explained_var = explained_variance(values, returns)
                    logger.logkv("serial_timesteps", update * self.n_steps)
                    logger.logkv("n_updates", update)
                    logger.logkv("total_timesteps", self.num_timesteps)
                    logger.logkv("fps", fps)
                    logger.logkv("explained_variance", float(explained_var))
                    if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0:
                        logger.logkv(
                            'ep_reward_mean',
                            safe_mean(
                                [ep_info['r'] for ep_info in ep_info_buf]))
                        logger.logkv(
                            'ep_len_mean',
                            safe_mean(
                                [ep_info['l'] for ep_info in ep_info_buf]))
                    logger.logkv('time_elapsed', t_start - t_first_start)
                    for (loss_val, loss_name) in zip(loss_vals,
                                                     self.loss_names):
                        logger.logkv(loss_name, loss_val)
                    logger.dumpkvs()

                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break

            return self
Пример #20
0
    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=100,
              tb_log_name="SIL_A2C"):
        with SetVerbosity(self.verbose), \
             TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name) as writer:  # type: tf.summary.FileWriter
            self._setup_learn(seed)

            self.learning_rate_schedule = Scheduler(
                initial_value=self.learning_rate,
                n_values=total_timesteps,
                schedule=self.lr_schedule)

            runner = SelfImitationA2CRunner(self.env,
                                            self,
                                            n_steps=self.n_steps,
                                            gamma=self.gamma)
            self.episode_reward = np.zeros((self.n_envs, ))

            t_start = time.time()
            for update in range(1, total_timesteps // self.n_batch + 1):
                # true_reward is the reward without discount
                obs, states, rewards, masks, actions, values, true_reward, raw_rewards = runner.run(
                )
                _, value_loss, policy_entropy = self._train_step(
                    obs, states, rewards, masks, actions, values, update,
                    writer)
                sil_loss, sil_adv, sil_samples, sil_nlogp = self._train_sil()
                n_seconds = time.time() - t_start
                fps = int((update * self.n_batch) / n_seconds)

                if writer is not None:
                    self.episode_reward = total_episode_reward_logger(
                        self.episode_reward,
                        raw_rewards.reshape((self.n_envs, self.n_steps)),
                        masks.reshape((self.n_envs, self.n_steps)), writer,
                        update * (self.n_batch + 1))
                    summary = tf.Summary(value=[
                        tf.Summary.Value(
                            tag="episode_reward/best_reward",
                            simple_value=self.sil.get_best_reward())
                    ])
                    writer.add_summary(summary, update * (self.n_batch + 1))

                if callback is not None:
                    callback(locals(), globals())

                if self.verbose >= 1 and (update % log_interval == 0
                                          or update == 1):
                    explained_var = explained_variance(values, rewards)
                    logger.record_tabular("nupdates", update)
                    logger.record_tabular("total_timesteps",
                                          update * self.n_batch)
                    logger.record_tabular("fps", fps)
                    logger.record_tabular("policy_entropy",
                                          float(policy_entropy))
                    logger.record_tabular("value_loss", float(value_loss))
                    logger.record_tabular("explained_variance",
                                          float(explained_var))
                    logger.record_tabular("best_episode_reward",
                                          float(self.sil.get_best_reward()))
                    if self.sil_update > 0:
                        logger.record_tabular("sil_num_episodes",
                                              float(self.sil.num_episodes()))
                        logger.record_tabular("sil_valid_samples",
                                              float(sil_samples))
                        logger.record_tabular("sil_steps",
                                              float(self.sil.num_steps()))
                    logger.dump_tabular()

                if update % (log_interval * 20) == 0:
                    self.save(writer.get_logdir())

        return self
Пример #21
0
    def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="ACKTR"):
        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name) as writer:
            self._setup_learn(seed)
            self.n_batch = self.n_envs * self.n_steps

            self.learning_rate_schedule = Scheduler(initial_value=self.learning_rate, n_values=total_timesteps,
                                                    schedule=self.lr_schedule)

            # FIFO queue of the q_runner thread is closed at the end of the learn function.
            # As a result, it needs to be redefinied at every call
            with self.graph.as_default():
                with tf.variable_scope("kfac_apply", reuse=self.trained,
                                       custom_getter=tf_util.outer_scope_getter("kfac_apply")):
                    # Some of the variables are not in a scope when they are create
                    # so we make a note of any previously uninitialized variables
                    tf_vars = tf.global_variables()
                    is_uninitialized = self.sess.run([tf.is_variable_initialized(var) for var in tf_vars])
                    old_uninitialized_vars = [v for (v, f) in zip(tf_vars, is_uninitialized) if not f]

                    self.train_op, self.q_runner = self.optim.apply_gradients(list(zip(self.grads_check, self.params)))

                    # then we check for new uninitialized variables and initialize them
                    tf_vars = tf.global_variables()
                    is_uninitialized = self.sess.run([tf.is_variable_initialized(var) for var in tf_vars])
                    new_uninitialized_vars = [v for (v, f) in zip(tf_vars, is_uninitialized)
                                              if not f and v not in old_uninitialized_vars]

                    if len(new_uninitialized_vars) != 0:
                        self.sess.run(tf.variables_initializer(new_uninitialized_vars))

            self.trained = True

            runner = A2CRunner(self.env, self, n_steps=self.n_steps, gamma=self.gamma)
            self.episode_reward = np.zeros((self.n_envs,))

            t_start = time.time()
            coord = tf.train.Coordinator()
            enqueue_threads = self.q_runner.create_threads(self.sess, coord=coord, start=True)
            for update in range(1, total_timesteps // self.n_batch + 1):
                # true_reward is the reward without discount
                obs, states, rewards, masks, actions, values, true_reward = runner.run()
                policy_loss, value_loss, policy_entropy = self._train_step(obs, states, rewards, masks, actions, values,
                                                                           update, writer)
                n_seconds = time.time() - t_start
                fps = int((update * self.n_batch) / n_seconds)

                if writer is not None:
                    self.episode_reward = total_episode_reward_logger(self.episode_reward,
                                                                      true_reward.reshape((self.n_envs, self.n_steps)),
                                                                      masks.reshape((self.n_envs, self.n_steps)),
                                                                      writer, update * (self.n_batch + 1))

                if callback is not None:
                    callback(locals(), globals())

                if self.verbose >= 1 and (update % log_interval == 0 or update == 1):
                    explained_var = explained_variance(values, rewards)
                    logger.record_tabular("nupdates", update)
                    logger.record_tabular("total_timesteps", update * self.n_batch)
                    logger.record_tabular("fps", fps)
                    logger.record_tabular("policy_entropy", float(policy_entropy))
                    logger.record_tabular("policy_loss", float(policy_loss))
                    logger.record_tabular("value_loss", float(value_loss))
                    logger.record_tabular("explained_variance", float(explained_var))
                    logger.dump_tabular()

            coord.request_stop()
            coord.join(enqueue_threads)

        return self
Пример #22
0
    def learn(self, total_timesteps, callback=None, seed=None, log_interval=1, tb_log_name="PPO2",
              eval_every_n=5, reset_num_timesteps=True, record_video=False, log_dir=""):
        
        # Define model saving variables
        # Current Iteration is basically the update
        # Initialise variable
        _savediter = 0
        _counter = (200//eval_every_n)
        
        # Transform to callable if needed
        self.learning_rate = get_schedule_fn(self.learning_rate)
        self.cliprange = get_schedule_fn(self.cliprange)

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn(seed)

            runner = Runner(env=self.env, model=self, n_steps=self.n_steps, gamma=self.gamma, lam=self.lam)
            self.episode_reward = np.zeros((self.n_envs,))

            ep_info_buf = deque(maxlen=100)
            t_first_start = time.time()

            nupdates = total_timesteps // self.n_batch

            for update in range(1, nupdates + 1):
                # Do the following except keyboard interrupt the learning process.
                try:

                    if update % eval_every_n == 1:
                        print("[RAISIM_GYM] Visualizing in RaiSimOgre")
                        obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = \
                            runner.run(test_mode=True, record_video=record_video, video_name=log_dir+"/"+str(update-1)+".mp4")
                        print("Average rewards in this test episode ", ep_infos[0]['r'])
                        model_name = log_dir + "_Iteration_{}".format(update-1)
                        self.save(model_name)
                        print("Saving model " + model_name)
                        # tensorboard_log(logger, ep_infos, self.sess)

                    assert self.n_batch % self.nminibatches == 0
                    batch_size = self.n_batch // self.nminibatches
                    t_start = time.time()
                    frac = 1.0 - (update - 1.0) / nupdates
                    lr_now = self.learning_rate(frac)
                    cliprangenow = self.cliprange(frac)
                    # true_reward is the reward without discount
                    obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = runner.run()
                    ep_info_buf.extend(ep_infos)
                    mb_loss_vals = []
                    if states is None:  # nonrecurrent version
                        update_fac = self.n_batch // self.nminibatches // self.noptepochs + 1
                        inds = np.arange(self.n_batch)
                        for epoch_num in range(self.noptepochs):
                            np.random.shuffle(inds)
                            for start in range(0, self.n_batch, batch_size):
                                timestep = self.num_timesteps // update_fac + ((self.noptepochs * self.n_batch + epoch_num *
                                                                                self.n_batch + start) // batch_size)
                                end = start + batch_size
                                mbinds = inds[start:end]
                                slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
                                mb_loss_vals.append(self._train_step(lr_now, cliprangenow, *slices, writer=writer,
                                                                    update=timestep))
                        self.num_timesteps += (self.n_batch * self.noptepochs) // batch_size * update_fac
                    else:  # recurrent version
                        update_fac = self.n_batch // self.nminibatches // self.noptepochs // self.n_steps + 1
                        assert self.n_envs % self.nminibatches == 0
                        env_indices = np.arange(self.n_envs)
                        flat_indices = np.arange(self.n_envs * self.n_steps).reshape(self.n_envs, self.n_steps)
                        envs_per_batch = batch_size // self.n_steps
                        for epoch_num in range(self.noptepochs):
                            np.random.shuffle(env_indices)
                            for start in range(0, self.n_envs, envs_per_batch):
                                timestep = self.num_timesteps // update_fac + ((self.noptepochs * self.n_envs + epoch_num *
                                                                                self.n_envs + start) // envs_per_batch)
                                end = start + envs_per_batch
                                mb_env_inds = env_indices[start:end]
                                mb_flat_inds = flat_indices[mb_env_inds].ravel()
                                slices = (arr[mb_flat_inds] for arr in (obs, returns, masks, actions, values, neglogpacs))
                                mb_states = states[mb_env_inds]
                                mb_loss_vals.append(self._train_step(lr_now, cliprangenow, *slices, update=timestep,
                                                                    writer=writer, states=mb_states))
                        self.num_timesteps += (self.n_envs * self.noptepochs) // envs_per_batch * update_fac

                    loss_vals = np.mean(mb_loss_vals, axis=0)
                    t_now = time.time()
                    fps = int(self.n_batch / (t_now - t_start))

                    if writer is not None:
                        self.episode_reward = total_episode_reward_logger(self.episode_reward,
                                                                        true_reward.reshape((self.n_envs, self.n_steps)),
                                                                        masks.reshape((self.n_envs, self.n_steps)),
                                                                        writer, self.num_timesteps)

                    # Verbose just mean that it will show you the logger on the terminal screen.
                    if self.verbose >= 1 and (update % log_interval == 0 or update == 1):
                        explained_var = explained_variance(values, returns)
                        logger.logkv("serial_timesteps", update * self.n_steps)
                        logger.logkv("nupdates", update)
                        logger.logkv("total_timesteps", self.num_timesteps)
                        logger.logkv("fps", fps)
                        logger.logkv("explained_variance", float(explained_var))
                        if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0:
                            logger.logkv('ep_reward_mean', safe_mean([ep_info['r'] for ep_info in ep_info_buf]))
                            logger.logkv('ep_len_mean', safe_mean([ep_info['l'] for ep_info in ep_info_buf]))
                        logger.logkv('time_elapsed', t_start - t_first_start)
                        for (loss_val, loss_name) in zip(loss_vals, self.loss_names):
                            logger.logkv(loss_name, loss_val)
                        logger.dumpkvs()

                    if callback is not None:
                        # Only stop training if return value is False, not when it is None. This is for backwards
                        # compatibility with callbacks that have no return statement.
                        if callback(locals(), globals()) is False:
                            break
                        
                except KeyboardInterrupt:
                    print("You have stopped the learning process by keyboard interrupt. Model Parameter is saved. \n")
                    # You can actually save files using the instance of self. save the model parameters. 
                    self.save(log_dir + "_Iteration_{}".format(update))
                    sys.exit()

            return self
Пример #23
0
    def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="PPO1"):
        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name) as writer:
            self._setup_learn(seed)

            assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the PPO1 model must be " \
                                                               "an instance of common.policies.ActorCriticPolicy."

            with self.sess.as_default():
                self.adam.sync()

                # Prepare for rollouts
                seg_gen = traj_segment_generator(self.policy_pi, self.env, self.timesteps_per_actorbatch)

                episodes_so_far = 0
                timesteps_so_far = 0
                iters_so_far = 0
                t_start = time.time()

                # rolling buffer for episode lengths
                lenbuffer = deque(maxlen=100)
                # rolling buffer for episode rewards
                rewbuffer = deque(maxlen=100)

                self.episode_reward = np.zeros((self.n_envs,))

                while True:
                    if callback is not None:
                        # Only stop training if return value is False, not when it is None. This is for backwards
                        # compatibility with callbacks that have no return statement.
                        if callback(locals(), globals()) == False:
                            break
                    if total_timesteps and timesteps_so_far >= total_timesteps:
                        break

                    if self.schedule == 'constant':
                        cur_lrmult = 1.0
                    elif self.schedule == 'linear':
                        cur_lrmult = max(1.0 - float(timesteps_so_far) / total_timesteps, 0)
                    else:
                        raise NotImplementedError

                    logger.log("********** Iteration %i ************" % iters_so_far)

                    seg = seg_gen.__next__()
                    add_vtarg_and_adv(seg, self.gamma, self.lam)

                    # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
                    obs_ph, action_ph, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"]

                    # true_rew is the reward without discount
                    if writer is not None:
                        self.episode_reward = total_episode_reward_logger(self.episode_reward,
                                                                          seg["true_rew"].reshape((self.n_envs, -1)),
                                                                          seg["dones"].reshape((self.n_envs, -1)),
                                                                          writer, timesteps_so_far)

                    # predicted value function before udpate
                    vpredbefore = seg["vpred"]

                    # standardized advantage function estimate
                    atarg = (atarg - atarg.mean()) / atarg.std()
                    dataset = Dataset(dict(ob=obs_ph, ac=action_ph, atarg=atarg, vtarg=tdlamret),
                                      shuffle=not issubclass(self.policy, LstmPolicy))
                    optim_batchsize = self.optim_batchsize or obs_ph.shape[0]

                    # set old parameter values to new parameter values
                    self.assign_old_eq_new(sess=self.sess)
                    logger.log("Optimizing...")
                    logger.log(fmt_row(13, self.loss_names))

                    # Here we do a bunch of optimization epochs over the data
                    for k in range(self.optim_epochs):
                        # list of tuples, each of which gives the loss for a minibatch
                        losses = []
                        for i, batch in enumerate(dataset.iterate_once(optim_batchsize)):
                            steps = (timesteps_so_far +
                                     k * optim_batchsize +
                                     int(i * (optim_batchsize / len(dataset.data_map))))
                            if writer is not None:
                                # run loss backprop with summary, but once every 10 runs save the metadata
                                # (memory, compute time, ...)
                                if (1 + k) % 10 == 0:
                                    run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
                                    run_metadata = tf.RunMetadata()
                                    summary, grad, *newlosses = self.lossandgrad(batch["ob"], batch["ob"], batch["ac"],
                                                                                 batch["atarg"], batch["vtarg"],
                                                                                 cur_lrmult, sess=self.sess,
                                                                                 options=run_options,
                                                                                 run_metadata=run_metadata)
                                    writer.add_run_metadata(run_metadata, 'step%d' % steps)
                                else:
                                    summary, grad, *newlosses = self.lossandgrad(batch["ob"], batch["ob"], batch["ac"],
                                                                                 batch["atarg"], batch["vtarg"],
                                                                                 cur_lrmult, sess=self.sess)
                                writer.add_summary(summary, steps)
                            else:
                                _, grad, *newlosses = self.lossandgrad(batch["ob"], batch["ob"], batch["ac"],
                                                                       batch["atarg"], batch["vtarg"], cur_lrmult,
                                                                       sess=self.sess)

                            self.adam.update(grad, self.optim_stepsize * cur_lrmult)
                            losses.append(newlosses)
                        logger.log(fmt_row(13, np.mean(losses, axis=0)))

                    logger.log("Evaluating losses...")
                    losses = []
                    for batch in dataset.iterate_once(optim_batchsize):
                        newlosses = self.compute_losses(batch["ob"], batch["ob"], batch["ac"], batch["atarg"],
                                                        batch["vtarg"], cur_lrmult, sess=self.sess)
                        losses.append(newlosses)
                    mean_losses, _, _ = mpi_moments(losses, axis=0)
                    logger.log(fmt_row(13, mean_losses))
                    for (loss_val, name) in zipsame(mean_losses, self.loss_names):
                        logger.record_tabular("loss_" + name, loss_val)
                    logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret))

                    # local values
                    lrlocal = (seg["ep_lens"], seg["ep_rets"])

                    # list of tuples
                    listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)
                    lens, rews = map(flatten_lists, zip(*listoflrpairs))
                    lenbuffer.extend(lens)
                    rewbuffer.extend(rews)
                    logger.record_tabular("EpLenMean", np.mean(lenbuffer))
                    logger.record_tabular("EpRewMean", np.mean(rewbuffer))
                    logger.record_tabular("EpThisIter", len(lens))
                    episodes_so_far += len(lens)
                    timesteps_so_far += MPI.COMM_WORLD.allreduce(seg["total_timestep"])
                    iters_so_far += 1
                    logger.record_tabular("EpisodesSoFar", episodes_so_far)
                    logger.record_tabular("TimestepsSoFar", timesteps_so_far)
                    logger.record_tabular("TimeElapsed", time.time() - t_start)
                    if self.verbose >= 1 and MPI.COMM_WORLD.Get_rank() == 0:
                        logger.dump_tabular()

        return self
    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=1,
              tb_log_name="PPO2",
              reset_num_timesteps=True):

        # def decide_next_skip(prob_of_down, up, down):
        #     r = np.random.random_sample()
        #     if r > prob_of_down:
        #         current_skip_num = up
        #     else:
        #         current_skip_num = down
        #     return current_skip_num
        #
        # memory_size_threshold = 800000
        # current_non_skipped = 0
        total_num_dumped = 0

        # if total_timesteps > memory_size_threshold:
        #     down_sample_fraction = (total_timesteps - memory_size_threshold)/total_timesteps
        #     down = int(1 / down_sample_fraction)
        #     up = down + 1
        #     prob_of_down = (down_sample_fraction - 1 / up) * up * down
        #
        #
        #     current_skip_num = decide_next_skip(prob_of_down, up, down)

        ep_full_infos = []
        # Transform to callable if needed
        self.learning_rate = get_schedule_fn(self.learning_rate)
        self.cliprange = get_schedule_fn(self.cliprange)

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:

            if self.run_info is not None:
                vf_flat_params = self.get_vf_flat()
                pi_flat_params = self.get_pi_flat()
                self.dump(vf_flat_params, "vf_start")
                self.dump(pi_flat_params, "pi_start")

            self._setup_learn(seed)

            runner = Runner(env=self.env,
                            model=self,
                            n_steps=self.n_steps,
                            gamma=self.gamma,
                            lam=self.lam)
            self.episode_reward = np.zeros((self.n_envs, ))

            ep_info_buf = deque(maxlen=100)
            t_first_start = time.time()

            nupdates = int(total_timesteps // self.n_batch)
            for update in range(1, nupdates + 1):
                assert self.n_batch % self.nminibatches == 0
                batch_size = self.n_batch // self.nminibatches
                t_start = time.time()
                frac = 1.0 - (update - 1.0) / nupdates
                lr_now = self.learning_rate(frac)
                cliprangenow = self.cliprange(frac)
                # true_reward is the reward without discount
                obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = runner.run(
                )

                # d = {"obs":obs, "returns":returns, "masks":masks, "actions":actions,
                #      "values":values, "neglogpacs":neglogpacs, "states":states,
                #      "ep_infos":ep_infos, "true_reward":true_reward}
                # with open("a.test", "w") as fp:
                #     json.dump(d, fp)

                ep_info_buf.extend(ep_infos)
                ep_full_infos.extend(ep_infos)

                mb_loss_vals = []
                if states is None:  # nonrecurrent version
                    update_fac = self.n_batch // self.nminibatches // self.noptepochs + 1
                    inds = np.arange(self.n_batch)
                    for epoch_num in range(self.noptepochs):
                        np.random.shuffle(inds)
                        for start in range(0, self.n_batch, batch_size):
                            timestep = self.num_timesteps // update_fac + (
                                (self.noptepochs * self.n_batch + epoch_num *
                                 self.n_batch + start) // batch_size)
                            end = start + batch_size
                            mbinds = inds[start:end]
                            slices = (arr[mbinds]
                                      for arr in (obs, returns, masks, actions,
                                                  values, neglogpacs))
                            train_result = self._train_step(lr_now,
                                                            cliprangenow,
                                                            *slices,
                                                            writer=writer,
                                                            update=timestep)

                            mb_loss_vals.append(train_result[:-1])

                            if self.run_info is not None:
                                # if total_timesteps > memory_size_threshold and current_non_skipped >= current_skip_num:
                                #     current_skip_num = decide_next_skip(prob_of_down, up, down)
                                #     current_non_skipped = 0
                                # else:
                                grads = train_result[-1]

                                vf_flat_params = self.get_vf_flat()
                                pi_flat_params = self.get_pi_flat()
                                # self.dump(vf_flat_params, "vf_all_params")
                                # self.dump(pi_flat_params, "pi_all_params")

                                # self.dump(grads, "grads")
                                # current_non_skipped += 1
                                total_num_dumped += 1

                    self.num_timesteps += (self.n_batch * self.noptepochs
                                           ) // batch_size * update_fac
                else:  # recurrent version
                    update_fac = self.n_batch // self.nminibatches // self.noptepochs // self.n_steps + 1
                    assert self.n_envs % self.nminibatches == 0
                    env_indices = np.arange(self.n_envs)
                    flat_indices = np.arange(self.n_envs *
                                             self.n_steps).reshape(
                                                 self.n_envs, self.n_steps)
                    envs_per_batch = batch_size // self.n_steps
                    for epoch_num in range(self.noptepochs):
                        np.random.shuffle(env_indices)
                        for start in range(0, self.n_envs, envs_per_batch):
                            timestep = self.num_timesteps // update_fac + (
                                (self.noptepochs * self.n_envs + epoch_num *
                                 self.n_envs + start) // envs_per_batch)
                            end = start + envs_per_batch
                            mb_env_inds = env_indices[start:end]
                            mb_flat_inds = flat_indices[mb_env_inds].ravel()
                            slices = (arr[mb_flat_inds]
                                      for arr in (obs, returns, masks, actions,
                                                  values, neglogpacs))
                            mb_states = states[mb_env_inds]
                            mb_loss_vals.append(
                                self._train_step(lr_now,
                                                 cliprangenow,
                                                 *slices,
                                                 update=timestep,
                                                 writer=writer,
                                                 states=mb_states))
                            if self.run_info is not None:
                                raise NotImplemented()
                                flat_params = self.get_flat()
                                self.dump(flat_params, 0)

                    self.num_timesteps += (self.n_envs * self.noptepochs
                                           ) // envs_per_batch * update_fac

                loss_vals = np.mean(mb_loss_vals, axis=0)
                t_now = time.time()
                fps = int(self.n_batch / (t_now - t_start))

                if writer is not None:
                    self.episode_reward = total_episode_reward_logger(
                        self.episode_reward,
                        true_reward.reshape((self.n_envs, self.n_steps)),
                        masks.reshape((self.n_envs, self.n_steps)), writer,
                        self.num_timesteps)

                if self.verbose >= 1 and (update % log_interval == 0
                                          or update == 1):
                    explained_var = explained_variance(values, returns)
                    logger.logkv("serial_timesteps", update * self.n_steps)
                    logger.logkv("nupdates", update)
                    logger.logkv("total_timesteps", self.num_timesteps)
                    logger.logkv("fps", fps)
                    logger.logkv("explained_variance", float(explained_var))
                    logger.logkv(
                        'ep_rewmean',
                        safe_mean([ep_info['r'] for ep_info in ep_info_buf]))
                    logger.logkv(
                        'eplenmean',
                        safe_mean([ep_info['l'] for ep_info in ep_info_buf]))
                    logger.logkv('time_elapsed', t_start - t_first_start)
                    for (loss_val, loss_name) in zip(loss_vals,
                                                     self.loss_names):
                        logger.logkv(loss_name, loss_val)
                    logger.dumpkvs()

                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break

            if self.run_info is not None:
                vf_flat_params = self.get_vf_flat()
                pi_flat_params = self.get_pi_flat()
                self.dump(vf_flat_params, "vf_final")
                self.dump(pi_flat_params, "pi_final")

                self.dump([total_num_dumped], "total_num_dumped")
            return [ep_info['r'] for ep_info in ep_full_infos]
Пример #25
0
    def learn(self,
              total_timesteps,
              callback=None,
              log_interval=100,
              tb_log_name="A2C",
              reset_num_timesteps=True):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)
        callback = self._init_callback(callback)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn()
            self.learning_rate_schedule = Scheduler(
                initial_value=self.learning_rate,
                n_values=total_timesteps,
                schedule=self.lr_schedule)

            t_start = time.time()
            callback.on_training_start(locals(), globals())

            for update in range(1, total_timesteps // self.n_batch + 1):

                callback.on_rollout_start()
                # true_reward is the reward without discount
                rollout = self.runner.run(callback)
                # unpack
                obs, states, rewards, masks, actions, values, ep_infos, true_reward = rollout
                callback.update_locals(locals())
                callback.on_rollout_end()

                # Early stopping due to the callback
                if not self.runner.continue_training:
                    break

                self.ep_info_buf.extend(ep_infos)
                _, value_loss, policy_entropy = self._train_step(
                    obs, states, rewards, masks, actions, values,
                    self.num_timesteps // self.n_batch, writer)
                n_seconds = time.time() - t_start
                fps = int((update * self.n_batch) / n_seconds)

                if writer is not None:
                    total_episode_reward_logger(
                        self.episode_reward,
                        true_reward.reshape((self.n_envs, self.n_steps)),
                        masks.reshape((self.n_envs, self.n_steps)), writer,
                        self.num_timesteps)

                if self.verbose >= 1 and (update % log_interval == 0
                                          or update == 1):
                    explained_var = explained_variance(values, rewards)
                    logger.record_tabular("nupdates", update)
                    logger.record_tabular("total_timesteps",
                                          self.num_timesteps)
                    logger.record_tabular("fps", fps)
                    logger.record_tabular("policy_entropy",
                                          float(policy_entropy))
                    logger.record_tabular("value_loss", float(value_loss))
                    logger.record_tabular("explained_variance",
                                          float(explained_var))
                    if len(self.ep_info_buf) > 0 and len(
                            self.ep_info_buf[0]) > 0:
                        logger.logkv(
                            'ep_reward_mean',
                            safe_mean([
                                ep_info['r'] for ep_info in self.ep_info_buf
                            ]))
                        logger.logkv(
                            'ep_len_mean',
                            safe_mean([
                                ep_info['l'] for ep_info in self.ep_info_buf
                            ]))
                    logger.dump_tabular()

        callback.on_training_end()
        return self
Пример #26
0
    def learn(self, total_timesteps, log_dir, logger, 
        callback=None, log_interval=1, tb_log_name="PPO2",
        reset_num_timesteps=True):
        # Transform to callable if needed
        self.learning_rate = get_schedule_fn(self.learning_rate)
        self.cliprange = get_schedule_fn(self.cliprange)
        cliprange_vf = get_schedule_fn(self.cliprange_vf)

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn()

            t_first_start = time.time()

            runner = Runner(env=self.env, model=self, n_steps=self.n_steps, gamma=self.gamma, lam=self.lam)
            self.episode_reward = np.zeros((self.n_envs,))

            ep_info_buf = deque(maxlen=100)

            n_updates = total_timesteps // self.n_batch
            
            for update in range(1, n_updates + 1):
                # Do the following except keyboard interrupt the learning process.
                try:
                    assert self.n_batch % self.nminibatches == 0
                    batch_size = self.n_batch // self.nminibatches
                    frac = 1.0 - (update - 1.0) / n_updates
                    lr_now = self.learning_rate(frac)
                    cliprange_now = self.cliprange(frac)
                    cliprange_vf_now = cliprange_vf(frac)

                    t_start = time.time()
                    # Unpack
                    obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = runner.run()
                    # # add by Yunlong
                    t_now = time.time()
                    fps = int(self.n_batch / (t_now - t_start))

                    self.ep_info_buf.extend(ep_infos)
                    mb_loss_vals = []
                    if states is None:  # nonrecurrent version
                        update_fac = self.n_batch // self.nminibatches // self.noptepochs + 1
                        inds = np.arange(self.n_batch)
                        for epoch_num in range(self.noptepochs):
                            np.random.shuffle(inds)
                            for start in range(0, self.n_batch, batch_size):
                                timestep = self.num_timesteps // update_fac + ((self.noptepochs * self.n_batch + epoch_num *
                                                                                self.n_batch + start) // batch_size)
                                end = start + batch_size
                                mbinds = inds[start:end]
                                slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
                                mb_loss_vals.append(self._train_step(lr_now, cliprange_now, *slices, writer=writer,
                                                                    update=timestep, cliprange_vf=cliprange_vf_now))
                    else:  # recurrent version
                        update_fac = self.n_batch // self.nminibatches // self.noptepochs // self.n_steps + 1
                        assert self.n_envs % self.nminibatches == 0
                        env_indices = np.arange(self.n_envs)
                        flat_indices = np.arange(self.n_envs * self.n_steps).reshape(self.n_envs, self.n_steps)
                        envs_per_batch = batch_size // self.n_steps
                        for epoch_num in range(self.noptepochs):
                            np.random.shuffle(env_indices)
                            for start in range(0, self.n_envs, envs_per_batch):
                                timestep = self.num_timesteps // update_fac + ((self.noptepochs * self.n_envs + epoch_num *
                                                                                self.n_envs + start) // envs_per_batch)
                                end = start + envs_per_batch
                                mb_env_inds = env_indices[start:end]
                                mb_flat_inds = flat_indices[mb_env_inds].ravel()
                                slices = (arr[mb_flat_inds] for arr in (obs, returns, masks, actions, values, neglogpacs))
                                mb_states = states[mb_env_inds]
                                mb_loss_vals.append(self._train_step(lr_now, cliprange_now, *slices, update=timestep,
                                                                    writer=writer, states=mb_states,
                                                                    cliprange_vf=cliprange_vf_now))

                    loss_vals = np.mean(mb_loss_vals, axis=0)
                    # # comment out by Yunlong
                    # t_now = time.time()
                    # fps = int(self.n_batch / (t_now - t_start))

                    if writer is not None:
                        total_episode_reward_logger(self.episode_reward,
                            true_reward.reshape((self.n_envs, self.n_steps)),
                            masks.reshape((self.n_envs, self.n_steps)),
                            writer, self.num_timesteps)

                    if self.verbose >= 1 and (update % log_interval == 0 or update == 1):
                        explained_var = explained_variance(values, returns)
                        logger.logkv("serial_timesteps", update * self.n_steps)
                        logger.logkv("n_updates", update)
                        logger.logkv("total_timesteps", self.num_timesteps)
                        logger.logkv("fps", fps)
                        logger.logkv("explained_variance", float(explained_var))
                        if len(self.ep_info_buf) > 0 and len(self.ep_info_buf[0]) > 0:
                            logger.logkv('ep_reward_mean', safe_mean([ep_info['r'] for ep_info in self.ep_info_buf]))
                            logger.logkv('ep_len_mean', safe_mean([ep_info['l'] for ep_info in self.ep_info_buf]))
                        logger.logkv('time_elapsed', t_start - t_first_start)
                        logger.logkv('true_reward', np.mean(true_reward))
                        for (loss_val, loss_name) in zip(loss_vals, self.loss_names):
                            logger.logkv(loss_name, loss_val)
                        logger.dumpkvs()
                    
                    if callback is not None:
                        # Only stop training if return value is False, not when it is None. This is for backwards
                        # compatibility with callbacks that have no return statement.
                        if callback(locals(), globals()) is False:
                            break
                except KeyboardInterrupt:
                    print("You have stopped the learning process by keyboard interrupt. Model Parameter is saved. \n")
                    # You can actually save files using the instance of self. save the model parameters. 
                    self.save(log_dir + "_Iteration_{}".format(update))
                    sys.exit()
            return self
Пример #27
0
    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=100,
              tb_log_name="TRPO",
              reset_num_timesteps=True):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn(seed)

            with self.sess.as_default():
                seg_gen = traj_segment_generator(
                    self.policy_pi,
                    self.env,
                    self.timesteps_per_batch,
                    reward_giver=self.reward_giver,
                    gail=self.using_gail)

                episodes_so_far = 0
                timesteps_so_far = 0
                iters_so_far = 0
                t_start = time.time()
                len_buffer = deque(
                    maxlen=40)  # rolling buffer for episode lengths
                reward_buffer = deque(
                    maxlen=40)  # rolling buffer for episode rewards
                self.episode_reward = np.zeros((self.n_envs, ))

                true_reward_buffer = None
                if self.using_gail:
                    true_reward_buffer = deque(maxlen=40)

                    # Initialize dataloader
                    batchsize = self.timesteps_per_batch // self.d_step
                    self.expert_dataset.init_dataloader(batchsize)

                    #  Stats not used for now
                    # TODO: replace with normal tb logging
                    #  g_loss_stats = Stats(loss_names)
                    #  d_loss_stats = Stats(reward_giver.loss_name)
                    #  ep_stats = Stats(["True_rewards", "Rewards", "Episode_length"])

                while True:
                    if callback is not None:
                        # Only stop training if return value is False, not when it is None. This is for backwards
                        # compatibility with callbacks that have no return statement.
                        if callback(locals(), globals()) is False:
                            break
                    if total_timesteps and timesteps_so_far >= total_timesteps:
                        break

                    logger.log("********** Iteration %i ************" %
                               iters_so_far)

                    def fisher_vector_product(vec):
                        return self.allmean(
                            self.compute_fvp(
                                vec, *fvpargs,
                                sess=self.sess)) + self.cg_damping * vec

                    # ------------------ Update G ------------------
                    logger.log("Optimizing Policy...")
                    # g_step = 1 when not using GAIL
                    mean_losses = None
                    vpredbefore = None
                    tdlamret = None
                    observation = None
                    action = None
                    seg = None
                    for k in range(self.g_step):
                        with self.timed("sampling"):
                            seg = seg_gen.__next__()
                        add_vtarg_and_adv(seg, self.gamma, self.lam)
                        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
                        observation, action, atarg, tdlamret = seg["ob"], seg[
                            "ac"], seg["adv"], seg["tdlamret"]
                        vpredbefore = seg[
                            "vpred"]  # predicted value function before update
                        atarg = (atarg - atarg.mean()) / atarg.std(
                        )  # standardized advantage function estimate

                        # true_rew is the reward without discount
                        if writer is not None:
                            self.episode_reward = total_episode_reward_logger(
                                self.episode_reward, seg["true_rew"].reshape(
                                    (self.n_envs, -1)), seg["dones"].reshape(
                                        (self.n_envs, -1)), writer,
                                self.num_timesteps)

                        args = seg["ob"], seg["ob"], seg["ac"], atarg
                        fvpargs = [arr[::5] for arr in args]

                        self.assign_old_eq_new(sess=self.sess)

                        with self.timed("computegrad"):
                            steps = self.num_timesteps + (k + 1) * (
                                seg["total_timestep"] / self.g_step)
                            run_options = tf.RunOptions(
                                trace_level=tf.RunOptions.FULL_TRACE)
                            run_metadata = tf.RunMetadata(
                            ) if self.full_tensorboard_log else None
                            # run loss backprop with summary, and save the metadata (memory, compute time, ...)
                            if writer is not None:
                                summary, grad, *lossbefore = self.compute_lossandgrad(
                                    *args,
                                    tdlamret,
                                    sess=self.sess,
                                    options=run_options,
                                    run_metadata=run_metadata)
                                if self.full_tensorboard_log:
                                    writer.add_run_metadata(
                                        run_metadata, 'step%d' % steps)
                                writer.add_summary(summary, steps)
                            else:
                                _, grad, *lossbefore = self.compute_lossandgrad(
                                    *args,
                                    tdlamret,
                                    sess=self.sess,
                                    options=run_options,
                                    run_metadata=run_metadata)

                        lossbefore = self.allmean(np.array(lossbefore))
                        grad = self.allmean(grad)
                        if np.allclose(grad, 0):
                            logger.log("Got zero gradient. not updating")
                        else:
                            with self.timed("conjugate_gradient"):
                                stepdir = conjugate_gradient(
                                    fisher_vector_product,
                                    grad,
                                    cg_iters=self.cg_iters,
                                    verbose=self.rank == 0
                                    and self.verbose >= 1)
                            assert np.isfinite(stepdir).all()
                            shs = .5 * stepdir.dot(
                                fisher_vector_product(stepdir))
                            # abs(shs) to avoid taking square root of negative values
                            lagrange_multiplier = np.sqrt(
                                abs(shs) / self.max_kl)
                            # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g))
                            fullstep = stepdir / lagrange_multiplier
                            expectedimprove = grad.dot(fullstep)
                            surrbefore = lossbefore[0]
                            stepsize = 1.0
                            thbefore = self.get_flat()
                            thnew = None
                            for _ in range(10):
                                thnew = thbefore + fullstep * stepsize
                                self.set_from_flat(thnew)
                                mean_losses = surr, kl_loss, *_ = self.allmean(
                                    np.array(
                                        self.compute_losses(*args,
                                                            sess=self.sess)))
                                improve = surr - surrbefore
                                logger.log("Expected: %.3f Actual: %.3f" %
                                           (expectedimprove, improve))
                                if not np.isfinite(mean_losses).all():
                                    logger.log(
                                        "Got non-finite value of losses -- bad!"
                                    )
                                elif kl_loss > self.max_kl * 1.5:
                                    logger.log(
                                        "violated KL constraint. shrinking step."
                                    )
                                elif improve < 0:
                                    logger.log(
                                        "surrogate didn't improve. shrinking step."
                                    )
                                else:
                                    logger.log("Stepsize OK!")
                                    break
                                stepsize *= .5
                            else:
                                logger.log("couldn't compute a good step")
                                self.set_from_flat(thbefore)
                            if self.nworkers > 1 and iters_so_far % 20 == 0:
                                # list of tuples
                                paramsums = MPI.COMM_WORLD.allgather(
                                    (thnew.sum(), self.vfadam.getflat().sum()))
                                assert all(
                                    np.allclose(ps, paramsums[0])
                                    for ps in paramsums[1:])

                        with self.timed("vf"):
                            for _ in range(self.vf_iters):
                                # NOTE: for recurrent policies, use shuffle=False?
                                for (mbob, mbret) in dataset.iterbatches(
                                    (seg["ob"], seg["tdlamret"]),
                                        include_final_partial_batch=False,
                                        batch_size=128,
                                        shuffle=True):
                                    grad = self.allmean(
                                        self.compute_vflossandgrad(
                                            mbob, mbob, mbret, sess=self.sess))
                                    self.vfadam.update(grad, self.vf_stepsize)

                    for (loss_name, loss_val) in zip(self.loss_names,
                                                     mean_losses):
                        logger.record_tabular(loss_name, loss_val)

                    logger.record_tabular(
                        "explained_variance_tdlam_before",
                        explained_variance(vpredbefore, tdlamret))

                    if self.using_gail:
                        # ------------------ Update D ------------------
                        logger.log("Optimizing Discriminator...")
                        logger.log(fmt_row(13, self.reward_giver.loss_name))
                        assert len(observation) == self.timesteps_per_batch
                        batch_size = self.timesteps_per_batch // self.d_step

                        # NOTE: uses only the last g step for observation
                        d_losses = [
                        ]  # list of tuples, each of which gives the loss for a minibatch
                        # NOTE: for recurrent policies, use shuffle=False?
                        for ob_batch, ac_batch in dataset.iterbatches(
                            (observation, action),
                                include_final_partial_batch=False,
                                batch_size=batch_size,
                                shuffle=True):
                            ob_expert, ac_expert = self.expert_dataset.get_next_batch(
                            )
                            # update running mean/std for reward_giver
                            if self.reward_giver.normalize:
                                self.reward_giver.obs_rms.update(
                                    np.concatenate((ob_batch, ob_expert), 0))

                            # Reshape actions if needed when using discrete actions
                            if isinstance(self.action_space,
                                          gym.spaces.Discrete):
                                if len(ac_batch.shape) == 2:
                                    ac_batch = ac_batch[:, 0]
                                if len(ac_expert.shape) == 2:
                                    ac_expert = ac_expert[:, 0]
                            *newlosses, grad = self.reward_giver.lossandgrad(
                                ob_batch, ac_batch, ob_expert, ac_expert)
                            self.d_adam.update(self.allmean(grad),
                                               self.d_stepsize)
                            d_losses.append(newlosses)
                        logger.log(fmt_row(13, np.mean(d_losses, axis=0)))

                        # lr: lengths and rewards
                        lr_local = (seg["ep_lens"], seg["ep_rets"],
                                    seg["ep_true_rets"])  # local values
                        list_lr_pairs = MPI.COMM_WORLD.allgather(
                            lr_local)  # list of tuples
                        lens, rews, true_rets = map(flatten_lists,
                                                    zip(*list_lr_pairs))
                        true_reward_buffer.extend(true_rets)
                    else:
                        # lr: lengths and rewards
                        lr_local = (seg["ep_lens"], seg["ep_rets"]
                                    )  # local values
                        list_lr_pairs = MPI.COMM_WORLD.allgather(
                            lr_local)  # list of tuples
                        lens, rews = map(flatten_lists, zip(*list_lr_pairs))
                    len_buffer.extend(lens)
                    reward_buffer.extend(rews)

                    if len(len_buffer) > 0:
                        logger.record_tabular("EpLenMean", np.mean(len_buffer))
                        logger.record_tabular("EpRewMean",
                                              np.mean(reward_buffer))
                    if self.using_gail:
                        logger.record_tabular("EpTrueRewMean",
                                              np.mean(true_reward_buffer))
                    logger.record_tabular("EpThisIter", len(lens))
                    episodes_so_far += len(lens)
                    current_it_timesteps = MPI.COMM_WORLD.allreduce(
                        seg["total_timestep"])
                    timesteps_so_far += current_it_timesteps
                    self.num_timesteps += current_it_timesteps
                    iters_so_far += 1

                    logger.record_tabular("EpisodesSoFar", episodes_so_far)
                    logger.record_tabular("TimestepsSoFar", self.num_timesteps)
                    logger.record_tabular("TimeElapsed", time.time() - t_start)

                    if self.verbose >= 1 and self.rank == 0:
                        logger.dump_tabular()

        return self
Пример #28
0
    def learn(self, total_timesteps, callback=None, seed=None, log_interval=1, tb_log_name="PPO2"):
        # Transform to callable if needed
        self.learning_rate = get_schedule_fn(self.learning_rate)
        self.cliprange = get_schedule_fn(self.cliprange)

        with TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name) as writer:
            self._setup_learn(seed)
            # この時点でself.n_steps=256
            runner = Runner(env=self.env, model=self, n_steps=self.n_steps, gamma=self.gamma, lam=self.lam)

            self.episode_reward = np.zeros((self.n_envs,))

            ep_info_buf = deque(maxlen=100)
            t_first_start = time.time()
            n_timesteps = 0
            # nupdates = total_timesteps // self.n_batch
            for timestep in range(1, total_timesteps + 1):
                assert self.n_batch % self.nminibatches == 0
                batch_size = self.n_batch // self.nminibatches
                t_start = time.time()
                frac = 1.0 - timestep / total_timesteps
                lr_now = self.learning_rate(frac)
                cliprangenow = self.cliprange(frac)
                # true_reward is the reward without discount
                obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = runner.run()
                self.n_steps = len(true_reward)
                n_timesteps += len(obs)
                #print("len(obs): {}, np.shape(obs): {}, np.shape(rewards): {}".format(len(obs), np.shape(obs), np.shape(returns)))
                #print("np.shape(actions): {}, np.shape(rewards): {}".format(np.shape(actions), np.shape(values)))
                ep_info_buf.extend(ep_infos)
                mb_loss_vals = []
                if states is None:  # nonrecurrent version
                    inds = np.arange(self.n_batch)
                    for epoch_num in range(self.noptepochs): # noptepochs回
                        np.random.shuffle(inds)
                        for start in range(0, self.n_batch, batch_size):  # self.n_batch個のデータをbatch_size間隔で等分し_train_stepに渡す.
                            # timestep = ((update * self.noptepochs * self.n_batch + epoch_num * self.n_batch + start) //
                            #             batch_size)
                            end = start + batch_size
                            #print("batch_size: {}".format(batch_size))
                            mbinds = inds[start:end]
                            slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
                            mb_loss_vals.append(self._train_step(lr_now, cliprangenow, *slices, writer=writer,
                                                                 update=n_timesteps))
                else:  # recurrent version
                    assert self.n_envs % self.nminibatches == 0
                    env_indices = np.arange(self.n_envs)
                    flat_indices = np.arange(self.n_envs * self.n_steps).reshape(self.n_envs, self.n_steps)
                    envs_per_batch = batch_size // self.n_steps
                    for epoch_num in range(self.noptepochs):
                        np.random.shuffle(env_indices)
                        for stan_timestepsrt in range(0, self.n_envs, envs_per_batch):
                            # timestep = ((update * self.noptepochs * self.n_envs + epoch_num * self.n_envs + start) //
                            #             envs_per_batch)
                            end = start + envs_per_batch
                            mb_env_inds = env_indices[start:end]
                            mb_flat_inds = flat_indices[mb_env_inds].ravel()
                            slices = (arr[mb_flat_inds] for arr in (obs, returns, masks, actions, values, neglogpacs))
                            mb_states = states[mb_env_inds]
                            mb_loss_vals.append(self._train_step(lr_now, cliprangenow, *slices, update=n_timesteps,
                                                                 writer=writer, states=mb_states))

                loss_vals = np.mean(mb_loss_vals, axis=0)
                t_now = time.time()
                fps = int(self.n_batch / (t_now - t_start))

                if writer is not None:
                    self.episode_reward = total_episode_reward_logger(self.episode_reward,
                                                                      true_reward.reshape((self.n_envs, self.n_steps)),
                                                                      masks.reshape((self.n_envs, self.n_steps)),
                                                                      writer, n_timesteps)

                if self.verbose >= 1 and (timestep % log_interval == 0 or timestep == 1):
                    explained_var = explained_variance(values, returns)
                    logger.logkv("total_timesteps", n_timesteps)
                    logger.logkv("fps", fps)
                    logger.logkv("explained_variance", float(explained_var))
                    logger.logkv('ep_rewmean', safe_mean([ep_info['r'] for ep_info in ep_info_buf]))
                    logger.logkv('eplenmean', safe_mean([ep_info['l'] for ep_info in ep_info_buf]))
                    logger.logkv('time_elapsed', t_start - t_first_start)
                    for (loss_val, loss_name) in zip(loss_vals, self.loss_names):
                        logger.logkv(loss_name, loss_val)
                    logger.dumpkvs()

                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break
                if n_timesteps > total_timesteps:
                    break

            return self