示例#1
0
 def fetch_metrics_from_workers(self):
     episode_rewards = []
     episode_lengths = []
     metric_lists = [
         a.get_completed_rollout_metrics.remote() for a in self.agents
     ]
     for metrics in metric_lists:
         for episode in ray.get(metrics):
             episode_lengths.append(episode.episode_length)
             episode_rewards.append(episode.episode_reward)
     res = TrainingResult(self.iteration, np.mean(episode_rewards),
                          np.mean(episode_lengths))
     return res
示例#2
0
 def fetch_metrics_from_workers(self):
     episode_rewards = []
     episode_lengths = []
     metric_lists = [
         a.get_completed_rollout_metrics.remote() for a in self.agents
     ]
     for metrics in metric_lists:
         for episode in ray.get(metrics):
             episode_lengths.append(episode.episode_length)
             episode_rewards.append(episode.episode_reward)
     avg_reward = np.mean(episode_rewards) if episode_rewards else None
     avg_length = np.mean(episode_lengths) if episode_lengths else None
     res = TrainingResult(self.experiment_id.hex, self.iteration,
                          avg_reward, avg_length, dict())
     return res
示例#3
0
文件: a3c.py 项目: pchalasani/ray
    def _fetch_metrics_from_workers(self):
        episode_rewards = []
        episode_lengths = []
        metric_lists = [
            a.get_completed_rollout_metrics.remote() for a in self.agents
        ]
        for metrics in metric_lists:
            for episode in ray.get(metrics):
                episode_lengths.append(episode.episode_length)
                episode_rewards.append(episode.episode_reward)
        avg_reward = (np.mean(episode_rewards)
                      if episode_rewards else float('nan'))
        avg_length = (np.mean(episode_lengths)
                      if episode_lengths else float('nan'))
        timesteps = np.sum(episode_lengths) if episode_lengths else 0

        result = TrainingResult(episode_reward_mean=avg_reward,
                                episode_len_mean=avg_length,
                                timesteps_this_iter=timesteps,
                                info={})

        return result
示例#4
0
    def train(self):
        config = self.config
        sample_time, learn_time = 0, 0

        for _ in range(config["timesteps_per_iteration"]):
            self.num_timesteps += 1
            dt = time.time()
            # Take action and update exploration to the newest value
            action = self.dqn_graph.act(
                self.sess,
                np.array(self.obs)[None],
                self.exploration.value(self.num_timesteps))[0]
            new_obs, rew, done, _ = self.env.step(action)
            # Store transition in the replay buffer.
            self.replay_buffer.add(self.obs, action, rew, new_obs, float(done))
            self.obs = new_obs

            self.episode_rewards[-1] += rew
            self.episode_lengths[-1] += 1
            if done:
                self.obs = self.env.reset()
                self.episode_rewards.append(0.0)
                self.episode_lengths.append(0.0)
            sample_time += time.time() - dt

            if self.num_timesteps > config["learning_starts"] and \
                    self.num_timesteps % config["train_freq"] == 0:
                dt = time.time()
                # Minimize the error in Bellman's equation on a batch sampled
                # from replay buffer.
                if config["prioritized_replay"]:
                    experience = self.replay_buffer.sample(
                        config["batch_size"],
                        beta=self.beta_schedule.value(self.num_timesteps))
                    (obses_t, actions, rewards, obses_tp1, dones, _,
                     batch_idxes) = experience
                else:
                    obses_t, actions, rewards, obses_tp1, dones = (
                        self.replay_buffer.sample(config["batch_size"]))
                    batch_idxes = None
                td_errors = self.dqn_graph.train(self.sess, obses_t, actions,
                                                 rewards, obses_tp1, dones,
                                                 np.ones_like(rewards))
                if config["prioritized_replay"]:
                    new_priorities = np.abs(td_errors) + (
                        config["prioritized_replay_eps"])
                    self.replay_buffer.update_priorities(
                        batch_idxes, new_priorities)
                learn_time += (time.time() - dt)

            if self.num_timesteps > config["learning_starts"] and (
                    self.num_timesteps % config["target_network_update_freq"]
                    == 0):
                # Update target network periodically.
                self.dqn_graph.update_target(self.sess)

        mean_100ep_reward = round(np.mean(self.episode_rewards[-101:-1]), 1)
        mean_100ep_length = round(np.mean(self.episode_lengths[-101:-1]), 1)
        num_episodes = len(self.episode_rewards)

        info = {
            "sample_time": sample_time,
            "learn_time": learn_time,
            "steps": self.num_timesteps,
            "episodes": num_episodes,
            "exploration":
            int(100 * self.exploration.value(self.num_timesteps))
        }

        logger.record_tabular("sample_time", sample_time)
        logger.record_tabular("learn_time", learn_time)
        logger.record_tabular("steps", self.num_timesteps)
        logger.record_tabular("buffer_size", len(self.replay_buffer))
        logger.record_tabular("episodes", num_episodes)
        logger.record_tabular("mean 100 episode reward", mean_100ep_reward)
        logger.record_tabular(
            "% time spent exploring",
            int(100 * self.exploration.value(self.num_timesteps)))
        logger.dump_tabular()

        res = TrainingResult(self.experiment_id.hex, self.num_iterations,
                             mean_100ep_reward, mean_100ep_length, info)
        self.num_iterations += 1
        return res
示例#5
0
    def _train(self):
        config = self.config
        sample_time, sync_time, learn_time, apply_time = 0, 0, 0, 0
        iter_init_timesteps = self.cur_timestep

        num_loop_iters = 0
        steps_per_iter = config["sample_batch_size"] * len(self.workers)
        while (self.cur_timestep - iter_init_timesteps <
               config["timesteps_per_iteration"]):
            dt = time.time()
            ray.get([
                w.do_steps.remote(config["sample_batch_size"],
                                  self.cur_timestep) for w in self.workers
            ])
            num_loop_iters += 1
            self.cur_timestep += steps_per_iter
            self.steps_since_update += steps_per_iter
            sample_time += time.time() - dt

            if self.cur_timestep > config["learning_starts"]:
                dt = time.time()
                # Minimize the error in Bellman's equation on a batch sampled
                # from replay buffer.
                self._update_worker_weights()
                sync_time += (time.time() - dt)
                dt = time.time()
                gradients = ray.get([
                    w.get_gradient.remote(self.cur_timestep)
                    for w in self.workers
                ])
                learn_time += (time.time() - dt)
                dt = time.time()
                for grad in gradients:
                    self.actor.apply_gradients(grad)
                apply_time += (time.time() - dt)

            if (self.cur_timestep > config["learning_starts"]
                    and self.steps_since_update >
                    config["target_network_update_freq"]):
                self.actor.dqn_graph.update_target(self.actor.sess)
                # Update target network periodically.
                self._update_worker_weights()
                self.steps_since_update -= config["target_network_update_freq"]
                self.num_target_updates += 1

        mean_100ep_reward = 0.0
        mean_100ep_length = 0.0
        num_episodes = 0
        buffer_size_sum = 0
        for mean_rew, mean_len, episodes, exploration, buf_sz in ray.get(
            [w.stats.remote(self.cur_timestep) for w in self.workers]):
            mean_100ep_reward += mean_rew
            mean_100ep_length += mean_len
            num_episodes += episodes
            buffer_size_sum += buf_sz
        mean_100ep_reward /= len(self.workers)
        mean_100ep_length /= len(self.workers)

        info = [
            ("mean_100ep_reward", mean_100ep_reward),
            ("exploration_frac", exploration),
            ("steps", self.cur_timestep),
            ("episodes", num_episodes),
            ("buffer_sizes_sum", buffer_size_sum),
            ("target_updates", self.num_target_updates),
            ("sample_time", sample_time),
            ("weight_sync_time", sync_time),
            ("apply_time", apply_time),
            ("learn_time", learn_time),
            ("samples_per_s",
             num_loop_iters * np.float64(steps_per_iter) / sample_time),
            ("learn_samples_per_s",
             num_loop_iters * np.float64(config["train_batch_size"]) *
             np.float64(config["num_workers"]) / learn_time),
        ]

        for k, v in info:
            logger.record_tabular(k, v)
        logger.dump_tabular()

        result = TrainingResult(episode_reward_mean=mean_100ep_reward,
                                episode_len_mean=mean_100ep_length,
                                timesteps_this_iter=self.cur_timestep -
                                iter_init_timesteps,
                                info=info)

        return result
示例#6
0
文件: es.py 项目: techscientist/ray-1
    def _train(self):
        config = self.config

        step_tstart = time.time()
        theta = self.policy.get_trainable_flat()
        assert theta.dtype == np.float32

        # Put the current policy weights in the object store.
        theta_id = ray.put(theta)
        # Use the actors to do rollouts, note that we pass in the ID of the
        # policy weights.
        results = self._collect_results(theta_id, config["episodes_per_batch"],
                                        config["timesteps_per_batch"])

        curr_task_results = []
        ob_count_this_batch = 0
        # Loop over the results
        for result in results:
            assert result.eval_length is None, "We aren't doing eval rollouts."
            assert result.noise_inds_n.ndim == 1
            assert result.returns_n2.shape == (len(result.noise_inds_n), 2)
            assert result.lengths_n2.shape == (len(result.noise_inds_n), 2)
            assert result.returns_n2.dtype == np.float32

            result_num_eps = result.lengths_n2.size
            result_num_timesteps = result.lengths_n2.sum()
            self.episodes_so_far += result_num_eps
            self.timesteps_so_far += result_num_timesteps

            curr_task_results.append(result)
            # Update ob stats.
            if self.policy.needs_ob_stat and result.ob_count > 0:
                self.ob_stat.increment(result.ob_sum, result.ob_sumsq,
                                       result.ob_count)
                ob_count_this_batch += result.ob_count

        # Assemble the results.
        noise_inds_n = np.concatenate(
            [r.noise_inds_n for r in curr_task_results])
        returns_n2 = np.concatenate([r.returns_n2 for r in curr_task_results])
        lengths_n2 = np.concatenate([r.lengths_n2 for r in curr_task_results])
        assert (noise_inds_n.shape[0] == returns_n2.shape[0] ==
                lengths_n2.shape[0])
        # Process the returns.
        if config["return_proc_mode"] == "centered_rank":
            proc_returns_n2 = utils.compute_centered_ranks(returns_n2)
        else:
            raise NotImplementedError(config["return_proc_mode"])

        # Compute and take a step.
        g, count = utils.batched_weighted_sum(
            proc_returns_n2[:, 0] - proc_returns_n2[:, 1],
            (self.noise.get(idx, self.policy.num_params)
             for idx in noise_inds_n),
            batch_size=500)
        g /= returns_n2.size
        assert (g.shape == (self.policy.num_params, ) and g.dtype == np.float32
                and count == len(noise_inds_n))
        update_ratio = self.optimizer.update(-g + config["l2coeff"] * theta)

        # Update ob stat (we're never running the policy in the master, but we
        # might be snapshotting the policy).
        if self.policy.needs_ob_stat:
            self.policy.set_ob_stat(self.ob_stat.mean, self.ob_stat.std)

        step_tend = time.time()
        tlogger.record_tabular("EpRewMean", returns_n2.mean())
        tlogger.record_tabular("EpRewStd", returns_n2.std())
        tlogger.record_tabular("EpLenMean", lengths_n2.mean())

        tlogger.record_tabular(
            "Norm", float(np.square(self.policy.get_trainable_flat()).sum()))
        tlogger.record_tabular("GradNorm", float(np.square(g).sum()))
        tlogger.record_tabular("UpdateRatio", float(update_ratio))

        tlogger.record_tabular("EpisodesThisIter", lengths_n2.size)
        tlogger.record_tabular("EpisodesSoFar", self.episodes_so_far)
        tlogger.record_tabular("TimestepsThisIter", lengths_n2.sum())
        tlogger.record_tabular("TimestepsSoFar", self.timesteps_so_far)

        tlogger.record_tabular("ObCount", ob_count_this_batch)

        tlogger.record_tabular("TimeElapsedThisIter", step_tend - step_tstart)
        tlogger.record_tabular("TimeElapsed", step_tend - self.tstart)
        tlogger.dump_tabular()

        info = {
            "weights_norm": np.square(self.policy.get_trainable_flat()).sum(),
            "grad_norm": np.square(g).sum(),
            "update_ratio": update_ratio,
            "episodes_this_iter": lengths_n2.size,
            "episodes_so_far": self.episodes_so_far,
            "timesteps_this_iter": lengths_n2.sum(),
            "timesteps_so_far": self.timesteps_so_far,
            "ob_count": ob_count_this_batch,
            "time_elapsed_this_iter": step_tend - step_tstart,
            "time_elapsed": step_tend - self.tstart
        }

        result = TrainingResult(episode_reward_mean=returns_n2.mean(),
                                episode_len_mean=lengths_n2.mean(),
                                timesteps_this_iter=lengths_n2.sum(),
                                info=info)

        return result
示例#7
0
    def _train(self):
        agents = self.agents
        config = self.config
        model = self.model

        print("===> iteration", self.iteration)

        iter_start = time.time()
        weights = ray.put(model.get_weights())
        [a.load_weights.remote(weights) for a in agents]
        trajectory, total_reward, traj_len_mean = collect_samples(
            agents, config, self.model.observation_filter,
            self.model.reward_filter)
        print("total reward is ", total_reward)
        print("trajectory length mean is ", traj_len_mean)
        print("timesteps:", trajectory["dones"].shape[0])
        if self.file_writer:
            traj_stats = tf.Summary(value=[
                tf.Summary.Value(tag="ppo/rollouts/mean_reward",
                                 simple_value=total_reward),
                tf.Summary.Value(tag="ppo/rollouts/traj_len_mean",
                                 simple_value=traj_len_mean)
            ])
            self.file_writer.add_summary(traj_stats, self.global_step)
        self.global_step += 1

        def standardized(value):
            # Divide by the maximum of value.std() and 1e-4
            # to guard against the case where all values are equal
            return (value - value.mean()) / max(1e-4, value.std())

        if config["use_gae"]:
            trajectory["advantages"] = standardized(trajectory["advantages"])
        else:
            trajectory["returns"] = standardized(trajectory["returns"])

        rollouts_end = time.time()
        print("Computing policy (iterations=" + str(config["num_sgd_iter"]) +
              ", stepsize=" + str(config["sgd_stepsize"]) + "):")
        names = [
            "iter", "total loss", "policy loss", "vf loss", "kl", "entropy"
        ]
        print(("{:>15}" * len(names)).format(*names))
        trajectory = shuffle(trajectory)
        shuffle_end = time.time()
        tuples_per_device = model.load_data(
            trajectory, self.iteration == 0 and config["full_trace_data_load"])
        load_end = time.time()
        rollouts_time = rollouts_end - iter_start
        shuffle_time = shuffle_end - rollouts_end
        load_time = load_end - shuffle_end
        sgd_time = 0
        for i in range(config["num_sgd_iter"]):
            sgd_start = time.time()
            batch_index = 0
            num_batches = (int(tuples_per_device) //
                           int(model.per_device_batch_size))
            loss, policy_loss, vf_loss, kl, entropy = [], [], [], [], []
            permutation = np.random.permutation(num_batches)
            # Prepare to drop into the debugger
            if self.iteration == config["tf_debug_iteration"]:
                model.sess = tf_debug.LocalCLIDebugWrapperSession(model.sess)
            while batch_index < num_batches:
                full_trace = (i == 0 and self.iteration == 0 and batch_index
                              == config["full_trace_nth_sgd_batch"])
                batch_loss, batch_policy_loss, batch_vf_loss, batch_kl, \
                    batch_entropy = model.run_sgd_minibatch(
                        permutation[batch_index] * model.per_device_batch_size,
                        self.kl_coeff, full_trace,
                        self.file_writer)
                loss.append(batch_loss)
                policy_loss.append(batch_policy_loss)
                vf_loss.append(batch_vf_loss)
                kl.append(batch_kl)
                entropy.append(batch_entropy)
                batch_index += 1
            loss = np.mean(loss)
            policy_loss = np.mean(policy_loss)
            vf_loss = np.mean(vf_loss)
            kl = np.mean(kl)
            entropy = np.mean(entropy)
            sgd_end = time.time()
            print("{:>15}{:15.5e}{:15.5e}{:15.5e}{:15.5e}{:15.5e}".format(
                i, loss, policy_loss, vf_loss, kl, entropy))

            values = []
            if i == config["num_sgd_iter"] - 1:
                metric_prefix = "ppo/sgd/final_iter/"
                values.append(
                    tf.Summary.Value(tag=metric_prefix + "kl_coeff",
                                     simple_value=self.kl_coeff))
                values.extend([
                    tf.Summary.Value(tag=metric_prefix + "mean_entropy",
                                     simple_value=entropy),
                    tf.Summary.Value(tag=metric_prefix + "mean_loss",
                                     simple_value=loss),
                    tf.Summary.Value(tag=metric_prefix + "mean_kl",
                                     simple_value=kl)
                ])
                if self.file_writer:
                    sgd_stats = tf.Summary(value=values)
                    self.file_writer.add_summary(sgd_stats, self.global_step)
            self.global_step += 1
            sgd_time += sgd_end - sgd_start
        if kl > 2.0 * config["kl_target"]:
            self.kl_coeff *= 1.5
        elif kl < 0.5 * config["kl_target"]:
            self.kl_coeff *= 0.5

        info = {
            "kl_divergence": kl,
            "kl_coefficient": self.kl_coeff,
            "rollouts_time": rollouts_time,
            "shuffle_time": shuffle_time,
            "load_time": load_time,
            "sgd_time": sgd_time,
            "sample_throughput": len(trajectory["observations"]) / sgd_time
        }

        print("kl div:", kl)
        print("kl coeff:", self.kl_coeff)
        print("rollouts time:", rollouts_time)
        print("shuffle time:", shuffle_time)
        print("load time:", load_time)
        print("sgd time:", sgd_time)
        print("sgd examples/s:", len(trajectory["observations"]) / sgd_time)
        print("total time so far:", time.time() - self.start_time)

        result = TrainingResult(
            episode_reward_mean=total_reward,
            episode_len_mean=traj_len_mean,
            timesteps_this_iter=trajectory["dones"].shape[0],
            info=info)

        return result
示例#8
0
    def train(self):
        agents = self.agents
        config = self.config
        model = self.model
        j = self.j
        self.j += 1

        print("===> iteration", self.j)

        saver = tf.train.Saver(max_to_keep=None)
        if "load_checkpoint" in config:
            saver.restore(model.sess, config["load_checkpoint"])

        # TF does not support to write logs to S3 at the moment
        write_tf_logs = config["write_logs"] and self.logdir.startswith("file")
        iter_start = time.time()
        if write_tf_logs:
            file_writer = tf.summary.FileWriter(self.logdir, model.sess.graph)
            if config["model_checkpoint_file"]:
                checkpoint_path = saver.save(
                    model.sess,
                    os.path.join(self.logdir,
                                 config["model_checkpoint_file"] % j))
                print("Checkpoint saved in file: %s" % checkpoint_path)
        checkpointing_end = time.time()
        weights = ray.put(model.get_weights())
        [a.load_weights.remote(weights) for a in agents]
        trajectory, total_reward, traj_len_mean = collect_samples(
            agents, config)
        print("total reward is ", total_reward)
        print("trajectory length mean is ", traj_len_mean)
        print("timesteps:", trajectory["dones"].shape[0])
        if write_tf_logs:
            traj_stats = tf.Summary(value=[
                tf.Summary.Value(tag="policy_gradient/rollouts/mean_reward",
                                 simple_value=total_reward),
                tf.Summary.Value(tag="policy_gradient/rollouts/traj_len_mean",
                                 simple_value=traj_len_mean)
            ])
            file_writer.add_summary(traj_stats, self.global_step)
        self.global_step += 1
        trajectory["advantages"] = (
            (trajectory["advantages"] - trajectory["advantages"].mean()) /
            trajectory["advantages"].std())
        rollouts_end = time.time()
        print("Computing policy (iterations=" + str(config["num_sgd_iter"]) +
              ", stepsize=" + str(config["sgd_stepsize"]) + "):")
        names = ["iter", "loss", "kl", "entropy"]
        print(("{:>15}" * len(names)).format(*names))
        trajectory = shuffle(trajectory)
        shuffle_end = time.time()
        tuples_per_device = model.load_data(
            trajectory, j == 0 and config["full_trace_data_load"])
        load_end = time.time()
        checkpointing_time = checkpointing_end - iter_start
        rollouts_time = rollouts_end - checkpointing_end
        shuffle_time = shuffle_end - rollouts_end
        load_time = load_end - shuffle_end
        sgd_time = 0
        for i in range(config["num_sgd_iter"]):
            sgd_start = time.time()
            batch_index = 0
            num_batches = (int(tuples_per_device) //
                           int(model.per_device_batch_size))
            loss, kl, entropy = [], [], []
            permutation = np.random.permutation(num_batches)
            while batch_index < num_batches:
                full_trace = (i == 0 and j == 0 and batch_index
                              == config["full_trace_nth_sgd_batch"])
                batch_loss, batch_kl, batch_entropy = model.run_sgd_minibatch(
                    permutation[batch_index] * model.per_device_batch_size,
                    self.kl_coeff, full_trace,
                    file_writer if write_tf_logs else None)
                loss.append(batch_loss)
                kl.append(batch_kl)
                entropy.append(batch_entropy)
                batch_index += 1
            loss = np.mean(loss)
            kl = np.mean(kl)
            entropy = np.mean(entropy)
            sgd_end = time.time()
            print("{:>15}{:15.5e}{:15.5e}{:15.5e}".format(
                i, loss, kl, entropy))

            values = []
            if i == config["num_sgd_iter"] - 1:
                metric_prefix = "policy_gradient/sgd/final_iter/"
                values.append(
                    tf.Summary.Value(tag=metric_prefix + "kl_coeff",
                                     simple_value=self.kl_coeff))
            else:
                metric_prefix = "policy_gradient/sgd/intermediate_iters/"
            values.extend([
                tf.Summary.Value(tag=metric_prefix + "mean_entropy",
                                 simple_value=entropy),
                tf.Summary.Value(tag=metric_prefix + "mean_loss",
                                 simple_value=loss),
                tf.Summary.Value(tag=metric_prefix + "mean_kl",
                                 simple_value=kl)
            ])
            if write_tf_logs:
                sgd_stats = tf.Summary(value=values)
                file_writer.add_summary(sgd_stats, self.global_step)
            self.global_step += 1
            sgd_time += sgd_end - sgd_start
        if kl > 2.0 * config["kl_target"]:
            self.kl_coeff *= 1.5
        elif kl < 0.5 * config["kl_target"]:
            self.kl_coeff *= 0.5

        info = {
            "kl_divergence": kl,
            "kl_coefficient": self.kl_coeff,
            "checkpointing_time": checkpointing_time,
            "rollouts_time": rollouts_time,
            "shuffle_time": shuffle_time,
            "load_time": load_time,
            "sgd_time": sgd_time,
            "sample_throughput": len(trajectory["observations"]) / sgd_time
        }

        print("kl div:", kl)
        print("kl coeff:", self.kl_coeff)
        print("checkpointing time:", checkpointing_time)
        print("rollouts time:", rollouts_time)
        print("shuffle time:", shuffle_time)
        print("load time:", load_time)
        print("sgd time:", sgd_time)
        print("sgd examples/s:", len(trajectory["observations"]) / sgd_time)
        print("total time so far:", time.time() - self.start_time)

        result = TrainingResult(self.experiment_id.hex, j, total_reward,
                                traj_len_mean, info)

        return result
示例#9
0
  def train(self):
    agents = self.agents
    config = self.config
    model = self.model
    j = self.j
    self.j += 1

    saver = tf.train.Saver(max_to_keep=None)
    if "load_checkpoint" in config:
      saver.restore(model.sess, config["load_checkpoint"])

    file_writer = tf.summary.FileWriter(
        "{}/trpo_{}_{}".format(
            config["tensorboard_log_dir"], self.env_name,
            str(datetime.today()).replace(" ", "_")),
        model.sess.graph)
    iter_start = time.time()
    if config["model_checkpoint_file"]:
      checkpoint_path = saver.save(
          model.sess, config["model_checkpoint_file"] % j)
      print("Checkpoint saved in file: %s" % checkpoint_path)
    checkpointing_end = time.time()
    weights = ray.put(model.get_weights())
    [a.load_weights.remote(weights) for a in agents]
    trajectory, total_reward, traj_len_mean = collect_samples(
        agents, config["timesteps_per_batch"], 0.995, 1.0, 2000)
    print("total reward is ", total_reward)
    print("trajectory length mean is ", traj_len_mean)
    print("timesteps:", trajectory["dones"].shape[0])
    traj_stats = tf.Summary(value=[
        tf.Summary.Value(
            tag="policy_gradient/rollouts/mean_reward",
            simple_value=total_reward),
        tf.Summary.Value(
            tag="policy_gradient/rollouts/traj_len_mean",
            simple_value=traj_len_mean)])
    file_writer.add_summary(traj_stats, self.global_step)
    self.global_step += 1
    trajectory["advantages"] = ((trajectory["advantages"] -
                                 trajectory["advantages"].mean()) /
                                trajectory["advantages"].std())
    rollouts_end = time.time()
    print("Computing policy (iterations=" + str(config["num_sgd_iter"]) +
          ", stepsize=" + str(config["sgd_stepsize"]) + "):")
    names = ["iter", "loss", "kl", "entropy"]
    print(("{:>15}" * len(names)).format(*names))
    trajectory = shuffle(trajectory)
    shuffle_end = time.time()
    tuples_per_device = model.load_data(
        trajectory, j == 0 and config["full_trace_data_load"])
    load_end = time.time()
    checkpointing_time = checkpointing_end - iter_start
    rollouts_time = rollouts_end - checkpointing_end
    shuffle_time = shuffle_end - rollouts_end
    load_time = load_end - shuffle_end
    sgd_time = 0
    for i in range(config["num_sgd_iter"]):
      sgd_start = time.time()
      batch_index = 0
      num_batches = int(tuples_per_device) // int(model.per_device_batch_size)
      loss, kl, entropy = [], [], []
      permutation = np.random.permutation(num_batches)
      while batch_index < num_batches:
        full_trace = (
            i == 0 and j == 0 and
            batch_index == config["full_trace_nth_sgd_batch"])
        batch_loss, batch_kl, batch_entropy = model.run_sgd_minibatch(
            permutation[batch_index] * model.per_device_batch_size,
            self.kl_coeff, full_trace, file_writer)
        loss.append(batch_loss)
        kl.append(batch_kl)
        entropy.append(batch_entropy)
        batch_index += 1
      loss = np.mean(loss)
      kl = np.mean(kl)
      entropy = np.mean(entropy)
      sgd_end = time.time()
      print("{:>15}{:15.5e}{:15.5e}{:15.5e}".format(i, loss, kl, entropy))

      values = []
      if i == config["num_sgd_iter"] - 1:
        metric_prefix = "policy_gradient/sgd/final_iter/"
        values.append(tf.Summary.Value(
            tag=metric_prefix + "kl_coeff",
            simple_value=self.kl_coeff))
      else:
        metric_prefix = "policy_gradient/sgd/intermediate_iters/"
      values.extend([
          tf.Summary.Value(
              tag=metric_prefix + "mean_entropy",
              simple_value=entropy),
          tf.Summary.Value(
              tag=metric_prefix + "mean_loss",
              simple_value=loss),
          tf.Summary.Value(
              tag=metric_prefix + "mean_kl",
              simple_value=kl)])
      sgd_stats = tf.Summary(value=values)
      file_writer.add_summary(sgd_stats, self.global_step)
      self.global_step += 1
      sgd_time += sgd_end - sgd_start
    if kl > 2.0 * config["kl_target"]:
      self.kl_coeff *= 1.5
    elif kl < 0.5 * config["kl_target"]:
      self.kl_coeff *= 0.5

    print("kl div:", kl)
    print("kl coeff:", self.kl_coeff)
    print("checkpointing time:", checkpointing_time)
    print("rollouts time:", rollouts_time)
    print("shuffle time:", shuffle_time)
    print("load time:", load_time)
    print("sgd time:", sgd_time)
    print("sgd examples/s:", len(trajectory["observations"]) / sgd_time)

    return TrainingResult(j, total_reward, traj_len_mean)
示例#10
0
 def _train(self):
     return TrainingResult(
         episode_reward_mean=10, episode_len_mean=10,
         timesteps_this_iter=10, info={})
示例#11
0
    def train(self):
        config = self.config

        step_tstart = time.time()
        theta = self.policy.get_trainable_flat()
        assert theta.dtype == np.float32

        # Put the current policy weights in the object store.
        theta_id = ray.put(theta)
        # Use the actors to do rollouts, note that we pass in the ID of the policy
        # weights.
        rollout_ids = [
            worker.do_rollouts.remote(
                theta_id,
                self.ob_stat.mean if self.policy.needs_ob_stat else None,
                self.ob_stat.std if self.policy.needs_ob_stat else None)
            for worker in self.workers
        ]

        # Get the results of the rollouts.
        results = ray.get(rollout_ids)

        curr_task_results = []
        ob_count_this_batch = 0
        # Loop over the results
        for result in results:
            assert result.eval_length is None, "We aren't doing eval rollouts."
            assert result.noise_inds_n.ndim == 1
            assert result.returns_n2.shape == (len(result.noise_inds_n), 2)
            assert result.lengths_n2.shape == (len(result.noise_inds_n), 2)
            assert result.returns_n2.dtype == np.float32

            result_num_eps = result.lengths_n2.size
            result_num_timesteps = result.lengths_n2.sum()
            self.episodes_so_far += result_num_eps
            self.timesteps_so_far += result_num_timesteps

            curr_task_results.append(result)
            # Update ob stats.
            if self.policy.needs_ob_stat and result.ob_count > 0:
                self.ob_stat.increment(result.ob_sum, result.ob_sumsq,
                                       result.ob_count)
                ob_count_this_batch += result.ob_count

        # Assemble the results.
        noise_inds_n = np.concatenate(
            [r.noise_inds_n for r in curr_task_results])
        returns_n2 = np.concatenate([r.returns_n2 for r in curr_task_results])
        lengths_n2 = np.concatenate([r.lengths_n2 for r in curr_task_results])
        assert noise_inds_n.shape[0] == returns_n2.shape[
            0] == lengths_n2.shape[0]
        # Process the returns.
        if config.return_proc_mode == "centered_rank":
            proc_returns_n2 = utils.compute_centered_ranks(returns_n2)
        else:
            raise NotImplementedError(config.return_proc_mode)

        # Compute and take a step.
        g, count = utils.batched_weighted_sum(
            proc_returns_n2[:, 0] - proc_returns_n2[:, 1],
            (self.noise.get(idx, self.policy.num_params)
             for idx in noise_inds_n),
            batch_size=500)
        g /= returns_n2.size
        assert (g.shape == (self.policy.num_params, ) and g.dtype == np.float32
                and count == len(noise_inds_n))
        update_ratio = self.optimizer.update(-g + config.l2coeff * theta)

        # Update ob stat (we're never running the policy in the master, but we
        # might be snapshotting the policy).
        if self.policy.needs_ob_stat:
            self.policy.set_ob_stat(self.ob_stat.mean, self.ob_stat.std)

        step_tend = time.time()
        tlogger.record_tabular("EpRewMean", returns_n2.mean())
        tlogger.record_tabular("EpRewStd", returns_n2.std())
        tlogger.record_tabular("EpLenMean", lengths_n2.mean())

        tlogger.record_tabular(
            "Norm", float(np.square(self.policy.get_trainable_flat()).sum()))
        tlogger.record_tabular("GradNorm", float(np.square(g).sum()))
        tlogger.record_tabular("UpdateRatio", float(update_ratio))

        tlogger.record_tabular("EpisodesThisIter", lengths_n2.size)
        tlogger.record_tabular("EpisodesSoFar", self.episodes_so_far)
        tlogger.record_tabular("TimestepsThisIter", lengths_n2.sum())
        tlogger.record_tabular("TimestepsSoFar", self.timesteps_so_far)

        tlogger.record_tabular("ObCount", ob_count_this_batch)

        tlogger.record_tabular("TimeElapsedThisIter", step_tend - step_tstart)
        tlogger.record_tabular("TimeElapsed", step_tend - self.tstart)
        tlogger.dump_tabular()

        if (config.snapshot_freq != 0
                and self.iteration % config.snapshot_freq == 0):
            filename = os.path.join(
                "/tmp", "snapshot_iter{:05d}.h5".format(self.iteration))
            assert not os.path.exists(filename)
            self.policy.save(filename)
            tlogger.log("Saved snapshot {}".format(filename))

        res = TrainingResult(self.iteration, returns_n2.mean(),
                             lengths_n2.mean())
        self.iteration += 1
        return res