示例#1
0
文件: ars.py 项目: nskh/ray
    def _train(self):

        # perform the training
        t1 = time.time()
        g_hat, info_dict = self.train_step()
        t2 = time.time()
        print('total time of one step', t2 - t1)

        self.episodes_so_far += len(info_dict['steps'])
        self.timesteps_so_far += np.sum(info_dict['steps'])

        # Evaluate the reward with the unperturbed params
        rewards = self.aggregate_rollouts(
            num_rollouts=self.config['eval_rollouts'], evaluate=True)
        w = ray.get(self.workers[0].get_weights.remote())

        tlogger.record_tabular("AverageReward", np.mean(rewards))
        tlogger.record_tabular("StdRewards", np.std(rewards))
        tlogger.record_tabular("WeightNorm", float(np.square(w).sum()))
        tlogger.record_tabular("WeightStd", float(np.std(w)))
        tlogger.record_tabular("GradNorm", float(np.square(g_hat).sum()))
        tlogger.record_tabular("MaxRewardRollout", np.max(rewards))
        tlogger.record_tabular("MinRewardRollout", np.min(rewards))
        tlogger.dump_tabular()

        result = ray.tune.result.TrainingResult(
            episode_reward_mean=np.mean(rewards),
            episode_len_mean=np.mean(info_dict['steps']),
            timesteps_this_iter=np.sum(info_dict['steps']))

        return result
示例#2
0
文件: es.py 项目: ml-squad/ray
    def _train(self):
        config = self.config

        step_tstart = time.time()
        theta = self.policy.get_weights()
        assert theta.dtype == np.float32

        # Put the current policy weights in the object store.
        theta_id = ray.put(theta)
        # Use the actors to do rollouts, note that we pass in the ID of the
        # policy weights.
        results, num_episodes, num_timesteps = self._collect_results(
            theta_id, config["episodes_per_batch"],
            config["timesteps_per_batch"])

        all_noise_indices = []
        all_training_returns = []
        all_training_lengths = []
        all_eval_returns = []
        all_eval_lengths = []

        # Loop over the results.
        for result in results:
            all_eval_returns += result.eval_returns
            all_eval_lengths += result.eval_lengths

            all_noise_indices += result.noise_indices
            all_training_returns += result.noisy_returns
            all_training_lengths += result.noisy_lengths

        assert len(all_eval_returns) == len(all_eval_lengths)
        assert (len(all_noise_indices) == len(all_training_returns) ==
                len(all_training_lengths))

        self.episodes_so_far += num_episodes
        self.timesteps_so_far += num_timesteps

        # Assemble the results.
        eval_returns = np.array(all_eval_returns)
        eval_lengths = np.array(all_eval_lengths)
        noise_indices = np.array(all_noise_indices)
        noisy_returns = np.array(all_training_returns)
        noisy_lengths = np.array(all_training_lengths)

        # Process the returns.
        if config["return_proc_mode"] == "centered_rank":
            proc_noisy_returns = utils.compute_centered_ranks(noisy_returns)
        else:
            raise NotImplementedError(config["return_proc_mode"])

        # Compute and take a step.
        g, count = utils.batched_weighted_sum(
            proc_noisy_returns[:, 0] - proc_noisy_returns[:, 1],
            (self.noise.get(index, self.policy.num_params)
             for index in noise_indices),
            batch_size=500)
        g /= noisy_returns.size
        assert (g.shape == (self.policy.num_params, ) and g.dtype == np.float32
                and count == len(noise_indices))
        # Compute the new weights theta.
        theta, update_ratio = self.optimizer.update(-g +
                                                    config["l2_coeff"] * theta)
        # Set the new weights in the local copy of the policy.
        self.policy.set_weights(theta)

        step_tend = time.time()
        tlogger.record_tabular("EvalEpRewMean", eval_returns.mean())
        tlogger.record_tabular("EvalEpRewStd", eval_returns.std())
        tlogger.record_tabular("EvalEpLenMean", eval_lengths.mean())

        tlogger.record_tabular("EpRewMean", noisy_returns.mean())
        tlogger.record_tabular("EpRewStd", noisy_returns.std())
        tlogger.record_tabular("EpLenMean", noisy_lengths.mean())

        tlogger.record_tabular("Norm", float(np.square(theta).sum()))
        tlogger.record_tabular("GradNorm", float(np.square(g).sum()))
        tlogger.record_tabular("UpdateRatio", float(update_ratio))

        tlogger.record_tabular("EpisodesThisIter", noisy_lengths.size)
        tlogger.record_tabular("EpisodesSoFar", self.episodes_so_far)
        tlogger.record_tabular("TimestepsThisIter", noisy_lengths.sum())
        tlogger.record_tabular("TimestepsSoFar", self.timesteps_so_far)

        tlogger.record_tabular("TimeElapsedThisIter", step_tend - step_tstart)
        tlogger.record_tabular("TimeElapsed", step_tend - self.tstart)
        tlogger.dump_tabular()

        info = {
            "weights_norm": np.square(theta).sum(),
            "grad_norm": np.square(g).sum(),
            "update_ratio": update_ratio,
            "episodes_this_iter": noisy_lengths.size,
            "episodes_so_far": self.episodes_so_far,
            "timesteps_this_iter": noisy_lengths.sum(),
            "timesteps_so_far": self.timesteps_so_far,
            "time_elapsed_this_iter": step_tend - step_tstart,
            "time_elapsed": step_tend - self.tstart
        }

        result = TrainingResult(episode_reward_mean=eval_returns.mean(),
                                episode_len_mean=eval_lengths.mean(),
                                timesteps_this_iter=noisy_lengths.sum(),
                                info=info)

        return result
示例#3
0
文件: es.py 项目: techscientist/ray-1
    def _train(self):
        config = self.config

        step_tstart = time.time()
        theta = self.policy.get_trainable_flat()
        assert theta.dtype == np.float32

        # Put the current policy weights in the object store.
        theta_id = ray.put(theta)
        # Use the actors to do rollouts, note that we pass in the ID of the
        # policy weights.
        results = self._collect_results(theta_id, config["episodes_per_batch"],
                                        config["timesteps_per_batch"])

        curr_task_results = []
        ob_count_this_batch = 0
        # Loop over the results
        for result in results:
            assert result.eval_length is None, "We aren't doing eval rollouts."
            assert result.noise_inds_n.ndim == 1
            assert result.returns_n2.shape == (len(result.noise_inds_n), 2)
            assert result.lengths_n2.shape == (len(result.noise_inds_n), 2)
            assert result.returns_n2.dtype == np.float32

            result_num_eps = result.lengths_n2.size
            result_num_timesteps = result.lengths_n2.sum()
            self.episodes_so_far += result_num_eps
            self.timesteps_so_far += result_num_timesteps

            curr_task_results.append(result)
            # Update ob stats.
            if self.policy.needs_ob_stat and result.ob_count > 0:
                self.ob_stat.increment(result.ob_sum, result.ob_sumsq,
                                       result.ob_count)
                ob_count_this_batch += result.ob_count

        # Assemble the results.
        noise_inds_n = np.concatenate(
            [r.noise_inds_n for r in curr_task_results])
        returns_n2 = np.concatenate([r.returns_n2 for r in curr_task_results])
        lengths_n2 = np.concatenate([r.lengths_n2 for r in curr_task_results])
        assert (noise_inds_n.shape[0] == returns_n2.shape[0] ==
                lengths_n2.shape[0])
        # Process the returns.
        if config["return_proc_mode"] == "centered_rank":
            proc_returns_n2 = utils.compute_centered_ranks(returns_n2)
        else:
            raise NotImplementedError(config["return_proc_mode"])

        # Compute and take a step.
        g, count = utils.batched_weighted_sum(
            proc_returns_n2[:, 0] - proc_returns_n2[:, 1],
            (self.noise.get(idx, self.policy.num_params)
             for idx in noise_inds_n),
            batch_size=500)
        g /= returns_n2.size
        assert (g.shape == (self.policy.num_params, ) and g.dtype == np.float32
                and count == len(noise_inds_n))
        update_ratio = self.optimizer.update(-g + config["l2coeff"] * theta)

        # Update ob stat (we're never running the policy in the master, but we
        # might be snapshotting the policy).
        if self.policy.needs_ob_stat:
            self.policy.set_ob_stat(self.ob_stat.mean, self.ob_stat.std)

        step_tend = time.time()
        tlogger.record_tabular("EpRewMean", returns_n2.mean())
        tlogger.record_tabular("EpRewStd", returns_n2.std())
        tlogger.record_tabular("EpLenMean", lengths_n2.mean())

        tlogger.record_tabular(
            "Norm", float(np.square(self.policy.get_trainable_flat()).sum()))
        tlogger.record_tabular("GradNorm", float(np.square(g).sum()))
        tlogger.record_tabular("UpdateRatio", float(update_ratio))

        tlogger.record_tabular("EpisodesThisIter", lengths_n2.size)
        tlogger.record_tabular("EpisodesSoFar", self.episodes_so_far)
        tlogger.record_tabular("TimestepsThisIter", lengths_n2.sum())
        tlogger.record_tabular("TimestepsSoFar", self.timesteps_so_far)

        tlogger.record_tabular("ObCount", ob_count_this_batch)

        tlogger.record_tabular("TimeElapsedThisIter", step_tend - step_tstart)
        tlogger.record_tabular("TimeElapsed", step_tend - self.tstart)
        tlogger.dump_tabular()

        info = {
            "weights_norm": np.square(self.policy.get_trainable_flat()).sum(),
            "grad_norm": np.square(g).sum(),
            "update_ratio": update_ratio,
            "episodes_this_iter": lengths_n2.size,
            "episodes_so_far": self.episodes_so_far,
            "timesteps_this_iter": lengths_n2.sum(),
            "timesteps_so_far": self.timesteps_so_far,
            "ob_count": ob_count_this_batch,
            "time_elapsed_this_iter": step_tend - step_tstart,
            "time_elapsed": step_tend - self.tstart
        }

        result = TrainingResult(episode_reward_mean=returns_n2.mean(),
                                episode_len_mean=lengths_n2.mean(),
                                timesteps_this_iter=lengths_n2.sum(),
                                info=info)

        return result