def do_rollouts(self, params, ob_mean, ob_std, timestep_limit):
        # Set the network weights.
        self.policy.set_trainable_flat(params)

        if self.policy.needs_ob_stat:
            self.policy.set_ob_stat(ob_mean, ob_std)

        if self.rs.rand() < self.config.eval_prob:
            print(
                "In this case, the reference implementation uses no noise in "
                "order to evaluate the policy. We're ignoring that.")

        noise_inds, returns, sign_returns, lengths = [], [], [], []
        task_ob_stat = utils.RunningStat(self.env.observation_space.shape,
                                         eps=0)

        # Perform some rollouts with noise.
        while len(noise_inds
                  ) == 0 or time.time() - task_tstart < self.min_task_runtime:
            noise_idx = self.noise.sample_index(self.rs,
                                                self.policy.num_params)
            v = self.config.noise_stdev * self.noise.get(
                noise_idx, self.policy.num_params)

            # These two sampling steps could be done in parallel on different actors
            # letting us update twice as frequently.
            self.policy.set_trainable_flat(params + v)
            rews_pos, len_pos = self.rollout_and_update_ob_stat(
                timestep_limit, task_ob_stat)

            self.policy.set_trainable_flat(params - v)
            rews_neg, len_neg = self.rollout_and_update_ob_stat(
                timestep_limit, task_ob_stat)

            noise_inds.append(noise_idx)
            returns.append([rews_pos.sum(), rews_neg.sum()])
            sign_returns.append(
                [np.sign(rews_pos).sum(),
                 np.sign(rews_neg).sum()])
            lengths.append([len_pos, len_neg])

            return Result(
                noise_inds_n=np.array(noise_inds),
                returns_n2=np.array(returns, dtype=np.float32),
                sign_returns_n2=np.array(sign_returns, dtype=np.float32),
                lengths_n2=np.array(lengths, dtype=np.int32),
                eval_return=None,
                eval_length=None,
                ob_sum=(None if task_ob_stat.count == 0 else task_ob_stat.sum),
                ob_sumsq=(None
                          if task_ob_stat.count == 0 else task_ob_stat.sumsq),
                ob_count=task_ob_stat.count,
            )
示例#2
0
    # Create the workers.
    print("Creating workers.")
    workers = [
        Worker(config, policy_params, env_name, noise_array)
        for _ in range(num_workers)
    ]

    env = gym.make(env_name)
    sess = utils.make_session(single_threaded=False)
    policy = policies.MujocoPolicy(env.observation_space, env.action_space,
                                   **policy_params)
    tf_util.initialize()
    optimizer = optimizers.Adam(policy, stepsize)

    ob_stat = utils.RunningStat(env.observation_space.shape, eps=1e-2)

    episodes_so_far = 0
    timesteps_so_far = 0
    tstart = time.time()

    while True:
        step_tstart = time.time()
        theta = policy.get_trainable_flat()
        assert theta.dtype == np.float32

        # These rollouts could be done in parallel.
        results = [
            worker.do_rollouts(theta,
                               ob_stat.mean if policy.needs_ob_stat else None,
                               ob_stat.std if policy.needs_ob_stat else None,