def do_rollouts(self, params, ob_mean, ob_std, timestep_limit): # Set the network weights. self.policy.set_trainable_flat(params) if self.policy.needs_ob_stat: self.policy.set_ob_stat(ob_mean, ob_std) if self.rs.rand() < self.config.eval_prob: print( "In this case, the reference implementation uses no noise in " "order to evaluate the policy. We're ignoring that.") noise_inds, returns, sign_returns, lengths = [], [], [], [] task_ob_stat = utils.RunningStat(self.env.observation_space.shape, eps=0) # Perform some rollouts with noise. while len(noise_inds ) == 0 or time.time() - task_tstart < self.min_task_runtime: noise_idx = self.noise.sample_index(self.rs, self.policy.num_params) v = self.config.noise_stdev * self.noise.get( noise_idx, self.policy.num_params) # These two sampling steps could be done in parallel on different actors # letting us update twice as frequently. self.policy.set_trainable_flat(params + v) rews_pos, len_pos = self.rollout_and_update_ob_stat( timestep_limit, task_ob_stat) self.policy.set_trainable_flat(params - v) rews_neg, len_neg = self.rollout_and_update_ob_stat( timestep_limit, task_ob_stat) noise_inds.append(noise_idx) returns.append([rews_pos.sum(), rews_neg.sum()]) sign_returns.append( [np.sign(rews_pos).sum(), np.sign(rews_neg).sum()]) lengths.append([len_pos, len_neg]) return Result( noise_inds_n=np.array(noise_inds), returns_n2=np.array(returns, dtype=np.float32), sign_returns_n2=np.array(sign_returns, dtype=np.float32), lengths_n2=np.array(lengths, dtype=np.int32), eval_return=None, eval_length=None, ob_sum=(None if task_ob_stat.count == 0 else task_ob_stat.sum), ob_sumsq=(None if task_ob_stat.count == 0 else task_ob_stat.sumsq), ob_count=task_ob_stat.count, )
# Create the workers. print("Creating workers.") workers = [ Worker(config, policy_params, env_name, noise_array) for _ in range(num_workers) ] env = gym.make(env_name) sess = utils.make_session(single_threaded=False) policy = policies.MujocoPolicy(env.observation_space, env.action_space, **policy_params) tf_util.initialize() optimizer = optimizers.Adam(policy, stepsize) ob_stat = utils.RunningStat(env.observation_space.shape, eps=1e-2) episodes_so_far = 0 timesteps_so_far = 0 tstart = time.time() while True: step_tstart = time.time() theta = policy.get_trainable_flat() assert theta.dtype == np.float32 # These rollouts could be done in parallel. results = [ worker.do_rollouts(theta, ob_stat.mean if policy.needs_ob_stat else None, ob_stat.std if policy.needs_ob_stat else None,