def run_po_rollout_batch(batch_size, rs_seed, noise_std=None): global noise t_init = time.time() interaction = interaction_shared theta = fiber_get_theta() obs_mean, obs_std = fiber_get_obs_stats() random_state = np.random.RandomState(rs_seed) random_state.seed(rs_seed) assert noise_std is not None noise_inds = np.asarray([ noise.sample_index(random_state, len(theta)) for _ in range(batch_size) ], dtype='int') returns = np.zeros((batch_size, 2)) final_xpos = np.zeros((batch_size, 2)) lengths = np.zeros((batch_size, 2), dtype='int') bcs = [None] * 2 # mirror sampling thetas = (theta + noise_std * noise.get(noise_idx, len(theta)) for noise_idx in noise_inds) returns[:, 0], lengths[:, 0], bcs[ 0], final_xpos[:, 0], _, _, _, = interaction.rollout_batch( thetas=thetas, batch_size=batch_size, random_state=random_state, obs_mean=obs_mean, obs_std=obs_std) thetas = (theta - noise_std * noise.get(noise_idx, len(theta)) for noise_idx in noise_inds) returns[:, 1], lengths[:, 1], bcs[ 1], final_xpos[:, 1], obs_sum, obs_sq, obs_count = interaction.rollout_batch( thetas=thetas, batch_size=batch_size, random_state=random_state, obs_mean=obs_mean, obs_std=obs_std) end = time.time() - t_init return POResult(returns=returns, noise_inds=noise_inds, lengths=lengths, bcs=np.swapaxes(np.array(bcs), 0, 1), obs_sum=obs_sum, obs_sq=obs_sq, obs_count=obs_count, time=end, final_xpos=final_xpos)
def run_po_rollout_batch(batch_size, noise_theta, noise_std=None): global noise t_init = time.time() interaction = interaction_shared theta = fiber_get_theta() obs_mean, obs_std = fiber_get_obs_stats() assert noise_std is not None random_state = np.random.RandomState() thetas = (theta + noise_std * noise.get(noise_theta, len(theta)) for _ in range(batch_size)) returns, lengths, bcs, final_xpos, obs_sum, obs_sq, obs_count = interaction.rollout_batch( thetas=thetas, batch_size=batch_size, random_state=random_state, obs_mean=obs_mean, obs_std=obs_std) end = time.time() - t_init return POResult(returns=returns, noise_inds=noise_theta, lengths=lengths, bcs=np.swapaxes(np.array(bcs), 0, 1), obs_sum=obs_sum, obs_sq=obs_sq, obs_count=obs_count, time=end, final_xpos=final_xpos)
def start_step(self, theta): global noise self.broadcast_theta(theta) rs_seed = np.random.randint(np.int32(2**31 - 1)) random_state = np.random.RandomState(rs_seed) random_state.seed(rs_seed) n_thetas = self.batch_size * self.batches_per_step * 2 // self.nb_evals + 1 noise_inds = np.asarray([ noise.sample_index(random_state, len(theta)) for _ in range(n_thetas) ], dtype='int') self.broadcast_obs_stats(self.obs_mean, self.obs_std) thetas = [ theta + self.noise_std * noise.get(noise_id, len(theta)) for noise_id in noise_inds ] training_task = [] for i in range(n_thetas): training_task += self.start_chunk(run_po_rollout_batch, self.batch_size, noise_inds[i], self.noise_std) return thetas, training_task
def compute_grads(self, noise_inds, fitness, theta): grads, count = batched_weighted_sum(fitness[:, 0] - fitness[:, 1], (noise.get(idx, len(theta)) for idx in noise_inds), batch_size=500) grads /= len(fitness) if self.args['optimizer_args']['divide_gradient_by_noise_std']: grads /= self.noise_std return grads