def evaluate_policy(eval_env, get_predictions, max_timestep=20000, n_evals=1, len_history_for_policy=32, rng=None): """Evaluate the policy.""" avg_rewards = collections.defaultdict(float) avg_rewards_unclipped = collections.defaultdict(float) for _ in range(n_evals): for policy in [ env_problem_utils.CATEGORICAL_SAMPLING, env_problem_utils.GUMBEL_SAMPLING, ]: trajs, _, _ = env_problem_utils.play_env_problem_with_policy( eval_env, get_predictions, num_trajectories=eval_env.batch_size, max_timestep=max_timestep, reset=True, policy_sampling=policy, rng=rng, len_history_for_policy=len_history_for_policy) avg_rewards[policy] += float(sum( np.sum(traj[2]) for traj in trajs)) / len(trajs) avg_rewards_unclipped[policy] += float( sum(np.sum(traj[3]) for traj in trajs)) / len(trajs) # Now average these out. for k in avg_rewards: avg_rewards[k] /= n_evals avg_rewards_unclipped[k] /= n_evals return avg_rewards, avg_rewards_unclipped
def evaluate_policy(eval_env, get_predictions, boundary, max_timestep=20000, n_evals=1, rng=None): """Evaluate the policy.""" avg_rewards = collections.defaultdict(float) avg_rewards_unclipped = collections.defaultdict(float) for _ in range(n_evals): for policy in [ env_problem_utils.CATEGORICAL_SAMPLING, env_problem_utils.GUMBEL_SAMPLING, env_problem_utils.EPSILON_GREEDY ]: trajs, _ = env_problem_utils.play_env_problem_with_policy( eval_env, get_predictions, boundary=boundary, max_timestep=max_timestep, reset=True, policy_sampling=policy, rng=rng) avg_rewards[policy] += float(sum( np.sum(traj[2]) for traj in trajs)) / len(trajs) avg_rewards_unclipped[policy] += float( sum(np.sum(traj[3]) for traj in trajs)) / len(trajs) # Now average these out. for k in avg_rewards: avg_rewards[k] /= n_evals avg_rewards_unclipped[k] /= n_evals return avg_rewards, avg_rewards_unclipped
def evaluate_policy(eval_env, get_predictions, temperatures, max_timestep=20000, n_evals=1, len_history_for_policy=32, rng=None): """Evaluate the policy.""" processed_reward_sums = collections.defaultdict(list) raw_reward_sums = collections.defaultdict(list) for eval_rng in jax_random.split(rng, num=n_evals): for temperature in temperatures: trajs, _, _ = env_problem_utils.play_env_problem_with_policy( eval_env, get_predictions, num_trajectories=eval_env.batch_size, max_timestep=max_timestep, reset=True, temperature=temperature, rng=eval_rng, len_history_for_policy=len_history_for_policy) processed_reward_sums[temperature].extend(sum(traj[2]) for traj in trajs) raw_reward_sums[temperature].extend(sum(traj[3]) for traj in trajs) # Return the mean and standard deviation for each temperature. def compute_stats(reward_dict): return { temperature: {"mean": onp.mean(rewards), "std": onp.std(rewards)} for (temperature, rewards) in reward_dict.items() } return { "processed": compute_stats(processed_reward_sums), "raw": compute_stats(raw_reward_sums), }
def test_play_env_problem_with_policy(self): env = env_problem.EnvProblem( base_env_name="CartPole-v0", batch_size=2, reward_range=(-1, 1)) def policy_fun(observations, rng=None): b, t = observations.shape[:2] a = env.action_space.n p = np.random.uniform(size=(b, t, a)) p = np.exp(p) p = p / np.sum(p, axis=-1, keepdims=True) return np.log(p), (), rng max_timestep = 15 num_trajectories = 2 trajectories, _ = env_problem_utils.play_env_problem_with_policy( env, policy_fun, num_trajectories=num_trajectories, max_timestep=max_timestep, boundary=20) self.assertEqual(num_trajectories, len(trajectories)) # Check shapes within trajectories. traj = trajectories[0] T = traj[1].shape[0] # pylint: disable=invalid-name self.assertEqual((T+1, 4), traj[0].shape) # (4,) is OBS self.assertEqual((T,), traj[2].shape) self.assertLessEqual(T, max_timestep) traj = trajectories[1] T = traj[1].shape[0] # pylint: disable=invalid-name self.assertEqual((T+1, 4), traj[0].shape) self.assertEqual((T,), traj[2].shape) self.assertLessEqual(T, max_timestep)
def collect_trajectories(env, policy_fn, n_trajectories=1, max_timestep=None, reset=True, len_history_for_policy=32, boundary=32, state=None, temperature=1.0, rng=None): """Collect trajectories with the given policy net and behaviour. Args: env: A gym env interface, for now this is not-batched. policy_fn: observations(B,T+1) -> log-probabs(B,T+1, A) callable. n_trajectories: int, number of trajectories. max_timestep: int or None, the index of the maximum time-step at which we return the trajectory, None for ending a trajectory only when env returns done. reset: bool, true if we want to reset the envs. The envs are also reset if max_max_timestep is None or < 0 len_history_for_policy: int or None, the maximum history to keep for applying the policy on. If None, use the full history. boundary: int, pad the sequences to the multiples of this number. state: state for `policy_fn`. temperature: (float) temperature to sample action from policy_fn. rng: jax rng, splittable. Returns: A tuple (trajectory, number of trajectories that are done) trajectory: list of (observation, action, reward) tuples, where each element `i` is a tuple of numpy arrays with shapes as follows: observation[i] = (B, T_i + 1) action[i] = (B, T_i) reward[i] = (B, T_i) """ assert isinstance(env, env_problem.EnvProblem) # This is an env_problem, run its collect function. trajs, n_done, timing_info, state = env_problem_utils.play_env_problem_with_policy( env, policy_fn, num_trajectories=n_trajectories, max_timestep=max_timestep, reset=reset, len_history_for_policy=len_history_for_policy, boundary=boundary, state=state, temperature=temperature, rng=rng) # Skip returning raw_rewards here, since they aren't used. # t is the return value of Trajectory.as_numpy, so: # (observation, action, processed_reward, raw_reward, infos) return trajs, n_done, timing_info, state
def test_play_env_problem_with_policy(self): env = gym_env_problem.GymEnvProblem(base_env_name="CartPole-v0", batch_size=2, reward_range=(-1, 1)) # Let's make sure that at-most 4 observations come to the policy function. len_history_for_policy = 4 def policy_fun(observations, state=None, rng=None): b, t = observations.shape[:2] # Assert that observations from time-step len_history_for_policy onwards # are zeros. self.assertTrue( np.all(observations[:, len_history_for_policy:, ...] == 0)) self.assertFalse( np.all(observations[:, :len_history_for_policy, ...] == 0)) a = env.action_space.n p = np.random.uniform(size=(b, t, a)) p = np.exp(p) p = p / np.sum(p, axis=-1, keepdims=True) return np.log(p), np.mean(p, axis=-1), state, rng def action_index_fn(index): return index[:, None] max_timestep = 15 num_trajectories = 2 trajectories, _, _, _ = env_problem_utils.play_env_problem_with_policy( env, policy_fun, action_index_fn=action_index_fn, num_trajectories=num_trajectories, max_timestep=max_timestep, len_history_for_policy=len_history_for_policy) self.assertEqual(num_trajectories, len(trajectories)) # Check shapes within trajectories. traj = trajectories[0] T = traj[1].shape[0] # pylint: disable=invalid-name self.assertEqual((T + 1, 4), traj[0].shape) # (4,) is OBS self.assertEqual((T, ), traj[2].shape) self.assertEqual(T, len(traj[4]["log_prob_actions"])) self.assertEqual(T, len(traj[4]["value_predictions"])) self.assertLessEqual(T, max_timestep) traj = trajectories[1] T = traj[1].shape[0] # pylint: disable=invalid-name self.assertEqual((T + 1, 4), traj[0].shape) self.assertEqual((T, ), traj[2].shape) self.assertEqual(T, len(traj[4]["log_prob_actions"])) self.assertEqual(T, len(traj[4]["value_predictions"])) self.assertLessEqual(T, max_timestep)
def collect_trajectories(env, policy_fn, n_trajectories=1, policy=env_problem_utils.GUMBEL_SAMPLING, max_timestep=None, epsilon=0.1, reset=True, len_history_for_policy=32, rng=None): """Collect trajectories with the given policy net and behaviour. Args: env: A gym env interface, for now this is not-batched. policy_fn: observations(B,T+1) -> log-probabs(B,T+1, A) callable. n_trajectories: int, number of trajectories. policy: string, "greedy", "epsilon-greedy", or "categorical-sampling" i.e. how to use the policy_fn to return an action. max_timestep: int or None, the index of the maximum time-step at which we return the trajectory, None for ending a trajectory only when env returns done. epsilon: float, the epsilon for `epsilon-greedy` policy. reset: bool, true if we want to reset the envs. The envs are also reset if max_max_timestep is None or < 0 len_history_for_policy: int, the maximum history to keep for applying the policy on. rng: jax rng, splittable. Returns: A tuple (trajectory, number of trajectories that are done) trajectory: list of (observation, action, reward) tuples, where each element `i` is a tuple of numpy arrays with shapes as follows: observation[i] = (B, T_i + 1) action[i] = (B, T_i) reward[i] = (B, T_i) """ assert isinstance(env, env_problem.EnvProblem) # This is an env_problem, run its collect function. trajs, n_done, timing_info = env_problem_utils.play_env_problem_with_policy( env, policy_fn, num_trajectories=n_trajectories, max_timestep=max_timestep, policy_sampling=policy, eps=epsilon, reset=reset, len_history_for_policy=len_history_for_policy, rng=rng) # Skip returning raw_rewards here, since they aren't used. # t is the return value of Trajectory.as_numpy, so: # (observation, action, processed_reward, raw_reward, infos) return [(t[0], t[1], t[2], t[4]) for t in trajs], n_done, timing_info
def collect_trajectories(env, policy_fun, num_trajectories=1, policy=env_problem_utils.CATEGORICAL_SAMPLING, max_timestep=None, boundary=20, epsilon=0.1, reset=True, rng=None): """Collect trajectories with the given policy net and behaviour. Args: env: A gym env interface, for now this is not-batched. policy_fun: observations(B,T+1) -> log-probabs(B,T+1, A) callable. num_trajectories: int, number of trajectories. policy: string, "greedy", "epsilon-greedy", or "categorical-sampling" i.e. how to use the policy_fun to return an action. max_timestep: int or None, the index of the maximum time-step at which we return the trajectory, None for ending a trajectory only when env returns done. boundary: int, boundary for padding, used in EnvProblem envs. epsilon: float, the epsilon for `epsilon-greedy` policy. reset: bool, true if we want to reset the envs. The envs are also reset if max_max_timestep is None or < 0 rng: jax rng, splittable. Returns: A tuple (trajectory, number of trajectories that are done) trajectory: list of (observation, action, reward) tuples, where each element `i` is a tuple of numpy arrays with shapes as follows: observation[i] = (B, T_i + 1) action[i] = (B, T_i) reward[i] = (B, T_i) """ assert isinstance(env, env_problem.EnvProblem) # This is an env_problem, run its collect function. return env_problem_utils.play_env_problem_with_policy( env, policy_fun, num_trajectories=num_trajectories, max_timestep=max_timestep, boundary=boundary, policy_sampling=policy, eps=epsilon, reset=reset, rng=rng)
def evaluate_policy(eval_env, get_predictions, boundary, max_timestep=20000, rng=None): """Evaluate the policy.""" avg_rewards = {} for policy in [ env_problem_utils.CATEGORICAL_SAMPLING, env_problem_utils.GUMBEL_SAMPLING, env_problem_utils.EPSILON_GREEDY ]: trajs, _ = env_problem_utils.play_env_problem_with_policy( eval_env, get_predictions, boundary=boundary, max_timestep=max_timestep, reset=True, policy_sampling=policy, rng=rng) avg_rewards[policy] = float(sum(np.sum(traj[2]) for traj in trajs)) / len(trajs) return avg_rewards
def collect_trajectories(env, policy_fun, num_trajectories=1, policy="greedy", max_timestep=None, boundary=20, epsilon=0.1): """Collect trajectories with the given policy net and behaviour. Args: env: A gym env interface, for now this is not-batched. policy_fun: observations(B,T+1) -> log-probabs(B,T+1, A) callable. num_trajectories: int, number of trajectories. policy: string, "greedy", "epsilon-greedy", or "categorical-sampling" i.e. how to use the policy_fun to return an action. max_timestep: int or None, the index of the maximum time-step at which we return the trajectory, None for ending a trajectory only when env returns done. boundary: int, boundary for padding, used in EnvProblem envs. epsilon: float, the epsilon for `epsilon-greedy` policy. Returns: trajectory: list of (observation, action, reward) tuples, where each element `i` is a tuple of numpy arrays with shapes as follows: observation[i] = (B, T_i + 1) action[i] = (B, T_i) reward[i] = (B, T_i) """ if isinstance(env, env_problem.EnvProblem): # This is an env_problem, run its collect function. return env_problem_utils.play_env_problem_with_policy( env, policy_fun, num_trajectories=num_trajectories, max_timestep=max_timestep, boundary=boundary) trajectories = [] for t in range(num_trajectories): t_start = time.time() rewards = [] actions = [] done = False observation = env.reset() # This is currently shaped (1, 1) + OBS, but new observations will keep # getting added to it, making it eventually (1, T+1) + OBS observation_history = observation[np.newaxis, np.newaxis, :] # Run either till we're done OR if max_timestep is defined only till that # timestep. ts = 0 while ((not done) and (not max_timestep or observation_history.shape[1] < max_timestep)): ts_start = time.time() # Run the policy, to pick an action, shape is (1, t, A) because # observation_history is shaped (1, t) + OBS predictions = policy_fun(observation_history) # We need the predictions for the last time-step, so squeeze the batch # dimension and take the last time-step. predictions = np.squeeze(predictions, axis=0)[-1] # Policy can be run in one of the following ways: # - Greedy # - Epsilon-Greedy # - Categorical-Sampling action = None if policy == "greedy": action = np.argmax(predictions) elif policy == "epsilon-greedy": # A schedule for epsilon is 1/k where k is the episode number sampled. if onp.random.random() < epsilon: # Choose an action at random. action = onp.random.randint(0, high=len(predictions)) else: # Return the best action. action = np.argmax(predictions) elif policy == "categorical-sampling": # NOTE: The predictions aren't probabilities but log-probabilities # instead, since they were computed with LogSoftmax. # So just np.exp them to make them probabilities. predictions = np.exp(predictions) action = onp.argwhere(onp.random.multinomial(1, predictions) == 1) else: raise ValueError("Unknown policy: %s" % policy) # NOTE: Assumption, single batch. try: action = int(action) except TypeError as err: # Let's dump some information before we die off. logging.error("Cannot convert action into an integer: [%s]", err) logging.error("action.shape: [%s]", action.shape) logging.error("action: [%s]", action) logging.error("predictions.shape: [%s]", predictions.shape) logging.error("predictions: [%s]", predictions) logging.error("observation_history: [%s]", observation_history) raise err observation, reward, done, _ = env.step(action) # observation is of shape OBS, so add extra dims and concatenate on the # time dimension. observation_history = np.concatenate( [observation_history, observation[np.newaxis, np.newaxis, :]], axis=1) rewards.append(reward) actions.append(action) ts += 1 logging.vlog( 2, " Collected time-step[ %5d] of trajectory[ %5d] in [%0.2f] msec.", ts, t, get_time(ts_start)) logging.vlog(2, " Collected trajectory[ %5d] in [%0.2f] msec.", t, get_time(t_start)) # This means we are done we're been terminated early. assert done or (max_timestep and max_timestep >= observation_history.shape[1]) # observation_history is (1, T+1) + OBS, lets squeeze out the batch dim. observation_history = np.squeeze(observation_history, axis=0) trajectories.append( (observation_history, np.stack(actions), np.stack(rewards))) return trajectories