def collect_samples(agents, num_timesteps, gamma, lam, horizon, observation_filter=NoFilter(), reward_filter=NoFilter()): num_timesteps_so_far = 0 trajectories = [] total_rewards = [] traj_len_means = [] while num_timesteps_so_far < num_timesteps: trajectory_batch = ray.get([ agent.compute_trajectory.remote(gamma, lam, horizon) for agent in agents ]) trajectory = concatenate(trajectory_batch) trajectory = flatten(trajectory) not_done = np.logical_not(trajectory["dones"]) total_rewards.append( trajectory["raw_rewards"][not_done].sum(axis=0).mean() / len(agents)) traj_len_means.append(not_done.sum(axis=0).mean() / len(agents)) trajectory = {key: val[not_done] for key, val in trajectory.items()} num_timesteps_so_far += len(trajectory["dones"]) trajectories.append(trajectory) return (concatenate(trajectories), np.mean(total_rewards), np.mean(traj_len_means))
def testFlatten(self): d = { "s": np.array([[[1, -1], [2, -2]], [[3, -3], [4, -4]]]), "a": np.array([[[5], [-5]], [[6], [-6]]]) } flat = flatten(d.copy(), start=0, stop=2) assert_allclose(d["s"][0][0][:], flat["s"][0][:]) assert_allclose(d["s"][0][1][:], flat["s"][1][:]) assert_allclose(d["s"][1][0][:], flat["s"][2][:]) assert_allclose(d["s"][1][1][:], flat["s"][3][:]) assert_allclose(d["a"][0][0], flat["a"][0]) assert_allclose(d["a"][0][1], flat["a"][1]) assert_allclose(d["a"][1][0], flat["a"][2]) assert_allclose(d["a"][1][1], flat["a"][3])
def compute_steps(self, gamma, lam, horizon, min_steps_per_task=-1): """Compute multiple rollouts and concatenate the results. Args: gamma: MDP discount factor lam: GAE(lambda) parameter horizon: Number of steps after which a rollout gets cut min_steps_per_task: Lower bound on the number of states to be collected. Returns: states: List of states. total_rewards: Total rewards of the trajectories. trajectory_lengths: Lengths of the trajectories. """ num_steps_so_far = 0 trajectories = [] total_rewards = [] trajectory_lengths = [] while True: trajectory = self.compute_trajectory(gamma, lam, horizon) total_rewards.append(trajectory["raw_rewards"].sum(axis=0).mean()) trajectory_lengths.append( np.logical_not(trajectory["dones"]).sum(axis=0).mean()) trajectory = flatten(trajectory) not_done = np.logical_not(trajectory["dones"]) # Filtering out states that are done. We do this because # trajectories are batched and cut only if all the trajectories # in the batch terminated, so we can potentially get rid of # some of the states here. trajectory = { key: val[not_done] for key, val in trajectory.items() } num_steps_so_far += trajectory["raw_rewards"].shape[0] trajectories.append(trajectory) if num_steps_so_far >= min_steps_per_task: break return concatenate(trajectories), total_rewards, trajectory_lengths
def collect_samples(agents, num_timesteps, gamma, lam, horizon, observation_filter=NoFilter(), reward_filter=NoFilter()): num_timesteps_so_far = 0 trajectories = [] total_rewards = [] traj_len_means = [] # This variable maps the object IDs of trajectories that are currently # computed to the agent that they are computed on; we start some initial # tasks here. agent_dict = { agent.compute_trajectory.remote(gamma, lam, horizon): agent for agent in agents } while num_timesteps_so_far < num_timesteps: # TODO(pcm): Make wait support arbitrary iterators and remove the # conversion to list here. [next_trajectory ], waiting_trajectories = ray.wait(list(agent_dict.keys())) agent = agent_dict.pop(next_trajectory) # Start task with next trajectory and record it in the dictionary. agent_dict[agent.compute_trajectory.remote(gamma, lam, horizon)] = (agent) trajectory = flatten(ray.get(next_trajectory)) not_done = np.logical_not(trajectory["dones"]) total_rewards.append( trajectory["raw_rewards"][not_done].sum(axis=0).mean()) traj_len_means.append(not_done.sum(axis=0).mean()) trajectory = {key: val[not_done] for key, val in trajectory.items()} num_timesteps_so_far += len(trajectory["dones"]) trajectories.append(trajectory) return (concatenate(trajectories), np.mean(total_rewards), np.mean(traj_len_means))