def sample(self): obs, actions, rewards, new_obs, dones = [], [], [], [], [] for _ in range(self.config["sample_batch_size"]): ob, act, rew, ob1, done = self._step(self.global_timestep) obs.append(ob) actions.append(act) rewards.append(rew) new_obs.append(ob1) dones.append(done) batch = SampleBatch({ "obs": obs, "actions": actions, "rewards": rewards, "new_obs": new_obs, "dones": dones, "weights": np.ones_like(rewards) }) assert batch.count == self.config["sample_batch_size"] # Prioritize on the worker side if self.config["worker_side_prioritization"]: td_errors = self.ddpg_graph.compute_td_error( self.sess, obs, batch["actions"], batch["rewards"], new_obs, batch["dones"], batch["weights"]) new_priorities = (np.abs(td_errors) + self.config["prioritized_replay_eps"]) batch.data["weights"] = new_priorities return batch
def testConcat(self): b1 = SampleBatch({"a": np.array([1, 2, 3]), "b": np.array([4, 5, 6])}) b2 = SampleBatch({"a": np.array([1]), "b": np.array([4])}) b3 = SampleBatch({"a": np.array([1]), "b": np.array([5])}) b12 = b1.concat(b2) self.assertEqual(b12.data["a"].tolist(), [1, 2, 3, 1]) self.assertEqual(b12.data["b"].tolist(), [4, 5, 6, 4]) b = SampleBatch.concat_samples([b1, b2, b3]) self.assertEqual(b.data["a"].tolist(), [1, 2, 3, 1, 1]) self.assertEqual(b.data["b"].tolist(), [4, 5, 6, 4, 5])
def sample(self, no_replay=False): # First seed the replay buffer with a few new samples if self.workers: weights = ray.put(self.get_weights()) for w in self.workers: w.set_weights.remote(weights) samples = ray.get([w.sample.remote() for w in self.workers]) else: samples = [DQNEvaluator.sample(self)] for s in samples: for row in s.rows(): self.replay_buffer.add(row["obs"], row["actions"], row["rewards"], row["new_obs"], row["dones"]) if no_replay: return samples # Then return a batch sampled from the buffer if self.config["prioritized_replay"]: (obses_t, actions, rewards, obses_tp1, dones, weights, batch_indexes) = self.replay_buffer.sample( self.config["train_batch_size"], beta=self.beta_schedule.value(self.global_timestep)) self._update_priorities_if_needed() batch = SampleBatch({ "obs": obses_t, "actions": actions, "rewards": rewards, "new_obs": obses_tp1, "dones": dones, "weights": weights, "batch_indexes": batch_indexes }) self.samples_to_prioritize = batch else: obses_t, actions, rewards, obses_tp1, dones = \ self.replay_buffer.sample(self.config["train_batch_size"]) batch = SampleBatch({ "obs": obses_t, "actions": actions, "rewards": rewards, "new_obs": obses_tp1, "dones": dones, "weights": np.ones_like(rewards) }) return batch
def sample(self): samples_dict = {"observations": [], "rewards": []} for i in range(self._sample_count): samples_dict["observations"].append( self.obs_filter(np.random.randn())) samples_dict["rewards"].append(self.rew_filter(np.random.randn())) return SampleBatch(samples_dict)
def collect_samples(agents, config, local_evaluator): num_timesteps_so_far = 0 trajectories = [] # This variable maps the object IDs of trajectories that are currently # computed to the agent that they are computed on; we start some initial # tasks here. agent_dict = {} for agent in agents: fut_sample = agent.sample.remote() agent_dict[fut_sample] = agent while num_timesteps_so_far < config["timesteps_per_batch"]: # TODO(pcm): Make wait support arbitrary iterators and remove the # conversion to list here. [fut_sample], _ = ray.wait(list(agent_dict)) agent = agent_dict.pop(fut_sample) # Start task with next trajectory and record it in the dictionary. fut_sample2 = agent.sample.remote() agent_dict[fut_sample2] = agent next_sample = ray.get(fut_sample) num_timesteps_so_far += next_sample.count trajectories.append(next_sample) return SampleBatch.concat_samples(trajectories)
def compute_steps(self, config, obs_filter, rew_filter): """Compute multiple rollouts and concatenate the results. Args: config: Configuration parameters obs_filter: Function that is applied to each of the observations. reward_filter: Function that is applied to each of the rewards. Returns: states: List of states. total_rewards: Total rewards of the trajectories. trajectory_lengths: Lengths of the trajectories. """ num_steps_so_far = 0 trajectories = [] self.update_filters(obs_filter, rew_filter) while num_steps_so_far < config["min_steps_per_task"]: rollout = self.sampler.get_data() trajectory = process_rollout(rollout, self.reward_filter, config["gamma"], config["lambda"], use_gae=config["use_gae"]) num_steps_so_far += trajectory["rewards"].shape[0] trajectories.append(trajectory) metrics = self.sampler.get_metrics() total_rewards, trajectory_lengths = zip(*[(c.episode_reward, c.episode_length) for c in metrics]) updated_obs_filter = self.sampler.get_obs_filter(flush=True) return (SampleBatch.concat_samples(trajectories), total_rewards, trajectory_lengths, updated_obs_filter, self.reward_filter)
def sample(self): obs, actions, rewards, new_obs, dones = [], [], [], [], [] for _ in range(self.config["sample_batch_size"] + self.config["n_step"] - 1): ob, act, rew, ob1, done = self._step(self.global_timestep) obs.append(ob) actions.append(act) rewards.append(rew) new_obs.append(ob1) dones.append(done) # N-step Q adjustments if self.config["n_step"] > 1: # Adjust for steps lost from truncation self.local_timestep -= (self.config["n_step"] - 1) adjust_nstep(self.config["n_step"], self.config["gamma"], obs, actions, rewards, new_obs, dones) batch = SampleBatch({ "obs": obs, "actions": actions, "rewards": rewards, "new_obs": new_obs, "dones": dones, "weights": np.ones_like(rewards) }) assert batch.count == self.config["sample_batch_size"] return batch
def collect_samples(agents, config, observation_filter, reward_filter): num_timesteps_so_far = 0 trajectories = [] total_rewards = [] trajectory_lengths = [] # This variable maps the object IDs of trajectories that are currently # computed to the agent that they are computed on; we start some initial # tasks here. agent_dict = { agent.compute_steps.remote(config, observation_filter, reward_filter): agent for agent in agents } while num_timesteps_so_far < config["timesteps_per_batch"]: # TODO(pcm): Make wait support arbitrary iterators and remove the # conversion to list here. [next_trajectory], _ = ray.wait(list(agent_dict)) agent = agent_dict.pop(next_trajectory) # Start task with next trajectory and record it in the dictionary. agent_dict[agent.compute_steps.remote(config, observation_filter, reward_filter)] = agent trajectory, rewards, lengths, obs_f, rew_f = ray.get(next_trajectory) total_rewards.extend(rewards) trajectory_lengths.extend(lengths) num_timesteps_so_far += sum(lengths) trajectories.append(trajectory) observation_filter.update(obs_f) reward_filter.update(rew_f) return (SampleBatch.concat_samples(trajectories), np.mean(total_rewards), np.mean(trajectory_lengths))
def sample(self): obs, actions, rewards, new_obs, dones = [], [], [], [], [] for _ in range(self.config["sample_batch_size"]): ob, act, rew, ob1, done = self._step(self.global_timestep) obs.append(ob) actions.append(act) rewards.append(rew) new_obs.append(ob1) dones.append(done) return SampleBatch({ "obs": obs, "actions": actions, "rewards": rewards, "new_obs": new_obs, "dones": dones, "weights": np.ones_like(rewards)})
def sample(self): obs, actions, rewards, new_obs, dones = [], [], [], [], [] for _ in range( self.config["sample_batch_size"] + self.config["n_step"] - 1): ob, act, rew, ob1, done = self._step(self.global_timestep) obs.append(ob) actions.append(act) rewards.append(rew) new_obs.append(ob1) dones.append(done) # N-step Q adjustments if self.config["n_step"] > 1: # Adjust for steps lost from truncation self.local_timestep -= (self.config["n_step"] - 1) adjust_nstep(self.config["n_step"], self.config["gamma"], obs, actions, rewards, new_obs, dones) batch = SampleBatch({ "obs": [pack(np.array(o)) for o in obs], "actions": actions, "rewards": rewards, "new_obs": [pack(np.array(o)) for o in new_obs], "dones": dones, "weights": np.ones_like(rewards) }) assert (batch.count == self.config["sample_batch_size"]) # Prioritize on the worker side if self.config["worker_side_prioritization"]: td_errors = self.ddpg_graph.compute_td_error( self.sess, obs, batch["actions"], batch["rewards"], new_obs, batch["dones"], batch["weights"]) new_priorities = ( np.abs(td_errors) + self.config["prioritized_replay_eps"]) batch.data["weights"] = new_priorities return batch
def sample(self): obs, actions, rewards, new_obs, dones = [], [], [], [], [] for _ in range( self.config["sample_batch_size"] + self.config["n_step"] - 1): ob, act, rew, ob1, done = self._step(self.global_timestep) obs.append(ob) actions.append(act) rewards.append(rew) new_obs.append(ob1) dones.append(done) # N-step Q adjustments if self.config["n_step"] > 1: # Adjust for steps lost from truncation self.local_timestep -= (self.config["n_step"] - 1) adjust_nstep( self.config["n_step"], self.config["gamma"], obs, actions, rewards, new_obs, dones) batch = SampleBatch({ "obs": [pack(np.array(o)) for o in obs], "actions": actions, "rewards": rewards, "new_obs": [pack(np.array(o)) for o in new_obs], "dones": dones, "weights": np.ones_like(rewards)}) assert (batch.count == self.config["sample_batch_size"]) # Prioritize on the worker side if self.config["worker_side_prioritization"]: td_errors = self.dqn_graph.compute_td_error( self.sess, obs, batch["actions"], batch["rewards"], new_obs, batch["dones"], batch["weights"]) new_priorities = ( np.abs(td_errors) + self.config["prioritized_replay_eps"]) batch.data["weights"] = new_priorities return batch
def testConcat(self): b1 = SampleBatch({"a": np.array([1, 2, 3]), "b": np.array([4, 5, 6])}) b2 = SampleBatch({"a": np.array([1]), "b": np.array([4])}) b3 = SampleBatch({"a": np.array([1]), "b": np.array([5])}) b12 = b1.concat(b2) self.assertEqual(b12["a"].tolist(), [1, 2, 3, 1]) self.assertEqual(b12["b"].tolist(), [4, 5, 6, 4]) b = SampleBatch.concat_samples([b1, b2, b3]) self.assertEqual(b["a"].tolist(), [1, 2, 3, 1, 1]) self.assertEqual(b["b"].tolist(), [4, 5, 6, 4, 5])
def sample(self): """Returns experience samples from this Evaluator. Observation filter and reward filters are flushed here. Returns: SampleBatch: A columnar batch of experiences. """ num_steps_so_far = 0 all_samples = [] while num_steps_so_far < self.config["min_steps_per_task"]: rollout = self.sampler.get_data() samples = process_rollout( rollout, self.rew_filter, self.config["gamma"], self.config["lambda"], use_gae=self.config["use_gae"]) num_steps_so_far += samples.count all_samples.append(samples) return SampleBatch.concat_samples(all_samples)
def sample(self): """Returns experience samples from this Evaluator. Observation filter and reward filters are flushed here. Returns: SampleBatch: A columnar batch of experiences. """ num_steps_so_far = 0 all_samples = [] while num_steps_so_far < self.config["min_steps_per_task"]: rollout = self.sampler.get_data() samples = process_rollout( rollout, self.rew_filter, self.config["gamma"], self.config["lambda"], use_gae=self.config["use_gae"]) num_steps_so_far += samples.count all_samples.append(samples) return SampleBatch.concat_samples(all_samples)
def sample(self): """Returns experience samples from this Evaluator. Observation filter and reward filters are flushed here. Returns: SampleBatch: A columnar batch of experiences. """ num_steps_so_far = 0 all_samples = [] while num_steps_so_far < self.config["min_steps_per_task"]: rollout = self.sampler.get_data() last_r = 0.0 # note: not needed since we don't truncate rollouts samples = compute_advantages( rollout, last_r, self.config["gamma"], self.config["lambda"], use_gae=self.config["use_gae"]) num_steps_so_far += samples.count all_samples.append(samples) return SampleBatch.concat_samples(all_samples)
def sample(self): obs, actions, rewards, new_obs, dones = [], [], [], [], [] for _ in range(self.config["sample_batch_size"]): ob, act, rew, ob1, done = self._step(self.global_timestep) obs.append(ob) actions.append(act) rewards.append(rew) new_obs.append(ob1) dones.append(done) batch = SampleBatch({ "obs": obs, "actions": actions, "rewards": rewards, "new_obs": new_obs, "dones": dones }) assert batch.count == self.config["sample_batch_size"] return batch
def sample(self): """Evaluate the current policies and return a batch of experiences. Return: SampleBatch from evaluating the current policies. """ batches = [self.sampler.get_data()] steps_so_far = batches[0].count while steps_so_far < self.batch_steps: batch = self.sampler.get_data() steps_so_far += batch.count batches.append(batch) batch = SampleBatch.concat_samples(batches) if self.compress_observations: batch["obs"] = [pack(o) for o in batch["obs"]] batch["new_obs"] = [pack(o) for o in batch["new_obs"]] return batch
def process_rollout(rollout, reward_filter, gamma, lambda_=1.0, use_gae=True): """Given a rollout, compute its value targets and the advantage. Args: rollout (PartialRollout): Partial Rollout Object reward_filter (Filter): Filter for processing advantanges gamma (float): Parameter for GAE lambda_ (float): Parameter for GAE use_gae (bool): Using Generalized Advantage Estamation Returns: SampleBatch (SampleBatch): Object with experience from rollout and processed rewards.""" traj = {} trajsize = len(rollout.data["actions"]) for key in rollout.data: traj[key] = np.stack(rollout.data[key]) if use_gae: assert "vf_preds" in rollout.data, "Values not found!" vpred_t = np.stack( rollout.data["vf_preds"] + [np.array(rollout.last_r)]).squeeze() delta_t = traj["rewards"] + gamma * vpred_t[1:] - vpred_t[:-1] # This formula for the advantage comes # "Generalized Advantage Estimation": https://arxiv.org/abs/1506.02438 traj["advantages"] = discount(delta_t, gamma * lambda_) traj["value_targets"] = traj["advantages"] + traj["vf_preds"] else: rewards_plus_v = np.stack( rollout.data["rewards"] + [np.array(rollout.last_r)]).squeeze() traj["advantages"] = discount(rewards_plus_v, gamma)[:-1] for i in range(traj["advantages"].shape[0]): traj["advantages"][i] = reward_filter(traj["advantages"][i]) traj["advantages"] = traj["advantages"].copy() assert all(val.shape[0] == trajsize for val in traj.values()), \ "Rollout stacked incorrectly!" return SampleBatch(traj)
def compute_advantages(rollout, last_r, gamma, lambda_=1.0, use_gae=True): """Given a rollout, compute its value targets and the advantage. Args: rollout (PartialRollout): Partial Rollout Object last_r (float): Value estimation for last observation gamma (float): Parameter for GAE lambda_ (float): Parameter for GAE use_gae (bool): Using Generalized Advantage Estamation Returns: SampleBatch (SampleBatch): Object with experience from rollout and processed rewards. """ traj = {} trajsize = len(rollout["actions"]) for key in rollout: traj[key] = np.stack(rollout[key]) if use_gae: assert "vf_preds" in rollout, "Values not found!" vpred_t = np.concatenate([rollout["vf_preds"], np.array([last_r])]) delta_t = traj["rewards"] + gamma * vpred_t[1:] - vpred_t[:-1] # This formula for the advantage comes # "Generalized Advantage Estimation": https://arxiv.org/abs/1506.02438 traj["advantages"] = discount(delta_t, gamma * lambda_) traj["value_targets"] = (traj["advantages"] + traj["vf_preds"]).copy().astype(np.float32) else: rewards_plus_v = np.concatenate( [rollout["rewards"], np.array([last_r])]) traj["advantages"] = discount(rewards_plus_v, gamma)[:-1] traj["advantages"] = traj["advantages"].copy().astype(np.float32) assert all(val.shape[0] == trajsize for val in traj.values()), \ "Rollout stacked incorrectly!" return SampleBatch(traj)