示例#1
0
    def sample(self):
        obs, actions, rewards, new_obs, dones = [], [], [], [], []
        for _ in range(self.config["sample_batch_size"]):
            ob, act, rew, ob1, done = self._step(self.global_timestep)
            obs.append(ob)
            actions.append(act)
            rewards.append(rew)
            new_obs.append(ob1)
            dones.append(done)
        batch = SampleBatch({
            "obs": obs,
            "actions": actions,
            "rewards": rewards,
            "new_obs": new_obs,
            "dones": dones,
            "weights": np.ones_like(rewards)
        })
        assert batch.count == self.config["sample_batch_size"]

        # Prioritize on the worker side
        if self.config["worker_side_prioritization"]:
            td_errors = self.ddpg_graph.compute_td_error(
                self.sess, obs, batch["actions"], batch["rewards"], new_obs,
                batch["dones"], batch["weights"])
            new_priorities = (np.abs(td_errors) +
                              self.config["prioritized_replay_eps"])
            batch.data["weights"] = new_priorities
        return batch
示例#2
0
 def testConcat(self):
     b1 = SampleBatch({"a": np.array([1, 2, 3]), "b": np.array([4, 5, 6])})
     b2 = SampleBatch({"a": np.array([1]), "b": np.array([4])})
     b3 = SampleBatch({"a": np.array([1]), "b": np.array([5])})
     b12 = b1.concat(b2)
     self.assertEqual(b12.data["a"].tolist(), [1, 2, 3, 1])
     self.assertEqual(b12.data["b"].tolist(), [4, 5, 6, 4])
     b = SampleBatch.concat_samples([b1, b2, b3])
     self.assertEqual(b.data["a"].tolist(), [1, 2, 3, 1, 1])
     self.assertEqual(b.data["b"].tolist(), [4, 5, 6, 4, 5])
示例#3
0
    def sample(self, no_replay=False):
        # First seed the replay buffer with a few new samples
        if self.workers:
            weights = ray.put(self.get_weights())
            for w in self.workers:
                w.set_weights.remote(weights)
            samples = ray.get([w.sample.remote() for w in self.workers])
        else:
            samples = [DQNEvaluator.sample(self)]

        for s in samples:
            for row in s.rows():
                self.replay_buffer.add(row["obs"], row["actions"],
                                       row["rewards"], row["new_obs"],
                                       row["dones"])

        if no_replay:
            return samples

        # Then return a batch sampled from the buffer
        if self.config["prioritized_replay"]:
            (obses_t, actions, rewards, obses_tp1, dones, weights,
             batch_indexes) = self.replay_buffer.sample(
                 self.config["train_batch_size"],
                 beta=self.beta_schedule.value(self.global_timestep))
            self._update_priorities_if_needed()
            batch = SampleBatch({
                "obs": obses_t,
                "actions": actions,
                "rewards": rewards,
                "new_obs": obses_tp1,
                "dones": dones,
                "weights": weights,
                "batch_indexes": batch_indexes
            })
            self.samples_to_prioritize = batch
        else:
            obses_t, actions, rewards, obses_tp1, dones = \
                self.replay_buffer.sample(self.config["train_batch_size"])
            batch = SampleBatch({
                "obs": obses_t,
                "actions": actions,
                "rewards": rewards,
                "new_obs": obses_tp1,
                "dones": dones,
                "weights": np.ones_like(rewards)
            })
        return batch
示例#4
0
 def sample(self):
     samples_dict = {"observations": [], "rewards": []}
     for i in range(self._sample_count):
         samples_dict["observations"].append(
             self.obs_filter(np.random.randn()))
         samples_dict["rewards"].append(self.rew_filter(np.random.randn()))
     return SampleBatch(samples_dict)
示例#5
0
def collect_samples(agents, config, local_evaluator):
    num_timesteps_so_far = 0
    trajectories = []
    # This variable maps the object IDs of trajectories that are currently
    # computed to the agent that they are computed on; we start some initial
    # tasks here.

    agent_dict = {}

    for agent in agents:
        fut_sample = agent.sample.remote()
        agent_dict[fut_sample] = agent

    while num_timesteps_so_far < config["timesteps_per_batch"]:
        # TODO(pcm): Make wait support arbitrary iterators and remove the
        # conversion to list here.
        [fut_sample], _ = ray.wait(list(agent_dict))
        agent = agent_dict.pop(fut_sample)
        # Start task with next trajectory and record it in the dictionary.
        fut_sample2 = agent.sample.remote()
        agent_dict[fut_sample2] = agent

        next_sample = ray.get(fut_sample)
        num_timesteps_so_far += next_sample.count
        trajectories.append(next_sample)
    return SampleBatch.concat_samples(trajectories)
示例#6
0
    def compute_steps(self, config, obs_filter, rew_filter):
        """Compute multiple rollouts and concatenate the results.

        Args:
            config: Configuration parameters
            obs_filter: Function that is applied to each of the
                observations.
            reward_filter: Function that is applied to each of the rewards.

        Returns:
            states: List of states.
            total_rewards: Total rewards of the trajectories.
            trajectory_lengths: Lengths of the trajectories.
        """
        num_steps_so_far = 0
        trajectories = []
        self.update_filters(obs_filter, rew_filter)

        while num_steps_so_far < config["min_steps_per_task"]:
            rollout = self.sampler.get_data()
            trajectory = process_rollout(rollout,
                                         self.reward_filter,
                                         config["gamma"],
                                         config["lambda"],
                                         use_gae=config["use_gae"])
            num_steps_so_far += trajectory["rewards"].shape[0]
            trajectories.append(trajectory)
        metrics = self.sampler.get_metrics()
        total_rewards, trajectory_lengths = zip(*[(c.episode_reward,
                                                   c.episode_length)
                                                  for c in metrics])
        updated_obs_filter = self.sampler.get_obs_filter(flush=True)
        return (SampleBatch.concat_samples(trajectories), total_rewards,
                trajectory_lengths, updated_obs_filter, self.reward_filter)
示例#7
0
    def sample(self):
        obs, actions, rewards, new_obs, dones = [], [], [], [], []
        for _ in range(self.config["sample_batch_size"] +
                       self.config["n_step"] - 1):
            ob, act, rew, ob1, done = self._step(self.global_timestep)
            obs.append(ob)
            actions.append(act)
            rewards.append(rew)
            new_obs.append(ob1)
            dones.append(done)

        # N-step Q adjustments
        if self.config["n_step"] > 1:
            # Adjust for steps lost from truncation
            self.local_timestep -= (self.config["n_step"] - 1)
            adjust_nstep(self.config["n_step"], self.config["gamma"], obs,
                         actions, rewards, new_obs, dones)

        batch = SampleBatch({
            "obs": obs,
            "actions": actions,
            "rewards": rewards,
            "new_obs": new_obs,
            "dones": dones,
            "weights": np.ones_like(rewards)
        })
        assert batch.count == self.config["sample_batch_size"]
        return batch
示例#8
0
def collect_samples(agents, config, observation_filter, reward_filter):
    num_timesteps_so_far = 0
    trajectories = []
    total_rewards = []
    trajectory_lengths = []
    # This variable maps the object IDs of trajectories that are currently
    # computed to the agent that they are computed on; we start some initial
    # tasks here.
    agent_dict = {
        agent.compute_steps.remote(config, observation_filter, reward_filter):
        agent
        for agent in agents
    }
    while num_timesteps_so_far < config["timesteps_per_batch"]:
        # TODO(pcm): Make wait support arbitrary iterators and remove the
        # conversion to list here.
        [next_trajectory], _ = ray.wait(list(agent_dict))
        agent = agent_dict.pop(next_trajectory)
        # Start task with next trajectory and record it in the dictionary.
        agent_dict[agent.compute_steps.remote(config, observation_filter,
                                              reward_filter)] = agent
        trajectory, rewards, lengths, obs_f, rew_f = ray.get(next_trajectory)
        total_rewards.extend(rewards)
        trajectory_lengths.extend(lengths)
        num_timesteps_so_far += sum(lengths)
        trajectories.append(trajectory)
        observation_filter.update(obs_f)
        reward_filter.update(rew_f)
    return (SampleBatch.concat_samples(trajectories), np.mean(total_rewards),
            np.mean(trajectory_lengths))
示例#9
0
 def sample(self):
     obs, actions, rewards, new_obs, dones = [], [], [], [], []
     for _ in range(self.config["sample_batch_size"]):
         ob, act, rew, ob1, done = self._step(self.global_timestep)
         obs.append(ob)
         actions.append(act)
         rewards.append(rew)
         new_obs.append(ob1)
         dones.append(done)
     return SampleBatch({
         "obs": obs, "actions": actions, "rewards": rewards,
         "new_obs": new_obs, "dones": dones,
         "weights": np.ones_like(rewards)})
示例#10
0
    def sample(self):
        obs, actions, rewards, new_obs, dones = [], [], [], [], []
        for _ in range(
                self.config["sample_batch_size"] + self.config["n_step"] - 1):
            ob, act, rew, ob1, done = self._step(self.global_timestep)
            obs.append(ob)
            actions.append(act)
            rewards.append(rew)
            new_obs.append(ob1)
            dones.append(done)

        # N-step Q adjustments
        if self.config["n_step"] > 1:
            # Adjust for steps lost from truncation
            self.local_timestep -= (self.config["n_step"] - 1)
            adjust_nstep(self.config["n_step"], self.config["gamma"], obs,
                         actions, rewards, new_obs, dones)

        batch = SampleBatch({
            "obs": [pack(np.array(o)) for o in obs],
            "actions": actions,
            "rewards": rewards,
            "new_obs": [pack(np.array(o)) for o in new_obs],
            "dones": dones,
            "weights": np.ones_like(rewards)
        })
        assert (batch.count == self.config["sample_batch_size"])

        # Prioritize on the worker side
        if self.config["worker_side_prioritization"]:
            td_errors = self.ddpg_graph.compute_td_error(
                self.sess, obs, batch["actions"], batch["rewards"], new_obs,
                batch["dones"], batch["weights"])
            new_priorities = (
                np.abs(td_errors) + self.config["prioritized_replay_eps"])
            batch.data["weights"] = new_priorities

        return batch
示例#11
0
    def sample(self):
        obs, actions, rewards, new_obs, dones = [], [], [], [], []
        for _ in range(
                self.config["sample_batch_size"] + self.config["n_step"] - 1):
            ob, act, rew, ob1, done = self._step(self.global_timestep)
            obs.append(ob)
            actions.append(act)
            rewards.append(rew)
            new_obs.append(ob1)
            dones.append(done)

        # N-step Q adjustments
        if self.config["n_step"] > 1:
            # Adjust for steps lost from truncation
            self.local_timestep -= (self.config["n_step"] - 1)
            adjust_nstep(
                self.config["n_step"], self.config["gamma"],
                obs, actions, rewards, new_obs, dones)

        batch = SampleBatch({
            "obs": [pack(np.array(o)) for o in obs], "actions": actions,
            "rewards": rewards,
            "new_obs": [pack(np.array(o)) for o in new_obs], "dones": dones,
            "weights": np.ones_like(rewards)})
        assert (batch.count == self.config["sample_batch_size"])

        # Prioritize on the worker side
        if self.config["worker_side_prioritization"]:
            td_errors = self.dqn_graph.compute_td_error(
                self.sess, obs, batch["actions"], batch["rewards"],
                new_obs, batch["dones"], batch["weights"])
            new_priorities = (
                np.abs(td_errors) + self.config["prioritized_replay_eps"])
            batch.data["weights"] = new_priorities

        return batch
示例#12
0
 def testConcat(self):
     b1 = SampleBatch({"a": np.array([1, 2, 3]), "b": np.array([4, 5, 6])})
     b2 = SampleBatch({"a": np.array([1]), "b": np.array([4])})
     b3 = SampleBatch({"a": np.array([1]), "b": np.array([5])})
     b12 = b1.concat(b2)
     self.assertEqual(b12["a"].tolist(), [1, 2, 3, 1])
     self.assertEqual(b12["b"].tolist(), [4, 5, 6, 4])
     b = SampleBatch.concat_samples([b1, b2, b3])
     self.assertEqual(b["a"].tolist(), [1, 2, 3, 1, 1])
     self.assertEqual(b["b"].tolist(), [4, 5, 6, 4, 5])
示例#13
0
    def sample(self):
        """Returns experience samples from this Evaluator. Observation
        filter and reward filters are flushed here.

        Returns:
            SampleBatch: A columnar batch of experiences.
        """
        num_steps_so_far = 0
        all_samples = []

        while num_steps_so_far < self.config["min_steps_per_task"]:
            rollout = self.sampler.get_data()
            samples = process_rollout(
                rollout, self.rew_filter, self.config["gamma"],
                self.config["lambda"], use_gae=self.config["use_gae"])
            num_steps_so_far += samples.count
            all_samples.append(samples)
        return SampleBatch.concat_samples(all_samples)
示例#14
0
    def sample(self):
        """Returns experience samples from this Evaluator. Observation
        filter and reward filters are flushed here.

        Returns:
            SampleBatch: A columnar batch of experiences.
        """
        num_steps_so_far = 0
        all_samples = []

        while num_steps_so_far < self.config["min_steps_per_task"]:
            rollout = self.sampler.get_data()
            samples = process_rollout(
                rollout, self.rew_filter, self.config["gamma"],
                self.config["lambda"], use_gae=self.config["use_gae"])
            num_steps_so_far += samples.count
            all_samples.append(samples)
        return SampleBatch.concat_samples(all_samples)
示例#15
0
    def sample(self):
        """Returns experience samples from this Evaluator. Observation
        filter and reward filters are flushed here.

        Returns:
            SampleBatch: A columnar batch of experiences.
        """
        num_steps_so_far = 0
        all_samples = []

        while num_steps_so_far < self.config["min_steps_per_task"]:
            rollout = self.sampler.get_data()
            last_r = 0.0  # note: not needed since we don't truncate rollouts
            samples = compute_advantages(
                rollout, last_r, self.config["gamma"],
                self.config["lambda"], use_gae=self.config["use_gae"])
            num_steps_so_far += samples.count
            all_samples.append(samples)
        return SampleBatch.concat_samples(all_samples)
示例#16
0
    def sample(self):
        obs, actions, rewards, new_obs, dones = [], [], [], [], []
        for _ in range(self.config["sample_batch_size"]):
            ob, act, rew, ob1, done = self._step(self.global_timestep)
            obs.append(ob)
            actions.append(act)
            rewards.append(rew)
            new_obs.append(ob1)
            dones.append(done)

        batch = SampleBatch({
            "obs": obs,
            "actions": actions,
            "rewards": rewards,
            "new_obs": new_obs,
            "dones": dones
        })
        assert batch.count == self.config["sample_batch_size"]
        return batch
示例#17
0
    def sample(self):
        """Evaluate the current policies and return a batch of experiences.

        Return:
            SampleBatch from evaluating the current policies.
        """

        batches = [self.sampler.get_data()]
        steps_so_far = batches[0].count
        while steps_so_far < self.batch_steps:
            batch = self.sampler.get_data()
            steps_so_far += batch.count
            batches.append(batch)
        batch = SampleBatch.concat_samples(batches)

        if self.compress_observations:
            batch["obs"] = [pack(o) for o in batch["obs"]]
            batch["new_obs"] = [pack(o) for o in batch["new_obs"]]

        return batch
示例#18
0
def process_rollout(rollout, reward_filter, gamma, lambda_=1.0, use_gae=True):
    """Given a rollout, compute its value targets and the advantage.

    Args:
        rollout (PartialRollout): Partial Rollout Object
        reward_filter (Filter): Filter for processing advantanges
        gamma (float): Parameter for GAE
        lambda_ (float): Parameter for GAE
        use_gae (bool): Using Generalized Advantage Estamation

    Returns:
        SampleBatch (SampleBatch): Object with experience from rollout and
            processed rewards."""

    traj = {}
    trajsize = len(rollout.data["actions"])
    for key in rollout.data:
        traj[key] = np.stack(rollout.data[key])

    if use_gae:
        assert "vf_preds" in rollout.data, "Values not found!"
        vpred_t = np.stack(
            rollout.data["vf_preds"] + [np.array(rollout.last_r)]).squeeze()
        delta_t = traj["rewards"] + gamma * vpred_t[1:] - vpred_t[:-1]
        # This formula for the advantage comes
        # "Generalized Advantage Estimation": https://arxiv.org/abs/1506.02438
        traj["advantages"] = discount(delta_t, gamma * lambda_)
        traj["value_targets"] = traj["advantages"] + traj["vf_preds"]
    else:
        rewards_plus_v = np.stack(
            rollout.data["rewards"] + [np.array(rollout.last_r)]).squeeze()
        traj["advantages"] = discount(rewards_plus_v, gamma)[:-1]

    for i in range(traj["advantages"].shape[0]):
        traj["advantages"][i] = reward_filter(traj["advantages"][i])

    traj["advantages"] = traj["advantages"].copy()

    assert all(val.shape[0] == trajsize for val in traj.values()), \
        "Rollout stacked incorrectly!"
    return SampleBatch(traj)
示例#19
0
def compute_advantages(rollout, last_r, gamma, lambda_=1.0, use_gae=True):
    """Given a rollout, compute its value targets and the advantage.

    Args:
        rollout (PartialRollout): Partial Rollout Object
        last_r (float): Value estimation for last observation
        gamma (float): Parameter for GAE
        lambda_ (float): Parameter for GAE
        use_gae (bool): Using Generalized Advantage Estamation

    Returns:
        SampleBatch (SampleBatch): Object with experience from rollout and
            processed rewards.
    """

    traj = {}
    trajsize = len(rollout["actions"])
    for key in rollout:
        traj[key] = np.stack(rollout[key])

    if use_gae:
        assert "vf_preds" in rollout, "Values not found!"
        vpred_t = np.concatenate([rollout["vf_preds"], np.array([last_r])])
        delta_t = traj["rewards"] + gamma * vpred_t[1:] - vpred_t[:-1]
        # This formula for the advantage comes
        # "Generalized Advantage Estimation": https://arxiv.org/abs/1506.02438
        traj["advantages"] = discount(delta_t, gamma * lambda_)
        traj["value_targets"] = (traj["advantages"] +
                                 traj["vf_preds"]).copy().astype(np.float32)
    else:
        rewards_plus_v = np.concatenate(
            [rollout["rewards"], np.array([last_r])])
        traj["advantages"] = discount(rewards_plus_v, gamma)[:-1]

    traj["advantages"] = traj["advantages"].copy().astype(np.float32)

    assert all(val.shape[0] == trajsize for val in traj.values()), \
        "Rollout stacked incorrectly!"
    return SampleBatch(traj)