Exemplo n.º 1
0
def _postprocess_dqn(policy_graph, sample_batch):
    obs, actions, rewards, new_obs, dones = [
        list(x) for x in sample_batch.columns(
            ["obs", "actions", "rewards", "new_obs", "dones"])]

    # N-step Q adjustments
    if policy_graph.config["n_step"] > 1:
        adjust_nstep(
            policy_graph.config["n_step"], policy_graph.config["gamma"],
            obs, actions, rewards, new_obs, dones)

    batch = SampleBatch({
        "obs": obs, "actions": actions, "rewards": rewards,
        "new_obs": new_obs, "dones": dones,
        "weights": np.ones_like(rewards)})

    # Prioritize on the worker side
    if batch.count > 0 and policy_graph.config["worker_side_prioritization"]:
        td_errors = policy_graph.compute_td_error(
            batch["obs"], batch["actions"], batch["rewards"],
            batch["new_obs"], batch["dones"], batch["weights"])
        new_priorities = (
            np.abs(td_errors) + policy_graph.config["prioritized_replay_eps"])
        batch.data["weights"] = new_priorities

    return batch
Exemplo n.º 2
0
    def _optimize(self):
        with self.replay_timer:
            if isinstance(self.replay_buffer, PrioritizedReplayBuffer):
                (obses_t, actions, rewards, obses_tp1, dones, weights,
                 batch_indexes) = self.replay_buffer.sample(
                     self.train_batch_size, beta=self.prioritized_replay_beta)
            else:
                (obses_t, actions, rewards, obses_tp1,
                 dones) = self.replay_buffer.sample(self.train_batch_size)
                weights = np.ones_like(rewards)
                batch_indexes = -np.ones_like(rewards)

            samples = SampleBatch({
                "obs": obses_t,
                "actions": actions,
                "rewards": rewards,
                "new_obs": obses_tp1,
                "dones": dones,
                "weights": weights,
                "batch_indexes": batch_indexes
            })

        with self.grad_timer:
            td_error = self.local_evaluator.compute_apply(samples)
            new_priorities = (np.abs(td_error) + self.prioritized_replay_eps)
            if isinstance(self.replay_buffer, PrioritizedReplayBuffer):
                self.replay_buffer.update_priorities(samples["batch_indexes"],
                                                     new_priorities)
            self.grad_timer.push_units_processed(samples.count)

        self.num_steps_trained += samples.count
Exemplo n.º 3
0
    def step(self):
        with self.update_weights_timer:
            if self.remote_evaluators:
                weights = ray.put(self.local_evaluator.get_weights())
                for e in self.remote_evaluators:
                    e.set_weights.remote(weights)

        with self.sample_timer:
            if self.remote_evaluators:
                samples = SampleBatch.concat_samples(
                    ray.get(
                        [e.sample.remote() for e in self.remote_evaluators]))
            else:
                samples = self.local_evaluator.sample()
            assert isinstance(samples, SampleBatch)

        with self.load_timer:
            tuples_per_device = self.par_opt.load_data(
                self.local_evaluator.sess,
                samples.columns([key for key, _ in self.loss_inputs]))

        with self.grad_timer:
            for i in range(self.config.get("num_sgd_iter", 10)):
                batch_index = 0
                num_batches = (int(tuples_per_device) //
                               int(self.per_device_batch_size))
                permutation = np.random.permutation(num_batches)
                while batch_index < num_batches:
                    # TODO(ekl) support ppo's debugging features, e.g.
                    # printing the current loss and tracing
                    self.par_opt.optimize(
                        self.sess,
                        permutation[batch_index] * self.per_device_batch_size)
                    batch_index += 1
Exemplo n.º 4
0
    def step(self):
        with self.update_weights_timer:
            if self.remote_evaluators:
                weights = ray.put(self.local_evaluator.get_weights())
                for e in self.remote_evaluators:
                    e.set_weights.remote(weights)

        with self.sample_timer:
            if self.remote_evaluators:
                batch = SampleBatch.concat_samples(
                    ray.get(
                        [e.sample.remote() for e in self.remote_evaluators]))
            else:
                batch = self.local_evaluator.sample()

            # Handle everything as if multiagent
            if isinstance(batch, SampleBatch):
                batch = MultiAgentBatch({DEFAULT_POLICY_ID: batch},
                                        batch.count)

            for policy_id, s in batch.policy_batches.items():
                for row in s.rows():
                    if "weights" not in row:
                        row["weights"] = np.ones_like(row["rewards"])
                    self.replay_buffers[policy_id].add(
                        pack_if_needed(row["obs"]),
                        row["actions"], row["rewards"],
                        pack_if_needed(row["new_obs"]), row["dones"],
                        row["weights"])

        if self.num_steps_sampled >= self.replay_starts:
            self._optimize()

        self.num_steps_sampled += batch.count
Exemplo n.º 5
0
def collect_samples(agents, config, local_evaluator):
    num_timesteps_so_far = 0
    trajectories = []
    # This variable maps the object IDs of trajectories that are currently
    # computed to the agent that they are computed on; we start some initial
    # tasks here.

    agent_dict = {}

    for agent in agents:
        fut_sample = agent.sample.remote()
        agent_dict[fut_sample] = agent

    while num_timesteps_so_far < config["timesteps_per_batch"]:
        # TODO(pcm): Make wait support arbitrary iterators and remove the
        # conversion to list here.
        [fut_sample], _ = ray.wait(list(agent_dict))
        agent = agent_dict.pop(fut_sample)
        # Start task with next trajectory and record it in the dictionary.
        fut_sample2 = agent.sample.remote()
        agent_dict[fut_sample2] = agent

        next_sample = ray.get(fut_sample)
        num_timesteps_so_far += next_sample.count
        trajectories.append(next_sample)

    return SampleBatch.concat_samples(trajectories)
Exemplo n.º 6
0
    def sample(self):
        obs, actions, rewards, new_obs, dones = [], [], [], [], []
        for _ in range(self.config["sample_batch_size"] +
                       self.config["n_step"] - 1):
            action = self.agent.act(self.state)
            next_state, reward, done, _ = self.env.step(action)
            next_state = to_rainbow(next_state)
            obs.append(self.state.data.cpu().numpy())
            actions.append(action)
            rewards.append(reward)
            new_obs.append(next_state.data.cpu().numpy())
            dones.append(1.0 if done else 0.0)
            self.state = next_state
            self.episode_rewards[-1] += reward
            self.episode_lengths[-1] += 1
            if done:
                self.state = to_rainbow(self.env.reset())
                self.agent.reset_noise()
                self.episode_rewards.append(0.0)
                self.episode_lengths.append(0.0)
            self.local_timestep += 1

        # N-step Q adjustments
        if self.config["n_step"] > 1:
            # Adjust for steps lost from truncation
            self.local_timestep -= (self.config["n_step"] - 1)
            adjust_nstep(self.config["n_step"], self.config["gamma"], obs,
                         actions, rewards, new_obs, dones)

        batch = SampleBatch({
            "obs": obs,
            "actions": actions,
            "rewards": rewards,
            "new_obs": new_obs,
            "dones": dones,
            "weights": np.ones_like(rewards)
        })
        assert batch.count == self.config["sample_batch_size"]

        td_errors = self.agent.compute_td_error(batch)
        batch.data["obs"] = [pack(o) for o in batch["obs"]]
        batch.data["new_obs"] = [pack(o) for o in batch["new_obs"]]
        new_priorities = (np.abs(td_errors) +
                          self.config["prioritized_replay_eps"])
        batch.data["weights"] = new_priorities

        return batch
Exemplo n.º 7
0
    def sample(self):
        obs, actions, rewards, new_obs, dones = [], [], [], [], []
        for _ in range(
                self.config["sample_batch_size"] + self.config["n_step"] - 1):
            update_eps = self.exploration.value(self.local_timestep)
            action = self.act(
                np.array(self.obs)[None], update_eps=update_eps)[0]
            obs_tp1, reward, done, _ = self.env.step(action)
            obs.append(self.obs)
            actions.append(action)
            rewards.append(np.sign(reward))
            new_obs.append(obs_tp1)
            dones.append(1.0 if done else 0.0)
            self.obs = obs_tp1
            self.episode_rewards[-1] += reward
            self.episode_lengths[-1] += 1
            if done:
                self.obs = self.env.reset()
                self.episode_rewards.append(0.0)
                self.episode_lengths.append(0.0)
            self.local_timestep += 1

        # N-step Q adjustments
        if self.config["n_step"] > 1:
            # Adjust for steps lost from truncation
            self.local_timestep -= (self.config["n_step"] - 1)
            adjust_nstep(
                self.config["n_step"], self.config["gamma"],
                obs, actions, rewards, new_obs, dones)

        batch = SampleBatch({
            "obs": obs, "actions": actions, "rewards": rewards,
            "new_obs": new_obs, "dones": dones,
            "weights": np.ones_like(rewards)})
        assert batch.count == self.config["sample_batch_size"]

#        td_errors = self.agent.compute_td_error(batch)
        batch.data["obs"] = [pack(o) for o in batch["obs"]]
        batch.data["new_obs"] = [pack(o) for o in batch["new_obs"]]
#        new_priorities = (
#            np.abs(td_errors) + self.config["prioritized_replay_eps"])
#        batch.data["weights"] = new_priorities

        return batch
Exemplo n.º 8
0
    def replay(self):
        with self.replay_timer:
            if len(self.replay_buffer) < self.replay_starts:
                return None

            (obses_t, actions, rewards, obses_tp1,
                dones, weights, batch_indexes) = self.replay_buffer.sample(
                    self.train_batch_size,
                    beta=self.prioritized_replay_beta)

            batch = SampleBatch({
                "obs": obses_t, "actions": actions, "rewards": rewards,
                "new_obs": obses_tp1, "dones": dones, "weights": weights,
                "batch_indexes": batch_indexes})
            return batch
Exemplo n.º 9
0
    def step(self):
        with self.update_weights_timer:
            if self.remote_evaluators:
                weights = ray.put(self.local_evaluator.get_weights())
                for e in self.remote_evaluators:
                    e.set_weights.remote(weights)

        with self.sample_timer:
            if self.remote_evaluators:
                samples = SampleBatch.concat_samples(
                    ray.get(
                        [e.sample.remote() for e in self.remote_evaluators]))
            else:
                samples = self.local_evaluator.sample()

        with self.grad_timer:
            grad = self.local_evaluator.compute_gradients(samples)
            self.local_evaluator.apply_gradients(grad)
Exemplo n.º 10
0
    def step(self):
        with self.update_weights_timer:
            if self.remote_evaluators:
                weights = ray.put(self.local_evaluator.get_weights())
                for e in self.remote_evaluators:
                    e.set_weights.remote(weights)

        with self.sample_timer:
            if self.remote_evaluators:
                samples = SampleBatch.concat_samples(
                    ray.get(
                        [e.sample.remote() for e in self.remote_evaluators]))
            else:
                samples = self.local_evaluator.sample()

        with self.grad_timer:
            grad, _ = self.local_evaluator.compute_gradients(samples)
            self.local_evaluator.apply_gradients(grad)
            self.grad_timer.push_units_processed(samples.count)

        self.num_steps_sampled += samples.count
        self.num_steps_trained += samples.count
Exemplo n.º 11
0
    def step(self):
        with self.update_weights_timer:
            if self.remote_evaluators:
                weights = ray.put(self.local_evaluator.get_weights())
                for e in self.remote_evaluators:
                    e.set_weights.remote(weights)

        with self.sample_timer:
            if self.remote_evaluators:
                samples = SampleBatch.concat_samples(
                    ray.get(
                        [e.sample.remote() for e in self.remote_evaluators]))
            else:
                samples = self.local_evaluator.sample()

        with self.grad_timer:
            grad = self.local_evaluator.compute_gradients(samples)
            self.local_evaluator.apply_gradients(grad)
            self.grad_timer.push_units_processed(samples.count)

        self.num_steps_sampled += samples.count
        self.num_steps_trained += samples.count
Exemplo n.º 12
0
    def step(self):
        with self.update_weights_timer:
            if self.remote_evaluators:
                weights = ray.put(self.local_evaluator.get_weights())
                for e in self.remote_evaluators:
                    e.set_weights.remote(weights)

        with self.sample_timer:
            if self.remote_evaluators:
                batch = SampleBatch.concat_samples(
                    ray.get(
                        [e.sample.remote() for e in self.remote_evaluators]))
            else:
                batch = self.local_evaluator.sample()
            for row in batch.rows():
                self.replay_buffer.add(
                    row["obs"], row["actions"], row["rewards"], row["new_obs"],
                    row["dones"], row["weights"])

        if len(self.replay_buffer) >= self.replay_starts:
            self._optimize()

        self.num_steps_sampled += batch.count
Exemplo n.º 13
0
    def step(self):
        with self.update_weights_timer:
            if self.remote_evaluators:
                weights = ray.put(self.local_evaluator.get_weights())
                for e in self.remote_evaluators:
                    e.set_weights.remote(weights)

        with self.sample_timer:
            if self.remote_evaluators:
                batch = SampleBatch.concat_samples(
                    ray.get(
                        [e.sample.remote() for e in self.remote_evaluators]))
            else:
                batch = self.local_evaluator.sample()
            for row in batch.rows():
                self.replay_buffer.add(row["obs"], row["actions"],
                                       row["rewards"], row["new_obs"],
                                       row["dones"], row["weights"])

        if len(self.replay_buffer) >= self.replay_starts:
            self._optimize()

        self.num_steps_sampled += batch.count
Exemplo n.º 14
0
    def step(self):
        with self.update_weights_timer:
            if self.remote_evaluators:
                weights = ray.put(self.local_evaluator.get_weights())
                for e in self.remote_evaluators:
                    e.set_weights.remote(weights)

        with self.sample_timer:
            if self.remote_evaluators:
                samples = SampleBatch.concat_samples(
                    ray.get(
                        [e.sample.remote() for e in self.remote_evaluators]))
            else:
                samples = self.local_evaluator.sample()
            assert isinstance(samples, SampleBatch)

        with self.load_timer:
            tuples_per_device = self.par_opt.load_data(
                self.local_evaluator.sess,
                samples.columns([key for key, _ in self.loss_inputs]))

        with self.grad_timer:
            for i in range(self.num_sgd_iter):
                batch_index = 0
                num_batches = (
                    int(tuples_per_device) // int(self.per_device_batch_size))
                permutation = np.random.permutation(num_batches)
                while batch_index < num_batches:
                    # TODO(ekl) support ppo's debugging features, e.g.
                    # printing the current loss and tracing
                    self.par_opt.optimize(
                        self.sess,
                        permutation[batch_index] * self.per_device_batch_size)
                    batch_index += 1

        self.num_steps_sampled += samples.count
        self.num_steps_trained += samples.count
Exemplo n.º 15
0
 def _replay(self):
     samples = {}
     with self.replay_timer:
         for policy_id, replay_buffer in self.replay_buffers.items():
             if isinstance(replay_buffer, PrioritizedReplayBuffer):
                 (obses_t, actions, rewards, obses_tp1, dones, weights,
                  batch_indexes) = replay_buffer.sample(
                      self.train_batch_size,
                      beta=self.prioritized_replay_beta)
             else:
                 (obses_t, actions, rewards, obses_tp1,
                  dones) = replay_buffer.sample(self.train_batch_size)
                 weights = np.ones_like(rewards)
                 batch_indexes = -np.ones_like(rewards)
         samples[policy_id] = SampleBatch({
             "obs": obses_t,
             "actions": actions,
             "rewards": rewards,
             "new_obs": obses_tp1,
             "dones": dones,
             "weights": weights,
             "batch_indexes": batch_indexes
         })
     return MultiAgentBatch(samples, self.train_batch_size)
Exemplo n.º 16
0
    def sample(self):
        """sample rollouts from the environment, being called in step in PolicyOptimizer"""
        observations, rewards, actions, logprobs, dones, values = [], [], [], [], [], []
        done = False
        for step in range(self.config['steps_per_rollout']):
            value, action, logprob, mean = self.net.forward(
                to_variable(self.obs[np.newaxis], self.config['cuda']))
            action = action.cpu().data.numpy(
            )[0] if self.config['cuda'] else action.data.numpy()[0]
            next_obs, reward, done, _ = self.env.step(action)

            if self.config['cuda']:
                # torch has an additional dimension for batch size, so we need to select that batch
                value, logprob, mean = value.data.cpu().numpy()[0], logprob.data.cpu().numpy()[0], \
                                       mean.data.cpu().numpy()[0]
            else:
                value, logprob, mean = value.data.numpy()[0], logprob.data.numpy()[0], \
                                       mean.data.numpy()[0]

            observations.append(self.obs)
            actions.append(action)
            rewards.append(reward)
            logprobs.append(logprob)
            values.append(value)
            dones.append(done)

            self.obs = next_obs

            if done:
                # reset the environment
                self.obs = self.env.reset()

        if done:
            last_value = 0.0
        else:
            # bootstrap, we only need the last value to do this
            value, action, logprob, mean = self.net.forward(
                to_variable(self.obs[np.newaxis], self.config['cuda']))

            if self.config['cuda']:
                # torch has an additional dimension for batch size, so we need to select that batch
                value, = value.data.cpu().numpy()[0]
            else:
                value, = value.data.numpy()[0]
            last_value = value

        # same as ppo_single/model/ppo.py
        observations = np.asarray(observations)
        rewards = np.asarray(rewards)
        logprobs = np.asarray(logprobs)
        dones = np.asarray(dones)
        values = np.asarray(values)
        actions = np.asarray(actions)
        returns = calculate_returns(rewards, dones, last_value,
                                    self.config['gamma'])
        return SampleBatch({
            'observations': observations,
            'rewards': rewards,
            'logprobs': logprobs,
            'dones': dones,
            'values': values,
            'actions': actions,
            'returns': returns[:-1]
        })