예제 #1
0
    def from_batch(self,
                   train_batch: SampleBatch,
                   is_training: bool = True) -> (TensorType, List[TensorType]):
        """Convenience function that calls this model with a tensor batch.

        All this does is unpack the tensor batch to call this model with the
        right input dict, state, and seq len arguments.
        """

        input_dict = {
            "obs": train_batch[SampleBatch.CUR_OBS],
            "is_training": is_training,
        }
        if SampleBatch.PREV_ACTIONS in train_batch:
            input_dict["prev_actions"] = train_batch[SampleBatch.PREV_ACTIONS]
        if SampleBatch.PREV_REWARDS in train_batch:
            input_dict["prev_rewards"] = train_batch[SampleBatch.PREV_REWARDS]
        states = []
        i = 0
        while "state_in_{}".format(i) in train_batch:
            states.append(train_batch["state_in_{}".format(i)])
            i += 1
        return self.__call__(input_dict, states, train_batch.get("seq_lens"))
예제 #2
0
파일: lambdas.py 프로젝트: parasj/ray
def flatten_data(data: AgentConnectorsOutput):
    assert isinstance(
        data, AgentConnectorsOutput
    ), "Single agent data must be of type AgentConnectorsOutput"

    for_training = data.for_training
    for_action = data.for_action

    flattened = {}
    for k, v in for_action.items():
        if k in [SampleBatch.INFOS, SampleBatch.ACTIONS
                 ] or k.startswith("state_out_"):
            # Do not flatten infos, actions, and state_out_ columns.
            flattened[k] = v
            continue
        if v is None:
            # Keep the same column shape.
            flattened[k] = None
            continue
        flattened[k] = np.array(tree.flatten(v))
    flattened = SampleBatch(flattened, is_training=False)

    return AgentConnectorsOutput(for_training, flattened)
예제 #3
0
    def add_postprocessed_batch_for_training(
            self, batch: SampleBatch,
            view_requirements: Dict[str, ViewRequirement]) -> None:
        """Adds a postprocessed SampleBatch (single agent) to our buffers.

        Args:
            batch (SampleBatch): A single agent (one trajectory) SampleBatch
                to be added to the Policy's buffers.
            view_requirements (Dict[str, ViewRequirement]: The view
                requirements for the policy. This is so we know, whether a
                view-column needs to be copied at all (not needed for
                training).
        """
        for view_col, data in batch.items():
            # TODO(ekl) how do we handle this for policies that don't extend
            # Torch / TF Policy template (no inference of view reqs)?
            # Skip columns that are not used for training.
            # if view_col not in view_requirements or \
            #         not view_requirements[view_col].used_for_training:
            #     continue
            self.buffers[view_col].extend(data)
        # Add the agent's trajectory length to our count.
        self.count += batch.count
예제 #4
0
    def test_n_step_from_same_obs_source_array(self):
        """Tests, whether n-step also works on a shared obs/new-obs array."""
        gamma = 0.99
        # The underlying observation data. Both obs and next_obs will
        # be references into that same np.array.
        underlying_obs = np.arange(0, 8)
        obs = underlying_obs[:7]
        next_obs = underlying_obs[1:]

        actions = np.random.randint(-1, 3, size=(7, ))
        check_actions = actions.copy()
        rewards = [10.0, 0.0, 100.0, 50.0, 60.0, 10.0, 100.0]
        dones = [False, False, False, False, False, False, True]

        batch = SampleBatch({
            SampleBatch.OBS: obs,
            SampleBatch.ACTIONS: actions,
            SampleBatch.REWARDS: rewards,
            SampleBatch.DONES: dones,
            SampleBatch.NEXT_OBS: next_obs,
        })
        adjust_nstep(4, gamma, batch)

        check(batch[SampleBatch.OBS], [0, 1, 2, 3, 4, 5, 6])
        check(batch[SampleBatch.ACTIONS], check_actions)
        check(batch[SampleBatch.NEXT_OBS], [4, 5, 6, 7, 7, 7, 7])
        check(batch[SampleBatch.DONES],
              [False, False, False, True, True, True, True])
        check(batch[SampleBatch.REWARDS], [
            discount_cumsum(np.array(rewards[0:4]), gamma)[0],
            discount_cumsum(np.array(rewards[1:5]), gamma)[0],
            discount_cumsum(np.array(rewards[2:6]), gamma)[0],
            discount_cumsum(np.array(rewards[3:7]), gamma)[0],
            discount_cumsum(np.array(rewards[4:]), gamma)[0],
            discount_cumsum(np.array(rewards[5:]), gamma)[0],
            discount_cumsum(np.array(rewards[6:]), gamma)[0],
        ])
예제 #5
0
파일: appo_tf_policy.py 프로젝트: Yard1/ray
def stats(policy: Policy, train_batch: SampleBatch) -> Dict[str, TensorType]:
    """Stats function for APPO. Returns a dict with important loss stats.

    Args:
        policy (Policy): The Policy to generate stats for.
        train_batch (SampleBatch): The SampleBatch (already) used for training.

    Returns:
        Dict[str, TensorType]: The stats dict.
    """
    values_batched = _make_time_major(
        policy,
        train_batch.get("seq_lens"),
        policy.model.value_function(),
        drop_last=policy.config["vtrace"])

    stats_dict = {
        "cur_lr": tf.cast(policy.cur_lr, tf.float64),
        "policy_loss": policy._mean_policy_loss,
        "entropy": policy._mean_entropy,
        "var_gnorm": tf.linalg.global_norm(policy.model.trainable_variables()),
        "vf_loss": policy._mean_vf_loss,
        "vf_explained_var": explained_variance(
            tf.reshape(policy._value_targets, [-1]),
            tf.reshape(values_batched, [-1])),
    }

    if policy.config["vtrace"]:
        is_stat_mean, is_stat_var = tf.nn.moments(policy._is_ratio, [0, 1])
        stats_dict["mean_IS"] = is_stat_mean
        stats_dict["var_IS"] = is_stat_var

    if policy.config["use_kl_loss"]:
        stats_dict["kl"] = policy._mean_kl
        stats_dict["KL_Coeff"] = policy.kl_coeff

    return stats_dict
예제 #6
0
    def compute_actions(
        self,
        obs_batch,
        state_batches=None,
        prev_action_batch=None,
        prev_reward_batch=None,
        info_batch=None,
        episodes=None,
        explore=None,
        timestep=None,
        **kwargs,
    ):
        # Create input dict to simply pass the entire call to
        # self.compute_actions_from_input_dict().
        input_dict = SampleBatch(
            {
                SampleBatch.CUR_OBS: obs_batch,
            },
            _is_training=tf.constant(False),
        )
        if state_batches is not None:
            for s in enumerate(state_batches):
                input_dict["state_in_{i}"] = s
        if prev_action_batch is not None:
            input_dict[SampleBatch.PREV_ACTIONS] = prev_action_batch
        if prev_reward_batch is not None:
            input_dict[SampleBatch.PREV_REWARDS] = prev_reward_batch
        if info_batch is not None:
            input_dict[SampleBatch.INFOS] = info_batch

        return self.compute_actions_from_input_dict(
            input_dict=input_dict,
            explore=explore,
            timestep=timestep,
            episodes=episodes,
            **kwargs,
        )
예제 #7
0
    def test_pad_batch_dynamic_max(self):
        """Test pad_batch_to_sequences_of_same_size when dynamic_max = True"""
        view_requirements = {
            "state_in_0":
            ViewRequirement(
                "state_out_0",
                shift=[-1],
                used_for_training=False,
                used_for_compute_actions=True,
                batch_repeat_value=1,
            )
        }
        max_seq_len = 20
        num_seqs = np.random.randint(1, 20)
        seq_lens = np.random.randint(1, max_seq_len, size=(num_seqs))
        max_len = np.max(seq_lens)
        sum_seq_lens = np.sum(seq_lens)

        s1 = SampleBatch(
            {
                "a": np.arange(sum_seq_lens),
                "b": np.arange(sum_seq_lens),
                "seq_lens": seq_lens,
                "state_in_0": [[0]] * num_seqs,
            },
            _max_seq_len=max_seq_len,
        )

        pad_batch_to_sequences_of_same_size(
            s1,
            max_seq_len=max_seq_len,
            feature_keys=["a", "b"],
            view_requirements=view_requirements,
        )
        check(s1.max_seq_len, max_len)
        check(s1["a"].shape[0], max_len * num_seqs)
        check(s1["b"].shape[0], max_len * num_seqs)
예제 #8
0
    def step(self):
        with self.update_weights_timer:
            if self.workers.remote_workers():
                weights = ray.put(self.workers.local_worker().get_weights())
                for e in self.workers.remote_workers():
                    e.set_weights.remote(weights)

        with self.sample_timer:
            if self.workers.remote_workers():
                batch = SampleBatch.concat_samples(
                    ray_get_and_free([
                        e.sample.remote()
                        for e in self.workers.remote_workers()
                    ]))
            else:
                batch = self.workers.local_worker().sample()

            # Handle everything as if multiagent
            if isinstance(batch, SampleBatch):
                batch = MultiAgentBatch({
                    DEFAULT_POLICY_ID: batch
                }, batch.count)

            for policy_id, s in batch.policy_batches.items():
                for row in s.rows():
                    self.replay_buffers[policy_id].add(
                        pack_if_needed(row["obs"]),
                        row["actions"],
                        row["rewards"],
                        pack_if_needed(row["new_obs"]),
                        row["dones"],
                        weight=None)

        if self.num_steps_sampled >= self.replay_starts:
            self._optimize()

        self.num_steps_sampled += batch.count
예제 #9
0
    def call(self, input_dict: SampleBatch) -> \
            (TensorType, List[TensorType], Dict[str, TensorType]):
        assert input_dict.get(SampleBatch.SEQ_LENS) is not None
        # Push obs through underlying (wrapped) model first.
        wrapped_out, _, _ = self.wrapped_keras_model(input_dict)

        # Concat. prev-action/reward if required.
        prev_a_r = []
        if self.lstm_use_prev_action:
            prev_a = input_dict[SampleBatch.PREV_ACTIONS]
            if isinstance(self.action_space, (Discrete, MultiDiscrete)):
                prev_a = one_hot(prev_a, self.action_space)
            prev_a_r.append(
                tf.reshape(tf.cast(prev_a, tf.float32), [-1, self.action_dim]))
        if self.lstm_use_prev_reward:
            prev_a_r.append(
                tf.reshape(
                    tf.cast(input_dict[SampleBatch.PREV_REWARDS], tf.float32),
                    [-1, 1]))

        if prev_a_r:
            wrapped_out = tf.concat([wrapped_out] + prev_a_r, axis=1)

        max_seq_len = tf.shape(wrapped_out)[0] // tf.shape(
            input_dict[SampleBatch.SEQ_LENS])[0]
        wrapped_out_plus_time_dim = add_time_dimension(wrapped_out,
                                                       max_seq_len=max_seq_len,
                                                       framework="tf")
        model_out, value_out, h, c = self._rnn_model([
            wrapped_out_plus_time_dim, input_dict[SampleBatch.SEQ_LENS],
            input_dict["state_in_0"], input_dict["state_in_1"]
        ])
        model_out_no_time_dim = tf.reshape(
            model_out, tf.concat([[-1], tf.shape(model_out)[2:]], axis=0))
        return model_out_no_time_dim, [h, c], {
            SampleBatch.VF_PREDS: tf.reshape(value_out, [-1])
        }
예제 #10
0
    def _build_actor_network(
        self,
        obs,
        obs_space,
        act_space,
        use_state_preprocessor,
        hiddens,
        activation=None,
        scope=None,
    ):
        with tf1.variable_scope(scope, reuse=tf1.AUTO_REUSE) as scope:
            if use_state_preprocessor:
                model = ModelCatalog.get_model(
                    SampleBatch(
                        obs=obs,
                        _is_training=self._get_is_training_placeholder()),
                    obs_space,
                    act_space,
                    1,
                    self.config["model"],
                )
                out = model.last_layer
            else:
                model = None
                out = obs

            for hidden in hiddens:
                out = tf1.layers.dense(out,
                                       units=hidden,
                                       activation=activation)
            feature = tf1.layers.dense(out,
                                       units=act_space.shape[0],
                                       activation=None)
            sampler = tfp.distributions.RelaxedOneHotCategorical(
                temperature=1.0, logits=feature).sample()

        return sampler, feature, model, tf1.global_variables(scope.name)
예제 #11
0
    def gen_replay(timeout):
        while True:
            samples = {}
            idxes = None

            for policy_id, reservoir_buffer in reservoir_buffers.buffers.items(
            ):
                if len(reservoir_buffer) >= min_size_to_learn and \
                        reservoir_buffers.steps[policy_id] >= learn_every:

                    idxes = reservoir_buffer.sample_idxes(train_batch_size)
                    (obses_t,
                     actions) = reservoir_buffer.sample_with_idxes(idxes)
                    samples[policy_id] = SampleBatch({
                        "obs": obses_t,
                        "actions": actions,
                    })

                    reservoir_buffers.steps[policy_id] = 0

            if samples == {}:
                yield _NextValueNotReady()
            else:
                yield MultiAgentBatch(samples, train_batch_size)
예제 #12
0
 def test_n_step_3(self):
     """Tests, whether n-step adjustments of trajectories work."""
     # n-step = 3
     gamma = 0.9
     obs = [1, 2, 3, 4, 5, 6, 7]
     actions = ["ac1", "ac2", "ac1", "ac1", "ac1", "ac2", "ac1"]
     rewards = [10.0, 0.0, 100.0, 100.0, 100.0, 100.0, 100.0]
     dones = [0, 0, 0, 0, 0, 0, 1]
     next_obs = [2, 3, 4, 5, 6, 7, 8]
     batch = SampleBatch({
         SampleBatch.OBS: obs,
         SampleBatch.ACTIONS: actions,
         SampleBatch.REWARDS: rewards,
         SampleBatch.DONES: dones,
         SampleBatch.NEXT_OBS: next_obs,
     })
     adjust_nstep(3, gamma, batch)
     check(batch[SampleBatch.OBS], [1, 2, 3, 4, 5, 6, 7])
     check(batch[SampleBatch.ACTIONS],
           ["ac1", "ac2", "ac1", "ac1", "ac1", "ac2", "ac1"])
     check(batch[SampleBatch.NEXT_OBS], [4, 5, 6, 7, 8, 8, 8])
     check(batch[SampleBatch.DONES], [0, 0, 0, 0, 1, 1, 1])
     check(batch[SampleBatch.REWARDS],
           [91.0, 171.0, 271.0, 271.0, 271.0, 190.0, 100.0])
예제 #13
0
    def postprocess_trajectory(
        self,
        sample_batch: SampleBatch,
        other_agent_batches: Optional[Dict[Any, SampleBatch]] = None,
        episode: Optional["Episode"] = None,
    ):
        sample_batch = super().postprocess_trajectory(
            sample_batch, other_agent_batches, episode
        )

        # Trajectory is actually complete -> last r=0.0.
        if sample_batch[SampleBatch.DONES][-1]:
            last_r = 0.0
        # Trajectory has been truncated -> last r=VF estimate of last obs.
        else:
            # Input dict is provided to us automatically via the Model's
            # requirements. It's a single-timestep (last one in trajectory)
            # input_dict.
            # Create an input dict according to the Model's requirements.
            index = "last" if SampleBatch.NEXT_OBS in sample_batch else -1
            input_dict = sample_batch.get_single_step_input_dict(
                self.model.view_requirements, index=index
            )
            last_r = self._value(**input_dict)

        # Adds the "advantages" (which in the case of MARWIL are simply the
        # discounted cumulative rewards) to the SampleBatch.
        return compute_advantages(
            sample_batch,
            last_r,
            self.config["gamma"],
            # We just want the discounted cumulative rewards, so we won't need
            # GAE nor critic (use_critic=True: Subtract vf-estimates from returns).
            use_gae=False,
            use_critic=False,
        )
예제 #14
0
    def __call__(self, samples: List[SampleBatchType]):
        samples, split_lst = self.post_process_samples(samples)
        self.buffer.extend(samples)
        self.split.append(split_lst)
        self.post_process_metrics()
        if len(self.split) > self.n:
            out = SampleBatch.concat_samples(self.buffer)
            out["split"] = np.array(self.split)
            self.buffer = []
            self.split = []

            # Metrics Reporting
            metrics = _get_shared_metrics()
            metrics.counters[STEPS_SAMPLED_COUNTER] += out.count

            # Reporting Adaptation Rew Diff
            ep_rew_pre = self.metrics["episode_reward_mean"]
            ep_rew_post = self.metrics["episode_reward_mean_adapt_" +
                                       str(self.n)]
            self.metrics["adaptation_delta"] = ep_rew_post - ep_rew_pre
            return [(out, self.metrics)]
        else:
            self.inner_adaptation_step(samples)
            return []
예제 #15
0
 def test_n_step_4(self):
     """Tests, whether n-step adjustments of trajectories work."""
     # n-step = 4
     gamma = 0.99
     obs = np.arange(0, 7)
     actions = np.random.randint(-1, 3, size=(7,))
     check_actions = actions.copy()
     rewards = [10.0, 0.0, 100.0, 50.0, 60.0, 10.0, 100.0]
     dones = [False, False, False, False, False, False, True]
     next_obs = np.arange(1, 8)
     batch = SampleBatch(
         {
             SampleBatch.OBS: obs,
             SampleBatch.ACTIONS: actions,
             SampleBatch.REWARDS: rewards,
             SampleBatch.DONES: dones,
             SampleBatch.NEXT_OBS: next_obs,
         }
     )
     adjust_nstep(4, gamma, batch)
     check(batch[SampleBatch.OBS], [0, 1, 2, 3, 4, 5, 6])
     check(batch[SampleBatch.ACTIONS], check_actions)
     check(batch[SampleBatch.NEXT_OBS], [4, 5, 6, 7, 7, 7, 7])
     check(batch[SampleBatch.DONES], [False, False, False, True, True, True, True])
     check(
         batch[SampleBatch.REWARDS],
         [
             discount_cumsum(np.array(rewards[0:4]), gamma)[0],
             discount_cumsum(np.array(rewards[1:5]), gamma)[0],
             discount_cumsum(np.array(rewards[2:6]), gamma)[0],
             discount_cumsum(np.array(rewards[3:7]), gamma)[0],
             discount_cumsum(np.array(rewards[4:]), gamma)[0],
             discount_cumsum(np.array(rewards[5:]), gamma)[0],
             discount_cumsum(np.array(rewards[6:]), gamma)[0],
         ],
     )
    def step(self):
        with self.update_weights_timer:
            if self.workers.remote_workers():
                weights = ray.put(self.workers.local_worker().get_weights())
                for e in self.workers.remote_workers():
                    e.set_weights.remote(weights)

        with self.sample_timer:
            samples = []
            while sum(s.count for s in samples) < self.train_batch_size:
                if self.workers.remote_workers():
                    samples.extend(
                        ray.get([
                            e.sample.remote()
                            for e in self.workers.remote_workers()
                        ]))
                else:
                    samples.append(self.workers.local_worker().sample())
            samples = SampleBatch.concat_samples(samples)
            self.sample_timer.push_units_processed(samples.count)

        with self.grad_timer:
            fetches = do_minibatch_sgd(samples, self.policies,
                                       self.workers.local_worker(),
                                       self.num_sgd_iter,
                                       self.sgd_minibatch_size,
                                       self.standardize_fields)
        self.grad_timer.push_units_processed(samples.count)

        if len(fetches) == 1 and DEFAULT_POLICY_ID in fetches:
            self.learner_stats = fetches[DEFAULT_POLICY_ID]
        else:
            self.learner_stats = fetches
        self.num_steps_sampled += samples.count
        self.num_steps_trained += samples.count
        return self.learner_stats
예제 #17
0
    def _get_dummy_batch_from_view_requirements(
            self, batch_size: int = 1) -> SampleBatch:
        """Creates a numpy dummy batch based on the Policy's view requirements.

        Args:
            batch_size (int): The size of the batch to create.

        Returns:
            Dict[str, TensorType]: The dummy batch containing all zero values.
        """
        ret = {}
        for view_col, view_req in self.view_requirements.items():
            if isinstance(view_req.space, (gym.spaces.Dict, gym.spaces.Tuple)):
                _, shape = ModelCatalog.get_action_shape(view_req.space)
                ret[view_col] = \
                    np.zeros((batch_size, ) + shape[1:], np.float32)
            else:
                if isinstance(view_req.space, gym.spaces.Space):
                    ret[view_col] = np.zeros_like(
                        [view_req.space.sample() for _ in range(batch_size)])
                else:
                    ret[view_col] = [view_req.space for _ in range(batch_size)]

        return SampleBatch(ret)
예제 #18
0
    def _add_sample_batch_to_buffer(self, buffer, batch_size, num_batches=5, **kwargs):
        self.eps_id = 0

        def _generate_data():
            self.eps_id += 1
            return SampleBatch(
                {
                    SampleBatch.T: [0, 1],
                    SampleBatch.ACTIONS: 2 * [np.random.choice([0, 1])],
                    SampleBatch.REWARDS: 2 * [np.random.rand()],
                    SampleBatch.OBS: 2 * [np.random.random((4,))],
                    SampleBatch.NEXT_OBS: 2 * [np.random.random((4,))],
                    SampleBatch.DONES: 2 * [np.random.choice([False, True])],
                    SampleBatch.EPS_ID: 2 * [self.eps_id],
                    SampleBatch.AGENT_INDEX: 2 * [0],
                    "batch_id": 2 * [self.batch_id],
                }
            )

        for i in range(num_batches):
            data = [_generate_data() for _ in range(batch_size)]
            self.batch_id += 1
            batch = SampleBatch.concat_samples(data)
            buffer.add(batch, **kwargs)
예제 #19
0
파일: rollout_ops.py 프로젝트: zachkeer/ray
def ParallelRollouts(workers: WorkerSet,
                     *,
                     mode="bulk_sync",
                     num_async=1) -> LocalIterator[SampleBatch]:
    """Operator to collect experiences in parallel from rollout workers.

    If there are no remote workers, experiences will be collected serially from
    the local worker instance instead.

    Arguments:
        workers (WorkerSet): set of rollout workers to use.
        mode (str): One of {'async', 'bulk_sync', 'raw'}.
            - In 'async' mode, batches are returned as soon as they are
              computed by rollout workers with no order guarantees.
            - In 'bulk_sync' mode, we collect one batch from each worker
              and concatenate them together into a large batch to return.
            - In 'raw' mode, the ParallelIterator object is returned directly
              and the caller is responsible for implementing gather and
              updating the timesteps counter.
        num_async (int): In async mode, the max number of async
            requests in flight per actor.

    Returns:
        A local iterator over experiences collected in parallel.

    Examples:
        >>> rollouts = ParallelRollouts(workers, mode="async")
        >>> batch = next(rollouts)
        >>> print(batch.count)
        50  # config.rollout_fragment_length

        >>> rollouts = ParallelRollouts(workers, mode="bulk_sync")
        >>> batch = next(rollouts)
        >>> print(batch.count)
        200  # config.rollout_fragment_length * config.num_workers

    Updates the STEPS_SAMPLED_COUNTER counter in the local iterator context.
    """

    # Ensure workers are initially in sync.
    workers.sync_weights()

    def report_timesteps(batch):
        metrics = _get_shared_metrics()
        metrics.counters[STEPS_SAMPLED_COUNTER] += batch.count
        return batch

    if not workers.remote_workers():
        # Handle the serial sampling case.
        def sampler(_):
            while True:
                yield workers.local_worker().sample()

        return (LocalIterator(sampler,
                              SharedMetrics()).for_each(report_timesteps))

    # Create a parallel iterator over generated experiences.
    rollouts = from_actors(workers.remote_workers())

    if mode == "bulk_sync":
        return rollouts \
            .batch_across_shards() \
            .for_each(lambda batches: SampleBatch.concat_samples(batches)) \
            .for_each(report_timesteps)
    elif mode == "async":
        return rollouts.gather_async(
            num_async=num_async).for_each(report_timesteps)
    elif mode == "raw":
        return rollouts
    else:
        raise ValueError("mode must be one of 'bulk_sync', 'async', 'raw', "
                         "got '{}'".format(mode))
예제 #20
0
    def build(self, view_requirements: ViewRequirementsDict) -> SampleBatch:
        """Builds a SampleBatch from the thus-far collected agent data.

        If the episode/trajectory has no DONE=True at the end, will copy
        the necessary n timesteps at the end of the trajectory back to the
        beginning of the buffers and wait for new samples coming in.
        SampleBatches created by this method will be ready for postprocessing
        by a Policy.

        Args:
            view_requirements (ViewRequirementsDict): The view
                requirements dict needed to build the SampleBatch from the raw
                buffers (which may have data shifts as well as mappings from
                view-col to data-col in them).

        Returns:
            SampleBatch: The built SampleBatch for this agent, ready to go into
                postprocessing.
        """

        batch_data = {}
        np_data = {}
        for view_col, view_req in view_requirements.items():
            # Create the batch of data from the different buffers.
            data_col = view_req.data_col or view_col

            # Some columns don't exist yet (get created during postprocessing).
            # -> skip.
            if data_col not in self.buffers:
                continue

            # OBS are already shifted by -1 (the initial obs starts one ts
            # before all other data columns).
            obs_shift = -1 if data_col == SampleBatch.OBS else 0

            # Keep an np-array cache so we don't have to regenerate the
            # np-array for different view_cols using to the same data_col.
            if data_col not in np_data:
                np_data[data_col] = to_float_np_array(self.buffers[data_col])

            # Range of indices on time-axis, e.g. "-50:-1". Together with
            # the `batch_repeat_value`, this determines the data produced.
            # Example:
            #  batch_repeat_value=10, shift_from=-3, shift_to=-1
            #  buffer=[-3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
            #  resulting data=[[-3, -2, -1], [7, 8, 9]]
            #  Range of 3 consecutive items repeats every 10 timesteps.
            if view_req.shift_from is not None:
                # Batch repeat value > 1: Only repeat the shift_from/to range
                # every n timesteps.
                if view_req.batch_repeat_value > 1:
                    count = int(
                        math.ceil((len(np_data[data_col]) - self.shift_before)
                                  / view_req.batch_repeat_value))
                    data = np.asarray([
                        np_data[data_col][self.shift_before +
                                          (i * view_req.batch_repeat_value) +
                                          view_req.shift_from +
                                          obs_shift:self.shift_before +
                                          (i * view_req.batch_repeat_value) +
                                          view_req.shift_to + 1 + obs_shift]
                        for i in range(count)
                    ])
                # Batch repeat value = 1: Repeat the shift_from/to range at
                # each timestep.
                else:
                    d = np_data[data_col]
                    shift_win = view_req.shift_to - view_req.shift_from + 1
                    data_size = d.itemsize * int(np.product(d.shape[1:]))
                    strides = [
                        d.itemsize * int(np.product(d.shape[i + 1:]))
                        for i in range(1, len(d.shape))
                    ]
                    data = np.lib.stride_tricks.as_strided(
                        d[self.shift_before - shift_win:],
                        [self.agent_steps, shift_win
                         ] + [d.shape[i] for i in range(1, len(d.shape))],
                        [data_size, data_size] + strides)
            # Set of (probably non-consecutive) indices.
            # Example:
            #  shift=[-3, 0]
            #  buffer=[-3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
            #  resulting data=[[-3, 0], [-2, 1], [-1, 2], [0, 3], [1, 4], ...]
            elif isinstance(view_req.shift, np.ndarray):
                data = np_data[data_col][self.shift_before + obs_shift +
                                         view_req.shift]
            # Single shift int value. Use the trajectory as-is, and if
            # `shift` != 0: shifted by that value.
            else:
                shift = view_req.shift + obs_shift

                # Batch repeat (only provide a value every n timesteps).
                if view_req.batch_repeat_value > 1:
                    count = int(
                        math.ceil((len(np_data[data_col]) - self.shift_before)
                                  / view_req.batch_repeat_value))
                    data = np.asarray([
                        np_data[data_col][self.shift_before + (
                            i * view_req.batch_repeat_value) + shift]
                        for i in range(count)
                    ])
                # Shift is exactly 0: Use trajectory as is.
                elif shift == 0:
                    data = np_data[data_col][self.shift_before:]
                # Shift is positive: We still need to 0-pad at the end.
                elif shift > 0:
                    data = to_float_np_array(
                        self.buffers[data_col][self.shift_before + shift:] + [
                            np.zeros(
                                shape=view_req.space.shape,
                                dtype=view_req.space.dtype)
                            for _ in range(shift)
                        ])
                # Shift is negative: Shift into the already existing and
                # 0-padded "before" area of our buffers.
                else:
                    data = np_data[data_col][self.shift_before + shift:shift]

            if len(data) > 0:
                batch_data[view_col] = data

        # Due to possible batch-repeats > 1, columns in the resulting batch
        # may not all have the same batch size.
        batch = SampleBatch(batch_data, _dont_check_lens=True)

        # Add EPS_ID and UNROLL_ID to batch.
        batch.data[SampleBatch.EPS_ID] = np.repeat(self.episode_id,
                                                   batch.count)
        if SampleBatch.UNROLL_ID not in batch.data:
            # TODO: (sven) Once we have the additional
            #  model.preprocess_train_batch in place (attention net PR), we
            #  should not even need UNROLL_ID anymore:
            #  Add "if SampleBatch.UNROLL_ID in view_requirements:" here.
            batch.data[SampleBatch.UNROLL_ID] = np.repeat(
                _AgentCollector._next_unroll_id, batch.count)
            _AgentCollector._next_unroll_id += 1

        # This trajectory is continuing -> Copy data at the end (in the size of
        # self.shift_before) to the beginning of buffers and erase everything
        # else.
        if not self.buffers[SampleBatch.DONES][-1]:
            # Copy data to beginning of buffer and cut lists.
            if self.shift_before > 0:
                for k, data in self.buffers.items():
                    self.buffers[k] = data[-self.shift_before:]
            self.agent_steps = 0

        return batch
예제 #21
0
    check_compute_single_action

# Fake CartPole episode of n time steps.
FAKE_BATCH = SampleBatch({
    SampleBatch.OBS:
    np.array(
        [[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8], [0.9, 1.0, 1.1, 1.2]],
        dtype=np.float32),
    SampleBatch.ACTIONS:
    np.array([0, 1, 1]),
    SampleBatch.PREV_ACTIONS:
    np.array([0, 1, 1]),
    SampleBatch.REWARDS:
    np.array([1.0, -1.0, .5], dtype=np.float32),
    SampleBatch.PREV_REWARDS:
    np.array([1.0, -1.0, .5], dtype=np.float32),
    SampleBatch.DONES:
    np.array([False, False, True]),
    SampleBatch.VF_PREDS:
    np.array([0.5, 0.6, 0.7], dtype=np.float32),
    SampleBatch.ACTION_DIST_INPUTS:
    np.array([[-2., 0.5], [-3., -0.3], [-0.1, 2.5]], dtype=np.float32),
    SampleBatch.ACTION_LOGP:
    np.array([-0.5, -0.1, -0.2], dtype=np.float32),
    SampleBatch.EPS_ID:
    np.array([0, 0, 0]),
    SampleBatch.AGENT_INDEX:
    np.array([0, 0, 0]),
})


class MyCallbacks(DefaultCallbacks):
예제 #22
0
def ddpg_actor_critic_loss(policy: Policy, model: ModelV2, _,
                           train_batch: SampleBatch) -> TensorType:
    twin_q = policy.config["twin_q"]
    gamma = policy.config["gamma"]
    n_step = policy.config["n_step"]
    use_huber = policy.config["use_huber"]
    huber_threshold = policy.config["huber_threshold"]
    l2_reg = policy.config["l2_reg"]

    input_dict = SampleBatch(obs=train_batch[SampleBatch.CUR_OBS],
                             _is_training=True)
    input_dict_next = SampleBatch(obs=train_batch[SampleBatch.NEXT_OBS],
                                  _is_training=True)

    model_out_t, _ = model(input_dict, [], None)
    model_out_tp1, _ = model(input_dict_next, [], None)
    target_model_out_tp1, _ = policy.target_model(input_dict_next, [], None)

    policy.target_q_func_vars = policy.target_model.variables()

    # Policy network evaluation.
    policy_t = model.get_policy_output(model_out_t)
    policy_tp1 = policy.target_model.get_policy_output(target_model_out_tp1)

    # Action outputs.
    if policy.config["smooth_target_policy"]:
        target_noise_clip = policy.config["target_noise_clip"]
        clipped_normal_sample = tf.clip_by_value(
            tf.random.normal(tf.shape(policy_tp1),
                             stddev=policy.config["target_noise"]),
            -target_noise_clip,
            target_noise_clip,
        )
        policy_tp1_smoothed = tf.clip_by_value(
            policy_tp1 + clipped_normal_sample,
            policy.action_space.low * tf.ones_like(policy_tp1),
            policy.action_space.high * tf.ones_like(policy_tp1),
        )
    else:
        # No smoothing, just use deterministic actions.
        policy_tp1_smoothed = policy_tp1

    # Q-net(s) evaluation.
    # prev_update_ops = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS))
    # Q-values for given actions & observations in given current
    q_t = model.get_q_values(model_out_t, train_batch[SampleBatch.ACTIONS])

    # Q-values for current policy (no noise) in given current state
    q_t_det_policy = model.get_q_values(model_out_t, policy_t)

    if twin_q:
        twin_q_t = model.get_twin_q_values(model_out_t,
                                           train_batch[SampleBatch.ACTIONS])

    # Target q-net(s) evaluation.
    q_tp1 = policy.target_model.get_q_values(target_model_out_tp1,
                                             policy_tp1_smoothed)

    if twin_q:
        twin_q_tp1 = policy.target_model.get_twin_q_values(
            target_model_out_tp1, policy_tp1_smoothed)

    q_t_selected = tf.squeeze(q_t, axis=len(q_t.shape) - 1)
    if twin_q:
        twin_q_t_selected = tf.squeeze(twin_q_t, axis=len(q_t.shape) - 1)
        q_tp1 = tf.minimum(q_tp1, twin_q_tp1)

    q_tp1_best = tf.squeeze(input=q_tp1, axis=len(q_tp1.shape) - 1)
    q_tp1_best_masked = (
        1.0 - tf.cast(train_batch[SampleBatch.DONES], tf.float32)) * q_tp1_best

    # Compute RHS of bellman equation.
    q_t_selected_target = tf.stop_gradient(
        tf.cast(train_batch[SampleBatch.REWARDS], tf.float32) +
        gamma**n_step * q_tp1_best_masked)

    # Compute the error (potentially clipped).
    if twin_q:
        td_error = q_t_selected - q_t_selected_target
        twin_td_error = twin_q_t_selected - q_t_selected_target
        if use_huber:
            errors = huber_loss(td_error, huber_threshold) + huber_loss(
                twin_td_error, huber_threshold)
        else:
            errors = 0.5 * tf.math.square(td_error) + 0.5 * tf.math.square(
                twin_td_error)
    else:
        td_error = q_t_selected - q_t_selected_target
        if use_huber:
            errors = huber_loss(td_error, huber_threshold)
        else:
            errors = 0.5 * tf.math.square(td_error)

    critic_loss = tf.reduce_mean(
        tf.cast(train_batch[PRIO_WEIGHTS], tf.float32) * errors)
    actor_loss = -tf.reduce_mean(q_t_det_policy)

    # Add l2-regularization if required.
    if l2_reg is not None:
        for var in policy.model.policy_variables():
            if "bias" not in var.name:
                actor_loss += l2_reg * tf.nn.l2_loss(var)
        for var in policy.model.q_variables():
            if "bias" not in var.name:
                critic_loss += l2_reg * tf.nn.l2_loss(var)

    # Model self-supervised losses.
    if policy.config["use_state_preprocessor"]:
        # Expand input_dict in case custom_loss' need them.
        input_dict[SampleBatch.ACTIONS] = train_batch[SampleBatch.ACTIONS]
        input_dict[SampleBatch.REWARDS] = train_batch[SampleBatch.REWARDS]
        input_dict[SampleBatch.DONES] = train_batch[SampleBatch.DONES]
        input_dict[SampleBatch.NEXT_OBS] = train_batch[SampleBatch.NEXT_OBS]
        if log_once("ddpg_custom_loss"):
            logger.warning(
                "You are using a state-preprocessor with DDPG and "
                "therefore, `custom_loss` will be called on your Model! "
                "Please be aware that DDPG now uses the ModelV2 API, which "
                "merges all previously separate sub-models (policy_model, "
                "q_model, and twin_q_model) into one ModelV2, on which "
                "`custom_loss` is called, passing it "
                "[actor_loss, critic_loss] as 1st argument. "
                "You may have to change your custom loss function to handle "
                "this.")
        [actor_loss,
         critic_loss] = model.custom_loss([actor_loss, critic_loss],
                                          input_dict)

    # Store values for stats function.
    policy.actor_loss = actor_loss
    policy.critic_loss = critic_loss
    policy.td_error = td_error
    policy.q_t = q_t

    # Return one loss value (even though we treat them separately in our
    # 2 optimizers: actor and critic).
    return policy.critic_loss + policy.actor_loss
예제 #23
0
    def _initialize_loss_from_dummy_batch(
            self,
            auto_remove_unneeded_view_reqs: bool = True,
            stats_fn=None) -> None:

        # Create the optimizer/exploration optimizer here. Some initialization
        # steps (e.g. exploration postprocessing) may need this.
        self._optimizer = self.optimizer()

        # Test calls depend on variable init, so initialize model first.
        self._sess.run(tf1.global_variables_initializer())

        if self.config["_use_trajectory_view_api"]:
            logger.info("Testing `compute_actions` w/ dummy batch.")
            actions, state_outs, extra_fetches = \
                self.compute_actions_from_input_dict(
                    self._dummy_batch, explore=False, timestep=0)
            for key, value in extra_fetches.items():
                self._dummy_batch[key] = np.zeros_like(value)
                self._input_dict[key] = get_placeholder(value=value, name=key)
                if key not in self.view_requirements:
                    logger.info("Adding extra-action-fetch `{}` to "
                                "view-reqs.".format(key))
                    self.view_requirements[key] = \
                        ViewRequirement(space=gym.spaces.Box(
                            -1.0, 1.0, shape=value.shape[1:],
                            dtype=value.dtype))
            dummy_batch = self._dummy_batch
        else:

            def fake_array(tensor):
                shape = tensor.shape.as_list()
                shape = [s if s is not None else 1 for s in shape]
                return np.zeros(shape, dtype=tensor.dtype.as_numpy_dtype)

            dummy_batch = {
                SampleBatch.CUR_OBS:
                fake_array(self._obs_input),
                SampleBatch.NEXT_OBS:
                fake_array(self._obs_input),
                SampleBatch.DONES:
                np.array([False], dtype=np.bool),
                SampleBatch.ACTIONS:
                fake_array(
                    ModelCatalog.get_action_placeholder(self.action_space)),
                SampleBatch.REWARDS:
                np.array([0], dtype=np.float32),
            }
            if self._obs_include_prev_action_reward:
                dummy_batch.update({
                    SampleBatch.PREV_ACTIONS:
                    fake_array(self._prev_action_input),
                    SampleBatch.PREV_REWARDS:
                    fake_array(self._prev_reward_input),
                })
            state_init = self.get_initial_state()
            state_batches = []
            for i, h in enumerate(state_init):
                dummy_batch["state_in_{}".format(i)] = np.expand_dims(h, 0)
                dummy_batch["state_out_{}".format(i)] = np.expand_dims(h, 0)
                state_batches.append(np.expand_dims(h, 0))
            if state_init:
                dummy_batch["seq_lens"] = np.array([1], dtype=np.int32)
            for k, v in self.extra_compute_action_fetches().items():
                dummy_batch[k] = fake_array(v)
            dummy_batch = SampleBatch(dummy_batch)

        batch_for_postproc = UsageTrackingDict(dummy_batch)
        batch_for_postproc.count = dummy_batch.count
        logger.info("Testing `postprocess_trajectory` w/ dummy batch.")
        self.exploration.postprocess_trajectory(self, batch_for_postproc,
                                                self._sess)
        postprocessed_batch = self.postprocess_trajectory(batch_for_postproc)
        # Add new columns automatically to (loss) input_dict.
        if self.config["_use_trajectory_view_api"]:
            for key in batch_for_postproc.added_keys:
                if key not in self._input_dict:
                    self._input_dict[key] = get_placeholder(
                        value=batch_for_postproc[key], name=key)
                if key not in self.view_requirements:
                    self.view_requirements[key] = \
                        ViewRequirement(space=gym.spaces.Box(
                            -1.0, 1.0, shape=batch_for_postproc[key].shape[1:],
                            dtype=batch_for_postproc[key].dtype))

        if not self.config["_use_trajectory_view_api"]:
            train_batch = UsageTrackingDict(
                dict({
                    SampleBatch.CUR_OBS: self._obs_input,
                }, **self._loss_input_dict))
            if self._obs_include_prev_action_reward:
                train_batch.update({
                    SampleBatch.PREV_ACTIONS: self._prev_action_input,
                    SampleBatch.PREV_REWARDS: self._prev_reward_input,
                    SampleBatch.CUR_OBS: self._obs_input,
                })

            for k, v in postprocessed_batch.items():
                if k in train_batch:
                    continue
                elif v.dtype == np.object:
                    continue  # can't handle arbitrary objects in TF
                elif k == "seq_lens" or k.startswith("state_in_"):
                    continue
                shape = (None, ) + v.shape[1:]
                dtype = np.float32 if v.dtype == np.float64 else v.dtype
                placeholder = tf1.placeholder(dtype, shape=shape, name=k)
                train_batch[k] = placeholder

            for i, si in enumerate(self._state_inputs):
                train_batch["state_in_{}".format(i)] = si
        else:
            train_batch = UsageTrackingDict(
                dict(self._input_dict, **self._loss_input_dict))

        if self._state_inputs:
            train_batch["seq_lens"] = self._seq_lens

        if log_once("loss_init"):
            logger.debug(
                "Initializing loss function with dummy input:\n\n{}\n".format(
                    summarize(train_batch)))

        self._loss_input_dict.update({k: v for k, v in train_batch.items()})
        loss = self._do_loss_init(train_batch)

        all_accessed_keys = \
            train_batch.accessed_keys | batch_for_postproc.accessed_keys | \
            batch_for_postproc.added_keys | set(
                self.model.view_requirements.keys())

        TFPolicy._initialize_loss(
            self, loss,
            [(k, v) for k, v in train_batch.items() if k in all_accessed_keys])

        if "is_training" in self._loss_input_dict:
            del self._loss_input_dict["is_training"]

        # Call the grads stats fn.
        # TODO: (sven) rename to simply stats_fn to match eager and torch.
        if self._grad_stats_fn:
            self._stats_fetches.update(
                self._grad_stats_fn(self, train_batch, self._grads))

        # Add new columns automatically to view-reqs.
        if self.config["_use_trajectory_view_api"] and \
                auto_remove_unneeded_view_reqs:
            # Add those needed for postprocessing and training.
            all_accessed_keys = train_batch.accessed_keys | \
                                batch_for_postproc.accessed_keys
            # Tag those only needed for post-processing (with some exceptions).
            for key in batch_for_postproc.accessed_keys:
                if key not in train_batch.accessed_keys and \
                        key not in self.model.view_requirements and \
                        key not in [
                            SampleBatch.EPS_ID, SampleBatch.AGENT_INDEX,
                            SampleBatch.UNROLL_ID, SampleBatch.DONES,
                            SampleBatch.REWARDS, SampleBatch.INFOS]:
                    if key in self.view_requirements:
                        self.view_requirements[key].used_for_training = False
                    if key in self._loss_input_dict:
                        del self._loss_input_dict[key]
            # Remove those not needed at all (leave those that are needed
            # by Sampler to properly execute sample collection).
            # Also always leave DONES, REWARDS, and INFOS, no matter what.
            for key in list(self.view_requirements.keys()):
                if key not in all_accessed_keys and key not in [
                    SampleBatch.EPS_ID, SampleBatch.AGENT_INDEX,
                    SampleBatch.UNROLL_ID, SampleBatch.DONES,
                    SampleBatch.REWARDS, SampleBatch.INFOS] and \
                        key not in self.model.view_requirements:
                    # If user deleted this key manually in postprocessing
                    # fn, warn about it and do not remove from
                    # view-requirements.
                    if key in batch_for_postproc.deleted_keys:
                        logger.warning(
                            "SampleBatch key '{}' was deleted manually in "
                            "postprocessing function! RLlib will "
                            "automatically remove non-used items from the "
                            "data stream. Remove the `del` from your "
                            "postprocessing function.".format(key))
                    else:
                        del self.view_requirements[key]
                    if key in self._loss_input_dict:
                        del self._loss_input_dict[key]
            # Add those data_cols (again) that are missing and have
            # dependencies by view_cols.
            for key in list(self.view_requirements.keys()):
                vr = self.view_requirements[key]
                if (vr.data_col is not None
                        and vr.data_col not in self.view_requirements):
                    used_for_training = \
                        vr.data_col in train_batch.accessed_keys
                    self.view_requirements[vr.data_col] = ViewRequirement(
                        space=vr.space, used_for_training=used_for_training)

        self._loss_input_dict_no_rnn = {
            k: v
            for k, v in self._loss_input_dict.items()
            if (v not in self._state_inputs and v != self._seq_lens)
        }

        # Initialize again after loss init.
        self._sess.run(tf1.global_variables_initializer())
예제 #24
0
파일: test_io.py 프로젝트: novahe/ray
import unittest

import ray
from ray.tune.registry import register_env, register_input, \
    registry_get_input, registry_contains_input
from ray.rllib.agents.pg import PGTrainer
from ray.rllib.examples.env.multi_agent import MultiAgentCartPole
from ray.rllib.offline import IOContext, JsonWriter, JsonReader, InputReader, \
    ShuffledInput
from ray.rllib.offline.json_writer import _to_json
from ray.rllib.policy.sample_batch import SampleBatch
from ray.rllib.utils.test_utils import framework_iterator

SAMPLES = SampleBatch({
    "actions": np.array([1, 2, 3, 4]),
    "obs": np.array([4, 5, 6, 7]),
    "eps_id": [1, 1, 2, 3],
})


def make_sample_batch(i):
    return SampleBatch({
        "actions": np.array([i, i, i]),
        "obs": np.array([i, i, i])
    })


class AgentIOTest(unittest.TestCase):
    def setUp(self):
        ray.init(num_cpus=1, ignore_reinit_error=True)
        self.test_dir = tempfile.mkdtemp()
예제 #25
0
파일: test_io.py 프로젝트: novahe/ray
def make_sample_batch(i):
    return SampleBatch({
        "actions": np.array([i, i, i]),
        "obs": np.array([i, i, i])
    })
예제 #26
0
    def test_episodes_unit(self):
        """Tests adding, sampling, and eviction of episodes."""
        buffer = ReplayBuffer(capacity=18, storage_unit="episodes")

        batches = [
            SampleBatch({
                SampleBatch.T: [0, 1, 2, 3],
                SampleBatch.ACTIONS: 4 * [np.random.choice([0, 1])],
                SampleBatch.REWARDS: 4 * [np.random.rand()],
                SampleBatch.DONES: [False, False, False, True],
                SampleBatch.SEQ_LENS: [4],
                SampleBatch.EPS_ID: 4 * [i],
            }) for i in range(3)
        ]

        batches.append(
            SampleBatch({
                SampleBatch.T: [0, 1, 0, 1],
                SampleBatch.ACTIONS: 4 * [np.random.choice([0, 1])],
                SampleBatch.REWARDS: 4 * [np.random.rand()],
                SampleBatch.DONES: [False, True, False, True],
                SampleBatch.SEQ_LENS: [2, 2],
                SampleBatch.EPS_ID: [3, 3, 4, 4],
            }))

        for batch in batches:
            buffer.add(batch)

        num_sampled_dict = {_id: 0 for _id in range(5)}
        num_samples = 200
        for i in range(num_samples):
            sample = buffer.sample(1)
            _id = sample[SampleBatch.EPS_ID][0]
            assert len(sample[SampleBatch.SEQ_LENS]) == 1
            num_sampled_dict[_id] += 1

        # All episodes, even though in different batches should be sampled
        # equally often
        assert np.allclose(
            np.array(list(num_sampled_dict.values())) / num_samples,
            [1 / 5, 1 / 5, 1 / 5, 1 / 5, 1 / 5],
            atol=0.1,
        )

        # Episode 6 is not entirely inside this batch, it should not be added
        # to the buffer
        buffer.add(
            SampleBatch({
                SampleBatch.T: [0, 1, 0, 1],
                SampleBatch.ACTIONS: 4 * [np.random.choice([0, 1])],
                SampleBatch.REWARDS: 4 * [np.random.rand()],
                SampleBatch.DONES: [False, True, False, False],
                SampleBatch.SEQ_LENS: [2, 2],
                SampleBatch.EPS_ID: [5, 5, 6, 6],
            }))

        num_sampled_dict = {_id: 0 for _id in range(7)}
        num_samples = 200
        for i in range(num_samples):
            sample = buffer.sample(1)
            _id = sample[SampleBatch.EPS_ID][0]
            assert len(sample[SampleBatch.SEQ_LENS]) == 1
            num_sampled_dict[_id] += 1

        # Episode 7 should be dropped for not ending inside the batch
        assert np.allclose(
            np.array(list(num_sampled_dict.values())) / num_samples,
            [1 / 6, 1 / 6, 1 / 6, 1 / 6, 1 / 6, 1 / 6, 0],
            atol=0.1,
        )

        # Add another batch to evict the first batch
        buffer.add(
            SampleBatch({
                SampleBatch.T: [0, 1, 2, 3],
                SampleBatch.ACTIONS: 4 * [np.random.choice([0, 1])],
                SampleBatch.REWARDS: 4 * [np.random.rand()],
                SampleBatch.DONES: [False, False, False, True],
                SampleBatch.SEQ_LENS: [4],
                SampleBatch.EPS_ID: 4 * [7],
            }))

        # After adding 1 more batch, eviction has started with 24
        # timesteps added in total, 2 of which were discarded
        assert len(buffer) == 6
        assert buffer._num_timesteps_added == 4 * 6 - 2
        assert buffer._num_timesteps_added_wrap == 4
        assert buffer._next_idx == 1
        assert buffer._eviction_started is True

        num_sampled_dict = {_id: 0 for _id in range(8)}
        num_samples = 200
        for i in range(num_samples):
            sample = buffer.sample(1)
            _id = sample[SampleBatch.EPS_ID][0]
            assert len(sample[SampleBatch.SEQ_LENS]) == 1
            num_sampled_dict[_id] += 1

        assert np.allclose(
            np.array(list(num_sampled_dict.values())) / num_samples,
            [0, 1 / 6, 1 / 6, 1 / 6, 1 / 6, 1 / 6, 0, 1 / 6],
            atol=0.1,
        )
예제 #27
0
    def test_sequences_unit(self):
        """Tests adding, sampling and eviction of sequences."""
        buffer = ReplayBuffer(capacity=10, storage_unit="sequences")

        batches = [
            SampleBatch({
                SampleBatch.T:
                i * [np.random.random((4, ))],
                SampleBatch.ACTIONS:
                i * [np.random.choice([0, 1])],
                SampleBatch.REWARDS:
                i * [np.random.rand()],
                SampleBatch.DONES:
                i * [np.random.choice([False, True])],
                SampleBatch.SEQ_LENS: [i],
                "batch_id":
                i * [i],
            }) for i in range(1, 4)
        ]

        batches.append(
            SampleBatch({
                SampleBatch.T:
                4 * [np.random.random((4, ))],
                SampleBatch.ACTIONS:
                4 * [np.random.choice([0, 1])],
                SampleBatch.REWARDS:
                4 * [np.random.rand()],
                SampleBatch.DONES:
                4 * [np.random.choice([False, True])],
                SampleBatch.SEQ_LENS: [2, 2],
                "batch_id":
                4 * [4],
            }))

        for batch in batches:
            buffer.add(batch)

        num_sampled_dict = {_id: 0 for _id in range(1, 5)}
        num_samples = 200
        for i in range(num_samples):
            sample = buffer.sample(1)
            _id = sample["batch_id"][0]
            assert len(sample[SampleBatch.SEQ_LENS]) == 1
            num_sampled_dict[_id] += 1

        # Out of five sequences, we want to sequences from the last batch to
        # be sampled twice as often, because they are stored separately
        assert np.allclose(
            np.array(list(num_sampled_dict.values())) / num_samples,
            [1 / 5, 1 / 5, 1 / 5, 2 / 5],
            atol=0.1,
        )

        # Add another batch to evict
        buffer.add(
            SampleBatch({
                SampleBatch.T:
                5 * [np.random.random((4, ))],
                SampleBatch.ACTIONS:
                5 * [np.random.choice([0, 1])],
                SampleBatch.REWARDS:
                5 * [np.random.rand()],
                SampleBatch.DONES:
                5 * [np.random.choice([False, True])],
                SampleBatch.SEQ_LENS: [5],
                "batch_id":
                5 * [5],
            }))

        # After adding 1 more batch, eviction has started with 15
        # timesteps added in total
        assert len(buffer) == 5
        assert buffer._num_timesteps_added == sum(range(1, 6))
        assert buffer._num_timesteps_added_wrap == 5
        assert buffer._next_idx == 1
        assert buffer._eviction_started is True

        # The first batch should now not be sampled anymore, other batches
        # should be sampled as before
        num_sampled_dict = {_id: 0 for _id in range(2, 6)}
        num_samples = 200
        for i in range(num_samples):
            sample = buffer.sample(1)
            _id = sample["batch_id"][0]
            assert len(sample[SampleBatch.SEQ_LENS]) == 1
            num_sampled_dict[_id] += 1

        assert np.allclose(
            np.array(list(num_sampled_dict.values())) / num_samples,
            [1 / 5, 1 / 5, 2 / 5, 1 / 5],
            atol=0.1,
        )
예제 #28
0
파일: policy.py 프로젝트: alipay/ray
    def compute_single_action(
        self,
        obs: Optional[TensorStructType] = None,
        state: Optional[List[TensorType]] = None,
        *,
        prev_action: Optional[TensorStructType] = None,
        prev_reward: Optional[TensorStructType] = None,
        info: dict = None,
        input_dict: Optional[SampleBatch] = None,
        episode: Optional["Episode"] = None,
        explore: Optional[bool] = None,
        timestep: Optional[int] = None,
        # Kwars placeholder for future compatibility.
        **kwargs,
    ) -> Tuple[TensorStructType, List[TensorType], Dict[str, TensorType]]:
        """Computes and returns a single (B=1) action value.

        Takes an input dict (usually a SampleBatch) as its main data input.
        This allows for using this method in case a more complex input pattern
        (view requirements) is needed, for example when the Model requires the
        last n observations, the last m actions/rewards, or a combination
        of any of these.
        Alternatively, in case no complex inputs are required, takes a single
        `obs` values (and possibly single state values, prev-action/reward
        values, etc..).

        Args:
            obs: Single observation.
            state: List of RNN state inputs, if any.
            prev_action: Previous action value, if any.
            prev_reward: Previous reward, if any.
            info: Info object, if any.
            input_dict: A SampleBatch or input dict containing the
                single (unbatched) Tensors to compute actions. If given, it'll
                be used instead of `obs`, `state`, `prev_action|reward`, and
                `info`.
            episode: This provides access to all of the internal episode state,
                which may be useful for model-based or multi-agent algorithms.
            explore: Whether to pick an exploitation or
                exploration action
                (default: None -> use self.config["explore"]).
            timestep: The current (sampling) time step.

        Keyword Args:
            kwargs: Forward compatibility placeholder.

        Returns:
            Tuple consisting of the action, the list of RNN state outputs (if
            any), and a dictionary of extra features (if any).
        """
        # Build the input-dict used for the call to
        # `self.compute_actions_from_input_dict()`.
        if input_dict is None:
            input_dict = {SampleBatch.OBS: obs}
            if state is not None:
                for i, s in enumerate(state):
                    input_dict[f"state_in_{i}"] = s
            if prev_action is not None:
                input_dict[SampleBatch.PREV_ACTIONS] = prev_action
            if prev_reward is not None:
                input_dict[SampleBatch.PREV_REWARDS] = prev_reward
            if info is not None:
                input_dict[SampleBatch.INFOS] = info

        # Batch all data in input dict.
        input_dict = tree.map_structure_with_path(
            lambda p, s:
            (s if p == "seq_lens" else s.unsqueeze(0) if torch and isinstance(
                s, torch.Tensor) else np.expand_dims(s, 0)),
            input_dict,
        )

        episodes = None
        if episode is not None:
            episodes = [episode]

        out = self.compute_actions_from_input_dict(
            input_dict=SampleBatch(input_dict),
            episodes=episodes,
            explore=explore,
            timestep=timestep,
        )

        # Some policies don't return a tuple, but always just a single action.
        # E.g. ES and ARS.
        if not isinstance(out, tuple):
            single_action = out
            state_out = []
            info = {}
        # Normal case: Policy should return (action, state, info) tuple.
        else:
            batched_action, state_out, info = out
            single_action = unbatch(batched_action)
        assert len(single_action) == 1
        single_action = single_action[0]

        # Return action, internal state(s), infos.
        return (
            single_action,
            [s[0] for s in state_out],
            {k: v[0]
             for k, v in info.items()},
        )
예제 #29
0
def actor_critic_loss(
    policy: Policy,
    model: ModelV2,
    dist_class: Type[TorchDistributionWrapper],
    train_batch: SampleBatch,
) -> Union[TensorType, List[TensorType]]:
    """Constructs the loss for the Soft Actor Critic.

    Args:
        policy: The Policy to calculate the loss for.
        model (ModelV2): The Model to calculate the loss for.
        dist_class (Type[TorchDistributionWrapper]: The action distr. class.
        train_batch: The training data.

    Returns:
        Union[TensorType, List[TensorType]]: A single loss tensor or a list
            of loss tensors.
    """
    # Look up the target model (tower) using the model tower.
    target_model = policy.target_models[model]

    # Should be True only for debugging purposes (e.g. test cases)!
    deterministic = policy.config["_deterministic_loss"]

    model_out_t, _ = model(
        SampleBatch(obs=train_batch[SampleBatch.CUR_OBS], _is_training=True),
        [], None)

    model_out_tp1, _ = model(
        SampleBatch(obs=train_batch[SampleBatch.NEXT_OBS], _is_training=True),
        [], None)

    target_model_out_tp1, _ = target_model(
        SampleBatch(obs=train_batch[SampleBatch.NEXT_OBS], _is_training=True),
        [], None)

    alpha = torch.exp(model.log_alpha)

    # Discrete case.
    if model.discrete:
        # Get all action probs directly from pi and form their logp.
        action_dist_inputs_t, _ = model.get_action_model_outputs(model_out_t)
        log_pis_t = F.log_softmax(action_dist_inputs_t, dim=-1)
        policy_t = torch.exp(log_pis_t)
        action_dist_inputs_tp1, _ = model.get_action_model_outputs(
            model_out_tp1)
        log_pis_tp1 = F.log_softmax(action_dist_inputs_tp1, -1)
        policy_tp1 = torch.exp(log_pis_tp1)
        # Q-values.
        q_t, _ = model.get_q_values(model_out_t)
        # Target Q-values.
        q_tp1, _ = target_model.get_q_values(target_model_out_tp1)
        if policy.config["twin_q"]:
            twin_q_t, _ = model.get_twin_q_values(model_out_t)
            twin_q_tp1, _ = target_model.get_twin_q_values(
                target_model_out_tp1)
            q_tp1 = torch.min(q_tp1, twin_q_tp1)
        q_tp1 -= alpha * log_pis_tp1

        # Actually selected Q-values (from the actions batch).
        one_hot = F.one_hot(train_batch[SampleBatch.ACTIONS].long(),
                            num_classes=q_t.size()[-1])
        q_t_selected = torch.sum(q_t * one_hot, dim=-1)
        if policy.config["twin_q"]:
            twin_q_t_selected = torch.sum(twin_q_t * one_hot, dim=-1)
        # Discrete case: "Best" means weighted by the policy (prob) outputs.
        q_tp1_best = torch.sum(torch.mul(policy_tp1, q_tp1), dim=-1)
        q_tp1_best_masked = (
            1.0 - train_batch[SampleBatch.DONES].float()) * q_tp1_best
    # Continuous actions case.
    else:
        # Sample single actions from distribution.
        action_dist_class = _get_dist_class(policy, policy.config,
                                            policy.action_space)
        action_dist_inputs_t, _ = model.get_action_model_outputs(model_out_t)
        action_dist_t = action_dist_class(action_dist_inputs_t, model)
        policy_t = (action_dist_t.sample() if not deterministic else
                    action_dist_t.deterministic_sample())
        log_pis_t = torch.unsqueeze(action_dist_t.logp(policy_t), -1)
        action_dist_inputs_tp1, _ = model.get_action_model_outputs(
            model_out_tp1)
        action_dist_tp1 = action_dist_class(action_dist_inputs_tp1, model)
        policy_tp1 = (action_dist_tp1.sample() if not deterministic else
                      action_dist_tp1.deterministic_sample())
        log_pis_tp1 = torch.unsqueeze(action_dist_tp1.logp(policy_tp1), -1)

        # Q-values for the actually selected actions.
        q_t, _ = model.get_q_values(model_out_t,
                                    train_batch[SampleBatch.ACTIONS])
        if policy.config["twin_q"]:
            twin_q_t, _ = model.get_twin_q_values(
                model_out_t, train_batch[SampleBatch.ACTIONS])

        # Q-values for current policy in given current state.
        q_t_det_policy, _ = model.get_q_values(model_out_t, policy_t)
        if policy.config["twin_q"]:
            twin_q_t_det_policy, _ = model.get_twin_q_values(
                model_out_t, policy_t)
            q_t_det_policy = torch.min(q_t_det_policy, twin_q_t_det_policy)

        # Target q network evaluation.
        q_tp1, _ = target_model.get_q_values(target_model_out_tp1, policy_tp1)
        if policy.config["twin_q"]:
            twin_q_tp1, _ = target_model.get_twin_q_values(
                target_model_out_tp1, policy_tp1)
            # Take min over both twin-NNs.
            q_tp1 = torch.min(q_tp1, twin_q_tp1)

        q_t_selected = torch.squeeze(q_t, dim=-1)
        if policy.config["twin_q"]:
            twin_q_t_selected = torch.squeeze(twin_q_t, dim=-1)
        q_tp1 -= alpha * log_pis_tp1

        q_tp1_best = torch.squeeze(input=q_tp1, dim=-1)
        q_tp1_best_masked = (
            1.0 - train_batch[SampleBatch.DONES].float()) * q_tp1_best

    # compute RHS of bellman equation
    q_t_selected_target = (train_batch[SampleBatch.REWARDS] +
                           (policy.config["gamma"]**policy.config["n_step"]) *
                           q_tp1_best_masked).detach()

    # Compute the TD-error (potentially clipped).
    base_td_error = torch.abs(q_t_selected - q_t_selected_target)
    if policy.config["twin_q"]:
        twin_td_error = torch.abs(twin_q_t_selected - q_t_selected_target)
        td_error = 0.5 * (base_td_error + twin_td_error)
    else:
        td_error = base_td_error

    critic_loss = [
        torch.mean(train_batch[PRIO_WEIGHTS] * huber_loss(base_td_error))
    ]
    if policy.config["twin_q"]:
        critic_loss.append(
            torch.mean(train_batch[PRIO_WEIGHTS] * huber_loss(twin_td_error)))

    # Alpha- and actor losses.
    # Note: In the papers, alpha is used directly, here we take the log.
    # Discrete case: Multiply the action probs as weights with the original
    # loss terms (no expectations needed).
    if model.discrete:
        weighted_log_alpha_loss = policy_t.detach() * (
            -model.log_alpha * (log_pis_t + model.target_entropy).detach())
        # Sum up weighted terms and mean over all batch items.
        alpha_loss = torch.mean(torch.sum(weighted_log_alpha_loss, dim=-1))
        # Actor loss.
        actor_loss = torch.mean(
            torch.sum(
                torch.mul(
                    # NOTE: No stop_grad around policy output here
                    # (compare with q_t_det_policy for continuous case).
                    policy_t,
                    alpha.detach() * log_pis_t - q_t.detach(),
                ),
                dim=-1,
            ))
    else:
        alpha_loss = -torch.mean(model.log_alpha *
                                 (log_pis_t + model.target_entropy).detach())
        # Note: Do not detach q_t_det_policy here b/c is depends partly
        # on the policy vars (policy sample pushed through Q-net).
        # However, we must make sure `actor_loss` is not used to update
        # the Q-net(s)' variables.
        actor_loss = torch.mean(alpha.detach() * log_pis_t - q_t_det_policy)

    # Store values for stats function in model (tower), such that for
    # multi-GPU, we do not override them during the parallel loss phase.
    model.tower_stats["q_t"] = q_t
    model.tower_stats["policy_t"] = policy_t
    model.tower_stats["log_pis_t"] = log_pis_t
    model.tower_stats["actor_loss"] = actor_loss
    model.tower_stats["critic_loss"] = critic_loss
    model.tower_stats["alpha_loss"] = alpha_loss

    # TD-error tensor in final stats
    # will be concatenated and retrieved for each individual batch item.
    model.tower_stats["td_error"] = td_error

    # Return all loss terms corresponding to our optimizers.
    return tuple([actor_loss] + critic_loss + [alpha_loss])
예제 #30
0
    def postprocess_trajectory(self,
                               policy: "Policy",
                               sample_batch: SampleBatch,
                               tf_sess: Optional["tf.Session"] = None):
        noisy_action_dist = noise_free_action_dist = None
        # Adjust the stddev depending on the action (pi)-distance.
        # Also see [1] for details.
        # TODO(sven): Find out whether this can be scrapped by simply using
        #  the `sample_batch` to get the noisy/noise-free action dist.
        _, _, fetches = policy.compute_actions(
            obs_batch=sample_batch[SampleBatch.CUR_OBS],
            # TODO(sven): What about state-ins and seq-lens?
            prev_action_batch=sample_batch.get(SampleBatch.PREV_ACTIONS),
            prev_reward_batch=sample_batch.get(SampleBatch.PREV_REWARDS),
            explore=self.weights_are_currently_noisy)

        # Categorical case (e.g. DQN).
        if policy.dist_class in (Categorical, TorchCategorical):
            action_dist = softmax(fetches[SampleBatch.ACTION_DIST_INPUTS])
        # Deterministic (Gaussian actions, e.g. DDPG).
        elif policy.dist_class in [Deterministic, TorchDeterministic]:
            action_dist = fetches[SampleBatch.ACTION_DIST_INPUTS]
        else:
            raise NotImplementedError  # TODO(sven): Other action-dist cases.

        if self.weights_are_currently_noisy:
            noisy_action_dist = action_dist
        else:
            noise_free_action_dist = action_dist

        _, _, fetches = policy.compute_actions(
            obs_batch=sample_batch[SampleBatch.CUR_OBS],
            prev_action_batch=sample_batch.get(SampleBatch.PREV_ACTIONS),
            prev_reward_batch=sample_batch.get(SampleBatch.PREV_REWARDS),
            explore=not self.weights_are_currently_noisy)

        # Categorical case (e.g. DQN).
        if policy.dist_class in (Categorical, TorchCategorical):
            action_dist = softmax(fetches[SampleBatch.ACTION_DIST_INPUTS])
            # Deterministic (Gaussian actions, e.g. DDPG).
        elif policy.dist_class in [Deterministic, TorchDeterministic]:
            action_dist = fetches[SampleBatch.ACTION_DIST_INPUTS]

        if noisy_action_dist is None:
            noisy_action_dist = action_dist
        else:
            noise_free_action_dist = action_dist

        delta = distance = None
        # Categorical case (e.g. DQN).
        if policy.dist_class in (Categorical, TorchCategorical):
            # Calculate KL-divergence (DKL(clean||noisy)) according to [2].
            # TODO(sven): Allow KL-divergence to be calculated by our
            #  Distribution classes (don't support off-graph/numpy yet).
            distance = np.nanmean(
                np.sum(
                    noise_free_action_dist *
                    np.log(noise_free_action_dist /
                           (noisy_action_dist + SMALL_NUMBER)), 1))
            current_epsilon = self.sub_exploration.get_info(
                sess=tf_sess)["cur_epsilon"]
            delta = -np.log(1 - current_epsilon +
                            current_epsilon / self.action_space.n)
        elif policy.dist_class in [Deterministic, TorchDeterministic]:
            # Calculate MSE between noisy and non-noisy output (see [2]).
            distance = np.sqrt(
                np.mean(np.square(noise_free_action_dist - noisy_action_dist)))
            current_scale = self.sub_exploration.get_info(
                sess=tf_sess)["cur_scale"]
            delta = getattr(self.sub_exploration, "ou_sigma", 0.2) * \
                current_scale

        # Adjust stddev according to the calculated action-distance.
        if distance <= delta:
            self.stddev_val *= 1.01
        else:
            self.stddev_val /= 1.01

        # Set self.stddev to calculated value.
        if self.framework == "tf":
            self.stddev.load(self.stddev_val, session=tf_sess)
        else:
            self.stddev = self.stddev_val

        return sample_batch