Exemplo n.º 1
0
def _generate_time_step(batched,
                        observation,
                        step_type,
                        discount,
                        prev_action=None,
                        action_spec=None,
                        reward=None,
                        reward_spec=ts.TensorSpec(()),
                        env_id=None,
                        env_info={}):

    flat_observation = nest.flatten(observation)

    if all(map(_is_numpy_array, flat_observation)):
        md = np
        if reward is not None:
            reward = np.float32(reward)
        discount = np.float32(discount)
    else:
        assert all(
            map(torch.is_tensor,
                flat_observation)), ("Elements in observation must be Tensor")
        md = torch
        if reward is not None:
            reward = to_tensor(reward, dtype=torch.float32)
        discount = to_tensor(discount, dtype=torch.float32)

    if batched:
        batch_size = flat_observation[0].shape[0]
        outer_dims = (batch_size, )
        if env_id is None:
            env_id = md.arange(batch_size, dtype=md.int32)
        if reward is not None:
            assert reward.shape[:1] == outer_dims
        if prev_action is not None:
            flat_action = nest.flatten(prev_action)
            assert flat_action[0].shape[:1] == outer_dims
    else:
        outer_dims = ()
        if env_id is None:
            env_id = md.zeros((), dtype=md.int32)

    step_type = md.full(outer_dims, step_type, dtype=md.int32)
    if reward is None:
        reward = md.zeros(outer_dims + reward_spec.shape, dtype=md.float32)
    discount = md.ones(outer_dims, dtype=md.float32) * discount
    if prev_action is None:
        prev_action = nest.map_structure(
            lambda spec: md.zeros(outer_dims + spec.shape,
                                  dtype=getattr(
                                      md, ts.torch_dtype_to_str(spec.dtype))),
            action_spec)

    return TimeStep(step_type,
                    reward,
                    discount,
                    observation,
                    prev_action,
                    env_id,
                    env_info=env_info)
Exemplo n.º 2
0
    def _step(self, time_step: TimeStep, state, calc_rewards=True):
        """
        Args:
            time_step (TimeStep): input time step data, where the
                observation is skill-augmened observation. The skill should be
                a one-hot vector.
            state (Tensor): state for DIAYN (previous skill) which should be
                a one-hot vector.
            calc_rewards (bool): if False, only return the losses.

        Returns:
            AlgStep:
                output: empty tuple ()
                state: skill
                info (DIAYNInfo):
        """
        observations_aug = time_step.observation
        step_type = time_step.step_type
        observation, skill = observations_aug
        prev_skill = state.detach()

        # normalize observation for easier prediction
        if self._observation_normalizer is not None:
            observation = self._observation_normalizer.normalize(observation)

        if self._encoding_net is not None:
            feature, _ = self._encoding_net(observation)

        skill_pred, _ = self._discriminator_net(feature)

        if self._skill_spec.is_discrete:
            loss = torch.nn.CrossEntropyLoss(reduction='none')(
                input=skill_pred, target=torch.argmax(prev_skill, dim=-1))
        else:
            # nn.MSELoss doesn't support reducing along a dim
            loss = torch.sum(math_ops.square(skill_pred - prev_skill), dim=-1)

        valid_masks = (step_type != to_tensor(StepType.FIRST)).to(
            torch.float32)
        loss *= valid_masks

        intrinsic_reward = ()
        if calc_rewards:
            intrinsic_reward = -loss.detach()
            intrinsic_reward = self._reward_normalizer.normalize(
                intrinsic_reward)

        return AlgStep(
            output=(),
            state=skill,
            info=DIAYNInfo(reward=intrinsic_reward, loss=loss))
Exemplo n.º 3
0
    def __init__(self, env, reward_weights=None):
        """
        Args:
            env (AlfEnvironment): An AlfEnvironment instance to be wrapped.
            reward_weights (list[float] | tuple[float]): a list/tuple of weights
                for the rewards; if None, then the first dimension will be 1 and
                the other dimensions will be 0s.
        """
        super(ScalarRewardWrapper, self).__init__(env)
        reward_spec = env.reward_spec()
        assert reward_spec.ndim == 1, (
            "This wrapper only supports vector rewards! Reward tensor rank: %d"
            % reward_spec.ndim)

        rewards_n = reward_spec.shape[0]
        if reward_weights is None:
            reward_weights = [1.] + [0.] * (rewards_n - 1)
        assert (isinstance(reward_weights, (list, tuple))
                and len(reward_weights) == rewards_n)
        self._np_reward_weights = np.array(reward_weights)
        self._tensor_reward_weights = to_tensor(reward_weights)
Exemplo n.º 4
0
def termination(observation, prev_action, reward, env_id=None, env_info={}):
    """Returns a ``TimeStep`` with ``step_type`` set to ``StepType.LAST``.

    Called by ``env.step()`` if 'Done'. ``discount`` should not be sent in and
    will be set as 0.

    Args:
        observation (nested tensors): current observations of the env.
        prev_action (nested tensors): previous actions to the the env.
        reward (float): A scalar, or 1D NumPy array, or tensor.
        env_id (torch.int32): (optional) A scalar or 1D tensor of the environment
            ID(s).
        env_info (dict): extra info returned by the environment.

    Returns:
        TimeStep:

    Raises:
        ValueError: If observations are tensors but reward's statically known rank
            is not 0 or 1.
    """
    flat_observation = nest.flatten(observation)
    if all(map(_is_numpy_array, flat_observation)):
        reward = np.float32(reward)
        if env_id is None:
            env_id = np.int32(0)
        step_type = StepType.LAST
        discount = np.float32(0.0)
        return TimeStep(step_type,
                        reward,
                        discount,
                        observation,
                        prev_action,
                        env_id,
                        env_info=env_info)
    else:
        assert all(
            map(torch.is_tensor,
                flat_observation)), ("Elements in observation must be Tensor")

        reward = to_tensor(reward, dtype=torch.float32)
        assert reward.dim() <= 1, "Expected reward to be a scalar or vector."
        if reward.dim() == 0:
            shape = []
            if env_id is None:
                env_id = torch.tensor(0, dtype=torch.int32)
        else:
            flat_action = nest.flatten(prev_action)
            assert flat_observation[0].shape[:1] == reward.shape
            assert flat_action[0].shape[:1] == reward.shape
            shape = reward.shape
            env_id = torch.arange(shape[0], dtype=torch.int32)
        step_type = torch.full(shape, StepType.LAST, dtype=torch.int32)
        discount = torch.full(shape, 0.0, dtype=torch.float32)
        return TimeStep(step_type,
                        reward,
                        discount,
                        observation,
                        prev_action,
                        env_id,
                        env_info=env_info)
Exemplo n.º 5
0
def transition(observation,
               prev_action,
               reward,
               discount=1.0,
               env_id=None,
               env_info={}):
    """Returns a ``TimeStep`` with ``step_type`` set equal to ``StepType.MID``.

    Called by ``env.step()`` if not 'Done'.

    The batch size is inferred from the shape of ``reward``.

    If ``discount`` is a scalar, and ``observation`` contains tensors,
    then ``discount`` will be broadcasted to match ``reward.shape``.

    Args:
        observation (nested tensors): current observations of the env.
        prev_action (nested tensors): previous actions to the the env.
        reward (float): A scalar, or 1D NumPy array, or tensor.
        discount (float): (optional) A scalar, or 1D NumPy array, or tensor.
        env_id (torch.int32): (optional) A scalar or 1D tensor of the environment
            ID(s).
        env_info (dict): extra info returned by the environment.

    Returns:
        TimeStep:

    Raises:
        ValueError: If observations are tensors but reward's rank
        is not 0 or 1.
    """
    flat_observation = nest.flatten(observation)
    if all(map(_is_numpy_array, flat_observation)):
        reward = np.float32(reward)
        if env_id is None:
            env_id = np.int32(0)
        step_type = StepType.MID
        discount = np.float32(discount)
        return TimeStep(step_type,
                        reward,
                        discount,
                        observation,
                        prev_action,
                        env_id,
                        env_info=env_info)
    else:
        assert all(
            map(torch.is_tensor,
                flat_observation)), ("Elements in observation must be Tensor")

        # TODO: If reward.shape.rank == 2, and static
        # batch sizes are available for both flat_observation and reward,
        # check that these match.
        reward = to_tensor(reward, dtype=torch.float32)
        assert reward.dim() <= 1, "Expected reward to be a scalar or vector."
        if reward.dim() == 0:
            shape = []
            if env_id is None:
                env_id = torch.tensor(0, dtype=torch.int32)
        else:
            flat_action = nest.flatten(prev_action)
            assert flat_observation[0].shape[:1] == reward.shape
            assert flat_action[0].shape[:1] == reward.shape
            shape = reward.shape
            env_id = torch.arange(shape[0], dtype=torch.int32)
        step_type = torch.full(shape, StepType.MID, dtype=torch.int32)
        discount = to_tensor(discount, dtype=torch.float32)

        if discount.dim() == 0:
            discount = torch.full(shape, discount, dtype=torch.float32)
        else:
            assert reward.shape == discount.shape
        return TimeStep(step_type,
                        reward,
                        discount,
                        observation,
                        prev_action,
                        env_id,
                        env_info=env_info)
Exemplo n.º 6
0
    def _create_trajectories(self):
        # Order of args for timestep_* methods:
        # reward, env_id, env_info
        ts0 = timestep_first([0, 0], [1, 2],
                             dict(x=to_tensor([1, 0]), y=to_tensor([1, 1])))
        ts1 = timestep_mid([1, 2], [1, 2],
                           dict(x=to_tensor([1, 2]), y=to_tensor([0, 3])))
        ts2 = timestep_last([3, 4], [1, 2],
                            dict(x=to_tensor([-1, -2]), y=to_tensor([1, -1])))
        ts3 = timestep_first([0, 0], [1, 2],
                             dict(x=to_tensor([1, 1]), y=to_tensor([1, 1])))
        ts4 = timestep_mid([5, 6], [1, 2],
                           dict(x=to_tensor([2, -2]), y=to_tensor([-1, -6])))
        ts5 = timestep_last([7, 8], [1, 2],
                            dict(x=to_tensor([10, 10]), y=to_tensor([5, 5])))

        return [ts0, ts1, ts2, ts3, ts4, ts5]
Exemplo n.º 7
0
def _create_timestep(reward, env_id, step_type, env_info):
    return TimeStep(step_type=to_tensor(step_type),
                    reward=to_tensor(reward),
                    env_info=env_info,
                    env_id=to_tensor(env_id))