Exemplo n.º 1
0
def test_concat(data_format):
    # Create some rollouts with random rewards
    ros = [
        StepSequence(rewards=np.random.randn(5),
                     observations=np.random.randn(6),
                     actions=np.random.randn(5),
                     policy_infos={'mean': np.random.randn(5)},
                     hidden=(np.random.randn(5), np.random.randn(5)),
                     data_format=data_format),
        StepSequence(rewards=np.random.randn(5),
                     observations=np.random.randn(6),
                     actions=np.random.randn(5),
                     policy_infos={'mean': np.random.randn(5)},
                     hidden=(np.random.randn(5), np.random.randn(5)),
                     data_format=data_format)
    ]

    # Perform concatenation
    cat = StepSequence.concat(ros)

    assert cat.continuous
    assert cat.rollout_count == 2

    # Check steps
    for step_ro, step_cat in zip(itertools.chain.from_iterable(ros), cat):
        assert step_ro.reward == step_cat.reward
        assert step_ro.observation == step_cat.observation
        assert step_ro.done == step_cat.done
Exemplo n.º 2
0
def test_replay_memory(capacity):
    rm = ReplayMemory(capacity)

    # Create fake rollouts (of length 5)
    ro1 = StepSequence(rewards=rewards,
                       observations=observations,
                       actions=actions,
                       hidden=hidden)
    ro2 = StepSequence(rewards=rewards,
                       observations=observations,
                       actions=actions,
                       hidden=hidden)
    # Concatenate them for testing only
    ros = StepSequence.concat(
        [ro1, ro2],
        truncate_last=True)  # same truncate_last behavior as push function

    # Check the lengths
    rm.push(ro1)
    assert len(rm) == len(ro1) or len(rm) == capacity
    rm.push(ro2)
    assert len(rm) == len(ro1) + len(ro1) or len(rm) == capacity

    # Check the elements
    shift = len(ros) - capacity
    if shift < len(ro1):
        assert all(rm.memory.observations[0] == ros.observations[shift])
    assert all(rm.memory.observations[-1] ==
               ro2.observations[-2])  # -2 since one was truncated
Exemplo n.º 3
0
def test_split_multi(data_format):
    # Don't require additional fields for this test
    StepSequence.required_fields = {}

    ro = StepSequence(rewards=np.arange(20),
                      rollout_bounds=[0, 4, 11, 17, 20],
                      data_format=data_format)

    # There should be four parts
    assert ro.rollout_count == 4
    # Of these sizes
    assert list(ro.rollout_lengths) == [4, 7, 6, 3]

    # Test selecting one
    s1 = ro.get_rollout(1)
    assert s1.rollout_count == 1
    assert s1[0].reward == ro[4].reward

    # Test selecting a slice
    s2 = ro.get_rollout(slice(1, -1))
    assert s2.rollout_count == 2
    assert s2[0].reward == ro[4].reward
    assert s2[7].reward == ro[11].reward

    # Test selecting by list
    s2 = ro.get_rollout([1, 3])
    assert s2.rollout_count == 2
    assert s2[0].reward == ro[4].reward
    assert s2[7].reward == ro[17].reward
Exemplo n.º 4
0
def test_adr_reward_generator(env):
    reference_env = env
    random_env = deepcopy(env)
    reward_generator = RewardGenerator(
        env_spec=random_env.spec,
        batch_size=256,
        reward_multiplier=1,
        lr=5e-3,
    )
    policy = FNNPolicy(reference_env.spec,
                       hidden_sizes=[16, 16],
                       hidden_nonlin=to.tanh)
    dr = create_default_randomizer_omo()
    dr.randomize(num_samples=1)
    random_env.domain_param = dr.get_params(fmt="dict", dtype="numpy")
    reference_sampler = ParallelRolloutSampler(reference_env,
                                               policy,
                                               num_workers=1,
                                               min_steps=1000)
    random_sampler = ParallelRolloutSampler(random_env,
                                            policy,
                                            num_workers=1,
                                            min_steps=1000)

    losses = []
    for i in range(200):
        reference_traj = StepSequence.concat(reference_sampler.sample())
        random_traj = StepSequence.concat(random_sampler.sample())
        losses.append(reward_generator.train(reference_traj, random_traj, 10))
    assert losses[len(losses) - 1] < losses[0]
Exemplo n.º 5
0
    def loss_fcn(self, rollout_real: StepSequence,
                 rollout_sim: StepSequence) -> float:
        """
        Compute the discrepancy between two time sequences of observations given metric.
        Be sure to align and truncate the rollouts beforehand.

        :param rollout_real: (concatenated) real-world rollout containing the observations
        :param rollout_sim: (concatenated) simulated rollout containing the observations
        :return: discrepancy cost summed over the observation dimensions
        """
        if len(rollout_real) != len(rollout_sim):
            raise pyrado.ShapeErr(given=rollout_real,
                                  expected_match=rollout_sim)

        # Extract the observations
        real_obs = rollout_real.get_data_values("observations",
                                                truncate_last=True)
        sim_obs = rollout_sim.get_data_values("observations",
                                              truncate_last=True)

        # Filter the observations
        real_obs = gaussian_filter1d(real_obs, self.std_obs_filt, axis=0)
        sim_obs = gaussian_filter1d(sim_obs, self.std_obs_filt, axis=0)

        # Normalize the signals
        real_obs_norm = self.obs_normalizer.project_to(real_obs)
        sim_obs_norm = self.obs_normalizer.project_to(sim_obs)

        # Compute loss based on the error
        loss_per_obs_dim = self.metric(real_obs_norm - sim_obs_norm)
        assert len(loss_per_obs_dim) == real_obs.shape[1]
        assert all(loss_per_obs_dim >= 0)
        return sum(loss_per_obs_dim)
Exemplo n.º 6
0
    def train(self, reference_trajectory: StepSequence,
              randomized_trajectory: StepSequence,
              num_epoch: int) -> to.Tensor:

        reference_batch = reference_trajectory.split_shuffled_batches(
            self.batch_size)
        random_batch = randomized_trajectory.split_shuffled_batches(
            self.batch_size)

        for _ in tqdm(range(num_epoch), 'Discriminator Epoch', num_epoch):
            try:
                reference_batch_now = convert_step_sequence(
                    next(reference_batch))
                random_batch_now = convert_step_sequence(next(random_batch))
            except StopIteration:
                break
            if reference_batch_now.shape[
                    0] < self.batch_size - 1 or random_batch_now.shape[
                        0] < self.batch_size - 1:
                break
            random_results = self.discriminator(random_batch_now)
            reference_results = self.discriminator(reference_batch_now)
            self.optimizer.zero_grad()
            loss = self.loss_fcn(
                random_results, to.ones(self.batch_size - 1)) + self.loss_fcn(
                    reference_results, to.zeros(self.batch_size - 1))
            loss.backward()
            self.optimizer.step()

            # Logging
            if self.logger is not None:
                self.logger.add_value('discriminator_loss', loss)
        return loss
Exemplo n.º 7
0
    def push(self, ros: Union[list, StepSequence], truncate_last: bool = True):
        """
        Save a sequence of steps and drop of steps if the capacity is exceeded.

        :param ros: list of rollouts or one concatenated rollout
        :param truncate_last: remove the last step from each rollout, forwarded to `StepSequence.concat`
        """
        if isinstance(ros, list):
            # Concatenate given rollouts if necessary
            ros = StepSequence.concat(ros)
        elif isinstance(ros, StepSequence):
            pass
        else:
            pyrado.TypeErr(given=ros, expected_type=[list, StepSequence])

        # Add new steps
        if self.isempty:
            self._memory = deepcopy(ros)  # on the very first call
        else:
            self._memory = StepSequence.concat([self._memory, ros], truncate_last=truncate_last)

        num_surplus = self._memory.length - self.capacity
        if num_surplus > 0:
            # Drop surplus of old steps
            self._memory = self._memory[num_surplus:]
Exemplo n.º 8
0
def test_action_statistics(env: SimEnv, policy: Policy):
    sigma = 1.0  # with lower values like 0.1 we can observe violations of the tolerances

    # Create an action-based exploration strategy
    explstrat = NormalActNoiseExplStrat(policy, std_init=sigma)

    # Sample a deterministic rollout
    ro_policy = rollout(env,
                        policy,
                        eval=True,
                        max_steps=1000,
                        stop_on_done=False,
                        seed=0)
    ro_policy.torch(to.get_default_dtype())

    # Run the exploration strategy on the previously sampled rollout
    if policy.is_recurrent:
        if isinstance(policy, TwoHeadedPolicy):
            act_expl, _, _ = explstrat(ro_policy.observations)
        else:
            act_expl, _ = explstrat(ro_policy.observations)
        # Get the hidden states from the deterministic rollout
        hidden_states = ro_policy.hidden_states
    else:
        if isinstance(policy, TwoHeadedPolicy):
            act_expl, _ = explstrat(ro_policy.observations)
        else:
            act_expl = explstrat(ro_policy.observations)
        hidden_states = [
            0.0
        ] * ro_policy.length  # just something that does not violate the format

    ro_expl = StepSequence(
        actions=act_expl[:-1],  # truncate act due to last obs
        observations=ro_policy.observations,
        rewards=ro_policy.rewards,  # don't care but necessary
        hidden_states=hidden_states,
    )
    ro_expl.torch()

    # Compute action statistics and the ground truth
    actstats = compute_action_statistics(ro_expl, explstrat)
    gt_logprobs = Normal(loc=ro_policy.actions,
                         scale=sigma).log_prob(ro_expl.actions)
    gt_entropy = Normal(loc=ro_policy.actions, scale=sigma).entropy()

    to.testing.assert_allclose(actstats.log_probs,
                               gt_logprobs,
                               rtol=1e-4,
                               atol=1e-5)
    to.testing.assert_allclose(actstats.entropy,
                               gt_entropy,
                               rtol=1e-4,
                               atol=1e-5)
Exemplo n.º 9
0
def test_add_data(data_format):
    ro = StepSequence(rewards=rewards,
                      observations=observations,
                      actions=actions,
                      policy_infos=policy_infos,
                      hidden=hidden,
                      data_format=data_format)
    # Add a data field
    ro.add_data('return', discounted_value(ro, 0.9))
    assert hasattr(ro, 'return')

    # Query new data field from steps
    assert abs(ro[2]['return'] - -86.675) < 0.01
Exemplo n.º 10
0
def convert_step_sequence(traj: StepSequence):
    """
    Converts a StepSequence to a Tensor which can be fed through a Network

    :param traj: A step sequence containing a trajectory
    :return: A Tensor containing the trajectory
    """
    assert isinstance(traj, StepSequence)
    traj.torch()
    state = traj.get_data_values('observations')[:-1].double()
    next_state = traj.get_data_values('observations')[1::].double()
    action = traj.get_data_values('actions').narrow(
        0, 0, next_state.shape[0]).double()
    traj = to.cat((state, next_state, action), 1).cpu().double()
    return traj
Exemplo n.º 11
0
    def evaluate(self,
                 rollout: StepSequence,
                 hidden_states_name: str = 'hidden_states') -> to.Tensor:
        """
        Re-evaluate the given rollout and return a derivable action tensor.
        This method makes sure that the gradient is propagated through the hidden state.

        :param rollout: complete rollout
        :param hidden_states_name: name of hidden states rollout entry, used for recurrent networks.
                                   Change this string for value functions.
        :return: actions with gradient data
        """
        act_list = []
        for ro in rollout.iterate_rollouts():
            if hidden_states_name in rollout.data_names:
                # Get initial hidden state from first step
                hs = ro[0][hidden_states_name]
            else:
                # Let the network pick the default hidden state
                hs = None

            # Run each step separately
            for step in ro:
                act, hs = self(step.observation, hs)
                act_list.append(act)

        return to.stack(act_list)
Exemplo n.º 12
0
def test_create_rew_only():
    # Don't require additional fields for this test
    StepSequence.required_fields = {}

    ro = StepSequence(rewards=rewards, data_format='numpy')
    assert len(ro) == 5
    assert (ro.rewards == np.array(rewards)).all()
Exemplo n.º 13
0
def test_create(mock_data, data_format, tensor_type):
    rewards, states, observations, actions, hidden, policy_infos = mock_data

    # With actions, observations and dicts
    ro = StepSequence(
        rewards=rewards,
        observations=observations,
        states=states,
        actions=actions,
        policy_infos=policy_infos,
        hidden=hidden,
        data_format=data_format,
    )
    assert len(ro) == 5

    assert isinstance(ro.rewards, tensor_type)
    assert isinstance(ro.observations, tensor_type)
    assert isinstance(ro.actions, tensor_type)
    assert isinstance(ro.policy_infos["mean"], tensor_type)
    assert isinstance(ro.policy_infos["std"], tensor_type)
    assert isinstance(ro.hidden[0], tensor_type)

    # Done should always be a ndarray
    assert isinstance(ro.done, np.ndarray)
    assert not ro.done[:-1].any()
    assert ro.done[-1]
Exemplo n.º 14
0
def test_step_iter(mock_data, data_format: str):
    rewards, states, observations, actions, hidden, policy_infos = mock_data

    ro = StepSequence(
        rewards=rewards,
        observations=observations,
        states=states,
        actions=actions,
        policy_infos=policy_infos,
        hidden=hidden,
        data_format=data_format,
    )

    assert len(ro) == 5

    for i, step in enumerate(ro):
        assert step.reward == rewards[i]
        # Check current and next
        assert (step.observation == to_format(observations[i],
                                              data_format)).all()
        assert (step.next_observation == to_format(observations[i + 1],
                                                   data_format)).all()
        # Check dict sub element
        assert (step.policy_info.mean == to_format(policy_infos[i]["mean"],
                                                   data_format)).all()
        assert (step.hidden[0] == to_format(hidden[i][0], data_format)).all()
Exemplo n.º 15
0
    def evaluate(self,
                 rollout: StepSequence,
                 hidden_states_name: str = 'hidden_states') -> to.Tensor:
        if not rollout.data_format == 'torch':
            raise pyrado.TypeErr(
                msg=
                'The rollout data passed to evaluate() must be of type torch.Tensor!'
            )
        if not rollout.continuous:
            raise pyrado.ValueErr(
                msg=
                'The rollout data passed to evaluate() from a continuous rollout!'
            )

        # Set policy, i.e. PyTorch nn.Module, to evaluation mode
        self.eval()

        act_list = []
        for ro in rollout.iterate_rollouts():
            if hidden_states_name in rollout.data_names:
                # Get initial hidden state from first step
                hidden = ro[0][hidden_states_name]
            else:
                # Let the network pick the default hidden state
                hidden = None

            # Run steps consecutively reusing the hidden state
            for step in ro:
                act, hidden = self(step.observation, hidden)
                act_list.append(act)

        # Set policy, i.e. PyTorch nn.Module, back to training mode
        self.train()

        return to.stack(act_list)
Exemplo n.º 16
0
    def update(self, *args: Any, **kwargs: Any):
        """Update the policy's (and value functions') parameters based on the collected rollout data."""
        obss = []
        losses = []
        for t in range(self.num_teachers):
            concat_ros = StepSequence.concat(kwargs["rollouts"][t])
            concat_ros.torch(data_type=to.get_default_dtype())
            obss.append(concat_ros.get_data_values("observations")[: self.min_steps])

        # Train student
        for epoch in range(self.num_epochs):
            self.optimizer.zero_grad()

            loss = 0
            for t_idx, teacher in enumerate(self.teacher_policies):
                s_dist = self.expl_strat.action_dist_at(self.policy(obss[t_idx]))
                s_act = s_dist.sample()
                t_dist = self.teacher_expl_strats[t_idx].action_dist_at(teacher(obss[t_idx]))

                l = self.teacher_weights[t_idx] * self.criterion(t_dist.log_prob(s_act), s_dist.log_prob(s_act))
                loss += l
                losses.append([t_idx, l.item()])
            print(f"Epoch {epoch} Loss: {loss.item()}")
            loss.backward()
            self.optimizer.step()
Exemplo n.º 17
0
    def evaluate(self,
                 rollout: StepSequence,
                 hidden_states_name: str = 'hidden_states') -> to.Tensor:
        assert rollout.continuous
        assert rollout.data_format == 'torch'

        # The passed sample collection might contain multiple rollouts.
        # Note:
        # While we *could* try to convert this to a PackedSequence, allowing us to only call the network once, that
        # would require a lot of reshaping on the result. So let's not. If performance becomes an issue, revisit here.
        act_list = []
        for ro in rollout.iterate_rollouts():
            if hidden_states_name in rollout.data_names:
                # Get initial hidden state from first step
                init_hs = self._unpack_hidden(ro[0][hidden_states_name])
            else:
                # Let the network pick the default hidden state
                init_hs = None

            # Reshape observations to match torch's rnn sequence protocol
            obs = ro.get_data_values('observations',
                                     True).unsqueeze(1).to(self.device)

            # Run them through the network
            output, _ = self.rnn_layers(obs, init_hs)

            # And through the output layer
            act = self.output_layer(output.squeeze(1))
            if self._output_nonlin is not None:
                act = self._output_nonlin(act)

            # Collect the actions
            act_list.append(act)

        return to.cat(act_list)
Exemplo n.º 18
0
def _ps_run_one_reset_kwargs_segment(
    G,
    domain_param: dict,
    init_state: np.ndarray,
    len_segment: int,
    stop_on_done: bool,
    use_rec: bool,
    idx_r: int,
    cnt_step: int,
    eval: bool,
):
    """
    Sample one segments of a rollout with given init state (which originates from a target domain setup) and domain
    parameters, passed as a tuple for simplicity at the other end.
    """
    if not isinstance(domain_param, dict):
        raise pyrado.TypeErr(given=domain_param, expected_type=dict)
    if not isinstance(init_state, np.ndarray):
        raise pyrado.TypeErr(given=init_state, expected_type=np.ndarray)
    if not isinstance(len_segment, int):
        raise pyrado.TypeErr(given=len_segment, expected_type=int)

    # Set the init space of the simulation environment such that we can later set to arbitrary states that could have
    # occurred during the rollout. This is necessary since we are running the evaluation in segments.
    G.env.init_space = InfBoxSpace(shape=G.env.init_space.shape)

    if use_rec:
        # Disabled the policy reset of PlaybackPolicy to do it here manually
        assert G.policy.no_reset
        G.policy.curr_rec = idx_r
        G.policy.curr_step = cnt_step

    ro = rollout(
        G.env,
        G.policy,
        eval=eval,
        reset_kwargs=dict(init_state=init_state, domain_param=domain_param),
        max_steps=len_segment,
        stop_on_done=stop_on_done,
    )

    # Pad if necessary
    StepSequence.pad(ro, len_segment)

    return ro
Exemplo n.º 19
0
def test_add_data(mock_data, data_format: str):
    rewards, states, observations, actions, hidden, policy_infos = mock_data

    ro = StepSequence(
        rewards=rewards,
        observations=observations,
        states=states,
        actions=actions,
        policy_infos=policy_infos,
        hidden=hidden,
        data_format=data_format,
    )
    # Add a data field
    ro.add_data("return", discounted_value(ro, 0.9))
    assert hasattr(ro, "return")

    # Query new data field from steps
    assert abs(ro[2]["return"] - -86.675) < 0.01
Exemplo n.º 20
0
def test_additional_required(mock_data):
    # Require the states as additional field for this test
    StepSequence.required_fields = {"states"}

    rewards, states, observations, actions, hidden, policy_infos = mock_data

    with pytest.raises(Exception) as err:
        # This should fail
        _ = StepSequence(rewards=rewards,
                         observations=observations,
                         actions=actions)
        assert isinstance(err, ValueError)

    ro = StepSequence(rewards=rewards,
                      observations=observations,
                      actions=actions,
                      states=states)
    assert len(ro) == 5
    assert (ro.rewards == np.array(rewards)).all()
Exemplo n.º 21
0
def test_namedtuple(data_format):
    hid_nt = [DummyNT(*it) for it in hidden]

    ro = StepSequence(rewards=rewards, hidden=hid_nt, data_format=data_format)

    assert isinstance(ro.hidden, DummyNT)

    for i, step in enumerate(ro):
        assert isinstance(step.hidden, DummyNT)
        assert (step.hidden.part1 == to_format(hid_nt[i].part1,
                                               data_format)).all()
Exemplo n.º 22
0
def test_basic_policy_evaluate_packed_padded_sequences(
        env: Env, policy: RecurrentPolicy):
    # Test packed padded sequence implementation against old implementation
    def old_evaluate(rollout: StepSequence,
                     hidden_states_name: str = "hidden_states") -> to.Tensor:
        # Set policy, i.e. PyTorch nn.Module, to evaluation mode
        policy.eval()

        # The passed sample collection might contain multiple rollouts.
        act_list = []
        for ro in rollout.iterate_rollouts():
            if hidden_states_name in rollout.data_names:
                # Get initial hidden state from first step
                hidden = policy._unpack_hidden(ro[0][hidden_states_name])
            else:
                # Let the network pick the default hidden state
                hidden = None

            # Reshape observations to match PyTorch's RNN sequence protocol
            obs = ro.get_data_values("observations", True).unsqueeze(1)
            obs = obs.to(device=policy.device, dtype=to.get_default_dtype())

            # Pass the input through hidden RNN layers
            out, _ = policy.rnn_layers(obs, hidden)

            # And through the output layer
            act = policy.output_layer(out.squeeze(1))
            if policy.output_nonlin is not None:
                act = policy.output_nonlin(act)

            # Collect the actions
            act_list.append(act)

        # Set policy, i.e. PyTorch nn.Module, back to training mode
        policy.train()

        return to.cat(act_list)

    # Get some rollouts
    ros = []
    for i in range(5):
        ro = rollout(env, policy, eval=True, render_mode=RenderMode())
        ro.torch(to.get_default_dtype())
        ros.append(ro)

    # Perform concatenation
    cat = StepSequence.concat(ros)

    # Evaluate old and new approaches
    act_old = old_evaluate(cat)
    act_new = policy.evaluate(cat)

    to.testing.assert_allclose(act_old, act_new)
Exemplo n.º 23
0
def test_convert(mock_data, other_format, tensor_type):
    rewards, states, observations, actions, hidden, policy_infos = mock_data

    ro = StepSequence(
        rewards=rewards,
        observations=observations,
        states=states,
        actions=actions,
        policy_infos=policy_infos,
        hidden=hidden,
        data_format=other_format,
    )
    # convert
    if other_format == "numpy":
        ro.torch()
    elif other_format == "torch":
        ro.numpy()
    # Verify
    assert isinstance(ro.rewards, tensor_type)
    assert isinstance(ro.observations, tensor_type)
    assert isinstance(ro.actions, tensor_type)
    assert isinstance(ro.policy_infos["mean"], tensor_type)
    assert isinstance(ro.policy_infos["std"], tensor_type)
    assert isinstance(ro.hidden[0], tensor_type)

    # Done should always be a ndarray
    assert isinstance(ro.done, np.ndarray)
Exemplo n.º 24
0
def test_process(mock_data, data_format: str):
    rewards, states, observations, actions, hidden, policy_infos = mock_data

    # Create the rollout
    ro = StepSequence(rewards=rewards,
                      observations=observations,
                      states=states,
                      actions=actions,
                      hidden=hidden)

    if data_format == "numpy":
        # Create the filter (arbitrary values)
        b, a = signal.butter(N=5, Wn=10, fs=100)

        # Filter the signals, but not the time
        ro_proc = StepSequence.process_data(ro,
                                            signal.filtfilt,
                                            fcn_arg_name="x",
                                            exclude_fields=["time"],
                                            b=b,
                                            a=a,
                                            padlen=2,
                                            axis=0)

    else:
        # Transform to PyTorch data and define a simple function
        ro.torch()
        ro_proc = StepSequence.process_data(ro,
                                            lambda x: x * 2,
                                            fcn_arg_name="x",
                                            include_fields=["time"],
                                            fcn_arg_types=to.Tensor)

    assert isinstance(ro_proc, StepSequence)
    assert ro_proc.length == ro.length
Exemplo n.º 25
0
def preprocess_rollout(rollout: StepSequence) -> StepSequence:
    """
    Extracts observations and actions from a `StepSequence` and packs them into a PyTorch tensor which can be fed
    through a network.

    :param rollout: a `StepSequence` instance containing a trajectory
    :return: a PyTorch tensor` containing the trajectory
    """
    if not isinstance(rollout, StepSequence):
        raise pyrado.TypeErr(given=rollout, expected_type=StepSequence)

    # Convert data type
    rollout.torch(to.get_default_dtype())

    # Extract the data
    state = rollout.get_data_values("observations")[:-1]
    next_state = rollout.get_data_values("observations")[1::]
    action = rollout.get_data_values("actions").narrow(0, 0,
                                                       next_state.shape[0])

    rollout = to.cat((state, next_state, action), 1)
    return rollout
Exemplo n.º 26
0
def test_adr_reward_generator(env):
    reference_env = env
    random_env = deepcopy(env)
    reward_generator = RewardGenerator(
        env_spec=random_env.spec,
        batch_size=100,
        reward_multiplier=1,
        logger=None
    )
    policy = FNNPolicy(reference_env.spec, hidden_sizes=[32], hidden_nonlin=to.tanh)
    dr = get_default_randomizer_omo()
    dr.randomize(num_samples=1)
    random_env.domain_param = dr.get_params(format='dict', dtype='numpy')
    reference_sampler = ParallelSampler(reference_env, policy, num_envs=4, min_steps=10000)
    random_sampler = ParallelSampler(random_env, policy, num_envs=4, min_steps=10000)

    losses = []
    for i in range(50):
        reference_traj = StepSequence.concat(reference_sampler.sample())
        random_traj = StepSequence.concat(random_sampler.sample())
        losses.append(reward_generator.train(reference_traj, random_traj, 10))
    assert losses[len(losses) - 1] < losses[0]
Exemplo n.º 27
0
    def __call__(self, dp_values: to.Tensor = None) -> Tuple[to.Tensor, StepSequence]:
        """
        Run one rollout in the target domain, and compute the features of the data used for sbi.

        :param dp_values: ignored, just here for the interface compatibility
        :return: features computed from the time series data, and the complete rollout
        """
        ro_real = None
        run_interactive_loop = True
        while run_interactive_loop:
            # Don't set the domain params here since they are set by the DomainRandWrapperBuffer to mimic the randomness
            ro_real = rollout(self._env, self._policy, eval=True, stop_on_done=self.stop_on_done)
            if not isinstance(self._env, RealEnv):
                run_interactive_loop = False
            else:
                # Ask is the current rollout should be discarded and redone
                run_interactive_loop = input("Continue with the next rollout y / n? ").lower() == "n"

        # Pad if necessary
        StepSequence.pad(ro_real, self._env.max_steps)

        # Pre-processing
        ro_real.torch()
        self._set_action_field([ro_real])

        # Assemble the data
        data_real = to.cat([ro_real.states[:-1, :], ro_real.get_data_values(self._action_field)], dim=1)
        if self._embedding.requires_target_domain_data:
            data_real = to.cat([data_real, data_real], dim=1)

        # Compute the features
        data_real = data_real.unsqueeze(0)  # only one target domain rollout
        data_real = self._embedding(Embedding.pack(data_real))  # shape [1, dim_feat]

        # Check shape (here no batching and always one rollout)
        if data_real.shape[0] != 1 or data_real.ndim != 2:
            raise pyrado.ShapeErr(given=data_real, expected_match=(1, -1))

        return data_real, ro_real
Exemplo n.º 28
0
    def update(self, rollouts: Sequence[StepSequence]):
        r"""
        Train the particles $mu$.

        :param rollouts: rewards collected from the rollout
        """
        policy_grads = []
        parameters = []

        for i in range(self.num_particles):
            # Get the rollouts associated to the i-th particle
            concat_ros = StepSequence.concat(rollouts[i])
            concat_ros.torch()

            act_stats = compute_action_statistics(concat_ros,
                                                  self.expl_strats[i])
            act_stats_fixed = compute_action_statistics(
                concat_ros, self.fixed_expl_strats[i])

            klds = to.distributions.kl_divergence(act_stats.act_distr,
                                                  act_stats_fixed.act_distr)
            entropy = act_stats.act_distr.entropy()
            log_prob = act_stats.log_probs

            concat_ros.rewards = concat_ros.rewards - (
                0.1 * klds.mean(1)).view(-1) - 0.1 * entropy.mean(1).view(-1)

            # Update the advantage estimator's parameters and return advantage estimates
            adv = self.particles[i].critic.update(rollouts[i],
                                                  use_empirical_returns=True)

            # Estimate policy gradients
            self.optimizers[i].zero_grad()
            policy_grad = -to.mean(log_prob * adv.detach())
            policy_grad.backward()  # step comes later than usual

            # Collect flattened parameter and gradient vectors
            policy_grads.append(self.expl_strats[i].param_grad)
            parameters.append(self.expl_strats[i].param_values)

        parameters = to.stack(parameters)
        policy_grads = to.stack(policy_grads)
        Kxx, dx_Kxx = self.kernel(parameters)
        grad_theta = (to.mm(Kxx, policy_grads / self.temperature) +
                      dx_Kxx) / self.num_particles

        for i in range(self.num_particles):
            self.expl_strats[i].param_grad = grad_theta[i]
            self.optimizers[i].step()
        self.updatecount += 1
Exemplo n.º 29
0
def test_slice(sls, data_format):
    ro = StepSequence(rewards=rewards,
                      observations=observations,
                      actions=actions,
                      policy_infos=policy_infos,
                      hidden=hidden,
                      data_format=data_format)

    # Slice rollout
    sliced = ro[sls]
    # Slice reward list for verification
    sliced_rew = rewards[sls]

    for i, step in enumerate(sliced):
        assert step.reward == sliced_rew[i]
Exemplo n.º 30
0
    def evaluate(self,
                 rollout: StepSequence,
                 hidden_states_name: str = 'hidden_states') -> to.Tensor:
        """
        Re-evaluate the given rollout and return a derivable action tensor.
        The default implementation simply calls `forward()`.

        :param rollout: recorded, complete rollout
        :param hidden_states_name: name of hidden states rollout entry, used for recurrent networks.
                                   Defaults to 'hidden_states'. Change for value functions.
        :return: actions with gradient data
        """
        self.eval()
        return self(rollout.get_data_values(
            'observations', truncate_last=True))  # all observations at once