Exemplo n.º 1
0
def create_ik_activation_setup(dt, max_steps, max_dist_force, physics_engine):
    # Set up environment
    env = Planar3LinkIKActivationSim(
        physicsEngine=physics_engine,
        dt=dt,
        max_steps=max_steps,
        max_dist_force=max_dist_force,
        taskCombinationMethod='product',
        positionTasks=True,
        checkJointLimits=False,
        collisionAvoidanceIK=True,
        observeTaskSpaceDiscrepancy=True,
    )
    print_domain_params(env.domain_param)

    # Set up policy
    def policy_fcn(t: float):
        return [0.3 + 0.2 * math.sin(2. * math.pi * 0.2 * t), 0, 1]

    policy = TimePolicy(env.spec, policy_fcn, dt)

    # Simulate
    return rollout(env,
                   policy,
                   render_mode=RenderMode(video=True),
                   stop_on_done=True)
Exemplo n.º 2
0
def joint_control_variant(dt, max_steps, max_dist_force, physics_engine):
    # Set up environment
    env = Planar3LinkJointCtrlSim(
        physicsEngine=physics_engine,
        dt=dt,
        max_steps=max_steps,
        max_dist_force=max_dist_force,
        checkJointLimits=True,
    )
    print_domain_params(env.domain_param)

    # Set up policy
    def policy_fcn(t: float):
        return [
            0.1,
            0.1,  # same as init config
            0.1 + 45. / 180. * math.pi * math.sin(2. * math.pi * 0.2 * t)
        ]  # oscillation in last link

    policy = TimePolicy(env.spec, policy_fcn, dt)

    # Simulate
    return rollout(env,
                   policy,
                   render_mode=RenderMode(video=True),
                   stop_on_done=True)
Exemplo n.º 3
0
def create_manual_activation_setup(dt, max_steps, max_dist_force,
                                   physics_engine):
    # Set up environment
    env = Planar3LinkTASim(physicsEngine=physics_engine,
                           dt=dt,
                           max_steps=max_steps,
                           max_dist_force=max_dist_force,
                           positionTasks=True,
                           observeTaskSpaceDiscrepancy=True)
    print_domain_params(env.domain_param)

    # Set up policy
    def policy_fcn(t: float):
        pot = np.fromstring(input("Enter potentials for next step: "),
                            dtype=np.double,
                            count=3,
                            sep=' ')
        return 1 / (1 + np.exp(-pot))

    policy = TimePolicy(env.spec, policy_fcn, dt)

    # Simulate
    return rollout(env,
                   policy,
                   render_mode=RenderMode(video=True),
                   stop_on_done=True)
Exemplo n.º 4
0
def create_joint_control_setup(dt, max_steps, max_dist_force, physics_engine):
    # Set up environment
    env = Planar3LinkJointCtrlSim(
        physicsEngine=physics_engine,
        dt=dt,
        max_steps=max_steps,
        max_dist_force=max_dist_force,
        taskCombinationMethod="sum",
        checkJointLimits=True,
    )
    print_domain_params(env.domain_param)

    # Set up policy
    def policy_fcn(t: float):
        return [
            10 / 180 * math.pi,
            10 / 180 * math.pi,  # same as init config
            10 / 180 * math.pi +
            45.0 / 180.0 * math.pi * math.sin(2.0 * math.pi * 0.2 * t),
        ]  # oscillation in last link

    policy = TimePolicy(env.spec, policy_fcn, dt)

    # Simulate
    return rollout(env,
                   policy,
                   render_mode=RenderMode(video=True),
                   stop_on_done=True)
Exemplo n.º 5
0
    def _compute_candidate(self, nc: int):
        """
        Train and save one candidate solution to a pt-file

        :param nc: number of domains used for training the candidate solution
        """
        # Do a warm start if desired
        self._subrtn_cand.init_modules(
            self.warmstart_cand,
            prefix=f"iter_{self._curr_iter - 1}",
            suffix="cand",
            policy_param_init=self.cand_policy_param_init,
            valuefcn_param_init=self.cand_critic_param_init,
        )

        # Sample sets of physics params xi_{1}, ..., xi_{nc}
        self.env_dr.fill_buffer(nc)
        env_params_cand = self.env_dr.randomizer.get_params()
        joblib.dump(
            env_params_cand,
            osp.join(self.save_dir,
                     f"iter_{self._curr_iter}_env_params_cand.pkl"))
        print("Randomized parameters of for the candidate solution:")
        print_domain_params(env_params_cand)

        # Reset the subroutine algorithm which includes resetting the exploration
        self._cnt_samples += self._subrtn_cand.sample_count
        self._subrtn_cand.reset()
        print("Reset candidate exploration noise.")

        pol_param_before = self._subrtn_cand.policy.param_values.clone()
        if isinstance(self._subrtn_cand, ActorCritic):
            # Set dropout and batch normalization layers to training mode
            self._subrtn_cand.critic.vfcn.train()
            critic_param_before = self._subrtn_cand.critic.vfcn.param_values.clone(
            )

        # Solve the (approx) stochastic program SP_nc for the sampled physics parameter sets
        print_cbt(f"\nIteration {self._curr_iter} | Candidate solution\n",
                  "c",
                  bright=True)
        self._subrtn_cand.train(snapshot_mode="best",
                                meta_info=dict(
                                    prefix=f"iter_{self._curr_iter}",
                                    suffix="cand"))

        if (self._subrtn_cand.policy.param_values == pol_param_before).all():
            warn(
                "The candidate's policy parameters did not change during training!",
                UserWarning)
        if isinstance(self._subrtn_refs, ActorCritic):
            if (self._subrtn_cand.critic.vfcn.param_values ==
                    critic_param_before).all():
                warn(
                    "The candidate's critic parameters did not change during training!",
                    UserWarning)

        print_cbt("Learned an approx solution for SP_nc.\n", "y")
Exemplo n.º 6
0
def create_setup(physics_engine, dt, max_steps, max_dist_force):
    # Set up environment
    env = BallOnPlate5DSim(physicsEngine=physics_engine,
                           dt=dt,
                           max_steps=max_steps,
                           max_dist_force=max_dist_force)
    env = ActNormWrapper(env)
    print_domain_params(env.domain_param)

    # Set up policy
    def policy_fcn(t: float):
        return [
            0.0,  # x_ddot_plate
            0.5 * math.sin(2. * math.pi * 5 * t),  # y_ddot_plate
            5. * math.cos(2. * math.pi / 5. * t),  # z_ddot_plate
            0.0,  # alpha_ddot_plate
            0.0,  # beta_ddot_plate
        ]

    policy = TimePolicy(env.spec, policy_fcn, dt)

    return env, policy
Exemplo n.º 7
0
def ik_control_variant(dt, max_steps, max_dist_force, physics_engine):
    # Set up environment
    env = Planar3LinkIKSim(
        physicsEngine=physics_engine,
        dt=dt,
        max_steps=max_steps,
        max_dist_force=max_dist_force,
        checkJointLimits=True,
    )
    print_domain_params(env.domain_param)

    # Set up policy
    def policy_fcn(t: float):
        return [0.3 + 0.2 * math.sin(2. * math.pi * 0.2 * t), 1.1]

    policy = TimePolicy(env.spec, policy_fcn, dt)

    # Simulate
    return rollout(env,
                   policy,
                   render_mode=RenderMode(video=True),
                   stop_on_done=True)
Exemplo n.º 8
0
 def __init__(self):
     ShowBase.__init__(self)
     self.done = False
     self.state = None
     self.param = None
     print("a")
     self.ro = rollout(
         env,
         policy,
         render_mode=RenderMode(text=args.verbose,
                                video=args.animation),
         eval=True,
         max_steps=max_steps,
         stop_on_done=not args.relentless,
         reset_kwargs=dict(domain_param=self.param,
                           init_state=self.state),
     )
     print("hoi")
     print_domain_params(env.domain_param)
     print_cbt(f"Return: {self.ro.undiscounted_return()}",
               "g",
               bright=True)
     self.done, self.state, self.param = after_rollout_query(
         env, policy, self.ro)
     print("1")
     self.bob = BallOnBeamSim(2)
     print("2")
     self.pos, self.r_ball, self.a, self.l_beam, self.d_beam = self.bob._init_anim(
     )
     print("3")
     self.ball = self.loader.loadModel("my_models/ball")
     self.ball.reparentTo(self.render)
     self.ball.setPos(self.pos)
     self.box = self.loader.loadModel("my_models/box")
     self.box.reparentTo(self.render)
     self.box.setPos(0, 0, 0)
     self.box.setScale(self.l_beam, self.d_beam, 2 * self.d_beam)
     self.camera.setPos(0, -10, 0)
Exemplo n.º 9
0
def sim_policy_fixed_env(env: SimEnv, policy: Policy,
                         domain_param: [dict, list]):
    """
    Simulate (with animation) a rollout in a environment with fixed domain parameters.

    :param env: environment stack as it was used during training
    :param policy: policy to simulate
    :param domain_param: domain parameter set or a list of sets that specify the environment
    """
    # Remove wrappers that make the rollouts stochastic
    env = remove_env(env, GaussianObsNoiseWrapper)
    env = remove_env(env, DomainRandWrapperBuffer)
    env = remove_env(env, DomainRandWrapperLive)

    # Initialize
    done, state, i = False, None, 0
    if isinstance(domain_param, dict):
        param = domain_param
    elif isinstance(domain_param, list):
        param = domain_param[i]
    else:
        raise pyrado.TypeErr(given=domain_param, expected_type=[dict, list])

    while not done:
        ro = rollout(env,
                     policy,
                     reset_kwargs=dict(domain_param=param, init_state=state),
                     render_mode=RenderMode(video=True),
                     eval=True)
        print_domain_params(env.domain_param)
        print_cbt(f'Return: {ro.undiscounted_return()}', 'g', bright=True)
        done, state, _ = after_rollout_query(env, policy, ro)

        if isinstance(domain_param, list):
            # Iterate over the list of domain parameter sets
            i = (i + 1) % len(domain_param)
            param = domain_param[i]
Exemplo n.º 10
0
        cand = to.load(osp.join(ex_dir, found_cands[i])).numpy()
        ax.scatter(np.arange(cand.size), cand, label='$\phi_{' + str(i) + '}$', c=f'C{i%10}', s=16)
    ax.xaxis.set_major_locator(MaxNLocator(integer=True))
    ax.set_ylabel('parameter value')
    ax.set_xlabel('parameter index')
    plt.legend()
    plt.show()

    # Simulate
    for i in range(len(found_policies)):
        # Load current
        policy = to.load(osp.join(ex_dir, found_policies[i]))
        cand = to.load(osp.join(ex_dir, found_cands[i]))

        # Set the domain randomizer given the hyper-parameters
        if isinstance(env_sim, MetaDomainRandWrapper):
            env_sim.adapt_randomizer(cand)
            print_cbt(f'Set the domain randomizer to\n{env_sim.randomizer}', 'c')
        else:
            raise pyrado.TypeErr(given=env_sim, expected_type=MetaDomainRandWrapper)

        done, state, param = False, None, None
        while not done:
            print_cbt(f'Simulating {found_policies[i]} with associated domain parameter distribution.', 'g')
            ro = rollout(env_sim, policy, render_mode=RenderMode(video=True), eval=True,
                         reset_kwargs=dict(domain_param=param, init_state=state))  # calls env.reset()
            print_domain_params(env_sim.domain_param)
            print_cbt(f'Return: {ro.undiscounted_return()}', 'g', bright=True)
            done, state, param = after_rollout_query(env_sim, policy, ro)
    pyrado.close_vpython()
Exemplo n.º 11
0
"""
Test predefined energy-based controller to make the Quanser Qube swing up.
"""
import torch as to

from pyrado.environments.pysim.quanser_qube import QQubeSim
from pyrado.domain_randomization.utils import print_domain_params
from pyrado.policies.environment_specific import QQubeSwingUpAndBalanceCtrl
from pyrado.sampling.rollout import rollout, after_rollout_query
from pyrado.utils.data_types import RenderMode
from pyrado.utils.input_output import print_cbt


if __name__ == '__main__':
    # Set up environment
    env = QQubeSim(dt=1/500., max_steps=4000)

    # Set up policy
    policy = QQubeSwingUpAndBalanceCtrl(env.spec)

    # Simulate
    done, param, state = False, None, None
    while not done:
        ro = rollout(env, policy, render_mode=RenderMode(text=False, video=True), eval=True,
                     reset_kwargs=dict(domain_param=param, init_state=state))
        print_domain_params(env.domain_param)
        print_cbt(f'Return: {ro.undiscounted_return()}', 'g', bright=True)
        done, state, param = after_rollout_query(env, policy, ro)
Exemplo n.º 12
0
    def _compute_references(self, nr: int, nG: int):
        """
        Train and save nG reference solutions to pt-files

        :param nr: number of domains used for training the reference solutions
        :param nG: number of reference solutions
        """
        # Loop to compute a distribution of optimality gaps via nG samples
        for k in range(nG):
            print_cbt(
                f'Iteration {self._curr_iter} | Reference solution {k + 1} of {nG}\n',
                'c',
                bright=True)
            if not self.warmstart_refs:
                # Create a new reference policy by re-initializing its parameters
                self._subrtn_cand.policy.init_param()

                # Create a new value function by re-initializing its parameters
                if isinstance(self._subrtn_refs, ActorCritic):
                    self._subrtn_refs.critic.value_fcn.init_param()

                print_cbt('Created a new reference solution.\n', 'y')

            else:
                # Continue from the candidate's policy of the current iteration
                self._subrtn_refs.policy.load_state_dict(
                    to.load(
                        osp.join(self._save_dir,
                                 f'iter_{self._curr_iter}_policy_cand.pt')).
                    state_dict())
                if not (self._subrtn_refs.policy.param_values
                        == self._subrtn_cand.policy.param_values).all():
                    warn(
                        "The reference policy's parameters are not equal to the candidate's after loading them!"
                        "This can be explained by snapshot_mode='best'",
                        UserWarning)

                # Continue from the candidate's value function of the current iteration
                if isinstance(self._subrtn_cand, ActorCritic) and isinstance(
                        self._subrtn_refs, ActorCritic):
                    self._subrtn_refs.critic.value_fcn.load_state_dict(
                        to.load(
                            osp.join(
                                self._save_dir,
                                f'iter_{self._curr_iter}_valuefcn_cand.pt')).
                        state_dict())

                print_cbt(
                    'Initialized the reference solution with the previously trained candidate solution.\n',
                    'y')

            # Sample new sets of physics params xi_{k,1}, ..., xi_{k,nr}
            self._env_dr.fill_buffer(nr)
            env_params_ref = self._env_dr.randomizer.get_params()
            joblib.dump(
                env_params_ref,
                osp.join(self._save_dir,
                         f'iter_{self._curr_iter}_env_params_ref_{k}.pkl'))
            print(
                'Randomized parameters of for the current reference solution:')
            print_domain_params(env_params_ref)

            # Reset the subroutine algorithm which includes resetting the exploration
            self._subrtn_refs.reset()
            print_cbt('Reset reference exploration noise.', 'y')

            if isinstance(self._subrtn_refs, ActorCritic):
                # Set dropout and batch normalization layers to training mode
                self._subrtn_refs.critic.value_fcn.train()
                critic_param_before = self._subrtn_refs.critic.value_fcn.param_values.clone(
                )

            # Solve the (approx) stochastic program SP_n for the samples physics parameter sets
            pol_param_before = self._subrtn_refs.policy.param_values.clone()
            self._subrtn_refs.train(snapshot_mode='best',
                                    meta_info=dict(
                                        prefix=f'iter_{self._curr_iter}',
                                        suffix=f'ref_{k}'))

            if (self._subrtn_refs.policy.param_values == pol_param_before
                ).all():
                warn(
                    "The reference's policy parameters did not change during training!",
                    UserWarning)
            if isinstance(self._subrtn_refs, ActorCritic):
                if (self._subrtn_refs.critic.value_fcn.param_values ==
                        critic_param_before).all():
                    warn(
                        "The reference's critic parameters did not change during training!",
                        UserWarning)

            print_cbt('Learned an approx solution for SP_n\n', 'y')
Exemplo n.º 13
0
    def _compute_candidate(self, nc: int):
        """
        Train and save one candidate solution to a pt-file

        :param nc: number of domains used for training the candidate solution
        """
        if self._curr_iter == 0 or not self.warmstart_cand:
            # Create a new candidate policy by re-initializing its parameters
            self._subrtn_cand.policy.init_param(self.cand_policy_param_init)

            # Create a new value function by re-initializing its parameters
            if isinstance(self._subrtn_cand, ActorCritic):
                self._subrtn_cand.critic.value_fcn.init_param(
                    self.cand_critic_param_init)

            print_cbt('Created a new candidate solution.\n', 'y')

        elif self._curr_iter > 0 and self.warmstart_cand:
            # Continue from the candidate's policy of the previous iteration
            self._subrtn_cand.policy.load_state_dict(
                to.load(
                    osp.join(self._save_dir,
                             f'iter_{self._curr_iter - 1}_policy_cand.pt')).
                state_dict())

            # Continue from the candidate's value function of the previous iteration
            if isinstance(self._subrtn_cand, ActorCritic):
                self._subrtn_cand.critic.value_fcn.load_state_dict(
                    to.load(
                        osp.join(
                            self._save_dir,
                            f'iter_{self._curr_iter - 1}_valuefcn_cand.pt')).
                    state_dict())

            print_cbt(
                'Initialized the candidate solution with the previously trained candidate.\n',
                'y')

        else:
            raise pyrado.ValueErr(
                msg=
                'Faulty joint configuration of curr_iter and warmstart_cand!')

        # Sample sets of physics params xi_{1}, ..., xi_{nc}
        self._env_dr.fill_buffer(nc)
        env_params_cand = self._env_dr.randomizer.get_params()
        joblib.dump(
            env_params_cand,
            osp.join(self._save_dir,
                     f'iter_{self._curr_iter}_env_params_cand.pkl'))
        print('Randomized parameters of for the candidate solution:')
        print_domain_params(env_params_cand)

        # Reset the subroutine algorithm which includes resetting the exploration
        self._subrtn_cand.reset()
        print('Reset candidate exploration noise.')

        if isinstance(self._subrtn_cand, ActorCritic):
            # Set dropout and batch normalization layers to training mode
            self._subrtn_cand.critic.value_fcn.train()
            critic_param_before = self._subrtn_cand.critic.value_fcn.param_values.clone(
            )

        # Solve the (approx) stochastic program SP_nc for the sampled physics parameter sets
        print_cbt(f'\nIteration {self._curr_iter} | Candidate solution\n',
                  'c',
                  bright=True)
        pol_param_before = self._subrtn_cand.policy.param_values.clone()
        self._subrtn_cand.train(snapshot_mode='best',
                                meta_info=dict(
                                    prefix=f'iter_{self._curr_iter}',
                                    suffix='cand'))

        if (self._subrtn_cand.policy.param_values == pol_param_before).all():
            warn(
                "The candidate's policy parameters did not change during training!",
                UserWarning)
        if isinstance(self._subrtn_refs, ActorCritic):
            if (self._subrtn_cand.critic.value_fcn.param_values ==
                    critic_param_before).all():
                warn(
                    "The candidate's critic parameters did not change during training!",
                    UserWarning)

        print_cbt('Learned an approx solution for SP_nc.\n', 'y')
Exemplo n.º 14
0
    def _compute_references(self, nr: int, nG: int):
        """
        Train and save nG reference solutions to pt-files

        :param nr: number of domains used for training the reference solutions
        :param nG: number of reference solutions
        """
        # Loop to compute a distribution of optimality gaps via nG samples
        for k in range(nG):
            print_cbt(
                f'Iteration {self._curr_iter} | Reference solution {k + 1} of {nG}\n',
                'c',
                bright=True)

            # Do a warm start if desired
            self._subrtn_refs.init_modules(
                self.warmstart_refs,
                prefix=f'iter_{self._curr_iter}',
                suffix='cand',
                policy_param_init=self.cand_policy_param_init,
                valuefcn_param_init=self.cand_critic_param_init)

            # Sample new sets of physics params xi_{k,1}, ..., xi_{k,nr}
            self.env_dr.fill_buffer(nr)
            env_params_ref = self.env_dr.randomizer.get_params()
            joblib.dump(
                env_params_ref,
                osp.join(self.save_dir,
                         f'iter_{self._curr_iter}_env_params_ref_{k}.pkl'))
            print(
                'Randomized parameters of for the current reference solution:')
            print_domain_params(env_params_ref)

            # Reset the subroutine algorithm which includes resetting the exploration
            self._cnt_samples += self._subrtn_refs.sample_count
            self._subrtn_refs.reset()
            print_cbt('Reset reference exploration noise.', 'y')

            pol_param_before = self._subrtn_refs.policy.param_values.clone()
            if isinstance(self._subrtn_refs, ActorCritic):
                # Set dropout and batch normalization layers to training mode
                self._subrtn_refs.critic.vfcn.train()
                critic_param_before = self._subrtn_refs.critic.vfcn.param_values.clone(
                )

            # Solve the (approx) stochastic program SP_n for the samples physics parameter sets
            self._subrtn_refs.train(snapshot_mode='best',
                                    meta_info=dict(
                                        prefix=f'iter_{self._curr_iter}',
                                        suffix=f'ref_{k}'))

            if (self._subrtn_refs.policy.param_values == pol_param_before
                ).all():
                warn(
                    "The reference's policy parameters did not change during training!",
                    UserWarning)
            if isinstance(self._subrtn_refs, ActorCritic):
                if (self._subrtn_refs.critic.vfcn.param_values ==
                        critic_param_before).all():
                    warn(
                        "The reference's critic parameters did not change during training!",
                        UserWarning)

            print_cbt('Learned an approx solution for SP_n\n', 'y')