示例#1
0
def create_qqsu_setup():
    # Environments
    env_hparams = dict(dt=1 / 100.0, max_steps=600)
    env_real = QQubeSwingUpSim(**env_hparams)
    env_real.domain_param = dict(
        mass_rot_pole=0.095 * 0.9,  # 0.095*0.9 = 0.0855
        mass_pend_pole=0.024 * 1.1,  # 0.024*1.1 = 0.0264
        length_rot_pole=0.085 * 0.9,  # 0.085*0.9 = 0.0765
        length_pend_pole=0.129 * 1.1,  # 0.129*1.1 = 0.1419
    )

    env_sim = QQubeSwingUpSim(**env_hparams)
    randomizer = DomainRandomizer(
        NormalDomainParam(name="mass_rot_pole",
                          mean=0.0,
                          std=1e-9,
                          clip_lo=1e-3),
        NormalDomainParam(name="mass_pend_pole",
                          mean=0.0,
                          std=1e-9,
                          clip_lo=1e-3),
        NormalDomainParam(name="length_rot_pole",
                          mean=0.0,
                          std=1e-9,
                          clip_lo=1e-3),
        NormalDomainParam(name="length_pend_pole",
                          mean=0.0,
                          std=1e-9,
                          clip_lo=1e-3),
    )
    env_sim = DomainRandWrapperLive(env_sim, randomizer)
    dp_map = {
        0: ("mass_rot_pole", "mean"),
        1: ("mass_rot_pole", "std"),
        2: ("mass_pend_pole", "mean"),
        3: ("mass_pend_pole", "std"),
        4: ("length_rot_pole", "mean"),
        5: ("length_rot_pole", "std"),
        6: ("length_pend_pole", "mean"),
        7: ("length_pend_pole", "std"),
    }
    # trafo_mask = [False, True, False, True, False, True, False, True]
    trafo_mask = [True] * 8
    env_sim = MetaDomainRandWrapper(env_sim, dp_map)

    # Policies (the behavioral policy needs to be deterministic)
    behavior_policy = QQubeSwingUpAndBalanceCtrl(env_sim.spec)
    prior = DomainRandomizer(
        NormalDomainParam(name="mass_rot_pole", mean=0.095, std=0.095 / 10),
        NormalDomainParam(name="mass_pend_pole", mean=0.024, std=0.024 / 10),
        NormalDomainParam(name="length_rot_pole", mean=0.085, std=0.085 / 10),
        NormalDomainParam(name="length_pend_pole", mean=0.129, std=0.129 / 10),
    )
    ddp_policy = DomainDistrParamPolicy(mapping=dp_map,
                                        trafo_mask=trafo_mask,
                                        prior=prior,
                                        scale_params=False)

    return env_sim, env_real, env_hparams, dp_map, behavior_policy, ddp_policy
示例#2
0
def test_velocity_filter(plot: bool):
    # Set up environment
    env_gt = QQubeSwingUpSim(dt=1 / 500.0, max_steps=350)
    env_gt.init_space = SingularStateSpace(np.array([0.1, np.pi / 2, 3.0, 0]))
    env_filt = ObsVelFiltWrapper(env_gt,
                                 idcs_pos=["theta", "alpha"],
                                 idcs_vel=["theta_dot", "alpha_dot"])

    # Set up policy
    policy = IdlePolicy(env_gt.spec)

    # Simulate
    ro_gt = rollout(env_gt, policy)
    ro_filt = rollout(env_filt, policy)

    # Filter the observations of the last rollout
    theta_dot_gt = ro_gt.observations[:, 4]
    alpha_dot_gt = ro_gt.observations[:, 5]
    theta_dot_filt = ro_filt.observations[:, 4]
    alpha_dot_filt = ro_filt.observations[:, 5]

    assert theta_dot_filt[0] != pytest.approx(
        theta_dot_gt[0])  # can't be equal since we set an init vel of 3 rad/s
    assert alpha_dot_filt[0] == pytest.approx(alpha_dot_gt[0], abs=1e-4)

    # Compute the error
    rmse_theta = rmse(theta_dot_gt, theta_dot_filt)
    rmse_alpha = rmse(alpha_dot_gt, alpha_dot_filt)

    if plot:
        from matplotlib import pyplot as plt

        # Plot the filtered signals versus the orignal observations
        plt.rc("text", usetex=True)
        fix, axs = plt.subplots(2, figsize=(16, 9))
        axs[0].plot(theta_dot_gt, label=r"$\dot{\theta}_{true}$")
        axs[0].plot(theta_dot_filt, label=r"$\dot{\theta}_{filt}$")
        axs[1].plot(alpha_dot_gt, label=r"$\dot{\alpha}_{true}$")
        axs[1].plot(alpha_dot_filt, label=r"$\dot{\alpha}_{filt}$")

        axs[0].set_title(rf"RMSE($\theta$): {rmse_theta}")
        axs[0].set_ylabel(r"$\dot{\theta}$ [rad/s]")
        axs[0].legend()
        axs[1].set_title(rf"RMSE($\alpha$): {rmse_alpha}")
        axs[1].set_xlabel("time steps")
        axs[1].set_ylabel(r"$\dot{\alpha}$ [rad/s]")
        axs[1].legend()
        plt.show()
    def __init__(self,
                 env_spec: EnvSpec,
                 ref_energy: float,
                 energy_gain: float,
                 th_gain: float,
                 acc_max: float,
                 use_cuda: bool = False):
        """
        Constructor

        :param env_spec: environment specification
        :param ref_energy: reference energy level [J]
        :param energy_gain: P-gain on the energy [m/s/J]
        :param th_gain: P-gain on angle theta
        :param acc_max: maximum linear acceleration of the pendulum pivot [m/s**2]
        :param use_cuda: `True` to move the policy to the GPU, `False` (default) to use the CPU
        """
        super().__init__(env_spec, use_cuda)

        # Initialize parameters
        self._log_E_ref = nn.Parameter(to.log(to.tensor(ref_energy)), requires_grad=True)
        self._log_E_gain = nn.Parameter(to.log(to.tensor(energy_gain)), requires_grad=True)
        self._th_gain = nn.Parameter(to.tensor(th_gain), requires_grad=True)
        self.acc_max = to.tensor(acc_max)
        self.dp_nom = QQubeSwingUpSim.get_nominal_domain_param()
def train_and_eval(trial: optuna.Trial, study_dir: str, seed: int):
    """
    Objective function for the Optuna `Study` to maximize.

    .. note::
        Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments.

    :param trial: Optuna Trial object for hyper-parameter optimization
    :param study_dir: the parent directory for all trials in this study
    :param seed: seed value for the random number generators, pass `None` for no seeding
    :return: objective function value
    """
    # Synchronize seeds between Optuna trials
    pyrado.set_seed(seed)

    # Environment
    env_hparams = dict(dt=1 / 250., max_steps=1500)
    env = QQubeSwingUpSim(**env_hparams)
    env = ActNormWrapper(env)

    # Policy
    policy_hparam = dict(feats=FeatureStack([
        identity_feat, sign_feat, abs_feat, squared_feat, cubic_feat,
        ATan2Feat(1, 2),
        MultFeat([4, 5])
    ]))
    policy = LinearPolicy(spec=env.spec, **policy_hparam)

    # Algorithm
    algo_hparam = dict(
        num_workers=1,  # parallelize via optuna n_jobs
        max_iter=50,
        pop_size=trial.suggest_int('pop_size', 50, 200),
        num_rollouts=trial.suggest_int('num_rollouts', 4, 10),
        num_is_samples=trial.suggest_int('num_is_samples', 5, 40),
        expl_std_init=trial.suggest_uniform('expl_std_init', 0.1, 0.5),
        symm_sampling=trial.suggest_categorical('symm_sampling',
                                                [True, False]),
    )
    csv_logger = create_csv_step_logger(
        osp.join(study_dir, f'trial_{trial.number}'))
    algo = PoWER(osp.join(study_dir, f'trial_{trial.number}'),
                 env,
                 policy,
                 **algo_hparam,
                 logger=csv_logger)

    # Train without saving the results
    algo.train(snapshot_mode='latest', seed=seed)

    # Evaluate
    min_rollouts = 1000
    sampler = ParallelRolloutSampler(
        env, policy, num_workers=1,
        min_rollouts=min_rollouts)  # parallelize via optuna n_jobs
    ros = sampler.sample()
    mean_ret = sum([r.undiscounted_return() for r in ros]) / min_rollouts

    return mean_ret
示例#5
0
def create_qq_setup(factor, dt, max_steps, render_mode):
    # Set up environment
    init_state = np.array([0.1, 0.0, 0.0, 0.0])
    env = QQubeSwingUpSim(dt=dt, max_steps=max_steps)
    env = ActNormWrapper(env)

    # Set up policy
    policy = QQubeSwingUpAndBalanceCtrl(env.spec)

    # Simulate
    ro = rollout(
        env,
        policy,
        reset_kwargs=dict(domain_param=dict(dt=dt), init_state=init_state),
        render_mode=render_mode,
        max_steps=max_steps,
    )
    act_500Hz = ro.actions

    ro = rollout(
        env,
        policy,
        reset_kwargs=dict(domain_param=dict(dt=dt * factor),
                          init_state=init_state),
        render_mode=render_mode,
        max_steps=int(max_steps / factor),
    )
    act_100Hz = ro.actions

    env = DownsamplingWrapper(env, factor)
    ro = rollout(
        env,
        policy,
        reset_kwargs=dict(domain_param=dict(dt=dt), init_state=init_state),
        render_mode=render_mode,
        max_steps=max_steps,
    )
    act_500Hz_w = ro.actions

    # Time in seconds
    time_500Hz = np.linspace(0, int(len(act_500Hz) * dt), int(len(act_500Hz)))
    time_100Hz = np.linspace(0, int(len(act_100Hz) * dt), int(len(act_100Hz)))
    time_500Hz_w = np.linspace(0, int(len(act_500Hz_w) * dt),
                               int(len(act_500Hz_w)))

    # Plot
    _, ax = plt.subplots(nrows=1)
    ax.plot(time_500Hz, act_500Hz, label="500 Hz (original)")
    ax.plot(time_100Hz, act_100Hz, label="100 Hz", ls="--")
    ax.plot(time_500Hz_w, act_500Hz_w, label="500 Hz (wrapped)", ls="--")
    ax.legend()
    ax.set_ylabel(env.act_space.labels)
    ax.set_xlabel("time [s]")
示例#6
0
def create_qq_setup(factor, dt, max_steps):
    # Set up environment
    init_state = np.array([0.1, 0.0, 0.0, 0.0])
    env = QQubeSwingUpSim(dt=dt, max_steps=max_steps)
    env = ActNormWrapper(env)

    # Set up policy
    policy = QQubeSwingUpAndBalanceCtrl(env.spec)

    # Simulate
    ro = rollout(
        env,
        policy,
        reset_kwargs=dict(domain_param=dict(dt=dt), init_state=init_state),
        render_mode=RenderMode(video=True),
        max_steps=max_steps,
    )
    act_500Hz = ro.actions

    ro = rollout(
        env,
        policy,
        reset_kwargs=dict(domain_param=dict(dt=dt * factor),
                          init_state=init_state),
        render_mode=RenderMode(video=True),
        max_steps=int(max_steps / factor),
    )
    act_100Hz = ro.actions
    act_100Hz_zoh = np.repeat(act_100Hz, 5, axis=0)

    env = DownsamplingWrapper(env, factor)
    ro = rollout(
        env,
        policy,
        reset_kwargs=dict(domain_param=dict(dt=dt), init_state=init_state),
        render_mode=RenderMode(video=True),
        max_steps=max_steps,
    )
    act_500Hz_wrapped = ro.actions

    # Plot
    _, ax = plt.subplots(nrows=1)
    ax.plot(act_500Hz, label="500 Hz (original)")
    ax.plot(act_100Hz_zoh, label="100 Hz (zoh)")
    ax.plot(act_500Hz_wrapped, label="500 Hz (wrapped)")
    ax.legend()
    ax.set_ylabel(env.act_space.labels)
    ax.set_xlabel("time steps")
    plt.show()
示例#7
0
    def __init__(
        self,
        env_spec: EnvSpec,
        ref_energy: float,
        energy_gain: float,
        th_gain: float,
        acc_max: float,
        reset_domain_param: bool = True,
        use_cuda: bool = False,
    ):
        """
        Constructor

        :param env_spec: environment specification
        :param ref_energy: reference energy level [J]
        :param energy_gain: P-gain on the energy [m/s/J]
        :param th_gain: P-gain on angle theta
        :param acc_max: maximum linear acceleration of the pendulum pivot [m/s**2]
        :param reset_domain_param: if `True` the domain parameters are reset if the they are present as a entry in the
                                   kwargs passed to `reset()`. If `False` they are ignored.
        :param use_cuda: `True` to move the policy to the GPU, `False` (default) to use the CPU
        """
        super().__init__(env_spec, use_cuda)

        # Initial parameters
        self._log_E_ref_init = to.log(to.tensor(ref_energy))
        self._log_E_gain_init = to.log(to.tensor(energy_gain))
        self._th_gain_init = to.tensor(th_gain)

        # Define parameters
        self._log_E_ref = nn.Parameter(to.empty_like(self._log_E_ref_init),
                                       requires_grad=True)
        self._log_E_gain = nn.Parameter(to.empty_like(self._log_E_gain_init),
                                        requires_grad=True)
        self._th_gain = nn.Parameter(to.empty_like(self._th_gain_init),
                                     requires_grad=True)

        self.acc_max = to.tensor(acc_max)
        self._domain_param = QQubeSwingUpSim.get_nominal_domain_param()
        self._reset_domain_param = reset_domain_param

        # Default initialization
        self.init_param(None)
示例#8
0
 def default_qqsu():
     return QQubeSwingUpSim(dt=0.004, max_steps=4000)
示例#9
0
from pyrado.environments.mujoco.quanser_qube import QQubeStabMjSim, QQubeSwingUpMjSim
from pyrado.environments.pysim.quanser_qube import QQubeStabSim, QQubeSwingUpSim
from pyrado.policies.special.environment_specific import QQubeSwingUpAndBalanceCtrl
from pyrado.sampling.rollout import after_rollout_query, rollout
from pyrado.utils.argparser import get_argparser
from pyrado.utils.data_types import RenderMode
from pyrado.utils.input_output import print_cbt

if __name__ == "__main__":
    # Parse command line arguments
    args = get_argparser().parse_args()

    dt = 1 / 500.0
    max_steps = 3500
    if args.env_name == "qq-su":
        env = QQubeSwingUpSim(dt=dt, max_steps=max_steps)
    elif args.env_name == "qq-mj-su":
        env = QQubeSwingUpMjSim(dt=dt, max_steps=max_steps)
    elif args.env_name == "qq-st":
        env = QQubeStabSim(dt=dt, max_steps=max_steps)
    elif args.env_name == "qq-mj-st":
        env = QQubeStabMjSim(dt=dt, max_steps=max_steps)
    else:
        raise pyrado.ValueErr(
            given_name="--env_name",
            given=args.env_name,
            eq_constraint="'qq-su', 'qq-mj-su', 'qq-st', or 'qq-mj-st'",
        )
    policy = QQubeSwingUpAndBalanceCtrl(env.spec)

    # Simulate
示例#10
0
    parser.add_argument("--train_teachers", action="store_true", default=False)
    parser.add_argument("--num_teachers", type=int, default=2)
    parser.add_argument("--max_iter", type=int, default=500)
    parser.add_argument("--num_epochs", type=int, default=10)

    # Parse command line arguments
    args = parser.parse_args()

    # Set seed if desired
    pyrado.set_seed(args.seed, verbose=True)
    use_cuda = args.device == "cuda"
    descr = f"_{args.max_steps}st_{args.freq}Hz"

    # Environment
    env_hparams = dict(dt=1 / args.freq, max_steps=args.max_steps)
    env_real = QQubeSwingUpSim(**env_hparams)
    ex_dir = setup_experiment(
        QQubeSwingUpSim.name,
        f"{PDDR.name}_{QQubeSwingUpAndBalanceCtrl.name}{descr}")

    if args.train_teachers:
        # Teacher policy
        teacher_policy_hparam = dict(hidden_sizes=[64, 64],
                                     hidden_nonlin=to.relu,
                                     output_nonlin=to.tanh,
                                     use_cuda=use_cuda)
        teacher_policy = FNNPolicy(spec=env_real.spec, **teacher_policy_hparam)

        # Reduce weights of last layer, recommended by paper
        for p in teacher_policy.net.output_layer.parameters():
            with to.no_grad():
    # Parse command line arguments
    args = get_argparser().parse_args()

    # Experiment (set seed before creating the modules)
    ex_dir = setup_experiment(
        QQubeSwingUpSim.name,
        f"{BayRn.name}-{PoWER.name}_{QQubeSwingUpAndBalanceCtrl.name}",
        f"sim2sim_rand-mass_pend_pole-mass_rot_pole_seed-{args.seed}",
    )

    # Set seed if desired
    pyrado.set_seed(args.seed, verbose=True)

    # Environments
    env_sim_hparams = dict(dt=1 / 100.0, max_steps=600)
    env_sim = QQubeSwingUpSim(**env_sim_hparams)
    env_sim = DomainRandWrapperLive(env_sim,
                                    create_zero_var_randomizer(env_sim))
    dp_map = create_default_domain_param_map_qq()
    env_sim = MetaDomainRandWrapper(env_sim, dp_map)

    env_real = QQubeSwingUpSim(**env_sim_hparams)
    env_real.domain_param = dict(
        mass_pend_pole=0.024 * 1.1,
        mass_rot_pole=0.095 * 1.1,
    )
    env_real_hparams = env_sim_hparams
    env_real = wrap_like_other_env(env_real, env_sim)

    # PoWER and energy-based controller setup
    policy_hparam = dict(energy_gain=0.587, ref_energy=0.827, acc_max=10.0)
示例#12
0
Test predefined energy-based swing-up controller on the Quanser Qube with observation noise.
"""
from scipy.ndimage import gaussian_filter1d
from matplotlib import pyplot as plt

from pyrado.environment_wrappers.observation_noise import GaussianObsNoiseWrapper
from pyrado.environments.pysim.quanser_qube import QQubeSwingUpSim
from pyrado.policies.special.environment_specific import QQubeSwingUpAndBalanceCtrl
from pyrado.sampling.rollout import rollout
from pyrado.utils.data_types import RenderMode

if __name__ == '__main__':
    plt.rc('text', usetex=True)

    # Set up environment
    env = QQubeSwingUpSim(dt=1 / 500., max_steps=3500)
    env = GaussianObsNoiseWrapper(
        env, noise_std=[0., 0., 0., 0., 2.,
                        0])  # only noise on theta_dot [rad/s]

    # Set up policy
    policy = QQubeSwingUpAndBalanceCtrl(env.spec)

    # Simulate
    ro = rollout(env,
                 policy,
                 render_mode=RenderMode(text=False, video=False),
                 eval=True)

    # Filter the observations of the last rollout
    theta_dot = ro.observations[:, 4]
示例#13
0
from pyrado.utils.data_types import RenderMode
from pyrado.utils.input_output import print_cbt

if __name__ == "__main__":
    # Parse command line arguments
    args = get_argparser().parse_args()
    dt = args.dt if args.dt is not None else 0.01

    if args.env_name == QCartPoleSwingUpSim.name:
        env = QCartPoleSwingUpSim(dt=dt,
                                  max_steps=int(5 / dt),
                                  wild_init=False)
        state = np.array([0, 87 / 180 * np.pi, 0, 0])

    elif args.env_name == QQubeSwingUpSim.name:
        env = QQubeSwingUpSim(dt=dt, max_steps=int(5 / dt))
        state = np.array([5 / 180 * np.pi, 87 / 180 * np.pi, 0, 0])

    elif args.env_name == QBallBalancerSim.name:
        env = QBallBalancerSim(dt=dt, max_steps=int(5 / dt))
        state = np.array(
            [2 / 180 * np.pi, 2 / 180 * np.pi, 0.1, -0.08, 0, 0, 0, 0])

    elif args.env_name == OneMassOscillatorSim.name:
        env = OneMassOscillatorSim(dt=dt, max_steps=int(5 / dt))
        state = np.array([-0.7, 0])

    elif args.env_name == PendulumSim.name:
        env = PendulumSim(dt=dt, max_steps=int(5 / dt))
        state = np.array([87 / 180 * np.pi, 0])
示例#14
0
def train_and_eval(trial: optuna.Trial, study_dir: str, seed: int):
    """
    Objective function for the Optuna `Study` to maximize.
    
    .. note::
        Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments.

    :param trial: Optuna Trial object for hyper-parameter optimization
    :param study_dir: the parent directory for all trials in this study
    :param seed: seed value for the random number generators, pass `None` for no seeding
    :return: objective function value
    """
    # Synchronize seeds between Optuna trials
    pyrado.set_seed(seed)

    # Environment
    env_hparams = dict(dt=1/100., max_steps=600)
    env = QQubeSwingUpSim(**env_hparams)
    env = ActNormWrapper(env)

    # Learning rate scheduler
    lrs_gamma = trial.suggest_categorical('exp_lr_scheduler_gamma', [None, 0.995, 0.999])
    if lrs_gamma is not None:
        lr_sched = lr_scheduler.ExponentialLR
        lr_sched_hparam = dict(gamma=lrs_gamma)
    else:
        lr_sched, lr_sched_hparam = None, dict()

    # Policy
    policy_hparam = dict(
        hidden_sizes=trial.suggest_categorical('hidden_sizes_policy', [(16, 16), (32, 32), (64, 64)]),
        hidden_nonlin=fcn_from_str(trial.suggest_categorical('hidden_nonlin_policy', ['to_tanh', 'to_relu'])),
    )  # FNN
    # policy_hparam = dict(
    #     hidden_size=trial.suggest_categorical('hidden_size_policy', [16, 32, 64]),
    #     num_recurrent_layers=trial.suggest_categorical('num_recurrent_layers_policy', [1, 2]),
    # )  # LSTM & GRU
    policy = FNNPolicy(spec=env.spec, **policy_hparam)
    # policy = GRUPolicy(spec=env.spec, **policy_hparam)

    # Critic
    vfcn_hparam = dict(
        hidden_sizes=trial.suggest_categorical('hidden_sizes_critic', [(16, 16), (32, 32), (64, 64)]),
        hidden_nonlin=fcn_from_str(trial.suggest_categorical('hidden_nonlin_critic', ['to_tanh', 'to_relu'])),
    )
    # vfcn_hparam = dict(
    #     hidden_size=trial.suggest_categorical('hidden_size_critic', [16, 32, 64]),
    #     num_recurrent_layers=trial.suggest_categorical('num_recurrent_layers_critic', [1, 2]),
    # )  # LSTM & GRU
    vfcn = FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), **vfcn_hparam)
    # vfcn = GRUPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), **vfcn_hparam)
    critic_hparam = dict(
        batch_size=500,
        gamma=trial.suggest_uniform('gamma_critic', 0.98, 1.),
        lamda=trial.suggest_uniform('lamda_critic', 0.95, 1.),
        num_epoch=trial.suggest_int('num_epoch_critic', 1, 10),
        lr=trial.suggest_loguniform('lr_critic', 1e-5, 1e-3),
        standardize_adv=trial.suggest_categorical('standardize_adv_critic', [False]),
        max_grad_norm=trial.suggest_categorical('max_grad_norm_critic', [None, 1., 5.]),
        lr_scheduler=lr_sched,
        lr_scheduler_hparam=lr_sched_hparam
    )
    critic = GAE(vfcn, **critic_hparam)

    # Algorithm
    algo_hparam = dict(
        num_workers=1,  # parallelize via optuna n_jobs
        max_iter=250,
        batch_size=500,
        min_steps=trial.suggest_int('num_rollouts_algo', 10, 30)*env.max_steps,
        num_epoch=trial.suggest_int('num_epoch_algo', 1, 10),
        eps_clip=trial.suggest_uniform('eps_clip_algo', 0.05, 0.2),
        std_init=trial.suggest_uniform('std_init_algo', 0.5, 1.0),
        lr=trial.suggest_loguniform('lr_algo', 1e-5, 1e-3),
        max_grad_norm=trial.suggest_categorical('max_grad_norm_algo', [None, 1., 5.]),
        lr_scheduler=lr_sched,
        lr_scheduler_hparam=lr_sched_hparam
    )
    csv_logger = create_csv_step_logger(osp.join(study_dir, f'trial_{trial.number}'))
    algo = PPO(osp.join(study_dir, f'trial_{trial.number}'), env, policy, critic, **algo_hparam, logger=csv_logger)

    # Train without saving the results
    algo.train(snapshot_mode='latest', seed=seed)

    # Evaluate
    min_rollouts = 1000
    sampler = ParallelRolloutSampler(env, policy, num_workers=1,
                                     min_rollouts=min_rollouts)  # parallelize via optuna n_jobs
    ros = sampler.sample()
    mean_ret = sum([r.undiscounted_return() for r in ros])/min_rollouts

    return mean_ret
        args.max_steps = 600
        print_cbt(f'Set maximum number of time steps to {args.max_steps}', 'y')

    # Get the experiment's directory to load from
    ex_dir = ask_for_experiment() if args.ex_dir is None else args.ex_dir
    dirs = [tmp[0] for tmp in os.walk(ex_dir)][1:]
    num_policies = len(dirs)
    print(f'Found {num_policies} policies.')

    # Specify domain parameters
    param_names = ['Dp', 'Dr', 'Mp', 'Mr', 'Lp', 'Lr']
    num_param = len(param_names)
    num_samples = 10

    # Create one-dim evaluation grid for multiple parameters
    nom_params = QQubeSwingUpSim.get_nominal_domain_param()
    param_values = dict(
        Dp=np.logspace(-8, -4, num_samples),
        Dr=np.logspace(-8, -4, num_samples),
        Mp=np.linspace(0.6 * nom_params['Mp'], 1.5 * nom_params['Mp'],
                       num_samples),
        Mr=np.linspace(0.6 * nom_params['Mr'], 1.5 * nom_params['Mr'],
                       num_samples),
        Lp=np.linspace(0.6 * nom_params['Lp'], 1.5 * nom_params['Lp'],
                       num_samples),
        Lr=np.linspace(0.6 * nom_params['Lr'], 1.5 * nom_params['Lr'],
                       num_samples),
    )

    # Set up the environment
    env = ActNormWrapper(QQubeSwingUpSim(dt=1 / 100.,
def train_and_eval(trial: optuna.Trial, study_dir: str, seed: int):
    """
    Objective function for the Optuna `Study` to maximize.
    
    .. note::
        Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments.

    :param trial: Optuna Trial object for hyper-parameter optimization
    :param study_dir: the parent directory for all trials in this study
    :param seed: seed value for the random number generators, pass `None` for no seeding
    :return: objective function value
    """
    # Synchronize seeds between Optuna trials
    pyrado.set_seed(seed)

    # Environments
    env_hparams = dict(dt=1 / 100., max_steps=600)
    env_real = QQubeSwingUpSim(**env_hparams)
    env_real.domain_param = dict(
        Mr=0.095 * 0.9,  # 0.095*0.9 = 0.0855
        Mp=0.024 * 1.1,  # 0.024*1.1 = 0.0264
        Lr=0.085 * 0.9,  # 0.085*0.9 = 0.0765
        Lp=0.129 * 1.1,  # 0.129*1.1 = 0.1419
    )

    env_sim = QQubeSwingUpSim(**env_hparams)
    randomizer = DomainRandomizer(
        NormalDomainParam(name='Mr', mean=0., std=1e6, clip_lo=1e-3),
        NormalDomainParam(name='Mp', mean=0., std=1e6, clip_lo=1e-3),
        NormalDomainParam(name='Lr', mean=0., std=1e6, clip_lo=1e-3),
        NormalDomainParam(name='Lp', mean=0., std=1e6, clip_lo=1e-3),
    )
    env_sim = DomainRandWrapperLive(env_sim, randomizer)
    dp_map = {
        0: ('Mr', 'mean'),
        1: ('Mr', 'std'),
        2: ('Mp', 'mean'),
        3: ('Mp', 'std'),
        4: ('Lr', 'mean'),
        5: ('Lr', 'std'),
        6: ('Lp', 'mean'),
        7: ('Lp', 'std')
    }
    trafo_mask = [True] * 8
    env_sim = MetaDomainRandWrapper(env_sim, dp_map)

    # Subroutine for policy improvement
    behav_policy_hparam = dict(hidden_sizes=[64, 64], hidden_nonlin=to.tanh)
    behav_policy = FNNPolicy(spec=env_sim.spec, **behav_policy_hparam)
    vfcn_hparam = dict(hidden_sizes=[64, 64], hidden_nonlin=to.tanh)
    vfcn = FNNPolicy(spec=EnvSpec(env_sim.obs_space, ValueFunctionSpace),
                     **vfcn_hparam)
    critic_hparam = dict(
        gamma=0.9885,
        lamda=0.9648,
        num_epoch=2,
        batch_size=500,
        standardize_adv=False,
        lr=5.792e-4,
        max_grad_norm=1.,
    )
    critic = GAE(vfcn, **critic_hparam)
    subrtn_policy_hparam = dict(
        max_iter=200,
        min_steps=3 * 23 * env_sim.max_steps,
        num_epoch=7,
        eps_clip=0.0744,
        batch_size=500,
        std_init=0.9074,
        lr=3.446e-04,
        max_grad_norm=1.,
        num_workers=1,
    )
    subrtn_policy = PPO(study_dir, env_sim, behav_policy, critic,
                        **subrtn_policy_hparam)

    # Subroutine for system identification
    prior_std_denom = trial.suggest_uniform('prior_std_denom', 5, 20)
    prior = DomainRandomizer(
        NormalDomainParam(name='Mr', mean=0.095, std=0.095 / prior_std_denom),
        NormalDomainParam(name='Mp', mean=0.024, std=0.024 / prior_std_denom),
        NormalDomainParam(name='Lr', mean=0.085, std=0.085 / prior_std_denom),
        NormalDomainParam(name='Lp', mean=0.129, std=0.129 / prior_std_denom),
    )
    ddp_policy = DomainDistrParamPolicy(
        mapping=dp_map,
        trafo_mask=trafo_mask,
        prior=prior,
        scale_params=trial.suggest_categorical('ddp_policy_scale_params',
                                               [True, False]),
    )
    subsubrtn_distr_hparam = dict(
        max_iter=trial.suggest_categorical('subsubrtn_distr_max_iter', [20]),
        pop_size=trial.suggest_int('pop_size', 50, 500),
        num_rollouts=1,
        num_is_samples=trial.suggest_int('num_is_samples', 5, 20),
        expl_std_init=trial.suggest_loguniform('expl_std_init', 1e-3, 1e-1),
        expl_std_min=trial.suggest_categorical('expl_std_min', [1e-4]),
        extra_expl_std_init=trial.suggest_loguniform('expl_std_init', 1e-3,
                                                     1e-1),
        extra_expl_decay_iter=trial.suggest_int('extra_expl_decay_iter', 0,
                                                10),
        num_workers=1,
    )
    csv_logger = create_csv_step_logger(
        osp.join(study_dir, f'trial_{trial.number}'))
    subsubrtn_distr = CEM(study_dir,
                          env_sim,
                          ddp_policy,
                          **subsubrtn_distr_hparam,
                          logger=csv_logger)
    obs_vel_weight = trial.suggest_loguniform('obs_vel_weight', 1, 100)
    subrtn_distr_hparam = dict(
        metric=None,
        obs_dim_weight=[1, 1, 1, 1, obs_vel_weight, obs_vel_weight],
        num_rollouts_per_distr=trial.suggest_int('num_rollouts_per_distr', 20,
                                                 100),
        num_workers=1,
    )
    subrtn_distr = SysIdViaEpisodicRL(subsubrtn_distr, behav_policy,
                                      **subrtn_distr_hparam)

    # Algorithm
    algo_hparam = dict(
        max_iter=trial.suggest_categorical('algo_max_iter', [10]),
        num_eval_rollouts=trial.suggest_categorical('algo_num_eval_rollouts',
                                                    [5]),
        warmstart=trial.suggest_categorical('algo_warmstart', [True]),
        thold_succ_subrtn=trial.suggest_categorical('algo_thold_succ_subrtn',
                                                    [50]),
        subrtn_snapshot_mode='latest',
    )
    algo = SimOpt(study_dir,
                  env_sim,
                  env_real,
                  subrtn_policy,
                  subrtn_distr,
                  **algo_hparam,
                  logger=csv_logger)

    # Jeeeha
    algo.train(seed=args.seed)

    # Evaluate
    min_rollouts = 1000
    sampler = ParallelRolloutSampler(
        env_real, algo.policy, num_workers=1,
        min_rollouts=min_rollouts)  # parallelize via optuna n_jobs
    ros = sampler.sample()
    mean_ret = sum([r.undiscounted_return() for r in ros]) / min_rollouts

    return mean_ret
示例#17
0
        dt = args.dt
    else:
        raise pyrado.ValueErr(
            msg=
            "There was no time field in the loaded rollout to infer the time step size from, neither has "
            "it been specified explicitly! Please provide the time step size using --dt."
        )

    if env_name == QBallBalancerSim.name:
        env = QBallBalancerSim(dt=dt)

    elif env_name == QCartPoleSwingUpSim.name:
        env = QCartPoleSwingUpSim(dt=dt)

    elif env_name == QQubeSwingUpSim.name:
        env = QQubeSwingUpSim(dt=dt)

    elif env_name == "wam-bic":  # avoid loading mujoco
        from pyrado.environments.mujoco.wam_bic import WAMBallInCupSim

        env = WAMBallInCupSim(num_dof=4)
        env.init_space = BoxSpace(-pyrado.inf,
                                  pyrado.inf,
                                  shape=env.init_space.shape)

    elif env_name == "wam-jsc":  # avoid loading mujoco
        from pyrado.environments.mujoco.wam_jsc import WAMJointSpaceCtrlSim

        env = WAMJointSpaceCtrlSim(num_dof=7)
        env.init_space = BoxSpace(-pyrado.inf,
                                  pyrado.inf,
if __name__ == '__main__':
    # Parse command line arguments
    args = get_argparser().parse_args()

    # Experiment (set seed before creating the modules)
    ex_dir = setup_experiment(QQubeSwingUpSim.name,
                              f'{BayRn.name}-{PoWER.name}_{QQubeSwingUpAndBalanceCtrl.name}_sim2sim',
                              f'rand-Mp-Mr_seed-{args.seed}')

    # Set seed if desired
    pyrado.set_seed(args.seed, verbose=True)

    # Environments
    env_sim_hparams = dict(dt=1/100., max_steps=600)
    env_sim = QQubeSwingUpSim(**env_sim_hparams)
    env_sim = DomainRandWrapperLive(env_sim, create_zero_var_randomizer(env_sim))
    dp_map = get_default_domain_param_map_qq()
    env_sim = MetaDomainRandWrapper(env_sim, dp_map)

    env_real = QQubeSwingUpSim(**env_sim_hparams)
    env_real.domain_param = dict(
        Mp=0.024*1.1,
        Mr=0.095*1.1,
    )
    env_real_hparams = env_sim_hparams
    env_real = wrap_like_other_env(env_real, env_sim)

    # PoWER + energy-based controller setup
    policy_hparam = dict(energy_gain=0.587, ref_energy=0.827, acc_max=10.)
    policy = QQubeSwingUpAndBalanceCtrl(env_sim.spec, **policy_hparam)
        # param_spec['m_pole'] = np.linspace(0.127*0.7, 0.127*1.3, num=11, endpoint=True)
        # param_spec['l_pole'] = np.linspace(0.641/2*0.7, 0.641/2*1.3, num=11, endpoint=True)

        # Get the experiments' directories to load from
        prefixes = [
            osp.join(pyrado.EXP_DIR, 'ENV_NAME', 'ALGO_NAME'),
        ]
        ex_names = [
            '',
        ]
        ex_labels = [
            '',
        ]

    elif args.env_name == QQubeSwingUpSim.name:
        env = QQubeSwingUpSim(dt=args.dt, max_steps=args.max_steps)

        # param_spec['g'] = np.linspace(9.81*0.7, 9.81*1.3, num=11, endpoint=True)
        # param_spec['Rm'] = np.linspace(8.4*0.7, 8.4*1.3, num=11, endpoint=True)
        # param_spec['km'] = np.linspace(0.042*0.7, 0.042*1.3, num=11, endpoint=True)
        # param_spec['Mr'] = np.linspace(0.095*0.7, 0.095*1.3, num=11, endpoint=True)
        # param_spec['Lr'] = np.linspace(0.085*0.7, 0.085*1.3, num=11, endpoint=True)
        # param_spec['Dr'] = np.linspace(5e-6*0.2, 5e-6*5, num=11, endpoint=True)  # 5e-6
        # param_spec['Mp'] = np.linspace(0.024*0.7, 0.024*1.3, num=11, endpoint=True)
        # param_spec['Lp'] = np.linspace(0.129*0.7, 0.129*1.3, num=11, endpoint=True)
        # param_spec['Dp'] = np.linspace(1e-6*0.2, 1e-6n*5, num=11, endpoint=True)  # 1e-6

        # Get the experiments' directories to load from
        prefixes = [
            osp.join(pyrado.EXP_DIR, 'ENV_NAME', 'ALGO_NAME'),
        ]
示例#20
0
    # Parse command line arguments
    args = get_argparser().parse_args()

    # Experiment (set seed before creating the modules)
    ex_dir = setup_experiment(
        QQubeSwingUpSim.name,
        f"{BayRn.name}-{PPO.name}_{FNNPolicy.name}",
        "rand-mass_pend_pole-mass_rot_pole-length_pend_pole-length_rot_pole_lower-std",
    )

    # Set seed if desired
    pyrado.set_seed(args.seed, verbose=True)

    # Environments
    env_sim_hparams = dict(dt=1 / 100.0, max_steps=600)
    env_sim = QQubeSwingUpSim(**env_sim_hparams)
    env_sim = ActNormWrapper(env_sim)
    env_sim = DomainRandWrapperLive(env_sim,
                                    create_zero_var_randomizer(env_sim))
    dp_map = create_default_domain_param_map_qq()
    env_sim = MetaDomainRandWrapper(env_sim, dp_map)

    env_real_hparams = dict(dt=1 / 500.0, max_steps=3000)
    env_real = QQubeSwingUpReal(**env_real_hparams)
    env_real = wrap_like_other_env(env_real, env_sim)

    # Policy
    policy_hparam = dict(hidden_sizes=[64, 64], hidden_nonlin=to.tanh)
    policy = FNNPolicy(spec=env_sim.spec, **policy_hparam)

    # Critic
示例#21
0
        )
        t_end = 5  # s
    else:
        ex_dir = setup_experiment(
            QQubeSwingUpSim.name,
            f"{NPDR.name}_{TimePolicy.name}",
            num_segs_str + len_seg_str + seed_str,
        )
        t_end = 10  # s

    # Set seed if desired
    pyrado.set_seed(args.seed, verbose=True)

    # Environments
    env_sim_hparams = dict(dt=1 / 250.0, max_steps=int(t_end * 250))
    env_sim = QQubeSwingUpSim(**env_sim_hparams)
    # env_sim = ActDelayWrapper(env_sim)

    # Create the ground truth target domain and the behavioral policy
    if ectl:
        env_real = osp.join(pyrado.EVAL_DIR,
                            f"qq-su_ectrl_250Hz_{t_end}s")  # 5s long
        policy = QQubeSwingUpAndBalanceCtrl(
            env_sim.spec
        )  # replaced by the recorded actions if use_rec_act=True
    else:
        env_real = osp.join(pyrado.EVAL_DIR,
                            f"qq_chrip_10to0Hz_+1.5V_250Hz_{t_end}s")
        assert use_rec_act
        policy = DummyPolicy(env_sim.spec)  # replaced by recorded real actions
示例#22
0
    parser.add_argument("--cov_only", action="store_true")
    args = parser.parse_args()

    # Experiment (set seed before creating the modules)
    ex_dir = setup_experiment(
        QQubeSwingUpSim.name,
        f"{PPO.name}_{FNNPolicy.name}",
        f"{args.frequency}Hz_{args.max_steps}ROLen_{args.ppo_iterations}PPOIter_{args.sprl_iterations}SPRLIter_cov_only{args.cov_only}_seed_{args.seed}",
    )

    # Set seed if desired
    pyrado.set_seed(args.seed, verbose=True)

    # Environment
    env_hparams = dict(dt=1 / float(args.frequency), max_steps=args.max_steps)
    env = QQubeSwingUpSim(**env_hparams)
    env = ActNormWrapper(env)

    # Policy
    policy_hparam = dict(hidden_sizes=[64, 64], hidden_nonlin=to.tanh)  # FNN
    # policy_hparam = dict(hidden_size=32, num_recurrent_layers=1)  # LSTM & GRU
    policy = FNNPolicy(spec=env.spec, **policy_hparam)
    # policy = GRUPolicy(spec=env.spec, **policy_hparam)

    # Critic
    vfcn_hparam = dict(hidden_sizes=[32, 32], hidden_nonlin=to.relu)  # FNN
    # vfcn_hparam = dict(hidden_size=32, num_recurrent_layers=1)  # LSTM & GRU
    vfcn = FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace),
                     **vfcn_hparam)
    # vfcn = GRUPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), **vfcn_hparam)
    critic_hparam = dict(
示例#23
0
from pyrado.utils.functions import skyline
from pyrado.utils.input_output import print_cbt

if __name__ == "__main__":
    # Parse command line arguments
    args = get_argparser().parse_args()
    dt = args.dt or 1 / 500.0
    t_end = 5.5  # s
    max_steps = int(t_end / dt)  # run for 5s
    check_in_sim = False
    # max_amp = 5.0 / 180 * np.pi  # max. amplitude [rad]
    max_amp = -3.5  # max. amplitude [V]

    # Create the simulated and real environments
    if args.env_name == QQubeSwingUpReal.name:
        env_sim = QQubeSwingUpSim(dt, max_steps)
        env_real = QQubeSwingUpReal(dt, max_steps)
    elif args.env_name == QCartPoleSwingUpReal.name:
        env_sim = QCartPoleSwingUpSim(dt, max_steps)
        env_real = QCartPoleSwingUpReal(dt, max_steps)
    elif args.env_name == WAMReal.name:
        env_sim = WAMJointSpaceCtrlSim(frame_skip=4,
                                       num_dof=7,
                                       max_steps=max_steps)
        env_real = WAMJointSpaceCtrlRealStepBased(num_dof=7,
                                                  max_steps=max_steps)
    else:
        raise pyrado.ValueErr(
            given=args.env_name,
            eq_constraint=
            f"{QQubeSwingUpReal.name}, {QCartPoleSwingUpReal.name} or {WAMReal.name}"