Exemplo n.º 1
0
def test_actor_critic(ex_dir, env: SimEnv, policy: Policy, algo, algo_hparam,
                      vfcn_type, use_cuda):
    pyrado.set_seed(0)

    if use_cuda:
        policy._device = "cuda"
        policy = policy.to(device="cuda")

    # Create value function
    if vfcn_type == "fnn-plain":
        vfcn = FNN(
            input_size=env.obs_space.flat_dim,
            output_size=1,
            hidden_sizes=[16, 16],
            hidden_nonlin=to.tanh,
            use_cuda=use_cuda,
        )
    elif vfcn_type == FNNPolicy.name:
        vf_spec = EnvSpec(env.obs_space, ValueFunctionSpace)
        vfcn = FNNPolicy(vf_spec,
                         hidden_sizes=[16, 16],
                         hidden_nonlin=to.tanh,
                         use_cuda=use_cuda)
    elif vfcn_type == RNNPolicy.name:
        vf_spec = EnvSpec(env.obs_space, ValueFunctionSpace)
        vfcn = RNNPolicy(vf_spec,
                         hidden_size=16,
                         num_recurrent_layers=1,
                         use_cuda=use_cuda)
    else:
        raise NotImplementedError

    # Create critic
    critic_hparam = dict(
        gamma=0.98,
        lamda=0.95,
        batch_size=32,
        lr=1e-3,
        standardize_adv=False,
    )
    critic = GAE(vfcn, **critic_hparam)

    # Common hyper-parameters
    common_hparam = dict(max_iter=2, min_rollouts=3, num_workers=1)
    # Add specific hyper parameters if any
    common_hparam.update(algo_hparam)

    # Create algorithm and train
    algo = algo(ex_dir, env, policy, critic, **common_hparam)
    algo.train()
    assert algo.curr_iter == algo.max_iter
Exemplo n.º 2
0
def create_flipping_task(
    env_spec: EnvSpec,
    obs_labels: Sequence[str],
    des_angle_delta: float = np.pi / 2.0,
    endless: bool = True,
) -> MaskedTask:
    """
    Create a task for rotating an object.

    .. note::
        This task was designed with an RcsPySim environment in mind, but is not restricted to these environments.

    :param env_spec: environment specification
    :param obs_labels: labels for selection, e.g. ['Box_A']. This needs to match the observations' names in RcsPySim
    :param des_angle_delta: desired angle to rotate. If reached, the task is reset, and rotating continues.
    :param endless: if `True`, the task will promote endlessly repeated flipping about the desired angle, else only one
                    flip is desired
    :return: masked task that only considers a subspace of all observations
    """
    # Get the masked environment specification
    spec = EnvSpec(
        env_spec.obs_space,
        env_spec.act_space,
        env_spec.state_space.subspace(
            env_spec.state_space.create_mask(obs_labels)),
    )

    # Create a desired state task
    rew_fcn = CosOfOneEleRewFcn(idx=0)
    task = FlippingTask(spec, des_angle_delta, rew_fcn, endless=endless)

    # Return the masked tasks
    return MaskedTask(env_spec, task, obs_labels)
Exemplo n.º 3
0
def create_home_pos_task(env_spec: EnvSpec, obs_labels: Sequence[str],
                         state_des: np.ndarray) -> MaskedTask:
    """
    Create a task for moving the robot to safe position.

    .. note::
        This task was designed with an RcsPySim environment in mind, but is not restricted to these environments.

    :param env_spec: environment specification
    :param obs_labels: labels for selection, e.g. ['PowerGrasp_R_Y', 'PowerGrasp_R_Z']. This needs to match the
                       observations' names in RcsPySim
    :param state_des: desired state (depends of the coordinate system). If reached, the task is over.
    :return: masked task that only considers a subspace of all observations
    """
    # Get the masked environment specification
    spec = EnvSpec(
        env_spec.obs_space,
        env_spec.act_space,
        env_spec.state_space.subspace(
            env_spec.state_space.create_mask(obs_labels)),
    )

    # Create a desired state task
    Q = 1e1 * np.eye(len(state_des))
    R = 1e-1 * np.eye(spec.act_space.flat_dim)
    rew_fcn = ExpQuadrErrRewFcn(Q, R)
    task = DesStateTask(spec, state_des, rew_fcn)

    # Return the masked tasks
    return MaskedTask(env_spec, task, obs_labels)
Exemplo n.º 4
0
def create_goal_dist_task(env_spec: EnvSpec,
                          ds_label: int,
                          rew_fcn: RewFcn,
                          succ_thold: float = 0.01) -> MaskedTask:
    """
    Create a task that rewards minimizing the `GoalDistance` of dynamical system movements primitives (see RcsPySim).

    .. note::
        This task was designed with an RcsPySim environment in mind, but is not restricted to these environments.

    :param env_spec: environment specification
    :param ds_label: label of the dynamical system (see RcsPySim)
    :param rew_fcn: reward function
    :param succ_thold: once the object of interest is closer than this threshold, the task is considered successfully
    :return: masked task that only considers a subspace of all observations
    """
    # Define the indices for selection. This needs to match the observations' names in RcsPySim.
    obs_labels = [f"GD_DS{ds_label}"]

    # Get the masked environment specification
    spec = EnvSpec(
        env_spec.obs_space,
        env_spec.act_space,
        env_spec.state_space.subspace(
            env_spec.state_space.create_mask(obs_labels))
        if env_spec.state_space is not EmptySpace else EmptySpace,
    )

    # Create a desired state task with the goal [0, 0]
    task = DesStateTask(
        spec, np.zeros(2), rew_fcn,
        functools.partial(proximity_succeeded, thold_dist=succ_thold))

    # Mask selected goal distance
    return MaskedTask(env_spec, task, obs_labels)
Exemplo n.º 5
0
def create_forcemin_task(env_spec: EnvSpec, obs_labels: Sequence[str],
                         Q: np.ndarray) -> MaskedTask:
    """
    Create a task which punishes the amount of used force.

    .. note::
        This task was designed with an RcsPySim environment in mind, but is not restricted to these environments.

    :param env_spec: environment specification
    :param obs_labels: labels for selection, e.g. ['WristLoadCellLBR_R_Fy']. This needs to match the observations'
                       names in RcsPySim
    :param Q: weight matrix of dim NxN with N=num_forces for the quadratic force costs
    :return: masked task that only considers a subspace of all observations
    """
    # Get the masked environment specification
    spec = EnvSpec(
        env_spec.obs_space,
        env_spec.act_space,
        env_spec.state_space.subspace(
            env_spec.state_space.create_mask(obs_labels)),
    )

    # Create an endlessly running desired state task
    rew_fcn = QuadrErrRewFcn(Q=Q,
                             R=np.zeros((spec.act_space.flat_dim,
                                         spec.act_space.flat_dim)))
    task = DesStateTask(spec, np.zeros(spec.state_space.shape), rew_fcn,
                        never_succeeded)

    # Mask selected collision cost observation
    return MaskedTask(env_spec, task, obs_labels)
Exemplo n.º 6
0
def create_collision_task(env_spec: EnvSpec, factor: float) -> MaskedTask:
    """
    Create a task which punishes collision costs given a collision model with pairs of bodies.
    This task only looks at the instantaneous collision cost.

    .. note::
        This task was designed with an RcsPySim environment in mind, but is not restricted to these environments.

    :param env_spec: environment specification
    :param factor: cost / reward function scaling factor
    :return: masked task that only considers a subspace of all observations
    """
    if not factor >= 0:
        raise pyrado.ValueErr(given=factor, ge_constraint="0")

    # Define the indices for selection. This needs to match the observations' names in RcsPySim.
    obs_labels = ["CollCost"]

    # Get the masked environment specification
    spec = EnvSpec(
        env_spec.obs_space,
        env_spec.act_space,
        env_spec.state_space.subspace(
            env_spec.state_space.create_mask(obs_labels)),
    )

    rew_fcn = AbsErrRewFcn(q=np.array([factor]),
                           r=np.zeros(spec.act_space.shape))

    # Create an endlessly running desired state task (no collision is desired)
    task = DesStateTask(spec, np.zeros(spec.state_space.shape), rew_fcn,
                        never_succeeded)

    # Mask selected collision cost observation
    return MaskedTask(env_spec, task, obs_labels)
Exemplo n.º 7
0
def create_box_lift_task(env_spec: EnvSpec, continuous_rew_fcn: bool,
                         succ_thold: float):
    # Define the indices for selection. This needs to match the observations' names in RcsPySim.
    idcs = ['Box_Z']

    # Get the masked environment specification
    spec = EnvSpec(
        env_spec.obs_space, env_spec.act_space,
        env_spec.state_space.subspace(env_spec.state_space.create_mask(idcs)))

    # Create a desired state task
    # state_des = np.array([0.3])  # box position is measured relative to the table
    state_des = np.array([1.1])  # box position is measured world coordinates
    if continuous_rew_fcn:
        Q = np.diag([3e1])
        R = 1e0 * np.eye(spec.act_space.flat_dim)
        rew_fcn = ExpQuadrErrRewFcn(Q, R)
    else:
        rew_fcn = MinusOnePerStepRewFcn()
    dst = DesStateTask(
        spec, state_des, rew_fcn,
        functools.partial(proximity_succeeded, thold_dist=succ_thold))

    # Return the masked tasks
    return MaskedTask(env_spec, dst, idcs)
Exemplo n.º 8
0
def create_task_space_discrepancy_task(env_spec: EnvSpec,
                                       rew_fcn: RewFcn) -> MaskedTask:
    """
    Create a task which punishes the discrepancy between the actual and the commanded state of the observed body.
    The observed body is specified in in the associated experiment configuration file in RcsPySim.
    This task only looks at the X and Z coordinates.

    :param env_spec: environment specification
    :param rew_fcn: reward function
    :return: masked task
    """
    # Define the indices for selection. This needs to match the observations' names in RcsPySim.
    idcs = [idx for idx in env_spec.state_space.labels if 'DiscrepTS' in idx]
    if not idcs:
        raise pyrado.ValueErr(
            msg="No state space labels found that contain 'DiscrepTS'")

    # Get the masked environment specification
    spec = EnvSpec(
        env_spec.obs_space, env_spec.act_space,
        env_spec.state_space.subspace(env_spec.state_space.create_mask(idcs)))

    # Create a desired state task (no task space discrepancy is desired and the task never stops because of success)
    dst = DesStateTask(spec, np.zeros(spec.state_space.shape), rew_fcn,
                       never_succeeded)

    # Mask selected discrepancy observation
    return MaskedTask(env_spec, dst, idcs)
Exemplo n.º 9
0
    def reset(self, env_spec: EnvSpec, **kwargs):
        self._env_spec = env_spec

        # Determine the masks
        if self._state_idcs is not None:
            self._state_mask = env_spec.state_space.create_mask(
                self._state_idcs)
        else:
            self._state_mask = np.ones(env_spec.state_space.shape,
                                       dtype=np.bool_)
        if self._action_idcs is not None:
            self._action_mask = env_spec.act_space.create_mask(
                self._action_idcs)
        else:
            self._action_mask = np.ones(env_spec.act_space.shape,
                                        dtype=np.bool_)

        # Pass masked state and masked action
        self._wrapped_task.reset(
            env_spec=EnvSpec(
                env_spec.obs_space,
                env_spec.act_space.subspace(self._action_mask),
                env_spec.state_space.subspace(self._state_mask)
                if env_spec.state_space is not EmptySpace else EmptySpace,
            ),
            **kwargs,
        )
Exemplo n.º 10
0
def create_box_flip_task(env_spec: EnvSpec, continuous_rew_fcn):
    # Define the indices for selection. This needs to match the observations' names in RcsPySim.
    idcs = ['Box_A']

    # Get the masked environment specification
    spec = EnvSpec(
        env_spec.obs_space,
        env_spec.act_space,
        env_spec.state_space.subspace(env_spec.state_space.create_mask(idcs))
    )

    # Create a desired state task
    # state_des = np.array([0.3])  # box position is measured relative to the table
    state_des = np.array([-np.pi/2])  # box position is measured world coordinates
    if continuous_rew_fcn:
        # q = np.array([0./np.pi])
        # r = 1e-6*np.ones(spec.act_space.flat_dim)
        # rew_fcn_act = AbsErrRewFcn(q, r)
        rew_fcn = CosOfOneEleRewFcn(idx=0)
        # rew_fcn = CompoundRewFcn([rew_fcn_act, rew_fcn_box])
    else:
        rew_fcn = MinusOnePerStepRewFcn()
    ef_task = EndlessFlippingTask(spec, rew_fcn, init_angle=0.)

    # Return the masked tasks
    return MaskedTask(env_spec, ef_task, idcs)
Exemplo n.º 11
0
    def __init__(self,
                 env_spec: EnvSpec,
                 batch_size: int,
                 reward_multiplier: float,
                 lr: float = 3e-3,
                 logger: StepLogger = None,
                 device: str = 'cuda' if to.cuda.is_available() else 'cpu'):
        """
        Constructor

        :param env_spec: environment specification
        :param batch_size: batch size for each update step
        :param reward_multiplier: factor for the predicted probability
        :param lr: learning rate
        :param logger: logger for every step of the algorithm, if `None` the default logger will be created
        """
        self.device = device
        self.batch_size = batch_size
        self.reward_multiplier = reward_multiplier
        self.lr = lr
        spec = EnvSpec(obs_space=BoxSpace.cat(
            [env_spec.obs_space, env_spec.obs_space, env_spec.act_space]),
                       act_space=BoxSpace(bound_lo=[0], bound_up=[1]))
        self.discriminator = FNNPolicy(spec=spec,
                                       hidden_nonlin=to.tanh,
                                       hidden_sizes=[62],
                                       output_nonlin=to.sigmoid)
        self.loss_fcn = nn.BCELoss()
        self.optimizer = to.optim.Adam(self.discriminator.parameters(),
                                       lr=lr,
                                       eps=1e-5)
        self.logger = logger
Exemplo n.º 12
0
def create_box_upper_shelve_task(env_spec: EnvSpec, continuous_rew_fcn: bool,
                                 succ_thold: float):
    # Define the indices for selection. This needs to match the observations' names in RcsPySim.
    idcs = ['Box_X', 'Box_Y', 'Box_Z', 'Box_A', 'Box_B', 'Box_C']

    # Get the masked environment specification
    spec = EnvSpec(
        env_spec.obs_space, env_spec.act_space,
        env_spec.state_space.subspace(env_spec.state_space.create_mask(idcs)))

    # Create a desired state task
    state_des = np.zeros(
        6)  # zeros since we observe the box position relative to the goal
    if continuous_rew_fcn:
        Q = np.diag([5e0, 5e0, 5e0, 1e-1, 1e-1, 1e-1])
        R = 5e-2 * np.eye(spec.act_space.flat_dim)
        rew_fcn = ExpQuadrErrRewFcn(Q, R)
    else:
        rew_fcn = MinusOnePerStepRewFcn
    dst = DesStateTask(
        spec, state_des, rew_fcn,
        functools.partial(proximity_succeeded, thold_dist=succ_thold))

    # Return the masked tasks
    return MaskedTask(env_spec, dst, idcs)
Exemplo n.º 13
0
def create_collision_task(env_spec: EnvSpec, factor: float) -> MaskedTask:
    """
    Create a task which punishes collision costs given a collision model with pairs of bodies.
    This task only looks at the instantaneous collision cost.

    :param env_spec: environment specification
    :param factor: cost / reward function scaling factor
    :return: masked task
    """
    if not factor >= 0:
        raise pyrado.ValueErr(given=factor, ge_constraint='0')

    # Define the indices for selection. This needs to match the observations' names in RcsPySim.
    idcs = ['CollCost']

    # Get the masked environment specification
    spec = EnvSpec(
        env_spec.obs_space, env_spec.act_space,
        env_spec.state_space.subspace(env_spec.state_space.create_mask(idcs)))

    rew_fcn = AbsErrRewFcn(q=np.array([factor]),
                           r=np.zeros(spec.act_space.shape))

    # Create a desired state task (no collision is desired and the task never stops because of success)
    dst = DesStateTask(spec, np.zeros(spec.state_space.shape), rew_fcn,
                       never_succeeded)

    # Mask selected collision cost observation
    return MaskedTask(env_spec, dst, idcs)
Exemplo n.º 14
0
def create_nonrecurrent_policy():
    return LinearPolicy(
        EnvSpec(
            BoxSpace(-1, 1, 4),
            BoxSpace(-1, 1, 3),
        ),
        FeatureStack(const_feat, identity_feat, squared_feat),
    )
Exemplo n.º 15
0
def create_recurrent_policy():
    return RNNPolicy(
        EnvSpec(
            BoxSpace(-1, 1, 4),
            BoxSpace(-1, 1, 3),
        ),
        hidden_size=32, num_recurrent_layers=1, hidden_nonlin='tanh'
    )
Exemplo n.º 16
0
def test_sprl(ex_dir, env: SimEnv, optimize_mean: bool):
    pyrado.set_seed(0)

    env = ActNormWrapper(env)
    env_sprl_params = [
        dict(
            name="gravity_const",
            target_mean=to.tensor([9.81]),
            target_cov_chol_flat=to.tensor([1.0]),
            init_mean=to.tensor([9.81]),
            init_cov_chol_flat=to.tensor([0.05]),
        )
    ]
    radnomizer = DomainRandomizer(
        *[SelfPacedDomainParam(**p) for p in env_sprl_params])
    env = DomainRandWrapperLive(env, randomizer=radnomizer)

    policy = FNNPolicy(env.spec, hidden_sizes=[64, 64], hidden_nonlin=to.tanh)

    vfcn_hparam = dict(hidden_sizes=[32, 32], hidden_nonlin=to.relu)
    vfcn = FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace),
                     **vfcn_hparam)
    critic_hparam = dict(
        gamma=0.9844534412010116,
        lamda=0.9710614403461155,
        num_epoch=10,
        batch_size=150,
        standardize_adv=False,
        lr=0.00016985313083236645,
    )
    critic = GAE(vfcn, **critic_hparam)

    subrtn_hparam = dict(
        max_iter=1,
        eps_clip=0.12648736789309026,
        min_steps=10 * env.max_steps,
        num_epoch=3,
        batch_size=150,
        std_init=0.7573286998997557,
        lr=6.999956625305722e-04,
        max_grad_norm=1.0,
        num_workers=1,
    )

    algo_hparam = dict(
        kl_constraints_ub=8000,
        performance_lower_bound=500,
        std_lower_bound=0.4,
        kl_threshold=200,
        max_iter=1,
        optimize_mean=optimize_mean,
    )

    algo = SPRL(env, PPO(ex_dir, env, policy, critic, **subrtn_hparam),
                **algo_hparam)
    algo.train(snapshot_mode="latest")
    assert algo.curr_iter == algo.max_iter
Exemplo n.º 17
0
def test_modulated_rew_fcn():
    Q = np.eye(4)
    R = np.eye(2)
    s = np.array([1, 2, 3, 4])
    a = np.array([0, 0])

    # Modulo 2 for all selected states
    idcs = [0, 1, 3]
    rew_fcn = QuadrErrRewFcn(Q, R)
    task = RadiallySymmDesStateTask(EnvSpec(None, None, None), np.zeros(4), rew_fcn, idcs, 2)
    r = task.step_rew(s, a)
    assert r == -(1**2 + 3**2)

    # Different modulo factor for the selected states
    idcs = [1, 3]
    task = RadiallySymmDesStateTask(EnvSpec(None, None, None), np.zeros(4), rew_fcn, idcs, np.array([2, 3]))
    r = task.step_rew(s, a)
    assert r == -(1**2 + 3**2 + 1**2)
Exemplo n.º 18
0
    def _create_main_task(self, task_args: dict) -> Task:
        # Create a DesStateTask that masks everything but the ball position
        idcs = list(
            range(self.state_space.flat_dim - 6,
                  self.state_space.flat_dim - 3))  # Cartesian ball position
        spec = EnvSpec(
            self.spec.obs_space, self.spec.act_space,
            self.spec.state_space.subspace(
                self.spec.state_space.create_mask(idcs)))

        # If we do not use copy(), state_des coming from MuJoCo is a reference and updates automatically at each step.
        # Note: sim.forward() + get_body_xpos() results in wrong output for state_des, as sim has not been updated to
        # init_space.sample(), which is first called in reset()

        if task_args.get('sparse_rew_fcn', False):
            factor = task_args.get('success_bonus', 1)
            # Binary final reward task
            main_task = FinalRewTask(ConditionOnlyTask(
                spec,
                condition_fcn=self.check_ball_in_cup,
                is_success_condition=True),
                                     mode=FinalRewMode(always_positive=True),
                                     factor=factor)
            # Yield -1 on fail after the main task ist done (successfully or not)
            dont_fail_after_succ_task = FinalRewTask(
                GoallessTask(spec, ZeroPerStepRewFcn()),
                mode=FinalRewMode(always_negative=True),
                factor=factor)

            # Augment the binary task with an endless dummy task, to avoid early stopping
            task = SequentialTasks((main_task, dont_fail_after_succ_task))

            return MaskedTask(self.spec, task, idcs)

        else:
            state_des = self.sim.data.get_site_xpos(
                'cup_goal')  # this is a reference
            R_default = np.diag([
                0, 0, 1, 1e-2, 1e-2, 1e-1
            ]) if self.num_dof == 7 else np.diag([0, 0, 1e-2, 1e-2])
            rew_fcn = ExpQuadrErrRewFcn(
                Q=task_args.get('Q', np.diag([
                    2e1, 1e-4, 2e1
                ])),  # distance ball - cup; shouldn't move in y-direction
                R=task_args.get('R',
                                R_default)  # last joint is really unreliable
            )
            task = DesStateTask(spec, state_des, rew_fcn)

            # Wrap the masked DesStateTask to add a bonus for the best state in the rollout
            return BestStateFinalRewTask(MaskedTask(self.spec, task, idcs),
                                         max_steps=self.max_steps,
                                         factor=task_args.get(
                                             'final_factor', 0.05))
Exemplo n.º 19
0
def test_pddr(ex_dir, env: SimEnv, policy, algo_hparam):
    pyrado.set_seed(0)

    # Create algorithm and train
    teacher_policy = deepcopy(policy)
    critic = GAE(
        vfcn=FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace),
                       hidden_sizes=[16, 16],
                       hidden_nonlin=to.tanh))
    teacher_algo_hparam = dict(critic=critic, min_steps=1500, max_iter=2)
    teacher_algo = PPO

    # Wrapper
    randomizer = create_default_randomizer(env)
    env = DomainRandWrapperLive(env, randomizer)

    # Subroutine
    algo_hparam = dict(
        max_iter=2,
        min_steps=env.max_steps,
        std_init=0.15,
        num_epochs=10,
        num_teachers=2,
        teacher_policy=teacher_policy,
        teacher_algo=teacher_algo,
        teacher_algo_hparam=teacher_algo_hparam,
        num_workers=1,
    )

    algo = PDDR(ex_dir, env, policy, **algo_hparam)

    algo.train()

    assert algo.curr_iter == algo.max_iter

    # Save and load
    algo.save_snapshot(meta_info=None)
    algo_loaded = Algorithm.load_snapshot(load_dir=ex_dir)
    assert isinstance(algo_loaded, Algorithm)
    policy_loaded = algo_loaded.policy

    # Check
    assert all(algo.policy.param_values == policy_loaded.param_values)

    # Load the experiment. Since we did not save any hyper-parameters, we ignore the errors when loading.
    env, policy, extra = load_experiment(ex_dir)
    assert isinstance(env, Env)
    assert isinstance(policy, Policy)
    assert isinstance(extra, dict)
Exemplo n.º 20
0
def test_arpl(ex_dir, env: SimEnv):
    pyrado.set_seed(0)

    env = ActNormWrapper(env)
    env = StateAugmentationWrapper(env, domain_param=None)

    policy = FNNPolicy(env.spec, hidden_sizes=[16, 16], hidden_nonlin=to.tanh)

    vfcn_hparam = dict(hidden_sizes=[32, 32], hidden_nonlin=to.tanh)
    vfcn = FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace),
                     **vfcn_hparam)
    critic_hparam = dict(
        gamma=0.9844534412010116,
        lamda=0.9710614403461155,
        num_epoch=10,
        batch_size=150,
        standardize_adv=False,
        lr=0.00016985313083236645,
    )
    critic = GAE(vfcn, **critic_hparam)

    algo_hparam = dict(
        max_iter=2,
        min_steps=23 * env.max_steps,
        min_rollouts=None,
        num_epoch=5,
        eps_clip=0.085,
        batch_size=150,
        std_init=0.995,
        lr=2e-4,
        num_workers=1,
    )
    arpl_hparam = dict(
        max_iter=2,
        steps_num=23 * env.max_steps,
        halfspan=0.05,
        dyn_eps=0.07,
        dyn_phi=0.25,
        obs_phi=0.1,
        obs_eps=0.05,
        proc_phi=0.1,
        proc_eps=0.03,
        torch_observation=True,
    )
    ppo = PPO(ex_dir, env, policy, critic, **algo_hparam)
    algo = ARPL(ex_dir, env, ppo, policy, ppo.expl_strat, **arpl_hparam)

    algo.train(snapshot_mode="best")
Exemplo n.º 21
0
def create_task_space_discrepancy_task(env_spec: EnvSpec, rew_fcn: RewFcn) -> MaskedTask:
    # Define the indices for selection. This needs to match the observations' names in RcsPySim.
    idcs = ['DiscrepTS_Y', 'DiscrepTS_Z']

    # Get the masked environment specification
    spec = EnvSpec(
        env_spec.obs_space,
        env_spec.act_space,
        env_spec.state_space.subspace(env_spec.state_space.create_mask(idcs))
    )

    # Create a desired state task (no task space discrepancy is desired and the task never stops because of success)
    dst = DesStateTask(spec, np.zeros(spec.state_space.shape), rew_fcn, never_succeeded)

    # Mask selected discrepancy observation
    return MaskedTask(env_spec, dst, idcs)
Exemplo n.º 22
0
    def __init__(
        self,
        mapping: Dict[int, Tuple[str, str]],
        trafo_mask: Union[list, to.Tensor],
        prior: DomainRandomizer = None,
        scale_params: bool = False,
        use_cuda: bool = False,
    ):
        """
        Constructor

        :param mapping: mapping from subsequent integers to domain distribution parameters, where the first string of
                        the value tuple is the name of the domain parameter (e.g. mass, length) and the second string is
                        the name of the distribution parameter (e.g. mean, std).  The integers are indices of the numpy
                        array which come from the algorithm.
        :param trafo_mask: every domain parameter that is set to `True` in this mask will be learned via a 'virtual'
                           parameter, i.e. in sqrt-space, and then finally squared to retrieve the domain parameter.
                           This transformation is useful to avoid setting a negative variance.
        :param prior: prior believe about the distribution parameters in from of a `DomainRandomizer`
        :param scale_params: if `True`, the sqrt-transformed policy parameters are scaled in the range of $[-0.5, 0.5]$.
                             The advantage of this is to make the parameter-based exploration easier.
        :param use_cuda: `True` to move the policy to the GPU, `False` (default) to use the CPU
        """
        if not isinstance(mapping, dict):
            raise pyrado.TypeErr(given=mapping, expected_type=dict)
        if not len(trafo_mask) == len(mapping):
            raise pyrado.ShapeErr(given=trafo_mask, expected_match=mapping)

        self._mapping = mapping
        self.mask = to.tensor(trafo_mask, dtype=to.bool)
        self.prior = prior
        self._scale_params = scale_params

        # Define the parameter space by using the Policy.env_spec.act_space
        param_spec = EnvSpec(obs_space=EmptySpace(),
                             act_space=BoxSpace(-pyrado.inf,
                                                pyrado.inf,
                                                shape=len(mapping)))

        # Call Policy's constructor
        super().__init__(param_spec, use_cuda)

        self.params = nn.Parameter(to.empty(param_spec.act_space.flat_dim),
                                   requires_grad=True)
        if self._scale_params:
            self.param_scaler = None  # initialized during init_param()
        self.init_param(prior=prior)
Exemplo n.º 23
0
    def _create_task(self, task_args: dict) -> Task:
        # Create a DesStateTask that masks everything but the ball position
        idcs = list(
            range(self.state_space.flat_dim - 3,
                  self.state_space.flat_dim))  # Cartesian ball position
        spec = EnvSpec(
            self.spec.obs_space, self.spec.act_space,
            self.spec.state_space.subspace(
                self.spec.state_space.create_mask(idcs)))

        # If we do not use copy(), state_des coming from MuJoCo is a reference and updates automatically at each step.
        # Note: sim.forward() + get_body_xpos() results in wrong output for state_des, as sim has not been updated to
        # init_space.sample(), which is first called in reset()

        if task_args.get('sparse_rew_fcn', False):
            # Binary final reward task
            task = FinalRewTask(ConditionOnlyTask(
                spec,
                condition_fcn=self.check_ball_in_cup,
                is_success_condition=True),
                                mode=FinalRewMode(always_positive=True),
                                factor=1)

            return MaskedTask(self.spec, task, idcs)

        else:
            # If we do not use copy(), state_des is a reference to passed body and updates automatically at each step
            state_des = self.sim.data.get_site_xpos(
                'cup_goal')  # this is a reference
            rew_fcn = ExpQuadrErrRewFcn(
                Q=task_args.get('Q', np.diag([
                    1e1, 1e5, 2e1
                ])),  # distance ball - cup; shouldn't move in y-direction
                R=task_args.get('R',
                                np.diag([
                                    1e-1, 1e-1, 1e-1, 1e-2, 1e-2, 1e-2
                                ]))  # desired joint angles and velocities
            )
            task = DesStateTask(spec, state_des, rew_fcn)

            # Wrap the masked DesStateTask to add a bonus for the best state in the rollout
            return BestStateFinalRewTask(MaskedTask(self.spec, task, idcs),
                                         max_steps=self.max_steps,
                                         factor=task_args.get(
                                             'final_factor', 1.))
Exemplo n.º 24
0
def create_extract_slider_task(env_spec: EnvSpec, task_args: dict,
                               des_state: np.ndarray):
    # Define the indices for selection. This needs to match the observations' names in RcsPySim.
    idcs = ['Slider_Y']

    # Get the masked environment specification
    spec = EnvSpec(
        env_spec.obs_space, env_spec.act_space,
        env_spec.state_space.subspace(env_spec.state_space.create_mask(idcs)))

    # Create a desired state task
    Q = task_args.get('Q_slider', np.array([[4e1]]))
    R = task_args.get('R_slider', 1e-6 * np.eye(spec.act_space.flat_dim))
    rew_fcn = ExpQuadrErrRewFcn(Q, R)
    dst_task = DesStateTask(spec, des_state, rew_fcn)

    # Return the masked tasks
    return MaskedTask(env_spec, dst_task, idcs)
Exemplo n.º 25
0
def test_adr(env, ex_dir, subrtn_hparam, actor_hparam, value_fcn_hparam,
             critic_hparam, adr_hparam):
    # Create the subroutine for the meta-algorithm
    actor = FNNPolicy(spec=env.spec, **actor_hparam)
    value_fcn = FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace),
                          **value_fcn_hparam)
    critic = GAE(value_fcn, **critic_hparam)
    subroutine = PPO(ex_dir, env, actor, critic, **subrtn_hparam)

    # Create algorithm and train
    particle_hparam = dict(actor=actor_hparam,
                           value_fcn=value_fcn_hparam,
                           critic=critic_hparam)
    algo = ADR(ex_dir,
               env,
               subroutine,
               svpg_particle_hparam=particle_hparam,
               **adr_hparam)
    algo.train()
    assert algo.curr_iter == algo.max_iter
Exemplo n.º 26
0
def create_insert_task(env_spec: EnvSpec, state_des: np.ndarray,
                       rew_fcn: RewFcn, success_fcn: Callable):
    # Define the indices for selection. This needs to match the observations' names in RcsPySim.
    idcs = [
        'Effector_X', 'Effector_Z', 'Effector_B', 'Effector_Xd', 'Effector_Zd',
        'Effector_Bd'
    ]

    # Get the masked environment specification
    spec = EnvSpec(
        env_spec.obs_space, env_spec.act_space,
        env_spec.state_space.subspace(env_spec.state_space.create_mask(idcs)))

    # Create a wrapped desired state task with the goal behind the wall
    fdst = FinalRewTask(DesStateTask(spec, state_des, rew_fcn, success_fcn),
                        mode=FinalRewMode(state_dependent=True,
                                          time_dependent=True))

    # Mask selected states
    return MaskedTask(env_spec, fdst, idcs)
Exemplo n.º 27
0
def create_goal_dist_distvel_task(env_spec: EnvSpec,
                                  ds_index: int,
                                  rew_fcn: RewFcn,
                                  succ_thold: float = 0.01) -> MaskedTask:
    # Define the indices for selection. This needs to match the observations' names in RcsPySim.
    idcs = [f'GD_DS{ds_index}', f'GD_DS{ds_index}d']

    # Get the masked environment specification
    spec = EnvSpec(
        env_spec.obs_space, env_spec.act_space,
        env_spec.state_space.subspace(env_spec.state_space.create_mask(idcs))
        if env_spec.state_space is not EmptySpace else EmptySpace)

    # Create a desired state task with the goal [0, 0]
    dst = DesStateTask(
        spec, np.zeros(2), rew_fcn,
        functools.partial(proximity_succeeded, thold_dist=succ_thold))

    # Mask selected goal distance velocities
    return MaskedTask(env_spec, dst, idcs)
Exemplo n.º 28
0
def test_actor_critic(env, linear_policy, ex_dir, algo, algo_hparam,
                      value_fcn_type, use_cuda):
    # Create value function
    if value_fcn_type == 'fnn-plain':
        value_fcn = FNN(input_size=env.obs_space.flat_dim,
                        output_size=1,
                        hidden_sizes=[16, 16],
                        hidden_nonlin=to.tanh,
                        use_cuda=use_cuda)
    else:
        vf_spec = EnvSpec(env.obs_space, ValueFunctionSpace)
        if value_fcn_type == 'fnn':
            value_fcn = FNNPolicy(vf_spec,
                                  hidden_sizes=[16, 16],
                                  hidden_nonlin=to.tanh,
                                  use_cuda=use_cuda)
        else:
            value_fcn = RNNPolicy(vf_spec,
                                  hidden_size=16,
                                  num_recurrent_layers=1,
                                  use_cuda=use_cuda)

    # Create critic
    critic_hparam = dict(
        gamma=0.98,
        lamda=0.95,
        batch_size=32,
        lr=1e-3,
        standardize_adv=False,
    )
    critic = GAE(value_fcn, **critic_hparam)

    # Common hyper-parameters
    common_hparam = dict(max_iter=3, min_rollouts=3, num_sampler_envs=1)
    # Add specific hyper parameters if any
    common_hparam.update(algo_hparam)

    # Create algorithm and train
    algo = algo(ex_dir, env, linear_policy, critic, **common_hparam)
    algo.train()
    assert algo.curr_iter == algo.max_iter
Exemplo n.º 29
0
def test_rff_regression(ex_dir, num_feat_per_dim: int, loss_fcn: Callable,
                        algo_hparam: dict):
    # Generate some data
    inputs = to.linspace(-4.0, 4.0, 8001).view(-1, 1)
    targets = noisy_nonlin_fcn(inputs, f=3.0, noise_std=0).view(-1, 1)

    # Create the policy
    rff = RFFeat(inp_dim=1,
                 num_feat_per_dim=num_feat_per_dim,
                 bandwidth=1 / 20)
    policy = LinearPolicy(
        EnvSpec(InfBoxSpace(shape=(1, )), InfBoxSpace(shape=(1, ))),
        FeatureStack(rff))

    # Create the algorithm, and train
    loss_before = loss_fcn(policy(inputs), targets)
    algo = NonlinRegression(ex_dir, inputs, targets, policy, **algo_hparam)
    algo.train()
    loss_after = loss_fcn(policy(inputs), targets)
    assert loss_after < loss_before
    assert algo.curr_iter >= algo_hparam["max_iter_no_improvement"]
Exemplo n.º 30
0
    def _create_deviation_task(self, task_args: dict) -> Task:
        idcs = list(
            range(self.state_space.flat_dim - 3,
                  self.state_space.flat_dim))  # Cartesian cup goal position
        spec = EnvSpec(
            self.spec.obs_space, self.spec.act_space,
            self.spec.state_space.subspace(
                self.spec.state_space.create_mask(idcs)))
        # init cup goal position
        state_des = np.array([
            0.82521, 0, 1.4469
        ]) if self.num_dof == 7 else np.array([0.758, 0, 1.5])
        rew_fcn = QuadrErrRewFcn(
            Q=task_args.get('Q_dev', np.diag(
                [2e-1, 1e-4, 5e0])),  # Cartesian distance from init position
            R=task_args.get(
                'R_dev',
                np.zeros(
                    (self._act_space.shape[0], self._act_space.shape[0]))))
        task = DesStateTask(spec, state_des, rew_fcn)

        return MaskedTask(self.spec, task, idcs)