Exemplo n.º 1
0
    def __init__(self, dt: float, gamma: float, lr: float, tau: float,
                 optimizer: str, val_function: ParametricFunction,
                 adv_function: ParametricFunction) -> None:
        CompoundStateful.__init__(self)
        self._reference_obs: Tensor = None
        self._val_function = val_function
        self._adv_function = adv_function
        self._target_val_function = copy.deepcopy(val_function)
        self._target_adv_function = copy.deepcopy(adv_function)

        self._adv_optimizer = \
            setup_optimizer(self._adv_function.parameters(),
                            opt_name=optimizer, lr=lr, dt=dt,
                            inverse_gradient_magnitude=1,
                            weight_decay=0)
        self._val_optimizer = \
            setup_optimizer(self._val_function.parameters(),
                            opt_name=optimizer, lr=lr, dt=dt,
                            inverse_gradient_magnitude=dt,
                            weight_decay=0)

        self._dt = dt
        self._gamma = gamma
        self._tau = tau
        info(
            f"setup> using AdvantageCritic, the provided gamma and rewards are scaled,"
            f" actual values: gamma={gamma ** dt}, rewards=original_rewards * {dt}"
        )

        self._device = 'cpu'
Exemplo n.º 2
0
    def __init__(self, dt: float, gamma: float, lr: float, optimizer: str,
                 q_function: ParametricFunction, tau: float,
                 noscale: bool) -> None:
        CompoundStateful.__init__(self)
        self._reference_obs: Tensor = None
        self._q_function = q_function
        self._target_q_function = copy.deepcopy(self._q_function)
        self._tau = tau

        ref_dt = 0.02
        if noscale:
            self._gamma = gamma**(dt / ref_dt)
        else:
            self._gamma = gamma

        if noscale:
            dt = ref_dt
            self._dt = ref_dt
        else:
            self._dt = dt
        info(
            f"setup> using ValueCritic, the provided gamma and rewards are scaled,"
            f" actual values: gamma={gamma ** self._dt},"
            f" rewards=original_rewards * {self._dt}")

        self._q_optimizer = \
            setup_optimizer(self._q_function.parameters(),
                            opt_name=optimizer, lr=lr, dt=self._dt,
                            inverse_gradient_magnitude=self._dt,
                            weight_decay=0)

        self._device = 'cpu'
Exemplo n.º 3
0
    def __init__(self, T: int, actor: TypeA2CActor, critic: A2CCritic,
                 opt_name: str, lr: float, dt: float, weight_decay: float):
        OnlineAgent.__init__(self, T=T, actor=actor, critic=critic)

        self._optimizer = setup_optimizer(chain(
            self._actor._policy_function.parameters(),
            self._critic._v_function.parameters()),
                                          opt_name=opt_name,
                                          lr=lr,
                                          dt=dt,
                                          inverse_gradient_magnitude=1,
                                          weight_decay=weight_decay)
Exemplo n.º 4
0
    def __init__(self, T: int, actor: OnlineActor, critic: OnlineCritic,
                 learn_per_step: int, batch_size: int, opt_name: str,
                 lr: float, dt: float, weight_decay: float):
        OnlineAgent.__init__(self, T=T, actor=actor, critic=critic)
        self._learn_per_step = learn_per_step
        self._batch_size = batch_size

        self._optimizer = setup_optimizer(chain(
            self._actor._policy_function.parameters(),
            self._critic._v_function.parameters()),
                                          opt_name=opt_name,
                                          lr=lr,
                                          dt=dt,
                                          inverse_gradient_magnitude=1,
                                          weight_decay=weight_decay)
Exemplo n.º 5
0
    def __init__(self, policy_function: ParametricFunction, noise: Noise,
                 lr: float, tau: float, opt_name: str, dt: float,
                 weight_decay: float) -> None:
        CompoundStateful.__init__(self)
        self._policy_function = policy_function
        self._target_policy_function = copy.deepcopy(self._policy_function)

        self._optimizer = setup_optimizer(self._policy_function.parameters(),
                                          opt_name=opt_name,
                                          lr=lr,
                                          dt=dt,
                                          inverse_gradient_magnitude=1,
                                          weight_decay=weight_decay)
        self._noise = noise
        self._tau = tau