示例#1
0
    def __init__(self, dt: float, gamma: float, lr: float, optimizer: str,
                 q_function: ParametricFunction, tau: float,
                 noscale: bool) -> None:
        CompoundStateful.__init__(self)
        self._reference_obs: Tensor = None
        self._q_function = q_function
        self._target_q_function = copy.deepcopy(self._q_function)
        self._tau = tau

        ref_dt = 0.02
        if noscale:
            self._gamma = gamma**(dt / ref_dt)
        else:
            self._gamma = gamma

        if noscale:
            dt = ref_dt
            self._dt = ref_dt
        else:
            self._dt = dt
        info(
            f"setup> using ValueCritic, the provided gamma and rewards are scaled,"
            f" actual values: gamma={gamma ** self._dt},"
            f" rewards=original_rewards * {self._dt}")

        self._q_optimizer = \
            setup_optimizer(self._q_function.parameters(),
                            opt_name=optimizer, lr=lr, dt=self._dt,
                            inverse_gradient_magnitude=self._dt,
                            weight_decay=0)

        self._device = 'cpu'
示例#2
0
    def __init__(self, dt: float, gamma: float, lr: float, tau: float,
                 optimizer: str, val_function: ParametricFunction,
                 adv_function: ParametricFunction) -> None:
        CompoundStateful.__init__(self)
        self._reference_obs: Tensor = None
        self._val_function = val_function
        self._adv_function = adv_function
        self._target_val_function = copy.deepcopy(val_function)
        self._target_adv_function = copy.deepcopy(adv_function)

        self._adv_optimizer = \
            setup_optimizer(self._adv_function.parameters(),
                            opt_name=optimizer, lr=lr, dt=dt,
                            inverse_gradient_magnitude=1,
                            weight_decay=0)
        self._val_optimizer = \
            setup_optimizer(self._val_function.parameters(),
                            opt_name=optimizer, lr=lr, dt=dt,
                            inverse_gradient_magnitude=dt,
                            weight_decay=0)

        self._dt = dt
        self._gamma = gamma
        self._tau = tau
        info(
            f"setup> using AdvantageCritic, the provided gamma and rewards are scaled,"
            f" actual values: gamma={gamma ** dt}, rewards=original_rewards * {dt}"
        )

        self._device = 'cpu'
示例#3
0
 def __init__(self, gamma: float, dt: float,
              v_function: ParametricFunction) -> None:
     CompoundStateful.__init__(self)
     self._reference_obs: Tensor = None
     self._v_function = v_function
     self._gamma = gamma**dt
     self._device = 'cpu'
     self._dt = dt
     info(
         f"setup> using OnlineCritic, the provided gamma and rewards are scaled,"
         f" actual values: gamma={gamma ** dt}, rewards=original_rewards * {dt}"
     )
示例#4
0
    def __init__(self, policy_function: ParametricFunction, noise: Noise,
                 lr: float, tau: float, opt_name: str, dt: float,
                 weight_decay: float) -> None:
        CompoundStateful.__init__(self)
        self._policy_function = policy_function
        self._target_policy_function = copy.deepcopy(self._policy_function)

        self._optimizer = setup_optimizer(self._policy_function.parameters(),
                                          opt_name=opt_name,
                                          lr=lr,
                                          dt=dt,
                                          inverse_gradient_magnitude=1,
                                          weight_decay=weight_decay)
        self._noise = noise
        self._tau = tau
示例#5
0
    def __init__(self, T: int, actor: OnlineActor,
                 critic: OnlineCritic) -> None:
        CompoundStateful.__init__(self)

        # reset and set in train mode
        self.train()

        # define learning components
        self._actor = actor
        self._critic = critic
        self._count = 0
        self._T = T
        self._device = "cpu"
        self.reset()

        # init _nb_train_env and _current_trajectories to None
        self._nb_train_env: Optional[int] = None
        self._current_trajectories: List[Trajectory] = []
示例#6
0
    def __init__(self, memory_size: int, batch_size: int, steps_btw_train: int,
                 learn_per_step: int, alpha: Optional[float],
                 beta: Optional[float], actor: Actor, critic: Critic) -> None:
        CompoundStateful.__init__(self)

        # reset and set in train mode
        self.reset()
        self.train()

        # define learning components
        self._actor = actor
        self._critic = critic
        self._sampler = setup_memory(alpha=alpha,
                                     beta=beta,
                                     memory_size=memory_size,
                                     batch_size=batch_size)

        # counter and parameters
        self._count = 0
        self._warm_up = 10  # prevents learning from a near empty buffer
        self._steps_btw_train = steps_btw_train
        self._learn_per_step = learn_per_step
示例#7
0
 def to(self, device):
     CompoundStateful.to(self, device)
     self._device = device
     return self
示例#8
0
 def to(self, device) -> "OnlineCritic":
     CompoundStateful.to(self, device)
     self._device = device
     return self
示例#9
0
 def to(self, device):
     return CompoundStateful.to(self, device)
示例#10
0
 def load_state_dict(self, state_dict: StateDict):
     CompoundStateful.load_state_dict(self, state_dict)
     self._count = state_dict["count"]
示例#11
0
 def state_dict(self) -> StateDict:
     state = CompoundStateful.state_dict(self)
     state["count"] = self._count
     return state
示例#12
0
    def __init__(self, policy_function: ParametricFunction,
                 dt: float, c_entropy: float) -> None:
        CompoundStateful.__init__(self)
        self._policy_function = policy_function

        self._c_entropy = c_entropy
示例#13
0
 def to(self, device):
     CompoundStateful.to(self, device)
     self._noise = self._noise.to(device)
     return self
示例#14
0
 def to(self, device) -> "OnlineAgent":
     self._device = device
     CompoundStateful.to(self, device)
     return self