Exemplo n.º 1
0
    def __init__(self, task, approximators, gamma=0.99, lr=0.001, polyak=0.995, delay=2, capacity=10000, num_workers=1):
        """
        Initialize the TD3 off-policy RL algorithm.

        Args:
            task (RLTask, Env): RL task/env to run
            approximators (Policy, [Policy, Value], ActorCritic): approximators to optimize
            gamma (float): discount factor (which is a bias-variance tradeoff). This parameter describes how much
                importance has the future rewards we get.
            lr (float): learning rate.
            polyak (float): coefficient (between 0 and 1) used in the polyak averaging when updating the target
                approximators. If 1, it will let the target parameter(s) unchanged, if 0 it will just copy the
                current parameter(s).
            delay (int): number of steps to wait before performing an update.
            capacity (int): capacity of the experience replay storage.
            num_workers (int): number of processes / workers to run in parallel
        """

        # check given approximators
        if isinstance(approximators, (tuple, list)):

            # get the policy and Q-value approximator
            policy, q_values = None, []
            for approximator in approximators:
                if isinstance(approximator, (Policy, QValue)):
                    policy = approximator
                elif isinstance(approximator, QValue):
                    q_values.append(approximator)

            # check that the policy and Q-value approximator are different than None
            if policy is None:
                raise ValueError("No policy approximator was given to the algorithm.")
            if not q_values:
                raise ValueError("No Q-value approximator was given to the algorithm.")

        else:
            raise TypeError("Expecting a list/tuple of a policy and a Q-value functions.")

        # check that there is at least 2 Q-value function approximators (the user can have more)
        if len(q_values) < 2:
            raise ValueError("Expecting at least 2 Q-value function approximators for the TD3 algorithm.")

        # get states and actions from policy
        states, actions = policy.states, policy.actions

        # check that the actions are continuous
        if not actions.is_continuous():
            raise ValueError("The TD3 assumes that the actions are continuous, however got an action which is not.")

        # evaluate target Q-value fct by copying Q-value function approximator
        memo = {}
        q_targets = [copy.deepcopy(q_value, memo=memo) for q_value in q_values]
        policy_target = copy.deepcopy(policy, memo=memo)

        # create action exploration strategy
        exploration = ActionExploration(policy=policy, action=actions)

        # create experience replay
        storage = ExperienceReplay(state_shapes=states.merged_shape, action_shapes=actions.merged_shape,
                                   capacity=capacity)
        sampler = BatchRandomSampler(storage)

        # create target return estimator
        returns = TDQValueReturn(q_value=q_values, policy=policy_target, target_qvalue=q_targets, gamma=gamma)

        # create Q-value loss and policy loss
        q_loss = MSBELoss(td_return=returns)
        policy_loss = QLoss(q_value=q_values[0], policy=policy)  # only the first q-value is used to train the policy
        losses = [q_loss, policy_loss]

        # create optimizer
        optimizer = Adam(learning_rate=lr)

        # create policy and q-value updaters
        params_updaters = [PolyakAveraging(current=policy, target=policy_target, rho=polyak)]
        for q_value, q_target in zip(q_values, q_targets):
            params_updaters.append(PolyakAveraging(current=q_value, target=q_target, rho=polyak))

        # create ticks (number of steps to wait before evaluating the loss / parameter updater)
        # this is used to delay the updates
        ticks = {updater: delay for updater in params_updaters}
        ticks.update({policy_loss: delay})

        # define the 3 main steps in RL: explore, evaluate, and update
        explorer = Explorer(task, exploration, storage, num_workers=num_workers)
        evaluator = Evaluator(None)  # off-policy
        updater = Updater(approximators, sampler, losses, optimizer, evaluators=returns, updaters=params_updaters,
                          ticks=ticks)

        # initialize RL algorithm
        super(TD3, self).__init__(explorer, evaluator, updater)
    def __init__(self,
                 task,
                 approximators,
                 gamma=0.99,
                 lr=0.001,
                 polyak=0.995,
                 capacity=10000,
                 num_workers=1):
        """
        Initialize the DDPG off-policy RL algorithm.

        Args:
            task (RLTask, Env): RL task/env to run
            approximators ([Policy, QValue]): policy and Q-value function approximator to optimize.
            gamma (float): discount factor (which is a bias-variance trade-off). This parameter describes how much
                importance has the future rewards we get.
            lr (float): learning rate
            polyak (float): coefficient (between 0 and 1) used in the polyak averaging when updating the target
                approximators. If 1, it will let the target parameter(s) unchanged, if 0 it will just copy the
                current parameter(s).
            capacity (int): capacity of the experience replay storage.
            num_workers (int): number of processes / workers to run in parallel
        """

        # check given approximators
        if isinstance(approximators,
                      (tuple, list)) and len(approximators) != 2:

            # get the policy and Q-value approximator
            policy, q_value = None, None
            for approximator in approximators:
                if isinstance(approximator, (Policy, QValue)):
                    policy = approximator
                elif isinstance(approximator, QValue):
                    q_value = approximator

            # check that the policy and Q-value approximator are different than None
            if policy is None:
                raise ValueError(
                    "No policy approximator was given to the algorithm.")
            if q_value is None:
                raise ValueError(
                    "No Q-value approximator was given to the algorithm.")

        else:
            raise TypeError(
                "Expecting a list/tuple of a policy and a Q-value function.")

        # get states and actions from policy
        states, actions = policy.states, policy.actions

        # check that the actions are continuous
        if not actions.is_continuous():
            raise ValueError(
                "The DDPG assumes that the actions are continuous, however got an action which is not."
            )

        # Set target parameters equal to main parameters
        memo = {}
        q_target = copy.deepcopy(q_value, memo=memo)
        policy_target = copy.deepcopy(policy, memo=memo)

        # create action exploration strategy
        exploration = ActionExploration(policy=policy, action=actions)

        # create experience replay
        storage = ExperienceReplay(state_shapes=states.merged_shape,
                                   action_shapes=actions.merged_shape,
                                   capacity=capacity)
        sampler = BatchRandomSampler(storage)

        # create target return estimator
        # target = QValueTarget(q_values=q_target, policy=policy_target, gamma=gamma)
        returns = TDQValueReturn(q_value=q_value,
                                 policy=policy_target,
                                 target_qvalue=q_target,
                                 gamma=gamma)

        # create Q-value loss and policy loss
        # q_loss = L2Loss(target=target, predictor=q_value)
        # q_loss = ValueLoss(returns=target, value=q_value)
        q_loss = MSBELoss(td_return=returns)
        policy_loss = QLoss(q_value=q_value, policy=policy)
        losses = [q_loss, policy_loss]

        # create optimizer
        optimizer = Adam(learning_rate=lr)

        # create q value and policy updaters
        q_value_updater = PolyakAveraging(current=q_value,
                                          target=q_target,
                                          rho=polyak)
        policy_updater = PolyakAveraging(current=policy,
                                         target=policy_target,
                                         rho=polyak)

        # define the 3 main steps in RL: explore, evaluate, and update
        explorer = Explorer(task,
                            exploration,
                            storage,
                            num_workers=num_workers)
        evaluator = Evaluator(None)  # off-policy
        updater = Updater(approximators,
                          sampler,
                          losses,
                          optimizer,
                          evaluators=returns,
                          updaters=[q_value_updater, policy_updater])

        # initialize RL algorithm
        super(DDPG, self).__init__(explorer, evaluator, updater)
Exemplo n.º 3
0
    def __init__(self,
                 task,
                 approximators,
                 gamma=0.99,
                 lr=5e-4,
                 polyak=0.995,
                 alpha=0.2,
                 capacity=10000,
                 num_workers=1):
        """
        Initialize the SAC off-policy RL algorithm.

        Args:
            task (RLTask, Env): RL task/env to run
            approximators ([Policy, Value, QValue]): approximators to optimize.
            gamma (float): discount factor (which is a bias-variance tradeoff). This parameter describes how much
                importance has the future rewards we get.
            lr (float): learning rate
            polyak (float): coefficient (between 0 and 1) used in the polyak averaging when updating the target
                approximators. If 1, it will let the target parameter(s) unchanged, if 0 it will just copy the
                current parameter(s).
            alpha (float): entropy regularization coefficient which controls the tradeoff between exploration and
                exploitation. Higher :attr:`alpha` means more exploration, and lower :attr:`alpha` corresponds to more
                exploitation.
            capacity (int): capacity of the experience replay storage.
            num_workers (int): number of processes / workers to run in parallel
        """

        # check approximators
        if not isinstance(approximators, collections.Iterable):
            raise TypeError(
                "Expecting the approximators to be a list containing a Policy, a Value, and at least 2 "
                "QValues")
        policy, value, q_values = None, None, []
        for approximator in approximators:
            if isinstance(approximator, Policy):
                policy = approximator
            elif isinstance(approximator, Value):
                value = approximator
            elif isinstance(approximator, ActorCritic):
                policy = approximator.actor
                value = approximator.critic
            elif isinstance(approximator, QValue):
                q_values.append(approximator)

        if policy is None:
            raise TypeError("No policy was given to the algorithm.")
        if value is None:
            raise TypeError(
                "No value function approximator was given to the algorithm.")
        if len(q_values) == 0:
            raise TypeError(
                "No Q-value function approximators were given to the algorithm."
            )

        # set target parameters equal to main parameters for the value function
        value_target = copy.deepcopy(value, memo={})

        # create experience replay
        states, actions = policy.states, policy.actions
        storage = ExperienceReplay(state_shapes=states.merged_shape,
                                   action_shapes=actions.merged_shape,
                                   capacity=capacity)
        sampler = BatchRandomSampler(storage)

        # create action exploration
        exploration = ActionExploration(policy)

        # create targets
        q_target = ValueTarget(values=value_target, gamma=gamma)
        v_target = EntropyValueTarget(q_values=q_values,
                                      policy=exploration,
                                      alpha=alpha)

        # create losses
        q_loss = MSBELoss(td_return=estimator)
        policy_loss = QLoss(
            q_value=q_values[0], policy=policy
        )  # only the first q-value is used to train the policy
        losses = [q_loss, policy_loss]

        # create optimizer
        optimizer = Adam(learning_rate=lr)

        # create parameter updater for target value function
        params_updater = PolyakAveraging(current=value,
                                         target=value_target,
                                         rho=polyak)

        # define the 3 main steps in RL: explore, evaluate, and update
        explorer = Explorer(task,
                            exploration,
                            storage,
                            num_workers=num_workers)
        evaluator = Evaluator(None)  # off-policy
        updater = Updater(approximators,
                          sampler,
                          losses,
                          optimizer,
                          updaters=params_updater)

        # initialize RL algorithm
        super(SAC, self).__init__(explorer, evaluator, updater)
Exemplo n.º 4
0
    def __init__(self,
                 task,
                 approximator,
                 gamma=0.99,
                 lr=5e-4,
                 capacity=10000,
                 polyak=0.995,
                 num_workers=1):
        """
        Initialize the DQN reinforcement learning algorithm.

        Args:
            task (RLTask, Env): RL task/env to run.
            approximator (ParametrizedQValueOutput, PolicyFromQValue): approximator to use and update.
            gamma (float): discount factor (which is a bias-variance tradeoff). This parameter describes how much
                importance has the future rewards we get.
            lr (float): learning rate.
            capacity (int): capacity of the experience replay storage.
            polyak (float): coefficient (between 0 and 1) used in the polyak averaging when updating the target
                approximators. If 1, it will let the target parameter(s) unchanged, if 0 it will just copy the
                current parameter(s).
            num_workers (int): number of processes / workers to run in parallel.
        """
        # check given approximator
        if isinstance(approximator, ParametrizedQValueOutput):
            policy = PolicyFromQValue(approximator)
            q_value = approximator
        elif isinstance(approximator, PolicyFromQValue):
            policy = approximator
            q_value = approximator.value
        else:
            raise TypeError(
                "Expecting the given approximator to be an instance of `PolicyFromQValue`, or "
                "`ParametrizedQValueOutput`, instead got: {}".format(
                    type(approximator)))

        # evaluate target Q-value fct by copying Q-value function approximator
        q_target = copy.deepcopy(q_value, memo={})

        # get states and actions from policy
        states, actions = policy.states, policy.actions

        # create action exploration strategy
        exploration = EpsilonGreedyActionExploration(policy=policy,
                                                     action=actions)

        # create experience replay and sampler
        storage = ExperienceReplay(state_shapes=states.merged_shape,
                                   action_shapes=actions.merged_shape,
                                   capacity=capacity)
        sampler = BatchRandomSampler(storage)

        # create target return estimator
        # target = QLearningTarget(q_values=q_target, gamma=gamma)
        td_return = TDQLearningReturn(q_value=q_value,
                                      target_qvalue=q_target,
                                      gamma=gamma)

        # create loss
        # loss = HuberLoss(L2Loss(target=target, predictor=q_value))
        loss = HuberLoss(MSBELoss(td_return=td_return), delta=1.)

        # create optimizer
        optimizer = Adam(learning_rate=lr)

        # create target updater
        # target_updater = CopyParameter(current=q_value, target=q_target, sleep_count=100)
        target_updater = PolyakAveraging(current=q_value,
                                         target=q_target,
                                         rho=polyak)

        # define the 3 main steps in RL: explore, evaluate, and update
        explorer = Explorer(task,
                            exploration,
                            storage,
                            num_workers=num_workers)
        evaluator = Evaluator(None)  # off-policy
        updater = Updater(policy,
                          sampler,
                          loss,
                          optimizer,
                          evaluators=[td_return],
                          updaters=[target_updater])

        # initialize RL algorithm
        super(DQN, self).__init__(explorer, evaluator, updater)