예제 #1
0
    def __init__(self, task, approximators, gamma=0.99, lr=0.001, polyak=0.995, delay=2, capacity=10000, num_workers=1):
        """
        Initialize the TD3 off-policy RL algorithm.

        Args:
            task (RLTask, Env): RL task/env to run
            approximators (Policy, [Policy, Value], ActorCritic): approximators to optimize
            gamma (float): discount factor (which is a bias-variance tradeoff). This parameter describes how much
                importance has the future rewards we get.
            lr (float): learning rate.
            polyak (float): coefficient (between 0 and 1) used in the polyak averaging when updating the target
                approximators. If 1, it will let the target parameter(s) unchanged, if 0 it will just copy the
                current parameter(s).
            delay (int): number of steps to wait before performing an update.
            capacity (int): capacity of the experience replay storage.
            num_workers (int): number of processes / workers to run in parallel
        """

        # check given approximators
        if isinstance(approximators, (tuple, list)):

            # get the policy and Q-value approximator
            policy, q_values = None, []
            for approximator in approximators:
                if isinstance(approximator, (Policy, QValue)):
                    policy = approximator
                elif isinstance(approximator, QValue):
                    q_values.append(approximator)

            # check that the policy and Q-value approximator are different than None
            if policy is None:
                raise ValueError("No policy approximator was given to the algorithm.")
            if not q_values:
                raise ValueError("No Q-value approximator was given to the algorithm.")

        else:
            raise TypeError("Expecting a list/tuple of a policy and a Q-value functions.")

        # check that there is at least 2 Q-value function approximators (the user can have more)
        if len(q_values) < 2:
            raise ValueError("Expecting at least 2 Q-value function approximators for the TD3 algorithm.")

        # get states and actions from policy
        states, actions = policy.states, policy.actions

        # check that the actions are continuous
        if not actions.is_continuous():
            raise ValueError("The TD3 assumes that the actions are continuous, however got an action which is not.")

        # evaluate target Q-value fct by copying Q-value function approximator
        memo = {}
        q_targets = [copy.deepcopy(q_value, memo=memo) for q_value in q_values]
        policy_target = copy.deepcopy(policy, memo=memo)

        # create action exploration strategy
        exploration = ActionExploration(policy=policy, action=actions)

        # create experience replay
        storage = ExperienceReplay(state_shapes=states.merged_shape, action_shapes=actions.merged_shape,
                                   capacity=capacity)
        sampler = BatchRandomSampler(storage)

        # create target return estimator
        returns = TDQValueReturn(q_value=q_values, policy=policy_target, target_qvalue=q_targets, gamma=gamma)

        # create Q-value loss and policy loss
        q_loss = MSBELoss(td_return=returns)
        policy_loss = QLoss(q_value=q_values[0], policy=policy)  # only the first q-value is used to train the policy
        losses = [q_loss, policy_loss]

        # create optimizer
        optimizer = Adam(learning_rate=lr)

        # create policy and q-value updaters
        params_updaters = [PolyakAveraging(current=policy, target=policy_target, rho=polyak)]
        for q_value, q_target in zip(q_values, q_targets):
            params_updaters.append(PolyakAveraging(current=q_value, target=q_target, rho=polyak))

        # create ticks (number of steps to wait before evaluating the loss / parameter updater)
        # this is used to delay the updates
        ticks = {updater: delay for updater in params_updaters}
        ticks.update({policy_loss: delay})

        # define the 3 main steps in RL: explore, evaluate, and update
        explorer = Explorer(task, exploration, storage, num_workers=num_workers)
        evaluator = Evaluator(None)  # off-policy
        updater = Updater(approximators, sampler, losses, optimizer, evaluators=returns, updaters=params_updaters,
                          ticks=ticks)

        # initialize RL algorithm
        super(TD3, self).__init__(explorer, evaluator, updater)
예제 #2
0
    def __init__(self, task, approximators, gamma=0.99, lr=0.001, num_batches=10, batch_size=10, num_workers=1):
        """
        Initialize the REINFORCE on-policy RL algorithm.

        Args:
            task (RLTask, Env): RL task/env to run
            approximators (Policy, [Policy, Value], ActorCritic): approximators to optimize
            gamma (float): discount factor (which is a bias-variance tradeoff). This parameter describes how much
                importance has the future rewards we get.
            lr (float): learning rate
            num_workers (int): number of processes / workers to run in parallel
        """

        # check approximators
        policy, value, actor_critic = None, None, None
        if isinstance(approximators, Policy):
            policy = approximators
            if not policy.is_parametric():
                raise ValueError("The policy should be parametric.")
        elif isinstance(approximators, (tuple, list)):
            for approximator in approximators:
                if isinstance(approximator, Policy):
                    policy = approximator
                elif isinstance(approximator, ValueApproximator):
                    value = approximator
            actor_critic = ActorCritic(policy, value)
        elif isinstance(approximators, ActorCritic):
            policy = approximators.actor
            value = approximators.critic
            actor_critic = approximators
        else:
            raise TypeError("Expecting the approximators to be an instance of `Policy`, or `ActorCritic`, instead got:"
                            " {}".format(type(approximators)))

        # create exploration strategy (if action is discrete, boltzmann exploration. If action is continuous, gaussian)
        exploration = ActionExploration(policy)

        # create storage
        states, actions = policy.states, policy.actions
        storage = RolloutStorage(num_steps=1000, state_shapes=states.merged_shape, action_shapes=actions.merged_shape,
                                 num_trajectories=1)
        sampler = BatchRandomSampler(storage, num_batches=10, batch_size_bounds=(8, 64))

        # create return: R_t = \sum_{t'=t}^{T} \gamma^{t'-t} r_{t'}
        returns = ActionRewardEstimator(storage, gamma=gamma)

        # create policy evaluator that will compute :math:`a \sim \pi(.|s_t)` and :math:`\pi(.|s_t)` on batch
        policy_evaluator = PolicyEvaluator(policy=exploration)

        # create loss for policy: \mathbb{E}[ \log \pi_{\theta}(a_t | s_t) R_t ]
        loss = PGLoss(returns)

        # create optimizer for policy (and possibly value function)
        optimizer = Adam(learning_rate=lr)

        # if value function, create its loss
        if value is not None:
            approximators = [policy, value]
            value_loss = ValueL2Loss(returns, value)
            loss = [loss, value_loss]
        else:
            approximators = policy

        # define the 3 main steps in RL: explore, evaluate, and update
        explorer = Explorer(task, exploration, storage, num_workers=num_workers)
        evaluator = Evaluator(returns)
        updater = Updater(approximators, sampler, loss, optimizer, evaluators=[policy_evaluator])

        # initialize RL algorithm
        super(REINFORCE, self).__init__(explorer, evaluator, updater, )
예제 #3
0
    def __init__(self,
                 task,
                 approximators,
                 gamma=0.99,
                 lr=5e-4,
                 polyak=0.995,
                 alpha=0.2,
                 capacity=10000,
                 num_workers=1):
        """
        Initialize the SAC off-policy RL algorithm.

        Args:
            task (RLTask, Env): RL task/env to run
            approximators ([Policy, Value, QValue]): approximators to optimize.
            gamma (float): discount factor (which is a bias-variance tradeoff). This parameter describes how much
                importance has the future rewards we get.
            lr (float): learning rate
            polyak (float): coefficient (between 0 and 1) used in the polyak averaging when updating the target
                approximators. If 1, it will let the target parameter(s) unchanged, if 0 it will just copy the
                current parameter(s).
            alpha (float): entropy regularization coefficient which controls the tradeoff between exploration and
                exploitation. Higher :attr:`alpha` means more exploration, and lower :attr:`alpha` corresponds to more
                exploitation.
            capacity (int): capacity of the experience replay storage.
            num_workers (int): number of processes / workers to run in parallel
        """

        # check approximators
        if not isinstance(approximators, collections.Iterable):
            raise TypeError(
                "Expecting the approximators to be a list containing a Policy, a Value, and at least 2 "
                "QValues")
        policy, value, q_values = None, None, []
        for approximator in approximators:
            if isinstance(approximator, Policy):
                policy = approximator
            elif isinstance(approximator, Value):
                value = approximator
            elif isinstance(approximator, ActorCritic):
                policy = approximator.actor
                value = approximator.critic
            elif isinstance(approximator, QValue):
                q_values.append(approximator)

        if policy is None:
            raise TypeError("No policy was given to the algorithm.")
        if value is None:
            raise TypeError(
                "No value function approximator was given to the algorithm.")
        if len(q_values) == 0:
            raise TypeError(
                "No Q-value function approximators were given to the algorithm."
            )

        # set target parameters equal to main parameters for the value function
        value_target = copy.deepcopy(value, memo={})

        # create experience replay
        states, actions = policy.states, policy.actions
        storage = ExperienceReplay(state_shapes=states.merged_shape,
                                   action_shapes=actions.merged_shape,
                                   capacity=capacity)
        sampler = BatchRandomSampler(storage)

        # create action exploration
        exploration = ActionExploration(policy)

        # create targets
        q_target = ValueTarget(values=value_target, gamma=gamma)
        v_target = EntropyValueTarget(q_values=q_values,
                                      policy=exploration,
                                      alpha=alpha)

        # create losses
        q_loss = MSBELoss(td_return=estimator)
        policy_loss = QLoss(
            q_value=q_values[0], policy=policy
        )  # only the first q-value is used to train the policy
        losses = [q_loss, policy_loss]

        # create optimizer
        optimizer = Adam(learning_rate=lr)

        # create parameter updater for target value function
        params_updater = PolyakAveraging(current=value,
                                         target=value_target,
                                         rho=polyak)

        # define the 3 main steps in RL: explore, evaluate, and update
        explorer = Explorer(task,
                            exploration,
                            storage,
                            num_workers=num_workers)
        evaluator = Evaluator(None)  # off-policy
        updater = Updater(approximators,
                          sampler,
                          losses,
                          optimizer,
                          updaters=params_updater)

        # initialize RL algorithm
        super(SAC, self).__init__(explorer, evaluator, updater)
    def __init__(self,
                 task,
                 approximators,
                 gamma=0.99,
                 lr=0.001,
                 polyak=0.995,
                 capacity=10000,
                 num_workers=1):
        """
        Initialize the DDPG off-policy RL algorithm.

        Args:
            task (RLTask, Env): RL task/env to run
            approximators ([Policy, QValue]): policy and Q-value function approximator to optimize.
            gamma (float): discount factor (which is a bias-variance trade-off). This parameter describes how much
                importance has the future rewards we get.
            lr (float): learning rate
            polyak (float): coefficient (between 0 and 1) used in the polyak averaging when updating the target
                approximators. If 1, it will let the target parameter(s) unchanged, if 0 it will just copy the
                current parameter(s).
            capacity (int): capacity of the experience replay storage.
            num_workers (int): number of processes / workers to run in parallel
        """

        # check given approximators
        if isinstance(approximators,
                      (tuple, list)) and len(approximators) != 2:

            # get the policy and Q-value approximator
            policy, q_value = None, None
            for approximator in approximators:
                if isinstance(approximator, (Policy, QValue)):
                    policy = approximator
                elif isinstance(approximator, QValue):
                    q_value = approximator

            # check that the policy and Q-value approximator are different than None
            if policy is None:
                raise ValueError(
                    "No policy approximator was given to the algorithm.")
            if q_value is None:
                raise ValueError(
                    "No Q-value approximator was given to the algorithm.")

        else:
            raise TypeError(
                "Expecting a list/tuple of a policy and a Q-value function.")

        # get states and actions from policy
        states, actions = policy.states, policy.actions

        # check that the actions are continuous
        if not actions.is_continuous():
            raise ValueError(
                "The DDPG assumes that the actions are continuous, however got an action which is not."
            )

        # Set target parameters equal to main parameters
        memo = {}
        q_target = copy.deepcopy(q_value, memo=memo)
        policy_target = copy.deepcopy(policy, memo=memo)

        # create action exploration strategy
        exploration = ActionExploration(policy=policy, action=actions)

        # create experience replay
        storage = ExperienceReplay(state_shapes=states.merged_shape,
                                   action_shapes=actions.merged_shape,
                                   capacity=capacity)
        sampler = BatchRandomSampler(storage)

        # create target return estimator
        # target = QValueTarget(q_values=q_target, policy=policy_target, gamma=gamma)
        returns = TDQValueReturn(q_value=q_value,
                                 policy=policy_target,
                                 target_qvalue=q_target,
                                 gamma=gamma)

        # create Q-value loss and policy loss
        # q_loss = L2Loss(target=target, predictor=q_value)
        # q_loss = ValueLoss(returns=target, value=q_value)
        q_loss = MSBELoss(td_return=returns)
        policy_loss = QLoss(q_value=q_value, policy=policy)
        losses = [q_loss, policy_loss]

        # create optimizer
        optimizer = Adam(learning_rate=lr)

        # create q value and policy updaters
        q_value_updater = PolyakAveraging(current=q_value,
                                          target=q_target,
                                          rho=polyak)
        policy_updater = PolyakAveraging(current=policy,
                                         target=policy_target,
                                         rho=polyak)

        # define the 3 main steps in RL: explore, evaluate, and update
        explorer = Explorer(task,
                            exploration,
                            storage,
                            num_workers=num_workers)
        evaluator = Evaluator(None)  # off-policy
        updater = Updater(approximators,
                          sampler,
                          losses,
                          optimizer,
                          evaluators=returns,
                          updaters=[q_value_updater, policy_updater])

        # initialize RL algorithm
        super(DDPG, self).__init__(explorer, evaluator, updater)
예제 #5
0
    def __init__(self,
                 task,
                 approximators,
                 gamma=0.99,
                 tau=0.95,
                 clip=0.2,
                 lr=5e-4,
                 l2_coeff=0.5,
                 entropy_coeff=0.01,
                 num_workers=1,
                 storage=None):
        """
        Initialize the PPO algorithm.

        Args:
            task (RLTask, Env): RL task/env to run
            approximators (ActorCritic, [Policy, Value]): approximators to optimize
            gamma (float): discount factor (which is a bias-variance tradeoff). This parameter describes how much
                importance has the future rewards we get.
            tau (float): trace-decay parameter (which is a bias-variance tradeoff). If :math:`\tau=1`, this results
                in a Monte Carlo method, while :math:`\tau=0` results in a one-step TD methods.
            clip (float): clip parameter
            lr (float): learning rate
            l2_coeff (float): coefficient for squared-error loss between the target and approximated value functions.
            entropy_coeff (float): coefficient for entropy loss.
            num_workers (int): number of workers (useful when parallelizing the code)
        """
        logger.debug('creating PPO algorithm')

        # create actor critic
        actor_critic = approximators
        if isinstance(approximators, (tuple, list)):
            policy, value = None, None
            for approximator in approximators:
                if isinstance(approximator, Policy):
                    policy = approximator
                elif isinstance(approximator, ValueApproximator):
                    value = approximator
            actor_critic = ActorCritic(policy, value)
        if not isinstance(actor_critic, ActorCritic):
            raise TypeError(
                "Expecting 'actor_critic' to be an instance of ActorCritic")

        # get policy and value
        policy = actor_critic.actor
        value = actor_critic.critic

        # create exploration strategy (wrap the original policy and specify how to explore)
        # By default, for discrete actions it will use a Categorical distribution and for continuous actions, it will
        # use a Gaussian with a diagonal covariance matrix.
        logger.debug(
            'creating the action exploration strategies for each action')
        exploration = ActionExploration(policy)

        # create storage and sampler
        states, actions = policy.states, policy.actions
        logger.debug('create rollout storage')
        storage = RolloutStorage(num_steps=1000,
                                 state_shapes=states.merged_shape,
                                 action_shapes=actions.merged_shape,
                                 num_trajectories=num_workers)
        logger.debug('create storage sampler')
        sampler = BatchRandomSampler(storage)

        # create estimator
        logger.debug('create return estimator (GAE)')
        estimator = GAE(storage, value, gamma=gamma, tau=tau)

        # create policy evaluator that will compute :math:`a \sim \pi(.|s_t)` and :math:`\pi(.|s_t)` on batch
        policy_evaluator = PolicyEvaluator(policy=exploration)

        # create loss
        logger.debug('create loss')
        loss = CLIPLoss(estimator, clip=clip) + l2_coeff * ValueL2Loss(
            estimator, value) + entropy_coeff * EntropyLoss()

        # create optimizer
        logger.debug('create Adam optimizer')
        optimizer = Adam(learning_rate=lr)

        # define the 3 main steps in RL: explore, evaluate, and update
        logger.debug('create explorer, evaluator, and updater')
        explorer = Explorer(task,
                            exploration,
                            storage,
                            num_workers=num_workers)
        evaluator = Evaluator(estimator)
        updater = Updater(policy,
                          sampler,
                          loss,
                          optimizer,
                          evaluators=[policy_evaluator])

        # initialize RL algorithm
        super(PPO, self).__init__(explorer, evaluator, updater)
예제 #6
0
    def __init__(self,
                 task,
                 approximator,
                 gamma=0.99,
                 lr=5e-4,
                 capacity=10000,
                 polyak=0.995,
                 num_workers=1):
        """
        Initialize the DQN reinforcement learning algorithm.

        Args:
            task (RLTask, Env): RL task/env to run.
            approximator (ParametrizedQValueOutput, PolicyFromQValue): approximator to use and update.
            gamma (float): discount factor (which is a bias-variance tradeoff). This parameter describes how much
                importance has the future rewards we get.
            lr (float): learning rate.
            capacity (int): capacity of the experience replay storage.
            polyak (float): coefficient (between 0 and 1) used in the polyak averaging when updating the target
                approximators. If 1, it will let the target parameter(s) unchanged, if 0 it will just copy the
                current parameter(s).
            num_workers (int): number of processes / workers to run in parallel.
        """
        # check given approximator
        if isinstance(approximator, ParametrizedQValueOutput):
            policy = PolicyFromQValue(approximator)
            q_value = approximator
        elif isinstance(approximator, PolicyFromQValue):
            policy = approximator
            q_value = approximator.value
        else:
            raise TypeError(
                "Expecting the given approximator to be an instance of `PolicyFromQValue`, or "
                "`ParametrizedQValueOutput`, instead got: {}".format(
                    type(approximator)))

        # evaluate target Q-value fct by copying Q-value function approximator
        q_target = copy.deepcopy(q_value, memo={})

        # get states and actions from policy
        states, actions = policy.states, policy.actions

        # create action exploration strategy
        exploration = EpsilonGreedyActionExploration(policy=policy,
                                                     action=actions)

        # create experience replay and sampler
        storage = ExperienceReplay(state_shapes=states.merged_shape,
                                   action_shapes=actions.merged_shape,
                                   capacity=capacity)
        sampler = BatchRandomSampler(storage)

        # create target return estimator
        # target = QLearningTarget(q_values=q_target, gamma=gamma)
        td_return = TDQLearningReturn(q_value=q_value,
                                      target_qvalue=q_target,
                                      gamma=gamma)

        # create loss
        # loss = HuberLoss(L2Loss(target=target, predictor=q_value))
        loss = HuberLoss(MSBELoss(td_return=td_return), delta=1.)

        # create optimizer
        optimizer = Adam(learning_rate=lr)

        # create target updater
        # target_updater = CopyParameter(current=q_value, target=q_target, sleep_count=100)
        target_updater = PolyakAveraging(current=q_value,
                                         target=q_target,
                                         rho=polyak)

        # define the 3 main steps in RL: explore, evaluate, and update
        explorer = Explorer(task,
                            exploration,
                            storage,
                            num_workers=num_workers)
        evaluator = Evaluator(None)  # off-policy
        updater = Updater(policy,
                          sampler,
                          loss,
                          optimizer,
                          evaluators=[td_return],
                          updaters=[target_updater])

        # initialize RL algorithm
        super(DQN, self).__init__(explorer, evaluator, updater)