def __init__(self, task, approximators, gamma=0.99, lr=0.001, polyak=0.995, delay=2, capacity=10000, num_workers=1): """ Initialize the TD3 off-policy RL algorithm. Args: task (RLTask, Env): RL task/env to run approximators (Policy, [Policy, Value], ActorCritic): approximators to optimize gamma (float): discount factor (which is a bias-variance tradeoff). This parameter describes how much importance has the future rewards we get. lr (float): learning rate. polyak (float): coefficient (between 0 and 1) used in the polyak averaging when updating the target approximators. If 1, it will let the target parameter(s) unchanged, if 0 it will just copy the current parameter(s). delay (int): number of steps to wait before performing an update. capacity (int): capacity of the experience replay storage. num_workers (int): number of processes / workers to run in parallel """ # check given approximators if isinstance(approximators, (tuple, list)): # get the policy and Q-value approximator policy, q_values = None, [] for approximator in approximators: if isinstance(approximator, (Policy, QValue)): policy = approximator elif isinstance(approximator, QValue): q_values.append(approximator) # check that the policy and Q-value approximator are different than None if policy is None: raise ValueError("No policy approximator was given to the algorithm.") if not q_values: raise ValueError("No Q-value approximator was given to the algorithm.") else: raise TypeError("Expecting a list/tuple of a policy and a Q-value functions.") # check that there is at least 2 Q-value function approximators (the user can have more) if len(q_values) < 2: raise ValueError("Expecting at least 2 Q-value function approximators for the TD3 algorithm.") # get states and actions from policy states, actions = policy.states, policy.actions # check that the actions are continuous if not actions.is_continuous(): raise ValueError("The TD3 assumes that the actions are continuous, however got an action which is not.") # evaluate target Q-value fct by copying Q-value function approximator memo = {} q_targets = [copy.deepcopy(q_value, memo=memo) for q_value in q_values] policy_target = copy.deepcopy(policy, memo=memo) # create action exploration strategy exploration = ActionExploration(policy=policy, action=actions) # create experience replay storage = ExperienceReplay(state_shapes=states.merged_shape, action_shapes=actions.merged_shape, capacity=capacity) sampler = BatchRandomSampler(storage) # create target return estimator returns = TDQValueReturn(q_value=q_values, policy=policy_target, target_qvalue=q_targets, gamma=gamma) # create Q-value loss and policy loss q_loss = MSBELoss(td_return=returns) policy_loss = QLoss(q_value=q_values[0], policy=policy) # only the first q-value is used to train the policy losses = [q_loss, policy_loss] # create optimizer optimizer = Adam(learning_rate=lr) # create policy and q-value updaters params_updaters = [PolyakAveraging(current=policy, target=policy_target, rho=polyak)] for q_value, q_target in zip(q_values, q_targets): params_updaters.append(PolyakAveraging(current=q_value, target=q_target, rho=polyak)) # create ticks (number of steps to wait before evaluating the loss / parameter updater) # this is used to delay the updates ticks = {updater: delay for updater in params_updaters} ticks.update({policy_loss: delay}) # define the 3 main steps in RL: explore, evaluate, and update explorer = Explorer(task, exploration, storage, num_workers=num_workers) evaluator = Evaluator(None) # off-policy updater = Updater(approximators, sampler, losses, optimizer, evaluators=returns, updaters=params_updaters, ticks=ticks) # initialize RL algorithm super(TD3, self).__init__(explorer, evaluator, updater)
def __init__(self, task, approximators, gamma=0.99, lr=0.001, num_batches=10, batch_size=10, num_workers=1): """ Initialize the REINFORCE on-policy RL algorithm. Args: task (RLTask, Env): RL task/env to run approximators (Policy, [Policy, Value], ActorCritic): approximators to optimize gamma (float): discount factor (which is a bias-variance tradeoff). This parameter describes how much importance has the future rewards we get. lr (float): learning rate num_workers (int): number of processes / workers to run in parallel """ # check approximators policy, value, actor_critic = None, None, None if isinstance(approximators, Policy): policy = approximators if not policy.is_parametric(): raise ValueError("The policy should be parametric.") elif isinstance(approximators, (tuple, list)): for approximator in approximators: if isinstance(approximator, Policy): policy = approximator elif isinstance(approximator, ValueApproximator): value = approximator actor_critic = ActorCritic(policy, value) elif isinstance(approximators, ActorCritic): policy = approximators.actor value = approximators.critic actor_critic = approximators else: raise TypeError("Expecting the approximators to be an instance of `Policy`, or `ActorCritic`, instead got:" " {}".format(type(approximators))) # create exploration strategy (if action is discrete, boltzmann exploration. If action is continuous, gaussian) exploration = ActionExploration(policy) # create storage states, actions = policy.states, policy.actions storage = RolloutStorage(num_steps=1000, state_shapes=states.merged_shape, action_shapes=actions.merged_shape, num_trajectories=1) sampler = BatchRandomSampler(storage, num_batches=10, batch_size_bounds=(8, 64)) # create return: R_t = \sum_{t'=t}^{T} \gamma^{t'-t} r_{t'} returns = ActionRewardEstimator(storage, gamma=gamma) # create policy evaluator that will compute :math:`a \sim \pi(.|s_t)` and :math:`\pi(.|s_t)` on batch policy_evaluator = PolicyEvaluator(policy=exploration) # create loss for policy: \mathbb{E}[ \log \pi_{\theta}(a_t | s_t) R_t ] loss = PGLoss(returns) # create optimizer for policy (and possibly value function) optimizer = Adam(learning_rate=lr) # if value function, create its loss if value is not None: approximators = [policy, value] value_loss = ValueL2Loss(returns, value) loss = [loss, value_loss] else: approximators = policy # define the 3 main steps in RL: explore, evaluate, and update explorer = Explorer(task, exploration, storage, num_workers=num_workers) evaluator = Evaluator(returns) updater = Updater(approximators, sampler, loss, optimizer, evaluators=[policy_evaluator]) # initialize RL algorithm super(REINFORCE, self).__init__(explorer, evaluator, updater, )
def __init__(self, task, approximators, gamma=0.99, lr=5e-4, polyak=0.995, alpha=0.2, capacity=10000, num_workers=1): """ Initialize the SAC off-policy RL algorithm. Args: task (RLTask, Env): RL task/env to run approximators ([Policy, Value, QValue]): approximators to optimize. gamma (float): discount factor (which is a bias-variance tradeoff). This parameter describes how much importance has the future rewards we get. lr (float): learning rate polyak (float): coefficient (between 0 and 1) used in the polyak averaging when updating the target approximators. If 1, it will let the target parameter(s) unchanged, if 0 it will just copy the current parameter(s). alpha (float): entropy regularization coefficient which controls the tradeoff between exploration and exploitation. Higher :attr:`alpha` means more exploration, and lower :attr:`alpha` corresponds to more exploitation. capacity (int): capacity of the experience replay storage. num_workers (int): number of processes / workers to run in parallel """ # check approximators if not isinstance(approximators, collections.Iterable): raise TypeError( "Expecting the approximators to be a list containing a Policy, a Value, and at least 2 " "QValues") policy, value, q_values = None, None, [] for approximator in approximators: if isinstance(approximator, Policy): policy = approximator elif isinstance(approximator, Value): value = approximator elif isinstance(approximator, ActorCritic): policy = approximator.actor value = approximator.critic elif isinstance(approximator, QValue): q_values.append(approximator) if policy is None: raise TypeError("No policy was given to the algorithm.") if value is None: raise TypeError( "No value function approximator was given to the algorithm.") if len(q_values) == 0: raise TypeError( "No Q-value function approximators were given to the algorithm." ) # set target parameters equal to main parameters for the value function value_target = copy.deepcopy(value, memo={}) # create experience replay states, actions = policy.states, policy.actions storage = ExperienceReplay(state_shapes=states.merged_shape, action_shapes=actions.merged_shape, capacity=capacity) sampler = BatchRandomSampler(storage) # create action exploration exploration = ActionExploration(policy) # create targets q_target = ValueTarget(values=value_target, gamma=gamma) v_target = EntropyValueTarget(q_values=q_values, policy=exploration, alpha=alpha) # create losses q_loss = MSBELoss(td_return=estimator) policy_loss = QLoss( q_value=q_values[0], policy=policy ) # only the first q-value is used to train the policy losses = [q_loss, policy_loss] # create optimizer optimizer = Adam(learning_rate=lr) # create parameter updater for target value function params_updater = PolyakAveraging(current=value, target=value_target, rho=polyak) # define the 3 main steps in RL: explore, evaluate, and update explorer = Explorer(task, exploration, storage, num_workers=num_workers) evaluator = Evaluator(None) # off-policy updater = Updater(approximators, sampler, losses, optimizer, updaters=params_updater) # initialize RL algorithm super(SAC, self).__init__(explorer, evaluator, updater)
def __init__(self, task, approximators, gamma=0.99, lr=0.001, polyak=0.995, capacity=10000, num_workers=1): """ Initialize the DDPG off-policy RL algorithm. Args: task (RLTask, Env): RL task/env to run approximators ([Policy, QValue]): policy and Q-value function approximator to optimize. gamma (float): discount factor (which is a bias-variance trade-off). This parameter describes how much importance has the future rewards we get. lr (float): learning rate polyak (float): coefficient (between 0 and 1) used in the polyak averaging when updating the target approximators. If 1, it will let the target parameter(s) unchanged, if 0 it will just copy the current parameter(s). capacity (int): capacity of the experience replay storage. num_workers (int): number of processes / workers to run in parallel """ # check given approximators if isinstance(approximators, (tuple, list)) and len(approximators) != 2: # get the policy and Q-value approximator policy, q_value = None, None for approximator in approximators: if isinstance(approximator, (Policy, QValue)): policy = approximator elif isinstance(approximator, QValue): q_value = approximator # check that the policy and Q-value approximator are different than None if policy is None: raise ValueError( "No policy approximator was given to the algorithm.") if q_value is None: raise ValueError( "No Q-value approximator was given to the algorithm.") else: raise TypeError( "Expecting a list/tuple of a policy and a Q-value function.") # get states and actions from policy states, actions = policy.states, policy.actions # check that the actions are continuous if not actions.is_continuous(): raise ValueError( "The DDPG assumes that the actions are continuous, however got an action which is not." ) # Set target parameters equal to main parameters memo = {} q_target = copy.deepcopy(q_value, memo=memo) policy_target = copy.deepcopy(policy, memo=memo) # create action exploration strategy exploration = ActionExploration(policy=policy, action=actions) # create experience replay storage = ExperienceReplay(state_shapes=states.merged_shape, action_shapes=actions.merged_shape, capacity=capacity) sampler = BatchRandomSampler(storage) # create target return estimator # target = QValueTarget(q_values=q_target, policy=policy_target, gamma=gamma) returns = TDQValueReturn(q_value=q_value, policy=policy_target, target_qvalue=q_target, gamma=gamma) # create Q-value loss and policy loss # q_loss = L2Loss(target=target, predictor=q_value) # q_loss = ValueLoss(returns=target, value=q_value) q_loss = MSBELoss(td_return=returns) policy_loss = QLoss(q_value=q_value, policy=policy) losses = [q_loss, policy_loss] # create optimizer optimizer = Adam(learning_rate=lr) # create q value and policy updaters q_value_updater = PolyakAveraging(current=q_value, target=q_target, rho=polyak) policy_updater = PolyakAveraging(current=policy, target=policy_target, rho=polyak) # define the 3 main steps in RL: explore, evaluate, and update explorer = Explorer(task, exploration, storage, num_workers=num_workers) evaluator = Evaluator(None) # off-policy updater = Updater(approximators, sampler, losses, optimizer, evaluators=returns, updaters=[q_value_updater, policy_updater]) # initialize RL algorithm super(DDPG, self).__init__(explorer, evaluator, updater)
def __init__(self, task, approximators, gamma=0.99, tau=0.95, clip=0.2, lr=5e-4, l2_coeff=0.5, entropy_coeff=0.01, num_workers=1, storage=None): """ Initialize the PPO algorithm. Args: task (RLTask, Env): RL task/env to run approximators (ActorCritic, [Policy, Value]): approximators to optimize gamma (float): discount factor (which is a bias-variance tradeoff). This parameter describes how much importance has the future rewards we get. tau (float): trace-decay parameter (which is a bias-variance tradeoff). If :math:`\tau=1`, this results in a Monte Carlo method, while :math:`\tau=0` results in a one-step TD methods. clip (float): clip parameter lr (float): learning rate l2_coeff (float): coefficient for squared-error loss between the target and approximated value functions. entropy_coeff (float): coefficient for entropy loss. num_workers (int): number of workers (useful when parallelizing the code) """ logger.debug('creating PPO algorithm') # create actor critic actor_critic = approximators if isinstance(approximators, (tuple, list)): policy, value = None, None for approximator in approximators: if isinstance(approximator, Policy): policy = approximator elif isinstance(approximator, ValueApproximator): value = approximator actor_critic = ActorCritic(policy, value) if not isinstance(actor_critic, ActorCritic): raise TypeError( "Expecting 'actor_critic' to be an instance of ActorCritic") # get policy and value policy = actor_critic.actor value = actor_critic.critic # create exploration strategy (wrap the original policy and specify how to explore) # By default, for discrete actions it will use a Categorical distribution and for continuous actions, it will # use a Gaussian with a diagonal covariance matrix. logger.debug( 'creating the action exploration strategies for each action') exploration = ActionExploration(policy) # create storage and sampler states, actions = policy.states, policy.actions logger.debug('create rollout storage') storage = RolloutStorage(num_steps=1000, state_shapes=states.merged_shape, action_shapes=actions.merged_shape, num_trajectories=num_workers) logger.debug('create storage sampler') sampler = BatchRandomSampler(storage) # create estimator logger.debug('create return estimator (GAE)') estimator = GAE(storage, value, gamma=gamma, tau=tau) # create policy evaluator that will compute :math:`a \sim \pi(.|s_t)` and :math:`\pi(.|s_t)` on batch policy_evaluator = PolicyEvaluator(policy=exploration) # create loss logger.debug('create loss') loss = CLIPLoss(estimator, clip=clip) + l2_coeff * ValueL2Loss( estimator, value) + entropy_coeff * EntropyLoss() # create optimizer logger.debug('create Adam optimizer') optimizer = Adam(learning_rate=lr) # define the 3 main steps in RL: explore, evaluate, and update logger.debug('create explorer, evaluator, and updater') explorer = Explorer(task, exploration, storage, num_workers=num_workers) evaluator = Evaluator(estimator) updater = Updater(policy, sampler, loss, optimizer, evaluators=[policy_evaluator]) # initialize RL algorithm super(PPO, self).__init__(explorer, evaluator, updater)
def __init__(self, task, approximator, gamma=0.99, lr=5e-4, capacity=10000, polyak=0.995, num_workers=1): """ Initialize the DQN reinforcement learning algorithm. Args: task (RLTask, Env): RL task/env to run. approximator (ParametrizedQValueOutput, PolicyFromQValue): approximator to use and update. gamma (float): discount factor (which is a bias-variance tradeoff). This parameter describes how much importance has the future rewards we get. lr (float): learning rate. capacity (int): capacity of the experience replay storage. polyak (float): coefficient (between 0 and 1) used in the polyak averaging when updating the target approximators. If 1, it will let the target parameter(s) unchanged, if 0 it will just copy the current parameter(s). num_workers (int): number of processes / workers to run in parallel. """ # check given approximator if isinstance(approximator, ParametrizedQValueOutput): policy = PolicyFromQValue(approximator) q_value = approximator elif isinstance(approximator, PolicyFromQValue): policy = approximator q_value = approximator.value else: raise TypeError( "Expecting the given approximator to be an instance of `PolicyFromQValue`, or " "`ParametrizedQValueOutput`, instead got: {}".format( type(approximator))) # evaluate target Q-value fct by copying Q-value function approximator q_target = copy.deepcopy(q_value, memo={}) # get states and actions from policy states, actions = policy.states, policy.actions # create action exploration strategy exploration = EpsilonGreedyActionExploration(policy=policy, action=actions) # create experience replay and sampler storage = ExperienceReplay(state_shapes=states.merged_shape, action_shapes=actions.merged_shape, capacity=capacity) sampler = BatchRandomSampler(storage) # create target return estimator # target = QLearningTarget(q_values=q_target, gamma=gamma) td_return = TDQLearningReturn(q_value=q_value, target_qvalue=q_target, gamma=gamma) # create loss # loss = HuberLoss(L2Loss(target=target, predictor=q_value)) loss = HuberLoss(MSBELoss(td_return=td_return), delta=1.) # create optimizer optimizer = Adam(learning_rate=lr) # create target updater # target_updater = CopyParameter(current=q_value, target=q_target, sleep_count=100) target_updater = PolyakAveraging(current=q_value, target=q_target, rho=polyak) # define the 3 main steps in RL: explore, evaluate, and update explorer = Explorer(task, exploration, storage, num_workers=num_workers) evaluator = Evaluator(None) # off-policy updater = Updater(policy, sampler, loss, optimizer, evaluators=[td_return], updaters=[target_updater]) # initialize RL algorithm super(DQN, self).__init__(explorer, evaluator, updater)