def __init__(self, inner_algo, env, policy, baseline, meta_optimizer, meta_batch_size=40, inner_lr=0.1, outer_lr=1e-3, num_grad_updates=1): if policy.vectorized: self.sampler_cls = OnPolicyVectorizedSampler else: self.sampler_cls = BatchSampler self.max_path_length = inner_algo.max_path_length self._policy = policy self._env = env self._baseline = baseline self._num_grad_updates = num_grad_updates self._meta_batch_size = meta_batch_size self._inner_algo = inner_algo self._inner_optimizer = DifferentiableSGD(self._policy, lr=inner_lr) self._meta_optimizer = make_optimizer(meta_optimizer, policy, lr=_Default(outer_lr), eps=_Default(1e-5))
def __init__( self, env_spec, policy, baseline, optimizer=torch.optim.Adam, policy_lr=_Default(1e-2), max_path_length=500, num_train_per_epoch=1, discount=0.99, gae_lambda=1, center_adv=True, positive_adv=False, policy_ent_coeff=0.0, use_softplus_entropy=False, stop_entropy_gradient=False, entropy_method='no_entropy', training_batch_size=None, training_epochs=1, ): self._env_spec = env_spec self._gae_lambda = gae_lambda self._center_adv = center_adv self._positive_adv = positive_adv self._policy_ent_coeff = policy_ent_coeff self._use_softplus_entropy = use_softplus_entropy self._stop_entropy_gradient = stop_entropy_gradient self._entropy_method = entropy_method self._eps = 1e-8 self._training_batch_size = training_batch_size self._training_epochs = training_epochs self._maximum_entropy = (entropy_method == 'max') self._entropy_regularzied = (entropy_method == 'regularized') self._check_entropy_configuration(entropy_method, center_adv, stop_entropy_gradient, policy_ent_coeff) self._episode_reward_mean = collections.deque(maxlen=100) self._optimizer = make_optimizer(optimizer, policy, lr=policy_lr, eps=_Default(1e-5)) super().__init__(env_spec=env_spec, policy=policy, baseline=baseline, discount=discount, max_path_length=max_path_length, n_samples=num_train_per_epoch) self._old_policy = copy.deepcopy(self.policy)
def __init__(self, env, policy, baseline, inner_lr=_Default(1e-2), outer_lr=1e-3, max_kl_step=0.01, max_path_length=500, discount=0.99, gae_lambda=1, center_adv=True, positive_adv=False, policy_ent_coeff=0.0, use_softplus_entropy=False, stop_entropy_gradient=False, entropy_method='no_entropy', meta_batch_size=40, num_grad_updates=1): inner_algo = VPG(env.spec, policy, baseline, optimizer=torch.optim.Adam, policy_lr=inner_lr, max_path_length=max_path_length, num_train_per_epoch=1, discount=discount, gae_lambda=gae_lambda, center_adv=center_adv, positive_adv=positive_adv, policy_ent_coeff=policy_ent_coeff, use_softplus_entropy=use_softplus_entropy, stop_entropy_gradient=stop_entropy_gradient, entropy_method=entropy_method) meta_optimizer = (ConjugateGradientOptimizer, dict(max_constraint_value=max_kl_step)) super().__init__(inner_algo=inner_algo, env=env, policy=policy, baseline=baseline, meta_optimizer=meta_optimizer, meta_batch_size=meta_batch_size, inner_lr=inner_lr, outer_lr=outer_lr, num_grad_updates=num_grad_updates)
def __init__(self, env, policy, baseline, inner_lr=_Default(1e-1), outer_lr=1e-3, lr_clip_range=5e-1, max_path_length=100, discount=0.99, gae_lambda=1.0, center_adv=True, positive_adv=False, policy_ent_coeff=0.0, use_softplus_entropy=False, stop_entropy_gradient=False, entropy_method='no_entropy', meta_batch_size=20, num_grad_updates=1): inner_algo = PPO(env.spec, policy, baseline, optimizer=torch.optim.Adam, policy_lr=inner_lr, lr_clip_range=lr_clip_range, max_path_length=max_path_length, num_train_per_epoch=1, discount=discount, gae_lambda=gae_lambda, center_adv=center_adv, positive_adv=positive_adv, policy_ent_coeff=policy_ent_coeff, use_softplus_entropy=use_softplus_entropy, stop_entropy_gradient=stop_entropy_gradient, entropy_method=entropy_method) super().__init__(inner_algo=inner_algo, env=env, policy=policy, baseline=baseline, meta_optimizer=torch.optim.Adam, meta_batch_size=meta_batch_size, inner_lr=inner_lr, outer_lr=outer_lr, num_grad_updates=num_grad_updates)
def __init__(self, env_spec, policy, baseline, optimizer=torch.optim.Adam, policy_lr=_Default(3e-4), max_path_length=500, lr_clip_range=2e-1, num_train_per_epoch=1, discount=0.99, gae_lambda=0.97, center_adv=True, positive_adv=False, policy_ent_coeff=0.0, use_softplus_entropy=False, stop_entropy_gradient=False, entropy_method='no_entropy', training_batch_size=None, training_epochs=1): super().__init__(env_spec=env_spec, policy=policy, baseline=baseline, optimizer=optimizer, policy_lr=policy_lr, max_path_length=max_path_length, num_train_per_epoch=num_train_per_epoch, discount=discount, gae_lambda=gae_lambda, center_adv=center_adv, positive_adv=positive_adv, policy_ent_coeff=policy_ent_coeff, use_softplus_entropy=use_softplus_entropy, stop_entropy_gradient=stop_entropy_gradient, entropy_method=entropy_method, training_batch_size=training_batch_size, training_epochs=training_epochs) self._lr_clip_range = lr_clip_range
def __init__(self, env_spec, policy, qf, replay_buffer, steps_per_epoch=20, n_train_steps=50, max_path_length=None, buffer_batch_size=64, min_buffer_size=int(1e4), rollout_batch_size=1, exploration_strategy=None, target_update_tau=0.01, discount=0.99, policy_weight_decay=0, qf_weight_decay=0, policy_optimizer=torch.optim.Adam, qf_optimizer=torch.optim.Adam, policy_lr=_Default(1e-4), qf_lr=_Default(1e-3), clip_pos_returns=False, clip_return=np.inf, max_action=None, reward_scale=1., smooth_return=True): action_bound = env_spec.action_space.high self._tau = target_update_tau self._policy_weight_decay = policy_weight_decay self._qf_weight_decay = qf_weight_decay self._clip_pos_returns = clip_pos_returns self._clip_return = clip_return self._max_action = action_bound if max_action is None else max_action self._evaluate = False self._success_history = deque(maxlen=100) self._episode_rewards = [] self._episode_policy_losses = [] self._episode_qf_losses = [] self._epoch_ys = [] self._epoch_qs = [] super().__init__(env_spec=env_spec, policy=policy, qf=qf, n_train_steps=n_train_steps, steps_per_epoch=steps_per_epoch, max_path_length=max_path_length, buffer_batch_size=buffer_batch_size, min_buffer_size=min_buffer_size, rollout_batch_size=rollout_batch_size, exploration_strategy=exploration_strategy, replay_buffer=replay_buffer, use_target=True, discount=discount, reward_scale=reward_scale, smooth_return=smooth_return) self._target_policy = copy.deepcopy(self.policy) self._target_qf = copy.deepcopy(self.qf) self._policy_optimizer = make_optimizer(policy_optimizer, self.policy, lr=policy_lr) self._qf_optimizer = make_optimizer(qf_optimizer, self.qf, lr=qf_lr)