def __init__(self, input_shape, output_dim, name='BernoulliMLPRegressor', hidden_sizes=(32, 32), hidden_nonlinearity=tf.nn.relu, hidden_w_init=tf.initializers.glorot_uniform(), hidden_b_init=tf.zeros_initializer(), output_nonlinearity=tf.nn.sigmoid, output_w_init=tf.initializers.glorot_uniform(), output_b_init=tf.zeros_initializer(), optimizer=None, optimizer_args=None, tr_optimizer=None, tr_optimizer_args=None, use_trust_region=True, max_kl_step=0.01, normalize_inputs=True, layer_normalization=False): super().__init__(input_shape, output_dim, name) self._use_trust_region = use_trust_region self._max_kl_step = max_kl_step self._normalize_inputs = normalize_inputs with tf.compat.v1.variable_scope(self._name, reuse=False) as vs: self._variable_scope = vs optimizer_args = optimizer_args or dict() tr_optimizer_args = tr_optimizer_args or dict() if optimizer is None: self._optimizer = make_optimizer(LbfgsOptimizer, **optimizer_args) else: self._optimizer = make_optimizer(optimizer, **optimizer_args) if tr_optimizer is None: self._tr_optimizer = make_optimizer(ConjugateGradientOptimizer, **tr_optimizer_args) else: self._tr_optimizer = make_optimizer(tr_optimizer, **tr_optimizer_args) self._first_optimized = False self.model = NormalizedInputMLPModel( input_shape, output_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, hidden_w_init=hidden_w_init, hidden_b_init=hidden_b_init, output_nonlinearity=output_nonlinearity, output_w_init=output_w_init, output_b_init=output_b_init, layer_normalization=layer_normalization) self._dist = Bernoulli(output_dim) self._network = None self._initialize()
def update_opt(self, loss, target, inputs, extra_inputs=None, **kwargs): """Construct operation graph for the optimizer. Args: loss (tf.Tensor): Loss objective to minimize. target (object): Target object to optimize. The object should implemenet `get_params()` and `get_param_values`. inputs (list[tf.Tensor]): List of input placeholders. extra_inputs (list[tf.Tensor]): List of extra input placeholders. kwargs (dict): Extra unused keyword arguments. Some optimizers have extra input, e.g. KL constraint. """ del kwargs with tf.name_scope(self._name): self._target = target tf_optimizer = make_optimizer(self._tf_optimizer, **self._learning_rate) self._train_op = tf_optimizer.minimize( loss, var_list=target.get_params()) if extra_inputs is None: extra_inputs = list() self._input_vars = inputs + extra_inputs self._opt_fun = LazyDict( f_loss=lambda: compile_function(inputs + extra_inputs, loss), )
def __init__(self, inner_algo, env, policy, sampler, task_sampler, meta_optimizer, meta_batch_size=40, inner_lr=0.1, outer_lr=1e-3, num_grad_updates=1, meta_evaluator=None, evaluate_every_n_epochs=1): self._sampler = sampler self.max_episode_length = inner_algo.max_episode_length self._meta_evaluator = meta_evaluator self._policy = policy self._env = env self._task_sampler = task_sampler self._value_function = copy.deepcopy(inner_algo._value_function) self._initial_vf_state = self._value_function.state_dict() self._num_grad_updates = num_grad_updates self._meta_batch_size = meta_batch_size self._inner_algo = inner_algo self._inner_optimizer = DifferentiableSGD(self._policy, lr=inner_lr) self._meta_optimizer = make_optimizer(meta_optimizer, module=policy, lr=_Default(outer_lr), eps=_Default(1e-5)) self._evaluate_every_n_epochs = evaluate_every_n_epochs
def __init__(self, optimizer, module, max_optimization_epochs=1, minibatch_size=None): self._optimizer = make_optimizer(optimizer, module=module) self._max_optimization_epochs = max_optimization_epochs self._minibatch_size = minibatch_size
def test_torch_make_optimizer_with_tuple(self): """Test make_optimizer function with tuple as first argument.""" optimizer_type = (torch.optim.Adam, {'lr': 0.1}) module = torch.nn.Linear(2, 1) optimizer = make_optimizer(optimizer_type, module=module) # pylint: disable=isinstance-second-argument-not-valid-type assert isinstance(optimizer, optimizer_type) assert optimizer.defaults['lr'] == optimizer_type[1]['lr']
def test_torch_make_optimizer_with_type(self): """Test make_optimizer function with type as first argument.""" optimizer_type = torch.optim.Adam module = torch.nn.Linear(2, 1) lr = 0.123 optimizer = make_optimizer(optimizer_type, module=module, lr=lr) assert isinstance(optimizer, optimizer_type) assert optimizer.defaults['lr'] == lr
def test_tf_make_optimizer_raise_value_error(self): """Test make_optimizer raises value error.""" lr = 0.123 optimizer_type = (tf.compat.v1.train.AdamOptimizer, { 'learning_rate': lr }) with pytest.raises(ValueError): _ = make_optimizer(optimizer_type, learning_rate=lr)
def __init__(self, env_spec, policy, baseline, max_path_length=500, discount=0.99, gae_lambda=1, center_adv=True, positive_adv=False, fixed_horizon=False, epsilon=0.5, l2_reg_dual=0., l2_reg_loss=0., optimizer=LbfgsOptimizer, optimizer_args=None, dual_optimizer=scipy.optimize.fmin_l_bfgs_b, dual_optimizer_args=None, name='REPS'): optimizer_args = optimizer_args or dict(max_opt_itr=_Default(50)) dual_optimizer_args = dual_optimizer_args or dict(maxiter=50) self.policy = policy self.max_path_length = max_path_length self._env_spec = env_spec self._baseline = baseline self._discount = discount self._gae_lambda = gae_lambda self._center_adv = center_adv self._positive_adv = positive_adv self._fixed_horizon = fixed_horizon self._name = name self._name_scope = tf.name_scope(self._name) self._old_policy = policy.clone('old_policy') self._old_policy.parameters = self.policy.parameters self._feat_diff = None self._param_eta = None self._param_v = None self._f_dual = None self._f_dual_grad = None self._f_policy_kl = None self._policy_network = None self._old_policy_network = None self._optimizer = make_optimizer(optimizer, **optimizer_args) self._dual_optimizer = dual_optimizer self._dual_optimizer_args = dual_optimizer_args self._epsilon = float(epsilon) self._l2_reg_dual = float(l2_reg_dual) self._l2_reg_loss = float(l2_reg_loss) self._episode_reward_mean = collections.deque(maxlen=100) self.sampler_cls = RaySampler self.init_opt()
def __init__(self, env_spec, num_seq_inputs=1, name='ContinuousMLPBaseline', hidden_sizes=(32, 32), hidden_nonlinearity=tf.nn.tanh, hidden_w_init=tf.initializers.glorot_uniform( seed=deterministic.get_tf_seed_stream()), hidden_b_init=tf.zeros_initializer(), output_nonlinearity=None, output_w_init=tf.initializers.glorot_uniform( seed=deterministic.get_tf_seed_stream()), output_b_init=tf.zeros_initializer(), optimizer=None, optimizer_args=None, normalize_inputs=True): self._env_spec = env_spec self._normalize_inputs = normalize_inputs self._name = name if optimizer_args is None: optimizer_args = dict() if optimizer is None: self._optimizer = make_optimizer(LbfgsOptimizer, **optimizer_args) else: self._optimizer = make_optimizer(optimizer, **optimizer_args) super().__init__(input_shape=(env_spec.observation_space.flat_dim * num_seq_inputs, ), output_dim=1, name=name, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, hidden_w_init=hidden_w_init, hidden_b_init=hidden_b_init, output_nonlinearity=output_nonlinearity, output_w_init=output_w_init, output_b_init=output_b_init) self._x_mean = None self._x_std = None self._y_hat = None self._initialize()
def __init__(self, input_shape, output_dim, name='ContinuousMLPRegressor', hidden_sizes=(32, 32), hidden_nonlinearity=tf.nn.tanh, hidden_w_init=tf.initializers.glorot_uniform(), hidden_b_init=tf.zeros_initializer(), output_nonlinearity=None, output_w_init=tf.initializers.glorot_uniform(), output_b_init=tf.zeros_initializer(), optimizer=None, optimizer_args=None, normalize_inputs=True): super().__init__(input_shape, output_dim, name) self._normalize_inputs = normalize_inputs with tf.compat.v1.variable_scope(self._name, reuse=False) as vs: self._variable_scope = vs if optimizer_args is None: optimizer_args = dict() if optimizer is None: self._optimizer = make_optimizer(LbfgsOptimizer, **optimizer_args) else: self._optimizer = make_optimizer(optimizer, **optimizer_args) self.model = NormalizedInputMLPModel( input_shape=input_shape, output_dim=output_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, hidden_w_init=hidden_w_init, hidden_b_init=hidden_b_init, output_nonlinearity=output_nonlinearity, output_w_init=output_w_init, output_b_init=output_b_init) self._network = None self._initialize()
def test_tf_make_optimizer_with_tuple(self): """Test make_optimizer function with tuple as first argument.""" lr = 0.123 optimizer_type = (tf.compat.v1.train.AdamOptimizer, { 'learning_rate': lr }) optimizer = make_optimizer(optimizer_type) # pylint: disable=isinstance-second-argument-not-valid-type assert isinstance(optimizer, optimizer_type) self.sess.run(tf.compat.v1.global_variables_initializer()) assert np.allclose( optimizer._lr, lr ) # Adam holds the value of learning rate in private variable self._lr
def test_tf_make_optimizer_with_type(self): """Test make_optimizer function with type as first argument.""" optimizer_type = tf.compat.v1.train.AdamOptimizer lr = 0.123 optimizer = make_optimizer(optimizer_type, learning_rate=lr, name='testOptimizer') assert isinstance(optimizer, optimizer_type) self.sess.run(tf.compat.v1.global_variables_initializer()) assert optimizer._name == 'testOptimizer' assert np.allclose( optimizer._lr, lr ) # Adam holds the value of learning rate in private variable self._lr
def __init__( self, env_spec, learner, *, batch_size, source=None, sampler=None, policy_optimizer=torch.optim.Adam, policy_lr=_Default(1e-3), loss='log_prob', minibatches_per_epoch=16, name='BC', ): self._source = source self.learner = learner self._optimizer = make_optimizer(policy_optimizer, module=self.learner, lr=policy_lr) if loss not in ('log_prob', 'mse'): raise ValueError('Loss should be either "log_prob" or "mse".') self._loss = loss self._minibatches_per_epoch = minibatches_per_epoch self._eval_env = None self._batch_size = batch_size self._name = name # For plotting self.policy = self.learner # Public fields for sampling. self._env_spec = env_spec self.exploration_policy = None self.policy = None self.max_episode_length = env_spec.max_episode_length self._sampler = sampler if isinstance(self._source, Policy): self.exploration_policy = self._source self._source = source if not isinstance(self._sampler, Sampler): raise TypeError('Source is a policy. Missing a sampler.') else: self._source = itertools.cycle(iter(source))
def __init__( self, env_spec, learner, *, batch_size, source=None, max_path_length=None, policy_optimizer=torch.optim.Adam, policy_lr=_Default(1e-3), loss='log_prob', minibatches_per_epoch=16, name='BC', ): self._source = source self.learner = learner self._optimizer = make_optimizer(policy_optimizer, module=self.learner, lr=policy_lr) if loss not in ('log_prob', 'mse'): raise ValueError('Loss should be either "log_prob" or "mse".') self._loss = loss self._minibatches_per_epoch = minibatches_per_epoch self._eval_env = None self._batch_size = batch_size self._name = name # Public fields for sampling. self.env_spec = env_spec self.policy = None self.max_path_length = max_path_length self.sampler_cls = None if isinstance(self._source, Policy): if max_path_length is None: raise ValueError('max_path_length must be passed if the ' 'source is a policy') self.policy = self._source self.sampler_cls = RaySampler self._source = source else: self._source = itertools.cycle(iter(source))
def init_opt(self): """Initialize the networks and Ops. Assume discrete space for dqn, so action dimension will always be action_space.n """ action_dim = self.env_spec.action_space.n # build q networks with tf.name_scope(self._name): action_t_ph = tf.compat.v1.placeholder(tf.int32, None, name='action') reward_t_ph = tf.compat.v1.placeholder(tf.float32, None, name='reward') done_t_ph = tf.compat.v1.placeholder(tf.float32, None, name='done') with tf.name_scope('update_ops'): target_update_op = tensor_utils.get_target_ops( self._qf.get_global_vars(), self._target_qf.get_global_vars()) self._qf_update_ops = tensor_utils.compile_function( inputs=[], outputs=target_update_op) with tf.name_scope('td_error'): # Q-value of the selected action action = tf.one_hot(action_t_ph, action_dim, on_value=1., off_value=0.) q_selected = tf.reduce_sum( self._qf.q_vals * action, # yapf: disable axis=1) # r + Q'(s', argmax_a(Q(s', _)) - Q(s, a) if self._double_q: target_qval_with_online_q = self._qf.build( self._target_qf.input, self._qf.name) future_best_q_val_action = tf.argmax( target_qval_with_online_q, 1) future_best_q_val = tf.reduce_sum( self._target_qf.q_vals * tf.one_hot(future_best_q_val_action, action_dim, on_value=1., off_value=0.), axis=1) else: # r + max_a(Q'(s', _)) - Q(s, a) future_best_q_val = tf.reduce_max(self._target_qf.q_vals, axis=1) q_best_masked = (1.0 - done_t_ph) * future_best_q_val # if done, it's just reward # else reward + discount * future_best_q_val target_q_values = (reward_t_ph + self._discount * q_best_masked) # td_error = q_selected - tf.stop_gradient(target_q_values) loss = tf.compat.v1.losses.huber_loss( q_selected, tf.stop_gradient(target_q_values)) loss = tf.reduce_mean(loss) with tf.name_scope('optimize_ops'): qf_optimizer = make_optimizer(self._qf_optimizer, learning_rate=self._qf_lr) if self._grad_norm_clipping is not None: gradients = qf_optimizer.compute_gradients( loss, var_list=self._qf.get_trainable_vars()) for i, (grad, var) in enumerate(gradients): if grad is not None: gradients[i] = (tf.clip_by_norm( grad, self._grad_norm_clipping), var) optimize_loss = qf_optimizer.apply_gradients(gradients) else: optimize_loss = qf_optimizer.minimize( loss, var_list=self._qf.get_trainable_vars()) self._train_qf = tensor_utils.compile_function( inputs=[ self._qf.input, action_t_ph, reward_t_ph, done_t_ph, self._target_qf.input ], outputs=[loss, optimize_loss])
def __init__( self, env_spec, policy, qf, replay_buffer, *, # Everything after this is numbers. max_path_length, steps_per_epoch=20, n_train_steps=50, max_eval_path_length=None, buffer_batch_size=64, min_buffer_size=int(1e4), exploration_policy=None, target_update_tau=0.01, discount=0.99, policy_weight_decay=0, qf_weight_decay=0, policy_optimizer=torch.optim.Adam, qf_optimizer=torch.optim.Adam, policy_lr=_Default(1e-4), qf_lr=_Default(1e-3), clip_pos_returns=False, clip_return=np.inf, max_action=None, reward_scale=1., smooth_return=True): action_bound = env_spec.action_space.high self._tau = target_update_tau self._policy_weight_decay = policy_weight_decay self._qf_weight_decay = qf_weight_decay self._clip_pos_returns = clip_pos_returns self._clip_return = clip_return self._max_action = action_bound if max_action is None else max_action self._steps_per_epoch = steps_per_epoch self._episode_rewards = [] self._episode_policy_losses = [] self._episode_qf_losses = [] self._epoch_ys = [] self._epoch_qs = [] self._policy = policy self._qf = qf self._n_train_steps = n_train_steps self._min_buffer_size = min_buffer_size self._qf = qf self._steps_per_epoch = steps_per_epoch self._n_train_steps = n_train_steps self._buffer_batch_size = buffer_batch_size self._discount = discount self._reward_scale = reward_scale self._smooth_return = smooth_return self.max_path_length = max_path_length self._max_eval_path_length = max_eval_path_length self.env_spec = env_spec self.replay_buffer = replay_buffer self.policy = policy self.exploration_policy = exploration_policy self._target_policy = copy.deepcopy(self.policy) self._target_qf = copy.deepcopy(self._qf) self._policy_optimizer = make_optimizer(policy_optimizer, module=self.policy, lr=policy_lr) self._qf_optimizer = make_optimizer(qf_optimizer, module=self._qf, lr=qf_lr) self._eval_env = None self.sampler_cls = LocalSampler
def __init__(self, env_spec, num_seq_inputs=1, name='GaussianMLPBaseline', hidden_sizes=(32, 32), hidden_nonlinearity=tf.nn.tanh, hidden_w_init=tf.initializers.glorot_uniform( seed=deterministic.get_tf_seed_stream()), hidden_b_init=tf.zeros_initializer(), output_nonlinearity=None, output_w_init=tf.initializers.glorot_uniform( seed=deterministic.get_tf_seed_stream()), output_b_init=tf.zeros_initializer(), optimizer=None, optimizer_args=None, use_trust_region=True, max_kl_step=0.01, learn_std=True, init_std=1.0, adaptive_std=False, std_share_network=False, std_hidden_sizes=(32, 32), std_nonlinearity=None, layer_normalization=False, normalize_inputs=True, normalize_outputs=True, subsample_factor=1.0): self._env_spec = env_spec self._num_seq_inputs = num_seq_inputs self._use_trust_region = use_trust_region self._max_kl_step = max_kl_step self._normalize_inputs = normalize_inputs self._normalize_outputs = normalize_outputs self._subsample_factor = subsample_factor if optimizer_args is None: optimizer_args = dict() if optimizer is None: if use_trust_region: self._optimizer = make_optimizer(PenaltyLBFGSOptimizer, **optimizer_args) else: self._optimizer = make_optimizer(LBFGSOptimizer, **optimizer_args) else: self._optimizer = make_optimizer(optimizer, **optimizer_args) super().__init__(name=name, input_shape=(env_spec.observation_space.flat_dim * num_seq_inputs, ), output_dim=1, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, hidden_w_init=hidden_w_init, hidden_b_init=hidden_b_init, output_nonlinearity=output_nonlinearity, output_w_init=output_w_init, output_b_init=output_b_init, learn_std=learn_std, adaptive_std=adaptive_std, std_share_network=std_share_network, init_std=init_std, min_std=None, max_std=None, std_hidden_sizes=std_hidden_sizes, std_hidden_nonlinearity=std_nonlinearity, std_output_nonlinearity=None, std_parameterization='exp', layer_normalization=layer_normalization) # model for old distribution, used when trusted region is on self._old_model = self.clone_model(name=name + '_old_model') self._x_mean = None self._x_std = None self._y_mean = None self._y_std = None self._old_network = None self._initialize()
def __init__(self, env_spec, policy, baseline, scope=None, max_path_length=100, discount=0.99, gae_lambda=1, center_adv=True, positive_adv=False, fixed_horizon=False, pg_loss='surrogate', lr_clip_range=0.01, max_kl_step=0.01, optimizer=None, optimizer_args=None, policy_ent_coeff=0.0, use_softplus_entropy=False, use_neg_logli_entropy=False, stop_entropy_gradient=False, entropy_method='no_entropy', flatten_input=True, name='NPO'): self.policy = policy self.scope = scope self.max_path_length = max_path_length self._env_spec = env_spec self._baseline = baseline self._discount = discount self._gae_lambda = gae_lambda self._center_adv = center_adv self._positive_adv = positive_adv self._fixed_horizon = fixed_horizon self._flatten_input = flatten_input self._name = name self._name_scope = tf.name_scope(self._name) self._old_policy = policy.clone('old_policy') self._use_softplus_entropy = use_softplus_entropy self._use_neg_logli_entropy = use_neg_logli_entropy self._stop_entropy_gradient = stop_entropy_gradient self._pg_loss = pg_loss if optimizer is None: if optimizer_args is None: optimizer_args = dict() optimizer = LbfgsOptimizer self._check_entropy_configuration(entropy_method, center_adv, stop_entropy_gradient, use_neg_logli_entropy, policy_ent_coeff) if pg_loss not in ['vanilla', 'surrogate', 'surrogate_clip']: raise ValueError('Invalid pg_loss') self._optimizer = make_optimizer(optimizer, **optimizer_args) self._lr_clip_range = float(lr_clip_range) self._max_kl_step = float(max_kl_step) self._policy_ent_coeff = float(policy_ent_coeff) self._f_rewards = None self._f_returns = None self._f_policy_kl = None self._f_policy_entropy = None self._episode_reward_mean = collections.deque(maxlen=100) if policy.vectorized: self.sampler_cls = OnPolicyVectorizedSampler else: self.sampler_cls = BatchSampler self.init_opt()
def __init__( self, env_spec, policy, qf, replay_buffer, sampler, exploration_policy=None, eval_env=None, double_q=True, qf_optimizer=torch.optim.Adam, *, # Everything after this is numbers. steps_per_epoch=20, n_train_steps=50, max_episode_length_eval=None, deterministic_eval=False, buffer_batch_size=64, min_buffer_size=int(1e4), num_eval_episodes=10, discount=0.99, qf_lr=_Default(1e-3), clip_rewards=None, clip_gradient=10, target_update_freq=5, reward_scale=1.): self._clip_reward = clip_rewards self._clip_grad = clip_gradient self._steps_per_epoch = steps_per_epoch self._target_update_freq = target_update_freq self._episode_qf_losses = [] self._epoch_ys = [] self._epoch_qs = [] self._policy = policy self._qf = qf self._n_train_steps = n_train_steps self._min_buffer_size = min_buffer_size self._qf = qf self._steps_per_epoch = steps_per_epoch self._n_train_steps = n_train_steps self._buffer_batch_size = buffer_batch_size self._double_q = double_q self._discount = discount self._reward_scale = reward_scale self.max_episode_length = env_spec.max_episode_length self._max_episode_length_eval = (max_episode_length_eval or self.max_episode_length) self._episode_reward_mean = collections.deque(maxlen=100) self._num_eval_episodes = num_eval_episodes self._deterministic_eval = deterministic_eval self.env_spec = env_spec self.replay_buffer = replay_buffer self.policy = policy self.exploration_policy = exploration_policy self._target_qf = copy.deepcopy(self._qf) self._qf_optimizer = make_optimizer(qf_optimizer, module=self._qf, lr=qf_lr) self._eval_env = eval_env self._sampler = sampler
def _init_opt(self): """Build the loss function and init the optimizer.""" with tf.name_scope(self._name): # Create target policy and qf network with tf.name_scope('inputs'): obs_dim = self._env_spec.observation_space.flat_dim input_y = tf.compat.v1.placeholder(tf.float32, shape=(None, 1), name='input_y') obs = tf.compat.v1.placeholder(tf.float32, shape=(None, obs_dim), name='input_observation') actions = tf.compat.v1.placeholder( tf.float32, shape=(None, self._env_spec.action_space.flat_dim), name='input_action') policy_network_outputs = self._target_policy.build(obs, name='policy') target_qf_outputs = self._target_qf.build(obs, actions, name='qf') self._target_policy_f_prob_online = compile_function( inputs=[obs], outputs=policy_network_outputs) self._target_qf_f_prob_online = compile_function( inputs=[obs, actions], outputs=target_qf_outputs) # Set up target init and update function with tf.name_scope('setup_target'): ops = get_target_ops(self.policy.get_global_vars(), self._target_policy.get_global_vars(), self._tau) policy_init_ops, policy_update_ops = ops qf_init_ops, qf_update_ops = get_target_ops( self._qf.get_global_vars(), self._target_qf.get_global_vars(), self._tau) target_init_op = policy_init_ops + qf_init_ops target_update_op = policy_update_ops + qf_update_ops f_init_target = compile_function(inputs=[], outputs=target_init_op) f_update_target = compile_function(inputs=[], outputs=target_update_op) with tf.name_scope('inputs'): obs_dim = self._env_spec.observation_space.flat_dim input_y = tf.compat.v1.placeholder(tf.float32, shape=(None, 1), name='input_y') obs = tf.compat.v1.placeholder(tf.float32, shape=(None, obs_dim), name='input_observation') actions = tf.compat.v1.placeholder( tf.float32, shape=(None, self._env_spec.action_space.flat_dim), name='input_action') # Set up policy training function next_action = self.policy.build(obs, name='policy_action') next_qval = self._qf.build(obs, next_action, name='policy_action_qval') with tf.name_scope('action_loss'): action_loss = -tf.reduce_mean(next_qval) if self._policy_weight_decay > 0.: regularizer = tf.keras.regularizers.l2( self._policy_weight_decay) for var in self.policy.get_regularizable_vars(): policy_reg = regularizer(var) action_loss += policy_reg with tf.name_scope('minimize_action_loss'): policy_optimizer = make_optimizer( self._policy_optimizer, learning_rate=self._policy_lr, name='PolicyOptimizer') policy_train_op = policy_optimizer.minimize( action_loss, var_list=self.policy.get_trainable_vars()) f_train_policy = compile_function( inputs=[obs], outputs=[policy_train_op, action_loss]) # Set up qf training function qval = self._qf.build(obs, actions, name='q_value') with tf.name_scope('qval_loss'): qval_loss = tf.reduce_mean( tf.compat.v1.squared_difference(input_y, qval)) if self._qf_weight_decay > 0.: regularizer = tf.keras.regularizers.l2( self._qf_weight_decay) for var in self._qf.get_regularizable_vars(): qf_reg = regularizer(var) qval_loss += qf_reg with tf.name_scope('minimize_qf_loss'): qf_optimizer = make_optimizer(self._qf_optimizer, learning_rate=self._qf_lr, name='QFunctionOptimizer') qf_train_op = qf_optimizer.minimize( qval_loss, var_list=self._qf.get_trainable_vars()) f_train_qf = compile_function( inputs=[input_y, obs, actions], outputs=[qf_train_op, qval_loss, qval]) self._f_train_policy = f_train_policy self._f_train_qf = f_train_qf self._f_init_target = f_init_target self._f_update_target = f_update_target
def test_torch_make_optimizer_raise_value_error(self): """Test make_optimizer raises value error.""" optimizer_type = (torch.optim.Adam, {'lr': 0.1}) module = torch.nn.Linear(2, 1) with pytest.raises(ValueError): _ = make_optimizer(optimizer_type, module=module, lr=0.123)
def __init__( self, env_spec, policy, qf1, qf2, replay_buffer, *, # Everything after this is numbers. max_episode_length_eval=None, grad_steps_per_env_step, exploration_policy, uniform_random_policy=None, max_action=None, target_update_tau=0.005, discount=0.99, reward_scaling=1., update_actor_interval=2, buffer_batch_size=64, replay_buffer_size=1e6, min_buffer_size=1e4, exploration_noise=0.1, policy_noise=0.2, policy_noise_clip=0.5, clip_return=np.inf, policy_lr=_Default(1e-4), qf_lr=_Default(1e-3), policy_optimizer=torch.optim.Adam, qf_optimizer=torch.optim.Adam, num_evaluation_episodes=10, steps_per_epoch=20, start_steps=10000, update_after=1000, use_deterministic_evaluation=False): self._env_spec = env_spec action_bound = self._env_spec.action_space.high[0] self._max_action = action_bound if max_action is None else max_action self._action_dim = self._env_spec.action_space.shape[0] self._tau = target_update_tau self._discount = discount self._reward_scaling = reward_scaling self._exploration_noise = exploration_noise self._policy_noise = policy_noise self._policy_noise_clip = policy_noise_clip self._clip_return = clip_return self._replay_buffer_size = replay_buffer_size self._min_buffer_size = min_buffer_size self._buffer_batch_size = buffer_batch_size self._grad_steps_per_env_step = grad_steps_per_env_step self._update_actor_interval = update_actor_interval self._steps_per_epoch = steps_per_epoch self._start_steps = start_steps self._update_after = update_after self._num_evaluation_episodes = num_evaluation_episodes self.max_episode_length = env_spec.max_episode_length self._max_episode_length_eval = env_spec.max_episode_length if max_episode_length_eval is not None: self._max_episode_length_eval = max_episode_length_eval self._use_deterministic_evaluation = use_deterministic_evaluation self._episode_policy_losses = [] self._episode_qf_losses = [] self._epoch_ys = [] self._epoch_qs = [] self._eval_env = None self.exploration_policy = exploration_policy self._uniform_random_policy = uniform_random_policy self.worker_cls = FragmentWorker self.sampler_cls = LocalSampler self._replay_buffer = replay_buffer self.policy = policy self._qf_1 = qf1 self._qf_2 = qf2 self._target_policy = copy.deepcopy(self.policy) self._target_qf_1 = copy.deepcopy(self._qf_1) self._target_qf_2 = copy.deepcopy(self._qf_2) self._policy_optimizer = make_optimizer(policy_optimizer, module=self.policy, lr=policy_lr) self._qf_optimizer_1 = make_optimizer(qf_optimizer, module=self._qf_1, lr=qf_lr) self._qf_optimizer_2 = make_optimizer(qf_optimizer, module=self._qf_2, lr=qf_lr) self._actor_loss = torch.zeros(1)
def _init_opt(self): """Build the loss function and init the optimizer.""" with tf.name_scope(self._name): # Create target policy (actor) and qf (critic) networks with tf.name_scope('inputs'): obs_dim = self._env_spec.observation_space.flat_dim y = tf.compat.v1.placeholder(tf.float32, shape=(None, 1), name='input_y') obs = tf.compat.v1.placeholder(tf.float32, shape=(None, obs_dim), name='input_observation') actions = tf.compat.v1.placeholder( tf.float32, shape=(None, self._env_spec.action_space.flat_dim), name='input_action') policy_network_outputs = self._target_policy.build(obs, name='policy') target_qf_outputs = self._target_qf.build(obs, actions, name='qf') target_qf2_outputs = self._target_qf2.build(obs, actions, name='qf') self._target_policy_f_prob_online = compile_function( inputs=[obs], outputs=policy_network_outputs) self._target_qf_f_prob_online = compile_function( inputs=[obs, actions], outputs=target_qf_outputs) self._target_qf2_f_prob_online = compile_function( inputs=[obs, actions], outputs=target_qf2_outputs) # Set up target init and update functions with tf.name_scope('setup_target'): policy_init_op, policy_update_op = get_target_ops( self.policy.get_global_vars(), self._target_policy.get_global_vars(), self._tau) qf_init_ops, qf_update_ops = get_target_ops( self.qf.get_global_vars(), self._target_qf.get_global_vars(), self._tau) qf2_init_ops, qf2_update_ops = get_target_ops( self.qf2.get_global_vars(), self._target_qf2.get_global_vars(), self._tau) target_init_op = policy_init_op + qf_init_ops + qf2_init_ops target_update_op = (policy_update_op + qf_update_ops + qf2_update_ops) f_init_target = compile_function(inputs=[], outputs=target_init_op) f_update_target = compile_function(inputs=[], outputs=target_update_op) # Set up policy training function next_action = self.policy.build(obs, name='policy_action') next_qval = self.qf.build(obs, next_action, name='policy_action_qval') with tf.name_scope('action_loss'): action_loss = -tf.reduce_mean(next_qval) with tf.name_scope('minimize_action_loss'): policy_optimizer = make_optimizer( self._policy_optimizer, learning_rate=self._policy_lr, name='PolicyOptimizer') policy_train_op = policy_optimizer.minimize( action_loss, var_list=self.policy.get_trainable_vars()) f_train_policy = compile_function( inputs=[obs], outputs=[policy_train_op, action_loss]) # Set up qf training function qval = self.qf.build(obs, actions, name='q_value') q2val = self.qf2.build(obs, actions, name='q2_value') with tf.name_scope('qval1_loss'): qval1_loss = tf.reduce_mean(tf.math.squared_difference( y, qval)) with tf.name_scope('qval2_loss'): qval2_loss = tf.reduce_mean( tf.math.squared_difference(y, q2val)) with tf.name_scope('minimize_qf_loss'): qf_optimizer = make_optimizer(self._qf_optimizer, learning_rate=self._qf_lr, name='QFunctionOptimizer') qf_train_op = qf_optimizer.minimize( qval1_loss, var_list=self.qf.get_trainable_vars()) qf2_train_op = qf_optimizer.minimize( qval2_loss, var_list=self.qf2.get_trainable_vars()) f_train_qf = compile_function( inputs=[y, obs, actions], outputs=[qf_train_op, qval1_loss, qval]) f_train_qf2 = compile_function( inputs=[y, obs, actions], outputs=[qf2_train_op, qval2_loss, q2val]) self._f_train_policy = f_train_policy self._f_train_qf = f_train_qf self._f_init_target = f_init_target self._f_update_target = f_update_target self._f_train_qf2 = f_train_qf2
def __init__(self, input_shape, output_dim, name='GaussianMLPRegressor', hidden_sizes=(32, 32), hidden_nonlinearity=tf.nn.tanh, hidden_w_init=tf.initializers.glorot_uniform(), hidden_b_init=tf.zeros_initializer(), output_nonlinearity=None, output_w_init=tf.initializers.glorot_uniform(), output_b_init=tf.zeros_initializer(), optimizer=None, optimizer_args=None, use_trust_region=True, max_kl_step=0.01, learn_std=True, init_std=1.0, adaptive_std=False, std_share_network=False, std_hidden_sizes=(32, 32), std_nonlinearity=None, layer_normalization=False, normalize_inputs=True, normalize_outputs=True, subsample_factor=1.0): super().__init__(input_shape, output_dim, name) self._use_trust_region = use_trust_region self._subsample_factor = subsample_factor self._max_kl_step = max_kl_step self._normalize_inputs = normalize_inputs self._normalize_outputs = normalize_outputs with tf.compat.v1.variable_scope(self._name, reuse=False) as vs: self._variable_scope = vs if optimizer_args is None: optimizer_args = dict() if optimizer is None: if use_trust_region: self._optimizer = make_optimizer(PenaltyLbfgsOptimizer, **optimizer_args) else: self._optimizer = make_optimizer(LbfgsOptimizer, **optimizer_args) else: self._optimizer = make_optimizer(optimizer, **optimizer_args) self.model = GaussianMLPRegressorModel( input_shape=input_shape, output_dim=self._output_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, hidden_w_init=hidden_w_init, hidden_b_init=hidden_b_init, output_nonlinearity=output_nonlinearity, output_w_init=output_w_init, output_b_init=output_b_init, learn_std=learn_std, adaptive_std=adaptive_std, std_share_network=std_share_network, init_std=init_std, min_std=None, max_std=None, std_hidden_sizes=std_hidden_sizes, std_hidden_nonlinearity=std_nonlinearity, std_output_nonlinearity=None, std_parameterization='exp', layer_normalization=layer_normalization) # model for old distribution, used when trusted region is on self._old_model = self.model.clone(name='model_for_old_dist') self._initialize()
def __init__( self, env_spec, policy, qf, replay_buffer, *, # Everything after this is numbers. steps_per_epoch=20, n_train_steps=50, max_path_length=None, max_eval_path_length=None, buffer_batch_size=64, min_buffer_size=int(1e4), rollout_batch_size=1, exploration_policy=None, target_update_tau=0.01, discount=0.99, policy_weight_decay=0, qf_weight_decay=0, policy_optimizer=torch.optim.Adam, qf_optimizer=torch.optim.Adam, policy_lr=_Default(1e-4), qf_lr=_Default(1e-3), clip_pos_returns=False, clip_return=np.inf, max_action=None, reward_scale=1., smooth_return=True): action_bound = env_spec.action_space.high self._tau = target_update_tau self._policy_weight_decay = policy_weight_decay self._qf_weight_decay = qf_weight_decay self._clip_pos_returns = clip_pos_returns self._clip_return = clip_return self._max_action = action_bound if max_action is None else max_action self._success_history = deque(maxlen=100) self._episode_rewards = [] self._episode_policy_losses = [] self._episode_qf_losses = [] self._epoch_ys = [] self._epoch_qs = [] super().__init__(env_spec=env_spec, policy=policy, qf=qf, n_train_steps=n_train_steps, steps_per_epoch=steps_per_epoch, max_path_length=max_path_length, max_eval_path_length=max_eval_path_length, buffer_batch_size=buffer_batch_size, min_buffer_size=min_buffer_size, rollout_batch_size=rollout_batch_size, exploration_policy=exploration_policy, replay_buffer=replay_buffer, use_target=True, discount=discount, reward_scale=reward_scale, smooth_return=smooth_return) self._target_policy = copy.deepcopy(self.policy) self._target_qf = copy.deepcopy(self.qf) self._policy_optimizer = make_optimizer(policy_optimizer, module=self.policy, lr=policy_lr) self._qf_optimizer = make_optimizer(qf_optimizer, module=self.qf, lr=qf_lr)
def __init__(self, env_spec, filters, strides, padding, hidden_sizes, hidden_nonlinearity=tf.nn.tanh, hidden_w_init=tf.initializers.glorot_uniform( seed=deterministic.get_tf_seed_stream()), hidden_b_init=tf.zeros_initializer(), output_nonlinearity=None, output_w_init=tf.initializers.glorot_uniform( seed=deterministic.get_tf_seed_stream()), output_b_init=tf.zeros_initializer(), name='GaussianCNNBaseline', learn_std=True, init_std=1.0, adaptive_std=False, std_share_network=False, std_filters=(), std_strides=(), std_padding='SAME', std_hidden_sizes=(), std_hidden_nonlinearity=None, std_output_nonlinearity=None, layer_normalization=False, normalize_inputs=True, normalize_outputs=True, subsample_factor=1., optimizer=None, optimizer_args=None, use_trust_region=True, max_kl_step=0.01): if not isinstance(env_spec.observation_space, akro.Box) or \ not len(env_spec.observation_space.shape) in (2, 3): raise ValueError( '{} can only process 2D, 3D akro.Image or' ' akro.Box observations, but received an env_spec with ' 'observation_space of type {} and shape {}'.format( type(self).__name__, type(env_spec.observation_space).__name__, env_spec.observation_space.shape)) self._env_spec = env_spec self._use_trust_region = use_trust_region self._subsample_factor = subsample_factor self._max_kl_step = max_kl_step self._normalize_inputs = normalize_inputs self._normalize_outputs = normalize_outputs if optimizer_args is None: optimizer_args = dict() if optimizer is None: if use_trust_region: self._optimizer = make_optimizer(PenaltyLbfgsOptimizer, **optimizer_args) else: self._optimizer = make_optimizer(LbfgsOptimizer, **optimizer_args) else: self._optimizer = make_optimizer(optimizer, **optimizer_args) super().__init__(input_shape=env_spec.observation_space.shape, output_dim=1, filters=filters, strides=strides, padding=padding, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, hidden_w_init=hidden_w_init, hidden_b_init=hidden_b_init, output_nonlinearity=output_nonlinearity, output_w_init=output_w_init, output_b_init=output_b_init, learn_std=learn_std, adaptive_std=adaptive_std, std_share_network=std_share_network, init_std=init_std, min_std=None, max_std=None, std_filters=std_filters, std_strides=std_strides, std_padding=std_padding, std_hidden_sizes=std_hidden_sizes, std_hidden_nonlinearity=std_hidden_nonlinearity, std_output_nonlinearity=std_output_nonlinearity, std_parameterization='exp', layer_normalization=layer_normalization, name=name) # model for old distribution, used when trusted region is on self._old_model = self.clone_model(name=name + '_old_model') self._old_network = None self._x_mean = None self._x_std = None self._y_mean = None self._y_std = None self._initialize()
def __init__(self, input_shape, output_dim, name='CategoricalMLPRegressor', hidden_sizes=(32, 32), hidden_nonlinearity=tf.nn.tanh, hidden_w_init=tf.initializers.glorot_uniform(), hidden_b_init=tf.zeros_initializer(), output_nonlinearity=tf.nn.softmax, output_w_init=tf.initializers.glorot_uniform(), output_b_init=tf.zeros_initializer(), optimizer=None, optimizer_args=None, tr_optimizer=None, tr_optimizer_args=None, use_trust_region=True, max_kl_step=0.01, normalize_inputs=True, layer_normalization=False): super().__init__(input_shape, output_dim, name) self._use_trust_region = use_trust_region self._max_kl_step = max_kl_step self._normalize_inputs = normalize_inputs with tf.compat.v1.variable_scope(self._name, reuse=False) as vs: self._variable_scope = vs if optimizer_args is None: optimizer_args = dict() if tr_optimizer_args is None: tr_optimizer_args = dict() if optimizer is None: self._optimizer = make_optimizer(LbfgsOptimizer, **optimizer_args) else: self._optimizer = make_optimizer(optimizer, **optimizer_args) if tr_optimizer is None: self._tr_optimizer = make_optimizer(ConjugateGradientOptimizer, **tr_optimizer_args) else: self._tr_optimizer = make_optimizer(tr_optimizer, **tr_optimizer_args) self._first_optimized = False self.model = CategoricalMLPRegressorModel( input_shape, output_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, hidden_w_init=hidden_w_init, hidden_b_init=hidden_b_init, output_nonlinearity=output_nonlinearity, output_w_init=output_w_init, output_b_init=output_b_init, layer_normalization=layer_normalization) # model for old distribution, used when trusted region is on self._old_model = self.model.clone(name='model_for_old_dist') self._network = None self._old_network = None self._initialize()