def new_tensor_variable(self, name, extra_dims): """ Create a tensor variable in Theano. :param name: name of the variable :param extra_dims: extra dimensions to be prepended :return: the created tensor variable """ return tensor_utils.new_tensor(name=name, ndim=extra_dims + 1, dtype=theano.config.floatX)
def new_tensor_variable(self, name, extra_dims): """ Create a tensor variable in Theano. :param name: name of the variable :param extra_dims: extra dimensions to be prepended :return: the created tensor variable """ if self.n <= 2**8: return tensor_utils.new_tensor(name=name, ndim=extra_dims + 1, dtype='uint8') elif self.n <= 2**16: return tensor_utils.new_tensor(name=name, ndim=extra_dims + 1, dtype='uint16') else: return tensor_utils.new_tensor(name=name, ndim=extra_dims + 1, dtype='uint32')
def init_opt(self): observations_var = self.env.observation_space.new_tensor_variable( 'observations', extra_dims=1) actions_var = self.env.action_space.new_tensor_variable('actions', extra_dims=1) advantages_var = tensor_utils.new_tensor('advantage', ndim=1, dtype=theano.config.floatX) dist = self.policy.distribution dist_info_vars = self.policy.dist_info_sym(observations_var) old_dist_info_vars = self.backup_policy.dist_info_sym(observations_var) kl = dist.kl_sym(old_dist_info_vars, dist_info_vars) mean_kl = TT.mean(kl) max_kl = TT.max(kl) pos_eps_dist_info_vars = self.pos_eps_policy.dist_info_sym( observations_var) neg_eps_dist_info_vars = self.neg_eps_policy.dist_info_sym( observations_var) mix_dist_info_vars = self.mix_policy.dist_info_sym(observations_var) surr = TT.sum( dist.log_likelihood_sym(actions_var, dist_info_vars) * advantages_var) surr_pos_eps = TT.sum( dist.log_likelihood_sym(actions_var, pos_eps_dist_info_vars) * advantages_var) surr_neg_eps = TT.sum( dist.log_likelihood_sym(actions_var, neg_eps_dist_info_vars) * advantages_var) surr_mix = TT.sum( dist.log_likelihood_sym(actions_var, mix_dist_info_vars) * advantages_var) surr_loglikelihood = TT.sum( dist.log_likelihood_sym(actions_var, mix_dist_info_vars)) params = self.policy.get_params(trainable=True) mix_params = self.mix_policy.get_params(trainable=True) pos_eps_params = self.pos_eps_policy.get_params(trainable=True) neg_eps_params = self.neg_eps_policy.get_params(trainable=True) backup_params = self.backup_policy.get_params(trainable=True) grads = theano.grad(surr, params) grad_pos_eps = theano.grad(surr_pos_eps, pos_eps_params) grad_neg_eps = theano.grad(surr_neg_eps, neg_eps_params) grad_mix = theano.grad(surr_mix, mix_params) grad_mix_lh = theano.grad(surr_loglikelihood, mix_params) self.f_surr = theano.function( inputs=[observations_var, actions_var, advantages_var], outputs=surr) self.f_train = theano.function( inputs=[observations_var, actions_var, advantages_var], outputs=grads) self.f_pos_grad = theano.function( inputs=[observations_var, actions_var, advantages_var], outputs=grad_pos_eps) self.f_neg_grad = theano.function( inputs=[observations_var, actions_var, advantages_var], outputs=grad_neg_eps) self.f_mix_grad = theano.function( inputs=[observations_var, actions_var, advantages_var], outputs=grad_mix) self.f_mix_lh = theano.function(inputs=[observations_var, actions_var], outputs=grad_mix_lh) #self.f_update = theano.function( # inputs=[eval_grad1, eval_grad2, eval_grad3, eval_grad4, eval_grad5, eval_grad6, eval_grad7], # outputs=None, # updates=sgd([eval_grad1, eval_grad2, eval_grad3, eval_grad4, eval_grad5, eval_grad6, eval_grad7], params, # learning_rate=self.learning_rate) #) self.f_kl = tensor_utils.compile_function( inputs=[observations_var], outputs=[mean_kl, max_kl], ) return dict()
def init_opt(self): is_recurrent = int(self.policy.recurrent) obs_var = self.env.observation_space.new_tensor_variable( 'obs', extra_dims=1 + is_recurrent, ) action_var = self.env.action_space.new_tensor_variable( 'action', extra_dims=1 + is_recurrent, ) advantage_var = tensor_utils.new_tensor('advantage', ndim=1 + is_recurrent, dtype=theano.config.floatX) dist = self.policy.distribution old_dist_info_vars = { k: tensor_utils.new_tensor('old_%s' % k, ndim=2 + is_recurrent, dtype=theano.config.floatX) for k in dist.dist_info_keys } old_dist_info_vars_list = [ old_dist_info_vars[k] for k in dist.dist_info_keys ] state_info_vars = { k: tensor_utils.new_tensor(k, ndim=2 + is_recurrent, dtype=theano.config.floatX) for k in self.policy.state_info_keys } state_info_vars_list = [ state_info_vars[k] for k in self.policy.state_info_keys ] if is_recurrent: valid_var = TT.matrix('valid') else: valid_var = None dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars) kl = dist.kl_sym(old_dist_info_vars, dist_info_vars) lr = dist.likelihood_ratio_sym(action_var, old_dist_info_vars, dist_info_vars) if self.truncate_local_is_ratio is not None: lr = TT.minimum(self.truncate_local_is_ratio, lr) if is_recurrent: mean_kl = TT.sum(kl * valid_var) / TT.sum(valid_var) surr_loss = -TT.sum( lr * advantage_var * valid_var) / TT.sum(valid_var) else: mean_kl = TT.mean(kl) surr_loss = -TT.mean(lr * advantage_var) input_list = [ obs_var, action_var, advantage_var, ] + state_info_vars_list + old_dist_info_vars_list if is_recurrent: input_list.append(valid_var) self.optimizer.update_opt(loss=surr_loss, target=self.policy, leq_constraint=(mean_kl, self.step_size), inputs=input_list, constraint_name="mean_kl") return dict()
def init_opt(self): is_recurrent = int(self.policy.recurrent) # Init dual param values self.param_eta = 15. # Adjust for linear feature vector. self.param_v = np.random.rand(self.env.observation_space.flat_dim * 2 + 4) # Theano vars obs_var = self.env.observation_space.new_tensor_variable( 'obs', extra_dims=1 + is_recurrent, ) action_var = self.env.action_space.new_tensor_variable( 'action', extra_dims=1 + is_recurrent, ) rewards = theano_tensor_utils.new_tensor( 'rewards', ndim=1 + is_recurrent, dtype=theano.config.floatX, ) # Feature difference variable representing the difference in feature # value of the next observation and the current observation \phi(s') - # \phi(s). feat_diff = theano_tensor_utils.new_tensor( 'feat_diff', ndim=2 + is_recurrent, dtype=theano.config.floatX) param_v = TT.vector('param_v') param_eta = TT.scalar('eta') valid_var = TT.matrix('valid') state_info_vars = { k: theano_tensor_utils.new_tensor( k, ndim=2 + is_recurrent, dtype=theano.config.floatX) for k in self.policy.state_info_keys } state_info_vars_list = [ state_info_vars[k] for k in self.policy.state_info_keys ] # Policy-related symbolics dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars) dist = self.policy.distribution # log of the policy dist logli = dist.log_likelihood_sym(action_var, dist_info_vars) # Symbolic sample Bellman error delta_v = rewards + TT.dot(feat_diff, param_v) # Policy loss (negative because we minimize) if is_recurrent: loss = -TT.sum(logli * TT.exp(delta_v / param_eta - TT.max( delta_v / param_eta)) * valid_var) / TT.sum(valid_var) else: loss = -TT.mean(logli * TT.exp(delta_v / param_eta - TT.max(delta_v / param_eta))) # Add regularization to loss. reg_params = self.policy.get_params(regularizable=True) loss += self.L2_reg_loss * TT.sum( [TT.mean(TT.square(param)) for param in reg_params]) / len(reg_params) # Policy loss gradient. loss_grad = TT.grad(loss, self.policy.get_params(trainable=True)) if is_recurrent: recurrent_vars = [valid_var] else: recurrent_vars = [] input = [ rewards, obs_var, feat_diff, action_var ] + state_info_vars_list + recurrent_vars + [param_eta, param_v] # if is_recurrent: # input += f_loss = theano_tensor_utils.compile_function( inputs=input, outputs=loss, ) f_loss_grad = theano_tensor_utils.compile_function( inputs=input, outputs=loss_grad, ) # Debug prints old_dist_info_vars = { k: theano_tensor_utils.new_tensor( 'old_%s' % k, ndim=2 + is_recurrent, dtype=theano.config.floatX) for k in dist.dist_info_keys } old_dist_info_vars_list = [ old_dist_info_vars[k] for k in dist.dist_info_keys ] if is_recurrent: mean_kl = TT.sum( dist.kl_sym(old_dist_info_vars, dist_info_vars) * valid_var) / TT.sum(valid_var) else: mean_kl = TT.mean(dist.kl_sym(old_dist_info_vars, dist_info_vars)) f_kl = theano_tensor_utils.compile_function( inputs=[obs_var, action_var] + state_info_vars_list + old_dist_info_vars_list + recurrent_vars, outputs=mean_kl, ) # Dual-related symbolics # Symbolic dual if is_recurrent: dual = param_eta * self.epsilon + \ param_eta * TT.log( TT.sum( TT.exp( delta_v / param_eta - TT.max(delta_v / param_eta) ) * valid_var ) / TT.sum(valid_var) ) + param_eta * TT.max(delta_v / param_eta) else: dual = param_eta * self.epsilon + \ param_eta * TT.log( TT.mean( TT.exp( delta_v / param_eta - TT.max(delta_v / param_eta) ) ) ) + param_eta * TT.max(delta_v / param_eta) # Add L2 regularization. dual += self.L2_reg_dual * \ (TT.square(param_eta) + TT.square(1 / param_eta)) # Symbolic dual gradient dual_grad = TT.grad(cost=dual, wrt=[param_eta, param_v]) # Eval functions. f_dual = theano_tensor_utils.compile_function( inputs=[rewards, feat_diff] + state_info_vars_list + recurrent_vars + [param_eta, param_v], outputs=dual) f_dual_grad = theano_tensor_utils.compile_function( inputs=[rewards, feat_diff] + state_info_vars_list + recurrent_vars + [param_eta, param_v], outputs=dual_grad) self.opt_info = dict( f_loss_grad=f_loss_grad, f_loss=f_loss, f_dual=f_dual, f_dual_grad=f_dual_grad, f_kl=f_kl)
from garage.misc.instrument import run_experiment env_name = "Swimmer" hidden_sizes = (32, 32) env = TheanoEnv(normalize(SwimmerEnv())) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=hidden_sizes) backup_policy = GaussianMLPPolicy(env.spec, hidden_sizes=hidden_sizes) mix_policy = GaussianMLPPolicy(env.spec, hidden_sizes=hidden_sizes) pos_eps_policy = GaussianMLPPolicy(env.spec, hidden_sizes=hidden_sizes) neg_eps_policy = GaussianMLPPolicy(env.spec, hidden_sizes=hidden_sizes) observations_var = env.observation_space.new_tensor_variable('observations', extra_dims=1) actions_var = env.action_space.new_tensor_variable('actions', extra_dims=1) rewards_var = tensor_utils.new_tensor('rewards', ndim=1, dtype=theano.config.floatX) dist = policy.distribution dist_info_vars = policy.dist_info_sym(observations_var) old_dist_info_vars = backup_policy.dist_info_sym(observations_var) kl = dist.kl_sym(old_dist_info_vars, dist_info_vars) mean_kl = TT.mean(kl) max_kl = TT.max(kl) #for test surr_ll = dist.log_likelihood_sym(actions_var, dist_info_vars) surr_ll_cumsum = dist.log_likelihood_sym_cumsum(actions_var, dist_info_vars) surr = TT.sum(surr_ll_cumsum * rewards_var) f_surr_ll = theano.function(inputs=[observations_var, actions_var],
def init_opt(self): is_recurrent = int(self.policy.recurrent) obs_var = self.env.observation_space.new_tensor_variable( 'obs', extra_dims=1 + is_recurrent, ) action_var = self.env.action_space.new_tensor_variable( 'action', extra_dims=1 + is_recurrent, ) advantage_var = tensor_utils.new_tensor( 'advantage', ndim=1 + is_recurrent, dtype=theano.config.floatX) dist = self.policy.distribution old_dist_info_vars = { k: tensor_utils.new_tensor( 'old_%s' % k, ndim=2 + is_recurrent, dtype=theano.config.floatX) for k in dist.dist_info_keys } old_dist_info_vars_list = [ old_dist_info_vars[k] for k in dist.dist_info_keys ] if is_recurrent: valid_var = TT.matrix('valid') else: valid_var = None state_info_vars = { k: tensor_utils.new_tensor( k, ndim=2 + is_recurrent, dtype=theano.config.floatX) for k in self.policy.state_info_keys } state_info_vars_list = [ state_info_vars[k] for k in self.policy.state_info_keys ] dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars) logli = dist.log_likelihood_sym(action_var, dist_info_vars) kl = dist.kl_sym(old_dist_info_vars, dist_info_vars) # formulate as a minimization problem # The gradient of the surrogate objective is the policy gradient if is_recurrent: surr_obj = -TT.sum( logli * advantage_var * valid_var) / TT.sum(valid_var) mean_kl = TT.sum(kl * valid_var) / TT.sum(valid_var) max_kl = TT.max(kl * valid_var) else: surr_obj = -TT.mean(logli * advantage_var) mean_kl = TT.mean(kl) max_kl = TT.max(kl) input_list = [obs_var, action_var, advantage_var ] + state_info_vars_list if is_recurrent: input_list.append(valid_var) self.optimizer.update_opt( surr_obj, target=self.policy, inputs=input_list) f_kl = tensor_utils.compile_function( inputs=input_list + old_dist_info_vars_list, outputs=[mean_kl, max_kl], ) self.opt_info = dict(f_kl=f_kl, )