def init_opt(self): self.start_time = time.time() is_recurrent = int(self.policy.recurrent) obs_var = self.env.observation_space.new_tensor_variable( 'obs', extra_dims=1 + is_recurrent, ) action_var = self.env.action_space.new_tensor_variable( 'action', extra_dims=1 + is_recurrent, ) advantage_var = ext.new_tensor( 'advantage', ndim=1 + is_recurrent, dtype=theano.config.floatX ) dist = self.policy.distribution old_dist_info_vars = { k: ext.new_tensor( 'old_%s' % k, ndim=2 + is_recurrent, dtype=theano.config.floatX ) for k in dist.dist_info_keys } old_dist_info_vars_list = [old_dist_info_vars[k] for k in dist.dist_info_keys] if is_recurrent: valid_var = TT.matrix('valid') else: valid_var = None dist_info_vars = self.policy.dist_info_sym(obs_var, action_var) kl = dist.kl_sym(old_dist_info_vars, dist_info_vars) lr = dist.likelihood_ratio_sym( action_var, old_dist_info_vars, dist_info_vars) if is_recurrent: mean_kl = TT.sum(kl * valid_var) / TT.sum(valid_var) surr_loss = - \ TT.sum(lr * advantage_var * valid_var) / TT.sum(valid_var) else: mean_kl = TT.mean(kl) surr_loss = - TT.mean(lr * advantage_var) input_list = [ obs_var, action_var, advantage_var, ] + old_dist_info_vars_list if is_recurrent: input_list.append(valid_var) self.optimizer.update_opt( loss=surr_loss, target=self.policy, leq_constraint=(mean_kl, self.step_size), inputs=input_list, constraint_name="mean_kl" ) return dict()
def init_opt(self): is_recurrent = int(self.policy.recurrent) obs_var = self.env.observation_space.new_tensor_variable( 'obs', extra_dims=1 + is_recurrent, ) action_var = self.env.action_space.new_tensor_variable( 'action', extra_dims=1 + is_recurrent, ) advantage_var = ext.new_tensor( 'advantage', ndim=1 + is_recurrent, dtype=theano.config.floatX ) dist = self.policy.distribution old_dist_info_vars = { k: ext.new_tensor( 'old_%s' % k, ndim=2 + is_recurrent, dtype=theano.config.floatX ) for k in dist.dist_info_keys } old_dist_info_vars_list = [old_dist_info_vars[k] for k in dist.dist_info_keys] if is_recurrent: valid_var = TT.matrix('valid') else: valid_var = None dist_info_vars = self.policy.dist_info_sym(obs_var, action_var) logli = dist.log_likelihood_sym(action_var, dist_info_vars) kl = dist.kl_sym(old_dist_info_vars, dist_info_vars) # formulate as a minimization problem # The gradient of the surrogate objective is the policy gradient if is_recurrent: surr_obj = - TT.sum(logli * advantage_var * valid_var) / TT.sum(valid_var) mean_kl = TT.sum(kl * valid_var) / TT.sum(valid_var) max_kl = TT.max(kl * valid_var) else: surr_obj = - TT.mean(logli * advantage_var) mean_kl = TT.mean(kl) max_kl = TT.max(kl) input_list = [obs_var, action_var, advantage_var] if is_recurrent: input_list.append(valid_var) self.optimizer.update_opt(surr_obj, target=self.policy, inputs=input_list) f_kl = ext.compile_function( inputs=input_list + old_dist_info_vars_list, outputs=[mean_kl, max_kl], ) self.opt_info = dict( f_kl=f_kl, )
def init_opt(self): is_recurrent = int(self.policy.recurrent) obs_var = self.env.observation_space.new_tensor_variable( 'obs', extra_dims=1 + is_recurrent, ) action_var = self.env.action_space.new_tensor_variable( 'action', extra_dims=1 + is_recurrent, ) advantage_var = ext.new_tensor('advantage', ndim=1 + is_recurrent, dtype=theano.config.floatX) dist = self.policy.distribution old_dist_info_vars = { k: ext.new_tensor('old_%s' % k, ndim=2 + is_recurrent, dtype=theano.config.floatX) for k in dist.dist_info_keys } old_dist_info_vars_list = [ old_dist_info_vars[k] for k in dist.dist_info_keys ] if is_recurrent: valid_var = TT.matrix('valid') else: valid_var = None dist_info_vars = self.policy.dist_info_sym(obs_var, action_var) logli = dist.log_likelihood_sym(action_var, dist_info_vars) kl = dist.kl_sym(old_dist_info_vars, dist_info_vars) # formulate as a minimization problem # The gradient of the surrogate objective is the policy gradient if is_recurrent: surr_obj = -TT.sum( logli * advantage_var * valid_var) / TT.sum(valid_var) mean_kl = TT.sum(kl * valid_var) / TT.sum(valid_var) max_kl = TT.max(kl * valid_var) else: surr_obj = -TT.mean(logli * advantage_var) mean_kl = TT.mean(kl) max_kl = TT.max(kl) input_list = [obs_var, action_var, advantage_var] if is_recurrent: input_list.append(valid_var) self.optimizer.update_opt(surr_obj, target=self.policy, inputs=input_list) f_kl = ext.compile_function( inputs=input_list + old_dist_info_vars_list, outputs=[mean_kl, max_kl], ) self.opt_info = dict(f_kl=f_kl, )
def init_opt(self): is_recurrent = int(self.policy.recurrent) obs_var = self.env.observation_space.new_tensor_variable( 'obs', extra_dims=1 + is_recurrent, ) action_var = self.env.action_space.new_tensor_variable( 'action', extra_dims=1 + is_recurrent, ) advantage_var = ext.new_tensor( 'advantage', ndim=1 + is_recurrent, dtype=theano.config.floatX ) dist = self.policy.distribution old_dist_info_vars = { k: ext.new_tensor( 'old_%s' % k, ndim=2 + is_recurrent, dtype=theano.config.floatX ) for k in dist.dist_info_keys } old_dist_info_vars_list = [old_dist_info_vars[k] for k in dist.dist_info_keys] if is_recurrent: valid_var = TT.matrix('valid') else: valid_var = None dist_info_vars = self.policy.dist_info_sym(obs_var, action_var) kl = dist.kl_sym(old_dist_info_vars, dist_info_vars) lr = dist.likelihood_ratio_sym( action_var, old_dist_info_vars, dist_info_vars) if is_recurrent: mean_kl = TT.sum(kl * valid_var) / TT.sum(valid_var) surr_loss = - \ TT.sum(lr * advantage_var * valid_var) / TT.sum(valid_var) else: mean_kl = TT.mean(kl) surr_loss = - TT.mean(lr * advantage_var) input_list = [ obs_var, action_var, advantage_var, ] + old_dist_info_vars_list if is_recurrent: input_list.append(valid_var) self.optimizer.update_opt( loss=surr_loss, target=self.policy, leq_constraint=(mean_kl, self.step_size), inputs=input_list, constraint_name="mean_kl" ) return dict()
def __init__(self, env, policy, baseline, max_kl): """ env = only structural info of env is used here; you need to pass the 'mode' to functions of this class max_kl = constraint for determining step-size (suggested: 1e-2 or 5e-3) """ self.policy = policy self.env = env self.baseline = baseline self.optimizer = ConjugateGradientOptimizer(**dict()) # Define symbolic variables self.observations_var = self.env.observation_space.new_tensor_variable( 'observations', extra_dims=1) self.actions_var = self.env.action_space.new_tensor_variable( 'actions', extra_dims=1) self.advantages_var = TT.vector('advantages') self.dist = self.policy.distribution self.old_dist_info_vars = { k: ext.new_tensor('old_%s' % k, ndim=2, dtype=theano.config.floatX) for k in self.dist.dist_info_keys } self.old_dist_info_vars_list = [ self.old_dist_info_vars[k] for k in self.dist.dist_info_keys ] self.state_info_vars = { k: ext.new_tensor(k, ndim=2, dtype=theano.config.floatX) for k in self.policy.state_info_keys } self.state_info_vars_list = [ self.state_info_vars[k] for k in self.policy.state_info_keys ] self.dist_info_vars = self.policy.dist_info_sym( self.observations_var, self.state_info_vars) # distribution info variable (symbolic) -- interpret as pi self.KL = self.dist.kl_sym(self.old_dist_info_vars, self.dist_info_vars) self.LR = self.dist.likelihood_ratio_sym(self.actions_var, self.old_dist_info_vars, self.dist_info_vars) self.mean_KL = TT.mean(self.KL) self.surr = -TT.mean(self.LR * self.advantages_var) self.input_list = [self.observations_var, self.actions_var, self.advantages_var] + \ self.state_info_vars_list + self.old_dist_info_vars_list self.optimizer.update_opt(loss=self.surr, target=self.policy, \ leq_constraint=(self.mean_KL, max_kl), \ inputs=self.input_list, constraint_name="mean_kl")
def new_tensor_variable(self, name, extra_dims): if self.n <= 2**8: return ext.new_tensor(name=name, ndim=extra_dims + 1, dtype='uint8') elif self.n <= 2**16: return ext.new_tensor(name=name, ndim=extra_dims + 1, dtype='uint16') else: return ext.new_tensor(name=name, ndim=extra_dims + 1, dtype='uint32')
def __init__(self, env, policy, baseline, max_kl): """ env = only structural info of env is used here; you need to pass the 'mode' to functions of this class max_kl = constraint for determining step-size (suggested: 1e-2 or 5e-3) """ self.policy = policy self.env = env self.baseline = baseline self.optimizer = FirstOrderOptimizer(**dict()) # Define symbolic variables self.observations_var = self.env.observation_space.new_tensor_variable( 'observations', extra_dims=1) self.actions_var = self.env.action_space.new_tensor_variable( 'actions', extra_dims=1) self.advantages_var = TT.vector('advantages') self.dist = self.policy.distribution self.old_dist_info_vars = { k: ext.new_tensor('old_%s' % k, ndim=2, dtype=theano.config.floatX) for k in self.dist.dist_info_keys } self.old_dist_info_vars_list = [ self.old_dist_info_vars[k] for k in self.dist.dist_info_keys ] self.state_info_vars = { k: ext.new_tensor(k, ndim=2, dtype=theano.config.floatX) for k in self.policy.state_info_keys } self.state_info_vars_list = [ self.state_info_vars[k] for k in self.policy.state_info_keys ] self.dist_info_vars = self.policy.dist_info_sym( self.observations_var, self.state_info_vars) self.logli = self.dist.log_likelihood_sym(self.actions_var, self.dist_info_vars) self.surr = -TT.mean(logli * advantages_var) self.input_list = [ self.observations_var, self.actions_var, self.advantages_var ] + self.state_info_vars_list self.optimizer.update_opt(self.surr, target=self.policy, inputs=input_list)
def __init__(self, env, policy, baseline, max_kl): """ env = only structural info of env is used here; you need to pass the 'mode' to functions of this class max_kl = constraint for determining step-size (suggested: 1e-2 or 5e-3) """ self.policy = policy self.env = env self.baseline = baseline self.optimizer = ConjugateGradientOptimizer(**dict()) # Define symbolic variables self.observations_var = self.env.observation_space.new_tensor_variable('observations', extra_dims=1) self.actions_var = self.env.action_space.new_tensor_variable('actions', extra_dims=1) self.advantages_var = TT.vector('advantages') self.dist = self.policy.distribution self.old_dist_info_vars = { k: ext.new_tensor( 'old_%s' % k, ndim=2, dtype=theano.config.floatX ) for k in self.dist.dist_info_keys } self.old_dist_info_vars_list = [self.old_dist_info_vars[k] for k in self.dist.dist_info_keys] self.state_info_vars = { k: ext.new_tensor( k, ndim=2, dtype=theano.config.floatX ) for k in self.policy.state_info_keys } self.state_info_vars_list = [self.state_info_vars[k] for k in self.policy.state_info_keys] self.dist_info_vars = self.policy.dist_info_sym(self.observations_var, self.state_info_vars) # distribution info variable (symbolic) -- interpret as pi self.KL = self.dist.kl_sym(self.old_dist_info_vars, self.dist_info_vars) self.LR = self.dist.likelihood_ratio_sym(self.actions_var, self.old_dist_info_vars, self.dist_info_vars) self.mean_KL = TT.mean(self.KL) self.surr = - TT.mean(self.LR * self.advantages_var) self.input_list = [self.observations_var, self.actions_var, self.advantages_var] + \ self.state_info_vars_list + self.old_dist_info_vars_list self.optimizer.update_opt(loss=self.surr, target=self.policy, \ leq_constraint=(self.mean_KL, max_kl), \ inputs=self.input_list, constraint_name="mean_kl")
def __init__(self, env, policy, baseline, max_kl): """ env = only structural info of env is used here; you need to pass the 'mode' to functions of this class max_kl = constraint for determining step-size (suggested: 1e-2 or 5e-3) """ self.policy = policy self.env = env self.baseline = baseline self.optimizer = FirstOrderOptimizer(**dict()) # Define symbolic variables self.observations_var = self.env.observation_space.new_tensor_variable('observations', extra_dims=1) self.actions_var = self.env.action_space.new_tensor_variable('actions', extra_dims=1) self.advantages_var = TT.vector('advantages') self.dist = self.policy.distribution self.old_dist_info_vars = { k: ext.new_tensor( 'old_%s' % k, ndim=2, dtype=theano.config.floatX ) for k in self.dist.dist_info_keys } self.old_dist_info_vars_list = [self.old_dist_info_vars[k] for k in self.dist.dist_info_keys] self.state_info_vars = { k: ext.new_tensor( k, ndim=2, dtype=theano.config.floatX ) for k in self.policy.state_info_keys } self.state_info_vars_list = [self.state_info_vars[k] for k in self.policy.state_info_keys] self.dist_info_vars = self.policy.dist_info_sym(self.observations_var, self.state_info_vars) self.logli = self.dist.log_likelihood_sym(self.actions_var, self.dist_info_vars) self.surr = - TT.mean(logli * advantages_var) self.input_list = [self.observations_var, self.actions_var, self.advantages_var] + self.state_info_vars_list self.optimizer.update_opt(self.surr, target=self.policy, inputs=input_list)
def init_opt(self): obs_var = self.env.observation_space.new_tensor_variable( 'obs', extra_dims=1, ) action_var = self.env.action_space.new_tensor_variable( 'action', extra_dims=1, ) advantage_var = ext.new_tensor( 'advantage', ndim=1, dtype=theano.config.floatX ) mean_var = ext.new_tensor( 'mean', ndim=2, dtype=theano.config.floatX ) log_std_var = ext.new_tensor( 'log_std', ndim=2, dtype=theano.config.floatX ) old_dist_info_vars = dict(mean=mean_var, log_std=log_std_var) dist_info_vars = self.policy.dist_info_sym(obs_var) lr = self.policy.distribution.likelihood_ratio_sym(action_var, old_dist_info_vars, dist_info_vars) surr_loss_vector = TT.minimum(lr * advantage_var, TT.clip(lr, 1 - self.epsilon, 1 + self.epsilon) * advantage_var) surr_loss = -TT.mean(surr_loss_vector) input_list = [obs_var, action_var, advantage_var, mean_var, log_std_var] self.optimizer.update_opt( loss=surr_loss, target=self.policy, inputs=input_list ) return dict()
def new_tensor_variable(self, name, extra_dims): if self.n <= 2 ** 8: return ext.new_tensor( name=name, ndim=extra_dims+1, dtype='uint8' ) elif self.n <= 2 ** 16: return ext.new_tensor( name=name, ndim=extra_dims+1, dtype='uint16' ) else: return ext.new_tensor( name=name, ndim=extra_dims+1, dtype='uint32' )
def new_tensor_variable(self, name, extra_dims): return ext.new_tensor( name=name, ndim=extra_dims+1, dtype=self._common_dtype, )
def init_opt(self): obs_var = ext.new_tensor( 'obs', ndim=2, dtype=theano.config.floatX) # todo: check the dtype manager_obs_var = ext.new_tensor('manager_obs', ndim=2, dtype=theano.config.floatX) action_var = self.env.action_space.new_tensor_variable( 'action', extra_dims=1, ) # this will have to be the advantage every time the manager makes a decision manager_advantage_var = ext.new_tensor('manager_advantage', ndim=1, dtype=theano.config.floatX) skill_advantage_var = ext.new_tensor('skill_advantage', ndim=1, dtype=theano.config.floatX) latent_var_sparse = ext.new_tensor('sparse_latent', ndim=2, dtype=theano.config.floatX) latent_var = ext.new_tensor('latents', ndim=2, dtype=theano.config.floatX) mean_var = ext.new_tensor('mean', ndim=2, dtype=theano.config.floatX) log_std_var = ext.new_tensor('log_std', ndim=2, dtype=theano.config.floatX) manager_prob_var = ext.new_tensor('log_std', ndim=2, dtype=theano.config.floatX) assert isinstance(self.policy, HierarchicalPolicy) ############################################################# ### calculating the manager portion of the surrogate loss ### ############################################################# # i, j should contain the probability of latent j at time step self.period*i # should be a len(obs)//self.period by len(self.latent) tensor latent_probs = self.policy.manager.dist_info_sym( manager_obs_var)['prob'] # old_latent_probs = self.old_policy.manager.dist_info_sym(manager_obs_var)['prob'] actual_latent_probs = TT.sum(latent_probs * latent_var_sparse, axis=1) old_actual_latent_probs = TT.sum(manager_prob_var * latent_var_sparse, axis=1) lr = TT.exp( TT.log(actual_latent_probs) - TT.log(old_actual_latent_probs)) manager_surr_loss_vector = TT.minimum( lr * manager_advantage_var, TT.clip(lr, 1 - self.epsilon, 1 + self.epsilon) * manager_advantage_var) manager_surr_loss = -TT.mean(manager_surr_loss_vector) ############################################################ ### calculating the skills portion of the surrogate loss ### ############################################################ dist_info_var = self.policy.low_policy.dist_info_sym( obs_var, state_info_var=latent_var) old_dist_info_var = dict(mean=mean_var, log_std=log_std_var) skill_lr = self.diagonal.likelihood_ratio_sym(action_var, old_dist_info_var, dist_info_var) skill_surr_loss_vector = TT.minimum( skill_lr * skill_advantage_var, TT.clip(skill_lr, 1 - self.epsilon, 1 + self.epsilon) * skill_advantage_var) skill_surr_loss = -TT.mean(skill_surr_loss_vector) surr_loss = manager_surr_loss / self.average_period + skill_surr_loss input_list = [ obs_var, manager_obs_var, action_var, manager_advantage_var, skill_advantage_var, latent_var, latent_var_sparse, mean_var, log_std_var, manager_prob_var ] self.optimizer.update_opt(loss=surr_loss, target=self.policy, inputs=input_list) return dict()
def init_opt(self): is_recurrent = int(self.policy.recurrent) # Init dual param values self.param_eta = 15. # Adjust for linear feature vector. self.param_v = np.random.rand(self.env.observation_space.flat_dim * 2 + 4) # Theano vars obs_var = self.env.observation_space.new_tensor_variable( 'obs', extra_dims=1 + is_recurrent, ) action_var = self.env.action_space.new_tensor_variable( 'action', extra_dims=1 + is_recurrent, ) rewards = ext.new_tensor( 'rewards', ndim=1 + is_recurrent, dtype=theano.config.floatX, ) # Feature difference variable representing the difference in feature # value of the next observation and the current observation \phi(s') - # \phi(s). feat_diff = ext.new_tensor( 'feat_diff', ndim=2 + is_recurrent, dtype=theano.config.floatX ) param_v = TT.vector('param_v') param_eta = TT.scalar('eta') valid_var = TT.matrix('valid') state_info_vars = { k: ext.new_tensor( k, ndim=2 + is_recurrent, dtype=theano.config.floatX ) for k in self.policy.state_info_keys } state_info_vars_list = [state_info_vars[k] for k in self.policy.state_info_keys] # Policy-related symbolics dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars) dist = self.policy.distribution # log of the policy dist logli = dist.log_likelihood_sym(action_var, dist_info_vars) # Symbolic sample Bellman error delta_v = rewards + TT.dot(feat_diff, param_v) # Policy loss (negative because we minimize) if is_recurrent: loss = - TT.sum(logli * TT.exp( delta_v / param_eta - TT.max(delta_v / param_eta) ) * valid_var) / TT.sum(valid_var) else: loss = - TT.mean(logli * TT.exp( delta_v / param_eta - TT.max(delta_v / param_eta) )) # Add regularization to loss. reg_params = self.policy.get_params(regularizable=True) loss += self.L2_reg_loss * TT.sum( [TT.mean(TT.square(param)) for param in reg_params] ) / len(reg_params) # Policy loss gradient. loss_grad = TT.grad( loss, self.policy.get_params(trainable=True)) if is_recurrent: recurrent_vars = [valid_var] else: recurrent_vars = [] input = [rewards, obs_var, feat_diff, action_var] + state_info_vars_list + recurrent_vars + [param_eta, param_v] # if is_recurrent: # input += f_loss = ext.compile_function( inputs=input, outputs=loss, ) f_loss_grad = ext.compile_function( inputs=input, outputs=loss_grad, ) # Debug prints old_dist_info_vars = { k: ext.new_tensor( 'old_%s' % k, ndim=2 + is_recurrent, dtype=theano.config.floatX ) for k in dist.dist_info_keys } old_dist_info_vars_list = [old_dist_info_vars[k] for k in dist.dist_info_keys] if is_recurrent: mean_kl = TT.sum(dist.kl_sym(old_dist_info_vars, dist_info_vars) * valid_var) / TT.sum(valid_var) else: mean_kl = TT.mean(dist.kl_sym(old_dist_info_vars, dist_info_vars)) f_kl = ext.compile_function( inputs=[obs_var, action_var] + state_info_vars_list + old_dist_info_vars_list + recurrent_vars, outputs=mean_kl, ) # Dual-related symbolics # Symbolic dual if is_recurrent: dual = param_eta * self.epsilon + \ param_eta * TT.log( TT.sum( TT.exp( delta_v / param_eta - TT.max(delta_v / param_eta) ) * valid_var ) / TT.sum(valid_var) ) + param_eta * TT.max(delta_v / param_eta) else: dual = param_eta * self.epsilon + \ param_eta * TT.log( TT.mean( TT.exp( delta_v / param_eta - TT.max(delta_v / param_eta) ) ) ) + param_eta * TT.max(delta_v / param_eta) # Add L2 regularization. dual += self.L2_reg_dual * \ (TT.square(param_eta) + TT.square(1 / param_eta)) # Symbolic dual gradient dual_grad = TT.grad(cost=dual, wrt=[param_eta, param_v]) # Eval functions. f_dual = ext.compile_function( inputs=[rewards, feat_diff] + state_info_vars_list + recurrent_vars + [param_eta, param_v], outputs=dual ) f_dual_grad = ext.compile_function( inputs=[rewards, feat_diff] + state_info_vars_list + recurrent_vars + [param_eta, param_v], outputs=dual_grad ) self.opt_info = dict( f_loss_grad=f_loss_grad, f_loss=f_loss, f_dual=f_dual, f_dual_grad=f_dual_grad, f_kl=f_kl )
def init_opt(self): self.start_time = time.time() is_recurrent = int(self.policy.recurrent) obs_var = self.env.observation_space.new_tensor_variable( 'obs', extra_dims=1 + is_recurrent, ) action_var = self.env.action_space.new_tensor_variable( 'action', extra_dims=1 + is_recurrent, ) advantage_var = ext.new_tensor('advantage', ndim=1 + is_recurrent, dtype=theano.config.floatX) if self.safety_constraint: safety_var = ext.new_tensor('safety_vals', ndim=1 + is_recurrent, dtype=theano.config.floatX) weights_var = ext.new_tensor('weights', ndim=1 + is_recurrent, dtype=theano.config.floatX) dist = self.policy.distribution old_dist_info_vars = { k: ext.new_tensor('old_%s' % k, ndim=2 + is_recurrent, dtype=theano.config.floatX) for k in dist.dist_info_keys } old_dist_info_vars_list = [ old_dist_info_vars[k] for k in dist.dist_info_keys ] state_info_vars = { k: ext.new_tensor(k, ndim=2 + is_recurrent, dtype=theano.config.floatX) for k in self.policy.state_info_keys } state_info_vars_list = [ state_info_vars[k] for k in self.policy.state_info_keys ] if is_recurrent: valid_var = TT.matrix('valid') else: valid_var = None dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars) self.dist_info_vars_func = ext.compile_function( inputs=[obs_var] + state_info_vars_list, outputs=dist_info_vars, log_name="dist_info_vars") ent = dist.entropy_sym(dist_info_vars) kl = dist.kl_sym(old_dist_info_vars, dist_info_vars) lr = dist.likelihood_ratio_sym(action_var, old_dist_info_vars, dist_info_vars) if is_recurrent: mean_ent = TT.sum( weights_var * ent * valid_var) / TT.sum(valid_var) max_kl = TT.max(kl * valid_var) mean_kl = TT.sum(weights_var * kl * valid_var) / TT.sum(valid_var) surr_loss = -TT.sum(lr * weights_var * advantage_var * valid_var) / TT.sum(valid_var) if self.safety_constraint: f_safety = TT.sum(lr * weights_var * safety_var * valid_var) / TT.sum(valid_var) else: mean_ent = TT.mean(weights_var * ent) max_kl = TT.max(kl) mean_kl = TT.mean(weights_var * kl) surr_loss = -TT.mean(lr * weights_var * advantage_var) if self.safety_constraint: f_safety = TT.mean(lr * weights_var * safety_var) if self.entropy_regularize: self.entropy_beta = theano.shared(self.entropy_coeff) surr_loss -= self.entropy_beta * mean_ent if self.safety_constraint: self.safety_gradient_rescale = theano.shared(1.) f_safety = self.safety_gradient_rescale * f_safety input_list = [ obs_var, action_var, advantage_var, weights_var, ] if self.safety_constraint: input_list.append(safety_var) input_list = input_list + state_info_vars_list + old_dist_info_vars_list if is_recurrent: input_list.append(valid_var) if not (self.safety_constrained_optimizer): self.optimizer.update_opt(loss=surr_loss, target=self.policy, leq_constraint=(mean_kl, self.step_size), inputs=input_list, constraint_name="mean_kl") else: self.optimizer.update_opt( loss=surr_loss, target=self.policy, quad_leq_constraint=(mean_kl, self.step_size), lin_leq_constraint=(f_safety, self.safety_step_size), inputs=input_list, constraint_name_1="mean_kl", constraint_name_2="safety", using_surrogate=False, precompute=True, attempt_feasible_recovery=self.attempt_feasible_recovery, attempt_infeasible_recovery=self.attempt_infeasible_recovery, revert_to_last_safe_point=self.revert_to_last_safe_point) f_kl = ext.compile_function( inputs=input_list, outputs=[mean_kl, max_kl], ) self.opt_info = dict(f_kl=f_kl, )
def init_opt(self): is_recurrent = int(self.policy.recurrent) obs_var = self.env.observation_space.new_tensor_variable( 'obs', extra_dims=1 + is_recurrent, ) action_var = self.env.action_space.new_tensor_variable( 'action', extra_dims=1 + is_recurrent, ) advantage_var = ext.new_tensor('advantage', ndim=1 + is_recurrent, dtype=theano.config.floatX) dist = self.policy.distribution old_dist_info_vars = { k: ext.new_tensor('old_%s' % k, ndim=2 + is_recurrent, dtype=theano.config.floatX) for k in dist.dist_info_keys } old_dist_info_vars_list = [ old_dist_info_vars[k] for k in dist.dist_info_keys ] state_info_vars = { k: ext.new_tensor(k, ndim=2 + is_recurrent, dtype=theano.config.floatX) for k in self.policy.state_info_keys } state_info_vars_list = [ state_info_vars[k] for k in self.policy.state_info_keys ] if is_recurrent: valid_var = TT.matrix('valid') else: valid_var = None task_obs_var = [] task_old_dist_info_vars_list = [] task_kls = [] for i in range(self.task_num): task_obs_var.append( self.env.observation_space.new_tensor_variable( 'obs_task%d' % (i), extra_dims=1 + is_recurrent, )) temp_dist_info_var = self.policy.dist_info_sym( task_obs_var[-1], state_info_vars) temp_old_dist_info_vars = { k: ext.new_tensor('task%d_old_%s' % (i, k), ndim=2 + is_recurrent, dtype=theano.config.floatX) for k in dist.dist_info_keys } task_old_dist_info_vars_list += [ temp_old_dist_info_vars[k] for k in dist.dist_info_keys ] task_kls.append( dist.kl_sym(temp_old_dist_info_vars, temp_dist_info_var)) dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars) kl = dist.kl_sym(old_dist_info_vars, dist_info_vars) lr = dist.likelihood_ratio_sym(action_var, old_dist_info_vars, dist_info_vars) kl_weight_var = ext.new_tensor('kl_weight', ndim=1, dtype=theano.config.floatX) if self.truncate_local_is_ratio is not None: lr = TT.minimum(self.truncate_local_is_ratio, lr) if is_recurrent: mean_kl = TT.sum(kl * valid_var) / TT.sum(valid_var) surr_loss = -TT.sum( lr * advantage_var * valid_var) / TT.sum(valid_var) else: weighted_kls = [] '''for i, one_task_kl in enumerate(task_kls): weighted_kls.append(TT.mean(one_task_kl * kl_weight_var[i])) mean_kl = TT.mean(weighted_kls)''' for i, one_task_kl in enumerate(task_kls): weighted_kls.append((one_task_kl * kl_weight_var[i])) mean_kl = TT.mean(TT.concatenate(weighted_kls)) surr_loss = -TT.mean(lr * advantage_var) input_list = [ obs_var, action_var, advantage_var, ] + state_info_vars_list + old_dist_info_vars_list + task_obs_var + task_old_dist_info_vars_list + [ kl_weight_var ] if is_recurrent: input_list.append(valid_var) self.optimizer.update_opt(loss=surr_loss, target=self.policy, leq_constraint=(mean_kl, self.step_size), inputs=input_list, constraint_name="mean_kl") self.f_constraints = [] self.f_constraints.append( ext.compile_function( inputs=input_list, outputs=TT.mean(kl), log_name="kl_div_task", )) for i in range(self.task_num): self.f_constraints.append( ext.compile_function( inputs=input_list, outputs=TT.mean(task_kls[i]), log_name="kl_div_task%d" % i, )) return dict()
def init_opt(self): is_recurrent = int(self.policy.recurrent) obs_var = self.env.observation_space.new_tensor_variable( 'obs', extra_dims=1 + is_recurrent, ) action_var = self.env.action_space.new_tensor_variable( 'action', extra_dims=1 + is_recurrent, ) advantage_var = ext.new_tensor( 'advantage', ndim=1 + is_recurrent, dtype=theano.config.floatX ) dist = self.policy.distribution old_dist_info_vars = { k: ext.new_tensor( 'old_%s' % k, ndim=2 + is_recurrent, dtype=theano.config.floatX ) for k in dist.dist_info_keys } old_dist_info_vars_list = [old_dist_info_vars[k] for k in dist.dist_info_keys] state_info_vars = { k: ext.new_tensor( k, ndim=2 + is_recurrent, dtype=theano.config.floatX ) for k in self.policy.state_info_keys } state_info_vars_list = [state_info_vars[k] for k in self.policy.state_info_keys] if is_recurrent: valid_var = TT.matrix('valid') else: valid_var = None task_obs_var = [] task_action_var = [] task_advantage_var = [] task_old_dist_info_vars_list = [] task_old_dist_info_vars_list_per_task = [] lrs = [] for i in range(self.task_num): task_obs_var.append(self.env.observation_space.new_tensor_variable( 'obs_task%d'%(i), extra_dims=1 + is_recurrent, )) task_action_var.append(self.env.action_space.new_tensor_variable( 'action_task%d'%(i), extra_dims=1 + is_recurrent, )) task_advantage_var.append(ext.new_tensor( 'advantage_task%d'%(i), ndim=1 + is_recurrent, dtype=theano.config.floatX )) temp_dist_info_var = self.policy.dist_info_sym(task_obs_var[-1], state_info_vars) temp_old_dist_info_vars = { k: ext.new_tensor( 'task%d_old_%s' % (i,k), ndim=2 + is_recurrent, dtype=theano.config.floatX ) for k in dist.dist_info_keys } task_old_dist_info_vars_list += [temp_old_dist_info_vars[k] for k in dist.dist_info_keys] task_old_dist_info_vars_list_per_task.append([temp_old_dist_info_vars[k] for k in dist.dist_info_keys]) lrs.append(dist.likelihood_ratio_sym(task_action_var[i], temp_old_dist_info_vars, temp_dist_info_var)) dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars) kl = dist.kl_sym(old_dist_info_vars, dist_info_vars) surr_loss = 0 task_sur_losses = [] for i, one_lr in enumerate(lrs): task_sur_losses.append(-TT.mean(one_lr * task_advantage_var[i])) surr_loss += task_sur_losses[-1] input_list = [ obs_var, action_var, advantage_var, ] + state_info_vars_list + old_dist_info_vars_list + task_obs_var + task_action_var + task_advantage_var + task_old_dist_info_vars_list if is_recurrent: input_list.append(valid_var) mean_kl = TT.mean(kl) self.optimizer.update_opt( loss=surr_loss, target=self.policy, leq_constraint=(mean_kl, self.step_size), inputs=input_list, constraint_name="mean_kl" ) self.f_constraints=[] self.f_constraints.append(ext.compile_function( inputs=input_list, outputs=TT.mean(kl), log_name="kl_div_task", )) self.f_task_grads = [] for i in range(self.task_num): task_grads = theano.grad(task_sur_losses[i], wrt=self.policy.get_params(trainable=True), disconnected_inputs='warn') self.f_task_grads.append(ext.compile_function( inputs=[ task_obs_var[i], task_action_var[i], task_advantage_var[i], ] + task_old_dist_info_vars_list_per_task[i] + state_info_vars_list, outputs=task_grads, log_name="f_task_grads", )) return dict()
def init_opt(self): """ Same as normal NPO, except for setting MKL_NUM_THREADS. """ # Set BEFORE Theano compiling; make equal to number of cores per worker. os.environ['MKL_NUM_THREADS'] = str(self.mkl_num_threads) is_recurrent = int(self.policy.recurrent) obs_var = self.env.observation_space.new_tensor_variable( 'obs', extra_dims=1 + is_recurrent, ) action_var = self.env.action_space.new_tensor_variable( 'action', extra_dims=1 + is_recurrent, ) advantage_var = ext.new_tensor('advantage', ndim=1 + is_recurrent, dtype=theano.config.floatX) dist = self.policy.distribution old_dist_info_vars = { k: ext.new_tensor('old_%s' % k, ndim=2 + is_recurrent, dtype=theano.config.floatX) for k in dist.dist_info_keys } old_dist_info_vars_list = [ old_dist_info_vars[k] for k in dist.dist_info_keys ] state_info_vars = { k: ext.new_tensor(k, ndim=2 + is_recurrent, dtype=theano.config.floatX) for k in self.policy.state_info_keys } state_info_vars_list = [ state_info_vars[k] for k in self.policy.state_info_keys ] if is_recurrent: valid_var = TT.matrix('valid') else: valid_var = None dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars) kl = dist.kl_sym(old_dist_info_vars, dist_info_vars) lr = dist.likelihood_ratio_sym(action_var, old_dist_info_vars, dist_info_vars) if self.truncate_local_is_ratio is not None: lr = TT.minimum(self.truncate_local_is_ratio, lr) if is_recurrent: mean_kl = TT.sum(kl * valid_var) / TT.sum(valid_var) surr_loss = -TT.sum( lr * advantage_var * valid_var) / TT.sum(valid_var) else: mean_kl = TT.mean(kl) surr_loss = -TT.mean(lr * advantage_var) if self.entropy_bonus > 0: surr_loss -= self.entropy_bonus * TT.mean( self.policy.distribution.entropy_sym(dist_info_vars)) input_list = [ obs_var, action_var, advantage_var, ] + state_info_vars_list + old_dist_info_vars_list if is_recurrent: input_list.append(valid_var) self.optimizer.update_opt(loss=surr_loss, target=self.policy, leq_constraint=(mean_kl, self.step_size), inputs=input_list, constraint_name="mean_kl") return dict()
def init_opt(self): # obs_var_raw = self.env.observation_space.new_tensor_variable( # 'obs', # extra_dims=1, # ) obs_var_raw = ext.new_tensor( 'obs', ndim=3, dtype=theano.config.floatX) # todo: check the dtype action_var = self.env.action_space.new_tensor_variable( 'action', extra_dims=1, ) # this will have to be the advantage every self.period timesteps advantage_var_sparse = ext.new_tensor('sparse_advantage', ndim=1, dtype=theano.config.floatX) advantage_var = ext.new_tensor('advantage', ndim=1, dtype=theano.config.floatX) obs_var_sparse = ext.new_tensor( 'sparse_obs', ndim=2, dtype=theano.config. floatX # todo: check this with carlos, refer to discrete.py in rllab.spaces ) latent_var_sparse = ext.new_tensor('sparse_latent', ndim=2, dtype=theano.config.floatX) latent_var = ext.new_tensor('latents', ndim=2, dtype=theano.config.floatX) assert isinstance(self.policy, HierarchicalPolicy) # todo: assumptions: 1 trajectory, which is a multiple of p; that the obs_var_probs is valid # undoing the reshape, so that batch sampling is ok obs_var = TT.reshape(obs_var_raw, [ obs_var_raw.shape[0] * obs_var_raw.shape[1], obs_var_raw.shape[2] ]) # obs_var = obs_var_raw ############################################################# ### calculating the manager portion of the surrogate loss ### ############################################################# # i, j should contain the probability of latent j at time step self.period*i # should be a len(obs)//self.period by len(self.latent) tensor latent_probs = self.policy.manager.dist_info_sym( obs_var_sparse)['prob'] actual_latent_probs = TT.sum(latent_probs * latent_var_sparse, axis=1) if self.trainable_manager: manager_surr_loss = -TT.mean( TT.log(actual_latent_probs) * advantage_var_sparse) else: manager_surr_loss = 0 ############################################################ ### calculating the skills portion of the surrogate loss ### ############################################################ # get the distribution parameters # dist_info_vars = [] # for latent in self.latents: # self.policy.low_policy.set_latent_train(latent) # dist_info_vars.append(self.policy.low_policy.dist_info_sym(obs_var)) # hopefully the above line takes multiple samples, and state_info_vars not needed as input dist_info_vars = self.policy.low_policy.dist_info_sym_all_latents( obs_var) probs = TT.stack([ self.diagonal.log_likelihood_sym(action_var, dist_info) for dist_info in dist_info_vars ], axis=1) # todo: verify that dist_info_vars is in order actual_action_log_probs = TT.sum(probs * latent_var, axis=1) skill_surr_loss = -TT.mean(actual_action_log_probs * advantage_var) surr_loss = manager_surr_loss / self.period + skill_surr_loss # so that the relative magnitudes are correct input_list = [ obs_var_raw, obs_var_sparse, action_var, advantage_var, advantage_var_sparse, latent_var, latent_var_sparse ] # input_list = [obs_var_raw, obs_var_sparse, action_var, advantage_var] # npo has state_info_vars and old_dist_info_vars, I don't think I need them until I go for NPO/TRPO self.optimizer.update_opt(loss=surr_loss, target=self.policy, inputs=input_list) return dict()
def new_tensor_variable(self, name, extra_dims): return ext.new_tensor( name=name, ndim=extra_dims, dtype=self.dtype, )
def init_opt(self): # obs_var_raw = self.env.observation_space.new_tensor_variable( # 'obs', # extra_dims=1, # ) obs_var_raw = ext.new_tensor( 'obs', ndim=3, dtype=theano.config.floatX) # todo: check the dtype action_var = self.env.action_space.new_tensor_variable( 'action', extra_dims=1, ) # this will have to be the advantage every self.period timesteps advantage_var = ext.new_tensor('advantage', ndim=1, dtype=theano.config.floatX) obs_var_sparse = ext.new_tensor( 'sparse_obs', ndim=2, dtype=theano.config. floatX # todo: check this with carlos, refer to discrete.py in rllab.spaces ) assert isinstance(self.policy, HierarchicalPolicy) # todo: assumptions: 1 trajectory, which is a multiple of p; that the obs_var_probs is valid # undoing the reshape, so that batch sampling is ok obs_var = TT.reshape(obs_var_raw, [ obs_var_raw.shape[0] * obs_var_raw.shape[1], obs_var_raw.shape[2] ]) # obs_var = obs_var_raw # i, j should contain the probability of latent j at time step self.period*i # should be a len(obs)//self.period by len(self.latent) tensor latent_probs = self.policy.manager.dist_info_sym( obs_var_sparse)['prob'] # get the distribution parameters # dist_info_vars = [] # for latent in self.latents: # self.policy.low_policy.set_latent_train(latent) # dist_info_vars.append(self.policy.low_policy.dist_info_sym(obs_var)) # hopefully the above line takes multiple samples, and state_info_vars not needed as input dist_info_vars = self.policy.low_policy.dist_info_sym_all_latents( obs_var) probs = [ TT.exp(self.diagonal.log_likelihood_sym(action_var, dist_info)) for dist_info in dist_info_vars ] # need to reshape at the end reshaped_probs = [ TT.reshape(prob, [obs_var.shape[0] // self.period, self.period]) for prob in probs ] # now, multiply out each row and concatenate subtrajectory_probs = TT.stack([ TT.prod(reshaped_prob, axis=1) for reshaped_prob in reshaped_probs ], axis=1) # shape error might come out of here # elementwise multiplication, then sum up each individual row and take log likelihood = TT.log(TT.sum(subtrajectory_probs * latent_probs, axis=1)) surr_loss = -TT.mean(likelihood * advantage_var) input_list = [obs_var_raw, obs_var_sparse, action_var, advantage_var] # npo has state_info_vars and old_dist_info_vars, I don't think I need them until I go for NPO/TRPO self.optimizer.update_opt(loss=surr_loss, target=self.policy, inputs=input_list) return dict()
def init_opt(self): is_recurrent = int(self.policy.recurrent) obs_var = self.env.observation_space.new_tensor_variable( 'obs', extra_dims=1 + is_recurrent, ) action_var = self.env.action_space.new_tensor_variable( 'action', extra_dims=1 + is_recurrent, ) advantage_var = ext.new_tensor( 'advantage', ndim=1 + is_recurrent, dtype=theano.config.floatX ) dist = self.policy.distribution old_dist_info_vars = { k: ext.new_tensor( 'old_%s' % k, ndim=2 + is_recurrent, dtype=theano.config.floatX ) for k in dist.dist_info_keys } old_dist_info_vars_list = [old_dist_info_vars[k] for k in dist.dist_info_keys] state_info_vars = { k: ext.new_tensor( k, ndim=2 + is_recurrent, dtype=theano.config.floatX ) for k in self.policy.state_info_keys } state_info_vars_list = [state_info_vars[k] for k in self.policy.state_info_keys] if is_recurrent: valid_var = TT.matrix('valid') else: valid_var = None dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars) kl = dist.kl_sym(old_dist_info_vars, dist_info_vars) lr = dist.likelihood_ratio_sym(action_var, old_dist_info_vars, dist_info_vars) if self.truncate_local_is_ratio is not None: lr = TT.minimum(self.truncate_local_is_ratio, lr) if is_recurrent: mean_kl = TT.sum(kl * valid_var) / TT.sum(valid_var) surr_loss = - TT.sum(lr * advantage_var * valid_var) / TT.sum(valid_var) else: mean_kl = TT.mean(kl) surr_loss = - TT.mean(lr * advantage_var) normal_loss = surr_loss # symmetry loss mirrored_obs_var = self.env.observation_space.new_tensor_variable( 'mirrored_obs', extra_dims=1 + is_recurrent, ) mean_act_collected = L.get_output(self.policy._l_mean, obs_var) mean_act_mirrored = L.get_output(self.policy._l_mean, mirrored_obs_var) sym_loss = self.sym_loss_weight * TT.mean(TT.square(TT.dot(mean_act_collected, self.act_per_mat.T)-mean_act_mirrored)) surr_loss += sym_loss action_loss = self.action_reg_weight * (TT.mean(TT.abs_(mean_act_collected)) + 5.0*TT.mean(TT.clip(TT.abs_(mean_act_collected)-1.0, 0.0, 100.0))) #surr_loss += action_loss input_list = [ obs_var, action_var, advantage_var, ] + state_info_vars_list + old_dist_info_vars_list+ [mirrored_obs_var] if is_recurrent: input_list.append(valid_var) self.optimizer.update_opt( loss=surr_loss, target=self.policy, leq_constraint=(mean_kl, self.step_size), inputs=input_list, constraint_name="mean_kl" ) self._f_sym_loss = ext.compile_function( inputs=[obs_var, mirrored_obs_var], outputs=[sym_loss] ) self._f_act_loss = ext.compile_function( inputs = [obs_var], outputs=[action_loss] ) return dict()
def init_opt(self): assert not self.policy.recurrent is_recurrent = int(self.policy.recurrent) obs_var = self.env.observation_space.new_tensor_variable( 'obs', extra_dims=1 + is_recurrent, ) #print("env.observation_space", self.env.observation_space) action_var = self.env.action_space.new_tensor_variable( 'action', extra_dims=1 + is_recurrent, ) latent_var = self.policy.latent_space.new_tensor_variable( 'latents', extra_dims=1 + is_recurrent, ) advantage_var = ext.new_tensor( 'advantage', ndim=1 + is_recurrent, dtype=theano.config.floatX ) dist = self.policy.distribution # this can still be the dist P(a|s,__h__) old_dist_info_vars = { k: ext.new_tensor( 'old_%s' % k, # define tensors old_mean and old_log_std ndim=2 + is_recurrent, dtype=theano.config.floatX ) for k in dist.dist_info_keys } old_dist_info_vars_list = [old_dist_info_vars[k] for k in dist.dist_info_keys] ##put 2 tensors above in a list if is_recurrent: valid_var = TT.matrix('valid') else: valid_var = None dist_info_vars = self.policy.dist_info_sym(obs_var, latent_var) kl = dist.kl_sym(old_dist_info_vars, dist_info_vars) lr = dist.likelihood_ratio_sym(action_var, old_dist_info_vars, dist_info_vars) if is_recurrent: mean_kl = TT.sum(kl * valid_var) / TT.sum(valid_var) surr_loss = - TT.sum(lr * advantage_var * valid_var) / TT.sum(valid_var) else: mean_kl = TT.mean(kl) surr_loss = - TT.mean(lr * advantage_var) loss = surr_loss input_list = [ # these are sym var. the inputs in optimize_policy have to be in same order! obs_var, action_var, advantage_var, latent_var, ] + old_dist_info_vars_list # provide old mean and var, for the new states as they were sampled from it! if is_recurrent: input_list.append(valid_var) self.optimizer.update_opt( loss=loss, target=self.policy, leq_constraint=(mean_kl, self.step_size), inputs=input_list, constraint_name="mean_kl" ) return dict()
def init_grad_approx_infos(self): # variables obs_var_raw = ext.new_tensor('obs', ndim=3, dtype=theano.config.floatX) obs_var_sparse = ext.new_tensor('sparse_obs', ndim=2, dtype=theano.config.floatX) action_var = self.env.action_space.new_tensor_variable( 'action', extra_dims=1, ) # this is 5k? # this will have to be the advantage every self.period timesteps advantage_var = ext.new_tensor('advantage', ndim=1, dtype=theano.config.floatX) advantage_var_sparse = ext.new_tensor( 'sparse_advantage', ndim=1, dtype=theano.config.floatX) # this is 5000 latent_var_sparse = ext.new_tensor('sparse_latent', ndim=2, dtype=theano.config.floatX) latent_var = ext.new_tensor('latents', ndim=2, dtype=theano.config.floatX) # this is 5000 obs_var = TT.reshape(obs_var_raw, [ obs_var_raw.shape[0] * obs_var_raw.shape[1], obs_var_raw.shape[2] ]) matrix = TT.eye(self.num_latents) latent_vectors = [matrix[i:i + 1, :] for i in range(self.num_latents)] # should be a len(obs)//self.period by len(self.latent) tensor latent_probs = self.policy.manager.dist_info_sym( obs_var_sparse)['prob'] dist_info_vars = [ self.policy.low_policy.dist_info_sym(obs_var, state_info_var=latent.repeat( obs_var.shape[0], axis=0)) for latent in latent_vectors ] logprobs = [ self.diagonal.log_likelihood_sym(action_var, dist_info) for dist_info in dist_info_vars ] # need to reshape at the end reshaped_logprobs = [ TT.reshape(prob, [obs_var.shape[0] // self.period, self.period]) for prob in logprobs ] # now, multiply out each row and concatenate subtrajectory_logprobs = TT.stack([ TT.sum(reshaped_prob, axis=1) for reshaped_prob in reshaped_logprobs ], axis=1) # exact loss subtrajectory_probs = TT.exp(subtrajectory_logprobs) likelihood = TT.log(TT.sum(subtrajectory_probs * latent_probs, axis=1)) surr_loss_exact = -TT.mean(likelihood * advantage_var_sparse) # approximate actual_latent_probs = TT.sum(latent_probs * latent_var_sparse, axis=1) manager_surr_loss = -TT.mean( TT.log(actual_latent_probs) * advantage_var_sparse) dist_info_approx = self.policy.low_policy.dist_info_sym( obs_var, state_info_var=latent_var) actual_action_log_probs = self.diagonal.log_likelihood_sym( action_var, dist_info_approx) skill_surr_loss = -TT.mean(actual_action_log_probs * advantage_var) surr_loss_approx = manager_surr_loss / self.period + skill_surr_loss input_list = [ obs_var_raw, obs_var_sparse, action_var, advantage_var, advantage_var_sparse, latent_var, latent_var_sparse ] grad_exact = theano.grad(surr_loss_exact, self.policy.get_params(trainable=True), disconnected_inputs='ignore') grad_approx = theano.grad(surr_loss_approx, self.policy.get_params(trainable=True), disconnected_inputs='ignore') grad_exact = [grad.flatten() for grad in grad_exact] grad_approx = [grad.flatten() for grad in grad_approx] v1 = TT.concatenate(grad_exact, axis=0) + 1e-8 v2 = TT.concatenate(grad_approx, axis=0) + 1e-8 v1 = v1 / TT.sqrt(TT.sum(TT.sqr(v1))) v2 = v2 / TT.sqrt(TT.sum(TT.sqr(v2))) cosine_distance = TT.sum(v1 * v2) actual_subtrajectory_prob = TT.sum(subtrajectory_probs * latent_var_sparse, axis=1) proportion = TT.mean(actual_subtrajectory_prob / TT.sum(subtrajectory_probs, axis=1)) self.get_dist_infos = ext.compile_function( inputs=input_list, outputs=dist_info_vars[0]['mean']) self.get_logprobs = ext.compile_function(inputs=input_list, outputs=logprobs[0]) self.get_subprobs = ext.compile_function( inputs=input_list, outputs=[subtrajectory_probs, actual_subtrajectory_prob]) self.get_likelihood = ext.compile_function(inputs=input_list, outputs=[likelihood]) self.get_surr_loss_exact = ext.compile_function( inputs=input_list, outputs=[surr_loss_exact]) self.get_surr_loss_approx = ext.compile_function( inputs=input_list, outputs=[surr_loss_approx]) self.get_vs = ext.compile_function(inputs=input_list, outputs=[v1, v2]) self.get_gradient_infos = ext.compile_function( inputs=input_list, outputs=[cosine_distance, proportion]) return dict()
def init_opt(self): is_recurrent = int(self.policy.recurrent) obs_var = self.env.observation_space.new_tensor_variable( 'obs', extra_dims=1 + is_recurrent, ) action_var = self.env.action_space.new_tensor_variable( 'action', extra_dims=1 + is_recurrent, ) advantage_var = ext.new_tensor('advantage', ndim=1 + is_recurrent, dtype=theano.config.floatX) dist = self.policy.distribution old_dist_info_vars = { k: ext.new_tensor('old_%s' % k, ndim=2 + is_recurrent, dtype=theano.config.floatX) for k in dist.dist_info_keys } old_dist_info_vars_list = [ old_dist_info_vars[k] for k in dist.dist_info_keys ] state_info_vars = { k: ext.new_tensor(k, ndim=2 + is_recurrent, dtype=theano.config.floatX) for k in self.policy.state_info_keys } state_info_vars_list = [ state_info_vars[k] for k in self.policy.state_info_keys ] if is_recurrent: valid_var = TT.matrix('valid') else: valid_var = None dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars) lr = dist.likelihood_ratio_sym(action_var, old_dist_info_vars, dist_info_vars) if self.truncate_local_is_ratio is not None: lr = TT.minimum(self.truncate_local_is_ratio, lr) if is_recurrent: surr_loss = -TT.sum( lr * advantage_var * valid_var) / TT.sum(valid_var) else: std_advar = (advantage_var - TT.mean(advantage_var)) / TT.std(advantage_var) surr_loss = -TT.mean( TT.min([ lr * std_advar, TT.clip(lr, 1 - self.clip_param, 1 + self.clip_param) * std_advar ])) # symmetry loss mirrored_obs_var = self.env.observation_space.new_tensor_variable( 'mirrored_obs', extra_dims=1 + is_recurrent, ) mean_act_collected = L.get_output(self.policy._l_mean, obs_var) mean_act_mirrored = L.get_output(self.policy._l_mean, mirrored_obs_var) sym_loss = self.sym_loss_weight * TT.mean( TT.square( TT.dot(mean_act_collected, self.act_per_mat.T) - mean_act_mirrored)) surr_loss += sym_loss input_list = [ obs_var, action_var, advantage_var, ] + state_info_vars_list + old_dist_info_vars_list + [ mirrored_obs_var ] if is_recurrent: input_list.append(valid_var) self.optimizer.update_opt( loss=surr_loss, target=self.policy, inputs=input_list, ) self._f_sym_loss = ext.compile_function( inputs=[obs_var, mirrored_obs_var], outputs=[sym_loss]) grad = theano.grad(surr_loss, wrt=self.policy.get_params(trainable=True), disconnected_inputs='warn') self._f_grad = ext.compile_function( inputs=input_list, outputs=grad, ) self._f_loss = ext.compile_function(input_list + list(), surr_loss) self.m_prev = [] self.v_prev = [] for i in range(len(self.policy.get_params(trainable=True))): self.m_prev.append( np.zeros(self.policy.get_params( trainable=True)[i].get_value().shape, dtype=self.policy.get_params( trainable=True)[i].get_value().dtype)) self.v_prev.append( np.zeros(self.policy.get_params( trainable=True)[i].get_value().shape, dtype=self.policy.get_params( trainable=True)[i].get_value().dtype)) self.t_prev = 0 self.optimizer.update_opt(surr_loss, self.policy, input_list) return dict()
def init_opt(self): is_recurrent = int(self.policy.recurrent) obs_var = self.env.observation_space.new_tensor_variable( 'obs', extra_dims=1 + is_recurrent, ) action_var = self.env.action_space.new_tensor_variable( 'action', extra_dims=1 + is_recurrent, ) advantage_var = ext.new_tensor('advantage', ndim=1 + is_recurrent, dtype=theano.config.floatX) dist = self.policy.distribution old_dist_info_vars = { k: ext.new_tensor('old_%s' % k, ndim=2 + is_recurrent, dtype=theano.config.floatX) for k in dist.dist_info_keys } old_dist_info_vars_list = [ old_dist_info_vars[k] for k in dist.dist_info_keys ] state_info_vars = { k: ext.new_tensor(k, ndim=2 + is_recurrent, dtype=theano.config.floatX) for k in self.policy.state_info_keys } state_info_vars_list = [ state_info_vars[k] for k in self.policy.state_info_keys ] if is_recurrent: valid_var = TT.matrix('valid') else: valid_var = None dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars) kl = dist.kl_sym(old_dist_info_vars, dist_info_vars) lr = dist.likelihood_ratio_sym(action_var, old_dist_info_vars, dist_info_vars) if self.truncate_local_is_ratio is not None: lr = TT.minimum(self.truncate_local_is_ratio, lr) if is_recurrent: mean_kl = TT.sum(kl * valid_var) / TT.sum(valid_var) surr_loss = -TT.sum( lr * advantage_var * valid_var) / TT.sum(valid_var) else: mean_kl = TT.mean(kl) surr_loss = -TT.mean(lr * advantage_var) input_list = [ obs_var, action_var, advantage_var, ] + state_info_vars_list + old_dist_info_vars_list # guiding net if len(self.guiding_policies) != 0: guiding_obs_var = self.policy._aux_pred_network.input_layer.input_var guiding_action_var = self.env.action_space.new_tensor_variable( 'guiding_action', extra_dims=1 + is_recurrent, ) prediction = self.policy._aux_pred_network._output surr_loss += self.guiding_policy_weight * TT.mean( TT.square(guiding_action_var - prediction)) input_list += [guiding_obs_var, guiding_action_var] if is_recurrent: input_list.append(valid_var) self.optimizer.update_opt(loss=surr_loss, target=self.policy, leq_constraint=(mean_kl, self.step_size), inputs=input_list, constraint_name="mean_kl") return dict()
def init_opt(self): is_recurrent = int(self.policy.recurrent) obs_var = self.env.observation_space.new_tensor_variable( 'obs', extra_dims=1 + is_recurrent, ) action_var = self.env.action_space.new_tensor_variable( 'action', extra_dims=1 + is_recurrent, ) advantage_var = ext.new_tensor('advantage', ndim=1 + is_recurrent, dtype=theano.config.floatX) dist = self.policy.distribution old_dist_info_vars = { k: ext.new_tensor('old_%s' % k, ndim=2 + is_recurrent, dtype=theano.config.floatX) for k in dist.dist_info_keys } old_dist_info_vars_list = [ old_dist_info_vars[k] for k in dist.dist_info_keys ] if is_recurrent: valid_var = TT.matrix('valid') else: valid_var = None state_info_vars = { k: ext.new_tensor(k, ndim=2 + is_recurrent, dtype=theano.config.floatX) for k in self.policy.state_info_keys } state_info_vars_list = [ state_info_vars[k] for k in self.policy.state_info_keys ] ## different policies should have different loss logli_list = [] dist_info_vars_list = [] kl_list = [] for id in range(self.num_of_agents): dist_info_vars = self.policy_list[id].dist_info_sym( obs_var, state_info_vars) logli = dist.log_likelihood_sym(action_var, dist_info_vars) kl = dist.kl_sym(old_dist_info_vars, dist_info_vars) logli_list.append(logli) dist_info_vars_list.append(dist_info_vars) kl_list.append(kl) # formulate as a minimization problem # The gradient of the surrogate objective is the policy gradient mean_kl_list = [] max_kl_list = [] surr_obj_list = [] if is_recurrent: for id in range(self.num_of_agents): surr_obj_raw = -TT.mean(logli_list[id] * advantage_var) policy_weight_decay_term = 0.5 * self.policy_weight_decay * sum( [ TT.sum(TT.square(param)) for param in self.policy_list[id].get_params( regularizable=True) ]) surr_obj = surr_obj_raw + policy_weight_decay_term mean_kl = TT.sum(kl_list[id] * valid_var) / TT.sum(valid_var) max_kl = TT.max(kl_list[id] * valid_var) mean_kl_list.append(mean_kl) max_kl_list.append(max_kl) surr_obj_list.append(surr_obj) else: for id in range(self.num_of_agents): surr_obj_raw = -TT.mean(logli_list[id] * advantage_var) policy_weight_decay_term = 0.5 * self.policy_weight_decay * sum( [ TT.sum(TT.square(param)) for param in self.policy_list[id].get_params( regularizable=True) ]) surr_obj = surr_obj_raw + policy_weight_decay_term mean_kl = TT.mean(kl_list[id]) max_kl = TT.max(kl_list[id]) mean_kl_list.append(mean_kl) max_kl_list.append(max_kl) surr_obj_list.append(surr_obj) input_list = [obs_var, action_var, advantage_var ] + state_info_vars_list if is_recurrent: input_list.append(valid_var) for id in range(self.num_of_agents): self.optimizer_list[id].update_opt(surr_obj_list[id], target=self.policy_list[id], inputs=input_list) f_kl_list = [] for id in range(self.num_of_agents): f_kl = ext.compile_function( inputs=input_list + old_dist_info_vars_list, outputs=[mean_kl_list[id], max_kl_list[id]], ) f_kl_list.append(f_kl) self.opt_info = dict(f_kl_list=f_kl_list, ) self.stein_m = None self.stein_v = None self.stein_epsilon = 1e-8 self.stein_beta1 = 0.9 self.stein_beta2 = 0.999 self.stein_t = 0
def init_opt(self): is_recurrent = int(self.policy.recurrent) obs_var = self.env.observation_space.new_tensor_variable( 'obs', extra_dims=1 + is_recurrent, ) action_var = self.env.action_space.new_tensor_variable( 'action', extra_dims=1 + is_recurrent, ) advantage_var = ext.new_tensor('advantage', ndim=1 + is_recurrent, dtype=theano.config.floatX) dist = self.policy.distribution old_dist_info_vars = { k: ext.new_tensor('old_%s' % k, ndim=2 + is_recurrent, dtype=theano.config.floatX) for k in dist.dist_info_keys } old_dist_info_vars_list = [ old_dist_info_vars[k] for k in dist.dist_info_keys ] state_info_vars = { k: ext.new_tensor(k, ndim=2 + is_recurrent, dtype=theano.config.floatX) for k in self.policy.state_info_keys } state_info_vars_list = [ state_info_vars[k] for k in self.policy.state_info_keys ] if is_recurrent: valid_var = TT.matrix('valid') else: valid_var = None entropy_input_var = TT.matrix('entropy_inputs') dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars) kl = dist.kl_sym(old_dist_info_vars, dist_info_vars) lr = dist.likelihood_ratio_sym(action_var, old_dist_info_vars, dist_info_vars) if self.truncate_local_is_ratio is not None: lr = TT.minimum(self.truncate_local_is_ratio, lr) if is_recurrent: mean_kl = TT.sum(kl * valid_var) / TT.sum(valid_var) surr_loss = -TT.sum( lr * advantage_var * valid_var) / TT.sum(valid_var) else: mean_kl = TT.mean(kl) surr_loss = -TT.mean(lr * advantage_var) # entropy of blended weights surr_loss += 1.0 * self.policy.bw_entropy( entropy_input_var) - 1.0 * self.policy.bw_choice_entropy( entropy_input_var) input_list = [ obs_var, action_var, advantage_var, ] + state_info_vars_list + old_dist_info_vars_list + [ entropy_input_var ] if is_recurrent: input_list.append(valid_var) self.optimizer.update_opt(loss=surr_loss, target=self.policy, leq_constraint=(mean_kl, self.step_size), inputs=input_list, constraint_name="mean_kl") return dict()
def init_opt(self): assert isinstance(self.policy, HierarchicalPolicy) assert not self.freeze_manager and not self.freeze_skills manager_surr_loss = 0 # skill_surr_loss = 0 obs_var_sparse = ext.new_tensor('sparse_obs', ndim=2, dtype=theano.config.floatX) obs_var_raw = ext.new_tensor( 'obs', ndim=3, dtype=theano.config.floatX) # todo: check the dtype action_var = self.env.action_space.new_tensor_variable( 'action', extra_dims=1, ) advantage_var = ext.new_tensor('advantage', ndim=1, dtype=theano.config.floatX) # latent_var = ext.new_tensor('latents', ndim=2, dtype=theano.config.floatX) mean_var = ext.new_tensor('mean', ndim=2, dtype=theano.config.floatX) log_std_var = ext.new_tensor('log_std', ndim=2, dtype=theano.config.floatX) # undoing the reshape, so that batch sampling is ok obs_var = TT.reshape(obs_var_raw, [ obs_var_raw.shape[0] * obs_var_raw.shape[1], obs_var_raw.shape[2] ]) ############################################################ ### calculating the skills portion of the surrogate loss ### ############################################################ latent_var_sparse = self.policy.manager.dist_info_sym( obs_var_sparse)['mean'] latent_var = TT.extra_ops.repeat(latent_var_sparse, self.period, axis=0) #.dimshuffle(0, 'x') dist_info_var = self.policy.low_policy.dist_info_sym( obs_var, state_info_var=latent_var) old_dist_info_var = dict(mean=mean_var, log_std=log_std_var) skill_lr = self.diagonal.likelihood_ratio_sym(action_var, old_dist_info_var, dist_info_var) skill_surr_loss_vector = TT.minimum( skill_lr * advantage_var, TT.clip(skill_lr, 1 - self.epsilon, 1 + self.epsilon) * advantage_var) skill_surr_loss = -TT.mean(skill_surr_loss_vector) surr_loss = skill_surr_loss # so that the relative magnitudes are correct if self.freeze_skills and not self.freeze_manager: raise NotImplementedError elif self.freeze_manager and not self.freeze_skills: raise NotImplementedError else: assert (not self.freeze_manager) or (not self.freeze_skills) input_list = [ obs_var_raw, obs_var_sparse, action_var, advantage_var, mean_var, log_std_var ] self.optimizer.update_opt(loss=surr_loss, target=self.policy, inputs=input_list) return dict()
def init_opt(self): is_recurrent = int(self.policy.recurrent) # Init dual param values self.param_eta = 15. # Adjust for linear feature vector. self.param_v = np.random.rand(self.env.observation_space.flat_dim * 2 + 4) # Theano vars obs_var = self.env.observation_space.new_tensor_variable( 'obs', extra_dims=1 + is_recurrent, ) action_var = self.env.action_space.new_tensor_variable( 'action', extra_dims=1 + is_recurrent, ) rewards = ext.new_tensor( 'rewards', ndim=1 + is_recurrent, dtype=theano.config.floatX, ) # Feature difference variable representing the difference in feature # value of the next observation and the current observation \phi(s') - # \phi(s). feat_diff = ext.new_tensor('feat_diff', ndim=2 + is_recurrent, dtype=theano.config.floatX) param_v = TT.vector('param_v') param_eta = TT.scalar('eta') valid_var = TT.matrix('valid') state_info_vars = { k: ext.new_tensor(k, ndim=2 + is_recurrent, dtype=theano.config.floatX) for k in self.policy.state_info_keys } state_info_vars_list = [ state_info_vars[k] for k in self.policy.state_info_keys ] # Policy-related symbolics dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars) dist = self.policy.distribution # log of the policy dist logli = dist.log_likelihood_sym(action_var, dist_info_vars) # Symbolic sample Bellman error delta_v = rewards + TT.dot(feat_diff, param_v) # Policy loss (negative because we minimize) if is_recurrent: loss = -TT.sum(logli * TT.exp(delta_v / param_eta - TT.max(delta_v / param_eta)) * valid_var) / TT.sum(valid_var) else: loss = -TT.mean(logli * TT.exp(delta_v / param_eta - TT.max(delta_v / param_eta))) # Add regularization to loss. reg_params = self.policy.get_params(regularizable=True) loss += self.L2_reg_loss * TT.sum( [TT.mean(TT.square(param)) for param in reg_params]) / len(reg_params) # Policy loss gradient. loss_grad = TT.grad(loss, self.policy.get_params(trainable=True)) if is_recurrent: recurrent_vars = [valid_var] else: recurrent_vars = [] input = [ rewards, obs_var, feat_diff, action_var ] + state_info_vars_list + recurrent_vars + [param_eta, param_v] # if is_recurrent: # input += f_loss = ext.compile_function( inputs=input, outputs=loss, ) f_loss_grad = ext.compile_function( inputs=input, outputs=loss_grad, ) # Debug prints old_dist_info_vars = { k: ext.new_tensor('old_%s' % k, ndim=2 + is_recurrent, dtype=theano.config.floatX) for k in dist.dist_info_keys } old_dist_info_vars_list = [ old_dist_info_vars[k] for k in dist.dist_info_keys ] if is_recurrent: mean_kl = TT.sum( dist.kl_sym(old_dist_info_vars, dist_info_vars) * valid_var) / TT.sum(valid_var) else: mean_kl = TT.mean(dist.kl_sym(old_dist_info_vars, dist_info_vars)) f_kl = ext.compile_function( inputs=[obs_var, action_var] + state_info_vars_list + old_dist_info_vars_list + recurrent_vars, outputs=mean_kl, ) # Dual-related symbolics # Symbolic dual if is_recurrent: dual = param_eta * self.epsilon + \ param_eta * TT.log( TT.sum( TT.exp( delta_v / param_eta - TT.max(delta_v / param_eta) ) * valid_var ) / TT.sum(valid_var) ) + param_eta * TT.max(delta_v / param_eta) else: dual = param_eta * self.epsilon + \ param_eta * TT.log( TT.mean( TT.exp( delta_v / param_eta - TT.max(delta_v / param_eta) ) ) ) + param_eta * TT.max(delta_v / param_eta) # Add L2 regularization. dual += self.L2_reg_dual * \ (TT.square(param_eta) + TT.square(1 / param_eta)) # Symbolic dual gradient dual_grad = TT.grad(cost=dual, wrt=[param_eta, param_v]) # Eval functions. f_dual = ext.compile_function(inputs=[rewards, feat_diff] + state_info_vars_list + recurrent_vars + [param_eta, param_v], outputs=dual) f_dual_grad = ext.compile_function( inputs=[rewards, feat_diff] + state_info_vars_list + recurrent_vars + [param_eta, param_v], outputs=dual_grad) self.opt_info = dict(f_loss_grad=f_loss_grad, f_loss=f_loss, f_dual=f_dual, f_dual_grad=f_dual_grad, f_kl=f_kl)
def new_tensor_variable(self, name, extra_dims): return ext.new_tensor( name=name, ndim=extra_dims+1, dtype=theano.config.floatX )
def init_opt(self, policy_name): is_recurrent = int(self.policies[policy_name].recurrent) ## By extra_dims they actually mean shape of the tensor # Thus, for recurrent they need an extra dimensions in the tensor to store sequences # We have 2 options: # - either re-use observation vars from policy # - create observation vars again (it gives an error at this point: probably requires to dublicate variables) reuse_obs_vars = True if reuse_obs_vars: obs_vars = self.policies[policy_name].input_vars else: obs_vars = [] for idx, obs_shape in enumerate( self.policies[policy_name].obs_shapes): # name = 'obs_%d' % (idx) name = 'obs' obs_var_cur = self.env.observation_space.new_tensor_variable( name, extra_dims=1 + is_recurrent, ) obs_vars.append(obs_var_cur) print( 'NPO: Observation vars are created for policy %s' % policy_name, obs_vars) action_var = self.env.action_space.new_tensor_variable( 'action', extra_dims=1 + is_recurrent, ) advantage_var = ext.new_tensor('advantage', ndim=1 + is_recurrent, dtype=theano.config.floatX) dist = self.policies[policy_name].distribution old_dist_info_vars = { k: ext.new_tensor('old_%s' % k, ndim=2 + is_recurrent, dtype=theano.config.floatX) for k in dist.dist_info_keys } old_dist_info_vars_list = [ old_dist_info_vars[k] for k in dist.dist_info_keys ] if is_recurrent: valid_var = TT.matrix('valid') else: valid_var = None # Here we need to get output variables based on input variables # dist_info_sym takes input features and spits out outputs of the policy graph # typically input variables are observations (sometimes actions as well) dist_info_vars = self.policies[policy_name].dist_info_sym( obs_vars, action_var) kl = dist.kl_sym(old_dist_info_vars, dist_info_vars) lr = dist.likelihood_ratio_sym(action_var, old_dist_info_vars, dist_info_vars) if is_recurrent: mean_kl = TT.sum(kl * valid_var) / TT.sum(valid_var) surr_loss = - \ TT.sum(lr * advantage_var * valid_var) / TT.sum(valid_var) else: mean_kl = TT.mean(kl) surr_loss = -TT.mean(lr * advantage_var) # Forming input list for the policy input_list = obs_vars + [action_var, advantage_var ] + old_dist_info_vars_list if is_recurrent: input_list.append(valid_var) # print('NPO: Policy Input list: ', [var for var in input_list]) # theano.printing.pydotprint(surr_loss, outfile="loss.png", # var_with_name_simple=True) self.optimizers[policy_name].update_opt( loss=surr_loss, target=self.policies[policy_name], leq_constraint=(mean_kl, self.step_size), inputs=input_list, constraint_name="mean_kl") return dict()
def new_tensor_variable(self, name, extra_dims): return ext.new_tensor(name=name, ndim=extra_dims + 1, dtype=theano.config.floatX)
def init_opt(self): is_recurrent = int(self.policy.recurrent) obs_var = self.env.observation_space.new_tensor_variable( 'obs', extra_dims=1 + is_recurrent, ) action_var = self.env.action_space.new_tensor_variable( 'action', extra_dims=1 + is_recurrent, ) advantage_var = ext.new_tensor('advantage', ndim=1 + is_recurrent, dtype=theano.config.floatX) dist = self.policy.distribution old_dist_info_vars = { k: ext.new_tensor('old_%s' % k, ndim=2 + is_recurrent, dtype=theano.config.floatX) for k in dist.dist_info_keys } old_dist_info_vars_list = [ old_dist_info_vars[k] for k in dist.dist_info_keys ] state_info_vars = { k: ext.new_tensor(k, ndim=2 + is_recurrent, dtype=theano.config.floatX) for k in self.policy.state_info_keys } state_info_vars_list = [ state_info_vars[k] for k in self.policy.state_info_keys ] if is_recurrent: valid_var = TT.matrix('valid') else: valid_var = None dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars) kl = dist.kl_sym(old_dist_info_vars, dist_info_vars) lr = dist.likelihood_ratio_sym(action_var, old_dist_info_vars, dist_info_vars) if self.truncate_local_is_ratio is not None: lr = TT.minimum(self.truncate_local_is_ratio, lr) if is_recurrent: mean_kl = TT.sum(kl * valid_var) / TT.sum(valid_var) surr_loss = -TT.sum( lr * advantage_var * valid_var) / TT.sum(valid_var) else: mean_kl = TT.mean(kl) surr_loss = -TT.mean(lr * advantage_var) # aux net aux_input_var = self.policy._aux_pred_network.input_layer.input_var aux_target_var = TT.matrix('aux_targets') prediction = self.policy._aux_pred_network._output surr_loss += 0.01 * TT.mean(TT.square(aux_target_var - prediction)) '''loss = lasagne.objectives.squared_error(prediction, aux_target_var) loss = loss.mean() grads = theano.grad(surr_loss, wrt=self.policy.get_params(trainable=True), disconnected_inputs='warn') abcd''' input_list = [ obs_var, action_var, advantage_var, ] + state_info_vars_list + old_dist_info_vars_list + [ aux_input_var, aux_target_var ] if is_recurrent: input_list.append(valid_var) self.optimizer.update_opt(loss=surr_loss, target=self.policy, leq_constraint=(mean_kl, self.step_size), inputs=input_list, constraint_name="mean_kl") return dict()
def init_opt(self): self.start_time = time.time() is_recurrent = int(self.policy.recurrent) obs_var = self.env.observation_space.new_tensor_variable( 'obs', extra_dims=1 + is_recurrent, ) action_var = self.env.action_space.new_tensor_variable( 'action', extra_dims=1 + is_recurrent, ) advantage_var = ext.new_tensor('advantage', ndim=1 + is_recurrent, dtype=theano.config.floatX) weights_var = ext.new_tensor('weights', ndim=1 + is_recurrent, dtype=theano.config.floatX) dist = self.policy.distribution old_dist_info_vars = { k: ext.new_tensor('old_%s' % k, ndim=2 + is_recurrent, dtype=theano.config.floatX) for k in dist.dist_info_keys } old_dist_info_vars_list = [ old_dist_info_vars[k] for k in dist.dist_info_keys ] state_info_vars = { k: ext.new_tensor(k, ndim=2 + is_recurrent, dtype=theano.config.floatX) for k in self.policy.state_info_keys } state_info_vars_list = [ state_info_vars[k] for k in self.policy.state_info_keys ] if is_recurrent: valid_var = TT.matrix('valid') else: valid_var = None dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars) self.dist_info_vars_func = ext.compile_function( inputs=[obs_var] + state_info_vars_list, outputs=dist_info_vars, log_name="dist_info_vars") # when we want to get D_KL( pi' || pi) for data that was sampled on # some behavior policy pi_b, where pi' is the optimization variable # and pi is the policy of the previous iteration, # the dist_info in memory will correspond to pi_b and not pi. # so we have to compute the dist_info for that data on pi, on the fly. ent = dist.entropy_sym(dist_info_vars) kl = dist.kl_sym(old_dist_info_vars, dist_info_vars) lr = dist.likelihood_ratio_sym(action_var, old_dist_info_vars, dist_info_vars) if is_recurrent: mean_ent = TT.sum( weights_var * ent * valid_var) / TT.sum(valid_var) max_kl = TT.max(kl * valid_var) mean_kl = TT.sum(weights_var * kl * valid_var) / TT.sum(valid_var) surr_loss = -TT.sum(lr * weights_var * advantage_var * valid_var) / TT.sum(valid_var) else: mean_ent = TT.mean(weights_var * ent) max_kl = TT.max(kl) mean_kl = TT.mean(weights_var * kl) surr_loss = -TT.mean(lr * weights_var * advantage_var) if self.entropy_regularize: self.entropy_beta = theano.shared(self.entropy_coeff) surr_loss -= self.entropy_beta * mean_ent input_list = [ obs_var, action_var, advantage_var, weights_var, ] + state_info_vars_list + old_dist_info_vars_list if is_recurrent: input_list.append(valid_var) self.optimizer.update_opt(loss=surr_loss, target=self.policy, leq_constraint=(mean_kl, self.step_size), inputs=input_list, constraint_name="mean_kl") f_kl = ext.compile_function( inputs=input_list, outputs=[mean_kl, max_kl], ) self.opt_info = dict(f_kl=f_kl, )
def init_opt(self): assert isinstance(self.policy, HierarchicalPolicy) manager_surr_loss = 0 skill_surr_loss = 0 if not self.freeze_manager: obs_var_sparse = ext.new_tensor('sparse_obs', ndim=2, dtype=theano.config.floatX) latent_var_sparse = ext.new_tensor('sparse_latent', ndim=2, dtype=theano.config.floatX) advantage_var_sparse = ext.new_tensor('sparse_advantage', ndim=1, dtype=theano.config.floatX) # advantage every self.period timesteps manager_prob_var = ext.new_tensor('manager_prob_var', ndim=2, dtype=theano.config.floatX) ############################################################# ### calculating the manager portion of the surrogate loss ### ############################################################# latent_probs = self.policy.manager.dist_info_sym(obs_var_sparse)['prob'] actual_latent_probs = TT.sum(latent_probs * latent_var_sparse, axis=1) old_actual_latent_probs = TT.sum(manager_prob_var * latent_var_sparse, axis=1) lr = TT.exp(TT.log(actual_latent_probs) - TT.log(old_actual_latent_probs)) manager_surr_loss_vector = TT.minimum(lr * advantage_var_sparse, TT.clip(lr, 1 - self.epsilon, 1 + self.epsilon) * advantage_var_sparse) manager_surr_loss = -TT.mean(manager_surr_loss_vector) if not self.freeze_skills: obs_var_raw = ext.new_tensor('obs', ndim=3, dtype=theano.config.floatX) # todo: check the dtype action_var = self.env.action_space.new_tensor_variable('action', extra_dims=1, ) advantage_var = ext.new_tensor('advantage', ndim=1, dtype=theano.config.floatX) latent_var = ext.new_tensor('latents', ndim=2, dtype=theano.config.floatX) mean_var = ext.new_tensor('mean', ndim=2, dtype=theano.config.floatX) log_std_var = ext.new_tensor('log_std', ndim=2, dtype=theano.config.floatX) # undoing the reshape, so that batch sampling is ok obs_var = TT.reshape(obs_var_raw, [obs_var_raw.shape[0] * obs_var_raw.shape[1], obs_var_raw.shape[2]]) ############################################################ ### calculating the skills portion of the surrogate loss ### ############################################################ dist_info_var = self.policy.low_policy.dist_info_sym(obs_var, state_info_var=latent_var) old_dist_info_var = dict(mean=mean_var, log_std=log_std_var) skill_lr = self.diagonal.likelihood_ratio_sym(action_var, old_dist_info_var, dist_info_var) skill_surr_loss_vector = TT.minimum(skill_lr * advantage_var, TT.clip(skill_lr, 1 - self.epsilon, 1 + self.epsilon) * advantage_var) skill_surr_loss = -TT.mean(skill_surr_loss_vector) surr_loss = manager_surr_loss / self.period + skill_surr_loss # so that the relative magnitudes are correct if self.freeze_skills and not self.freeze_manager: input_list = [obs_var_sparse, advantage_var_sparse, latent_var_sparse, manager_prob_var] elif self.freeze_manager and not self.freeze_skills: input_list = [obs_var_raw, action_var, advantage_var, latent_var, mean_var, log_std_var] else: assert (not self.freeze_manager) or (not self.freeze_skills) input_list = [obs_var_raw, obs_var_sparse, action_var, advantage_var, advantage_var_sparse, latent_var, latent_var_sparse, mean_var, log_std_var, manager_prob_var] self.optimizer.update_opt( loss=surr_loss, target=self.policy, inputs=input_list ) return dict()