def new_tensor_variable(space, name, extra_dims): if isinstance(space, gym.spaces.Box): return ext.new_tensor(name=name, ndim=extra_dims + 1, dtype=theano.config.floatX) elif isinstance(space, gym.spaces.Discrete): if space.n <= 2**8: return ext.new_tensor(name=name, ndim=extra_dims + 1, dtype='uint8') elif space.n <= 2**16: return ext.new_tensor(name=name, ndim=extra_dims + 1, dtype='uint16') else: return ext.new_tensor(name=name, ndim=extra_dims + 1, dtype='uint32') elif isinstance(space, gym.spaces.Tuple): dtypes = [ new_tensor_variable(c, "tmp", extra_dims=0).dtype for c in space.spaces ] if dtypes and hasattr(dtypes[0], "as_numpy_dtype"): dtypes = [d.as_numpy_dtype for d in dtypes] common_dtype = np.core.numerictypes.find_common_type([], dtypes) return ext.new_tensor( name=name, ndim=extra_dims + 1, dtype=common_dtype, ) else: raise NotImplementedError
def new_tensor_variable(self, name, extra_dims): """ Create a tensor variable in Theano. :param name: name of the variable :param extra_dims: extra dimensions to be prepended :return: the created tensor variable """ if self.n <= 2**8: return ext.new_tensor( name=name, ndim=extra_dims + 1, dtype='uint8') elif self.n <= 2**16: return ext.new_tensor( name=name, ndim=extra_dims + 1, dtype='uint16') else: return ext.new_tensor( name=name, ndim=extra_dims + 1, dtype='uint32')
def new_tensor_variable(self, name, extra_dims): """ Create a tensor variable in Theano. :param name: name of the variable :param extra_dims: extra dimensions to be prepended :return: the created tensor variable """ return ext.new_tensor(name=name, ndim=extra_dims + 1, dtype=theano.config.floatX)
def init_opt(self): is_recurrent = int(self.policy.recurrent) # Init dual param values self.param_eta = 15. # Adjust for linear feature vector. self.param_v = np.random.rand(self.env.observation_space.flat_dim * 2 + 4) # Theano vars obs_var = self.env.observation_space.new_tensor_variable( 'obs', extra_dims=1 + is_recurrent, ) action_var = self.env.action_space.new_tensor_variable( 'action', extra_dims=1 + is_recurrent, ) rewards = ext.new_tensor( 'rewards', ndim=1 + is_recurrent, dtype=theano.config.floatX, ) # Feature difference variable representing the difference in feature # value of the next observation and the current observation \phi(s') - # \phi(s). feat_diff = ext.new_tensor('feat_diff', ndim=2 + is_recurrent, dtype=theano.config.floatX) param_v = TT.vector('param_v') param_eta = TT.scalar('eta') valid_var = TT.matrix('valid') state_info_vars = { k: ext.new_tensor(k, ndim=2 + is_recurrent, dtype=theano.config.floatX) for k in self.policy.state_info_keys } state_info_vars_list = [ state_info_vars[k] for k in self.policy.state_info_keys ] # Policy-related symbolics dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars) dist = self.policy.distribution # log of the policy dist logli = dist.log_likelihood_sym(action_var, dist_info_vars) # Symbolic sample Bellman error delta_v = rewards + TT.dot(feat_diff, param_v) # Policy loss (negative because we minimize) if is_recurrent: loss = -TT.sum(logli * TT.exp(delta_v / param_eta - TT.max(delta_v / param_eta)) * valid_var) / TT.sum(valid_var) else: loss = -TT.mean(logli * TT.exp(delta_v / param_eta - TT.max(delta_v / param_eta))) # Add regularization to loss. reg_params = self.policy.get_params(regularizable=True) loss += self.L2_reg_loss * TT.sum( [TT.mean(TT.square(param)) for param in reg_params]) / len(reg_params) # Policy loss gradient. loss_grad = TT.grad(loss, self.policy.get_params(trainable=True)) if is_recurrent: recurrent_vars = [valid_var] else: recurrent_vars = [] input = [ rewards, obs_var, feat_diff, action_var ] + state_info_vars_list + recurrent_vars + [param_eta, param_v] # if is_recurrent: # input += f_loss = ext.compile_function( inputs=input, outputs=loss, ) f_loss_grad = ext.compile_function( inputs=input, outputs=loss_grad, ) # Debug prints old_dist_info_vars = { k: ext.new_tensor('old_%s' % k, ndim=2 + is_recurrent, dtype=theano.config.floatX) for k in dist.dist_info_keys } old_dist_info_vars_list = [ old_dist_info_vars[k] for k in dist.dist_info_keys ] if is_recurrent: mean_kl = TT.sum( dist.kl_sym(old_dist_info_vars, dist_info_vars) * valid_var) / TT.sum(valid_var) else: mean_kl = TT.mean(dist.kl_sym(old_dist_info_vars, dist_info_vars)) f_kl = ext.compile_function( inputs=[obs_var, action_var] + state_info_vars_list + old_dist_info_vars_list + recurrent_vars, outputs=mean_kl, ) # Dual-related symbolics # Symbolic dual if is_recurrent: dual = param_eta * self.epsilon + \ param_eta * TT.log( TT.sum( TT.exp( delta_v / param_eta - TT.max(delta_v / param_eta) ) * valid_var ) / TT.sum(valid_var) ) + param_eta * TT.max(delta_v / param_eta) else: dual = param_eta * self.epsilon + \ param_eta * TT.log( TT.mean( TT.exp( delta_v / param_eta - TT.max(delta_v / param_eta) ) ) ) + param_eta * TT.max(delta_v / param_eta) # Add L2 regularization. dual += self.L2_reg_dual * \ (TT.square(param_eta) + TT.square(1 / param_eta)) # Symbolic dual gradient dual_grad = TT.grad(cost=dual, wrt=[param_eta, param_v]) # Eval functions. f_dual = ext.compile_function(inputs=[rewards, feat_diff] + state_info_vars_list + recurrent_vars + [param_eta, param_v], outputs=dual) f_dual_grad = ext.compile_function( inputs=[rewards, feat_diff] + state_info_vars_list + recurrent_vars + [param_eta, param_v], outputs=dual_grad) self.opt_info = dict(f_loss_grad=f_loss_grad, f_loss=f_loss, f_dual=f_dual, f_dual_grad=f_dual_grad, f_kl=f_kl)
def init_opt(self): is_recurrent = int(self.policy.recurrent) obs_var = self.env.observation_space.new_tensor_variable( 'obs', extra_dims=1 + is_recurrent, ) action_var = self.env.action_space.new_tensor_variable( 'action', extra_dims=1 + is_recurrent, ) advantage_var = ext.new_tensor('advantage', ndim=1 + is_recurrent, dtype=theano.config.floatX) dist = self.policy.distribution old_dist_info_vars = { k: ext.new_tensor('old_%s' % k, ndim=2 + is_recurrent, dtype=theano.config.floatX) for k in dist.dist_info_keys } old_dist_info_vars_list = [ old_dist_info_vars[k] for k in dist.dist_info_keys ] if is_recurrent: valid_var = TT.matrix('valid') else: valid_var = None state_info_vars = { k: ext.new_tensor(k, ndim=2 + is_recurrent, dtype=theano.config.floatX) for k in self.policy.state_info_keys } state_info_vars_list = [ state_info_vars[k] for k in self.policy.state_info_keys ] dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars) logli = dist.log_likelihood_sym(action_var, dist_info_vars) kl = dist.kl_sym(old_dist_info_vars, dist_info_vars) # formulate as a minimization problem # The gradient of the surrogate objective is the policy gradient if is_recurrent: surr_obj = -TT.sum( logli * advantage_var * valid_var) / TT.sum(valid_var) mean_kl = TT.sum(kl * valid_var) / TT.sum(valid_var) max_kl = TT.max(kl * valid_var) else: surr_obj = -TT.mean(logli * advantage_var) mean_kl = TT.mean(kl) max_kl = TT.max(kl) input_list = [obs_var, action_var, advantage_var ] + state_info_vars_list if is_recurrent: input_list.append(valid_var) self.optimizer.update_opt(surr_obj, target=self.policy, inputs=input_list) f_kl = ext.compile_function( inputs=input_list + old_dist_info_vars_list, outputs=[mean_kl, max_kl], ) self.opt_info = dict(f_kl=f_kl, )