def __init__(self, qf, policy, min_pool_size=10000, replay_pool_size=1000000, replacement_prob=1.0, qf_batch_size=32, qf_weight_decay=0., qf_update_method='adam', qf_learning_rate=1e-3, qf_use_target=True, soft_target_tau=0.001, ): self.soft_target_tau = soft_target_tau self.min_pool_size = min_pool_size self.replay_pool_size = replay_pool_size self.replacement_prob = replacement_prob self.qf_batch_size = qf_batch_size self.qf_weight_decay = qf_weight_decay self.qf_update_method = FirstOrderOptimizer(update_method=qf_update_method, learning_rate=qf_learning_rate) self.qf_use_target = qf_use_target self.discount = 0.99 self.qf = qf self.policy = policy self.qf_loss_averages = [] self.q_averages = [] self.y_averages = []
def __init__( self, manager_optimizer=None, optimizer=None, snn_optimizer=None, optimizer_args=None, step_size=1e-6, latents=None, # some sort of iterable of the actual latent vectors period=10, # how often I choose a latent truncate_local_is_ratio=None, **kwargs): if optimizer is None: if optimizer_args is None: # optimizer_args = dict() optimizer_args = dict(batch_size=None) self.optimizer = FirstOrderOptimizer( learning_rate=step_size, **optimizer_args) # I hope this is right self.manager_optimizer = manager_optimizer self.snn_optimizer = snn_optimizer self.step_size = step_size self.truncate_local_is_ratio = truncate_local_is_ratio super(PG_concurrent, self).__init__(**kwargs) # not sure if this line is correct self.latents = latents self.period = period # todo: fix this sampler stuff self.sampler = HierBatchSampler(self, self.period) # i hope this is right self.diagonal = DiagonalGaussian( self.policy.low_policy.action_space.flat_dim) self.debug_fns = []
def __init__(self, optimizer=None, optimizer_args=None, step_size=0.003, num_latents=6, latents=None, # some sort of iterable of the actual latent vectors period=10, # how often I choose a latent truncate_local_is_ratio=None, epsilon=0.1, train_pi_iters=10, use_skill_dependent_baseline=False, mlp_skill_dependent_baseline=False, freeze_manager=False, freeze_skills=False, **kwargs): if optimizer is None: if optimizer_args is None: # optimizer_args = dict() optimizer_args = dict(batch_size=None) self.optimizer = FirstOrderOptimizer(learning_rate=step_size, max_epochs=train_pi_iters, **optimizer_args) self.step_size = step_size self.truncate_local_is_ratio = truncate_local_is_ratio self.epsilon = epsilon super(Concurrent_PPO, self).__init__(**kwargs) # not sure if this line is correct self.num_latents = kwargs['policy'].latent_dim self.latents = latents self.period = period self.freeze_manager = freeze_manager self.freeze_skills = freeze_skills assert (not freeze_manager) or (not freeze_skills) # todo: fix this sampler stuff # import pdb; pdb.set_trace() self.sampler = HierBatchSampler(self, self.period) # self.sampler = BatchSampler(self) # i hope this is right self.diagonal = DiagonalGaussian(self.policy.low_policy.action_space.flat_dim) self.debug_fns = [] assert isinstance(self.policy, HierarchicalPolicy) if self.policy is not None: self.period = self.policy.period assert self.policy.period == self.period # self.old_policy = copy.deepcopy(self.policy) # skill dependent baseline self.use_skill_dependent_baseline = use_skill_dependent_baseline self.mlp_skill_dependent_baseline = mlp_skill_dependent_baseline if use_skill_dependent_baseline: curr_env = kwargs['env'] skill_dependent_action_space = curr_env.action_space new_obs_space_no_bi = curr_env.observation_space.shape[0] + 1 # 1 for the t_remaining skill_dependent_obs_space_dim = (new_obs_space_no_bi * (self.num_latents + 1) + self.num_latents,) skill_dependent_obs_space = Box(-1.0, 1.0, shape=skill_dependent_obs_space_dim) skill_dependent_env_spec = EnvSpec(skill_dependent_obs_space, skill_dependent_action_space) if self.mlp_skill_dependent_baseline: self.skill_dependent_baseline = GaussianMLPBaseline(env_spec=skill_dependent_env_spec) else: self.skill_dependent_baseline = LinearFeatureBaseline(env_spec=skill_dependent_env_spec)
def __init__( self, env, policy, baseline, difference_params=False, quantize=False, quantization_tuning=4, optimizer=None, optimizer_args=None, **kwargs): Serializable.quick_init(self, locals()) if optimizer is None: default_args = dict( batch_size=None, max_epochs=1, ) if optimizer_args is None: optimizer_args = default_args else: optimizer_args = dict(default_args, **optimizer_args) optimizer = FirstOrderOptimizer(**optimizer_args) self.optimizer = optimizer self.quantize = quantize self.quantization_tuning = quantization_tuning self.opt_info = None self.policy_params_last_update = 0 self.difference_params = difference_params super(Agent, self).__init__(env=env, policy=policy, baseline=baseline, quantize=quantize, quantization_tuning=quantization_tuning, **kwargs)
def __init__(self, wrapped_constraint, env_spec, yield_zeros_until=1, optimizer=None, hidden_sizes=(32,), hidden_nonlinearity=NL.sigmoid, lag_time=10, coeff=1., filter_bonuses=False, max_epochs=25, *args, **kwargs): Serializable.quick_init(self,locals()) self._wrapped_constraint = wrapped_constraint self._env_spec = env_spec self._filter_bonuses = filter_bonuses self._yield_zeros_until = yield_zeros_until self._hidden_sizes = hidden_sizes self._lag_time = lag_time self._coeff = coeff self._max_epochs = max_epochs self.use_bonus = True if optimizer is None: #optimizer = LbfgsOptimizer() optimizer = FirstOrderOptimizer(max_epochs=max_epochs, batch_size=None) self._optimizer = optimizer obs_dim = env_spec.observation_space.flat_dim predictor_network = MLP(1,hidden_sizes,hidden_nonlinearity,NL.sigmoid, input_shape=(obs_dim,)) LasagnePowered.__init__(self, [predictor_network.output_layer]) x_var = predictor_network.input_layer.input_var y_var = TT.matrix("ys") out_var = L.get_output(predictor_network.output_layer, {predictor_network.input_layer: x_var}) regression_loss = TT.mean(TT.square(y_var - out_var)) optimizer_args = dict( loss=regression_loss, target=self, inputs=[x_var, y_var], ) self._optimizer.update_opt(**optimizer_args) self._f_predict = compile_function([x_var],out_var) self._fit_steps = 0 self.has_baseline = self._wrapped_constraint.has_baseline if self.has_baseline: self.baseline = self._wrapped_constraint.baseline
def __init__( self, optimizer=None, optimizer_args=None, step_size=1e-2, num_latents=6, latents=None, # some sort of iterable of the actual latent vectors period=10, # how often I choose a latent truncate_local_is_ratio=None, use_skill_dependent_baseline=False, **kwargs): Serializable.quick_init(self, locals()) if optimizer is None: default_args = dict(batch_size=None, max_epochs=1) if optimizer_args is None: optimizer_args = default_args else: optimizer_args = dict(default_args, **optimizer_args) optimizer = FirstOrderOptimizer(learning_rate=step_size, **optimizer_args) self.optimizer = optimizer self.step_size = step_size self.truncate_local_is_ratio = truncate_local_is_ratio super(PG_concurrent_approx, self).__init__(**kwargs) # not sure if this line is correct self.num_latents = kwargs['policy'].latent_dim self.latents = latents self.period = period # todo: fix this sampler stuff self.sampler = HierBatchSampler(self, self.period) # i hope this is right self.diagonal = DiagonalGaussian( self.policy.low_policy.action_space.flat_dim) self.debug_fns = [] assert isinstance(self.policy, HierarchicalPolicy) if self.policy is not None: self.period = self.policy.period assert self.policy.period == self.period self.trainable_manager = self.policy.trainable_manager # skill dependent baseline self.use_skill_dependent_baseline = use_skill_dependent_baseline if use_skill_dependent_baseline: curr_env = kwargs['env'] skill_dependent_action_space = curr_env.action_space skill_dependent_obs_space_dim = ( (curr_env.observation_space.shape[0] + 1) * self.num_latents, ) skill_dependent_obs_space = Box( -1.0, 1.0, shape=skill_dependent_obs_space_dim) skill_depdendent_env_spec = EnvSpec(skill_dependent_obs_space, skill_dependent_action_space) self.skill_dependent_baseline = LinearFeatureBaseline( env_spec=skill_depdendent_env_spec)
def __init__(self, optimizer=None, optimizer_args=None, step_size=0.0003, truncate_local_is_ratio=None, epsilon=0.1, train_pi_iters=80, **kwargs): if optimizer is None: if optimizer_args is None: optimizer_args = dict(batch_size=None) self.optimizer = FirstOrderOptimizer(learning_rate=step_size, max_epochs=train_pi_iters, **optimizer_args) self.step_size = step_size self.truncate_local_is_ratio = truncate_local_is_ratio self.epsilon = epsilon super(PPO_flat, self).__init__(**kwargs) # not sure if this line is correct # i hope this is right self.debug_fns = []
def __init__( self, optimizer=None, optimizer_args=None, step_size=0.0003, latents=None, # some sort of iterable of the actual latent vectors average_period=10, # average over all the periods truncate_local_is_ratio=None, epsilon=0.1, train_pi_iters=80, use_skill_dependent_baseline=False, **kwargs): if optimizer is None: if optimizer_args is None: # optimizer_args = dict() optimizer_args = dict(batch_size=None) self.optimizer = FirstOrderOptimizer(learning_rate=step_size, max_epochs=train_pi_iters, **optimizer_args) self.step_size = step_size self.truncate_local_is_ratio = truncate_local_is_ratio self.epsilon = epsilon super(Hippo, self).__init__(**kwargs) # not sure if this line is correct self.num_latents = kwargs['policy'].latent_dim self.latents = latents self.average_period = average_period # import pdb; pdb.set_trace() self.sampler = BatchSampler(self) # i hope this is right self.diagonal = DiagonalGaussian( self.policy.low_policy.action_space.flat_dim) self.debug_fns = [] self.use_skill_dependent_baseline = use_skill_dependent_baseline assert isinstance(self.policy, HierarchicalPolicy) self.old_policy = copy.deepcopy(self.policy)
def __init__(self, agents_number, average_period, participation_rate, env, policy, baseline, difference_params=False, quantize=False, quantization_tuning=4, optimizer=None, optimizer_args=None, **kwargs): Serializable.quick_init(self, locals()) if optimizer is None: default_args = dict( batch_size=None, max_epochs=1, ) if optimizer_args is None: optimizer_args = default_args else: optimizer_args = dict(default_args, **optimizer_args) optimizer = [FirstOrderOptimizer(**optimizer_args)] * agents_number self.agents = [ Agent(env=env, policy=policy, optimizer=optimizer, baseline=baseline, difference_params=difference_params, quantize=quantize, quantization_tuning=quantization_tuning, **kwargs) for optimizer in optimizer ] self.baseline = baseline self.average_period = average_period self.participation_rate = participation_rate self.transferred_bits = 0 super(Server, self).__init__(agents_number=agents_number, average_period=average_period, participation_rate=participation_rate, env=env, policy=policy, baseline=baseline, difference_params=difference_params, quantize=quantize, quantization_tuning=quantization_tuning, optimizer=optimizer, optimizer_args=optimizer_args, **kwargs)
def __init__(self, optimizer=None, optimizer_args=None, step_size=0.01, truncate_local_is_ratio=None, **kwargs): if optimizer is None: if optimizer_args is None: optimizer_args = dict() optimizer = FirstOrderOptimizer(**optimizer_args) self.optimizer = optimizer self.step_size = step_size self.truncate_local_is_ratio = truncate_local_is_ratio super(VPG, self).__init__(**kwargs)
def __init__(self, num_of_agents, env, policy, policy_list, baseline, baseline_list, optimizer=None, optimizer_args=None, with_critic=True, **kwargs): Serializable.quick_init(self, locals()) if optimizer is None: default_args = dict( batch_size=None, max_epochs=1, ) if optimizer_args is None: optimizer_args = default_args else: optimizer_args = dict(default_args, **optimizer_args) optimizer = FirstOrderOptimizer(**optimizer_args) #optimizer = MyFirstOrderOptimizer(**optimizer_args) self.optimizer = optimizer self.opt_info = None self.num_of_agents = num_of_agents self.sampler_list = [ BatchSampler_Multi(self, i, with_critic) for i in range(self.num_of_agents) ] self.optimizer_list = [ pickle.loads(pickle.dumps(self.optimizer)) for _ in range(self.num_of_agents) ] super(VPG_multi, self).__init__(env=env, policy=policy, baseline=baseline, **kwargs) self.policy_list = policy_list self.baseline_list = baseline_list
def __init__( self, env, policy, baseline, optimizer=None, optimizer_args=None, **kwargs): Serializable.quick_init(self, locals()) if optimizer is None: default_args = dict( batch_size=None, max_epochs=1, ) if optimizer_args is None: optimizer_args = default_args else: optimizer_args = dict(default_args, **optimizer_args) optimizer = FirstOrderOptimizer(**optimizer_args) self.optimizer = optimizer self.opt_info = None super(VPG, self).__init__(env=env, policy=policy, baseline=baseline, **kwargs)
def __init__(self, agents_number, average_period, server_env, policy, baseline, optimizer=None, optimizer_args=None, **kwargs): Serializable.quick_init(self, locals()) if optimizer is None: default_args = dict( batch_size=None, max_epochs=1, ) if optimizer_args is None: optimizer_args = default_args else: optimizer_args = dict(default_args, **optimizer_args) optimizer = [FirstOrderOptimizer(**optimizer_args)] * agents_number self.agents = [ Agent(env=server_env.agents_envs[k], policy=policy, optimizer=optimizer, baseline=baseline, **kwargs) for k, optimizer in enumerate(optimizer) ] self.baseline = baseline self.average_period = average_period super(Server, self).__init__(agents_number=agents_number, average_period=average_period, env=server_env, policy=policy, baseline=baseline, optimizer=optimizer, optimizer_args=optimizer_args, **kwargs)
class Hippo(BatchPolopt): def __init__( self, optimizer=None, optimizer_args=None, step_size=0.0003, latents=None, # some sort of iterable of the actual latent vectors average_period=10, # average over all the periods truncate_local_is_ratio=None, epsilon=0.1, train_pi_iters=80, use_skill_dependent_baseline=False, mlp_skill_dependent_baseline=False, **kwargs): if optimizer is None: if optimizer_args is None: # optimizer_args = dict() optimizer_args = dict(batch_size=None) self.optimizer = FirstOrderOptimizer(learning_rate=step_size, max_epochs=train_pi_iters, **optimizer_args) self.step_size = step_size self.truncate_local_is_ratio = truncate_local_is_ratio self.epsilon = epsilon super(Hippo, self).__init__(**kwargs) # not sure if this line is correct self.num_latents = kwargs['policy'].latent_dim self.latents = latents self.average_period = average_period # import pdb; pdb.set_trace() # self.sampler = BatchSampler(self) self.sampler = HierBatchSampler(self, period=None) # i hope this is right self.diagonal = DiagonalGaussian( self.policy.low_policy.action_space.flat_dim) self.debug_fns = [] assert isinstance(self.policy, HierarchicalPolicyRandomTime) # self.old_policy = copy.deepcopy(self.policy) # skill dependent baseline self.use_skill_dependent_baseline = use_skill_dependent_baseline self.mlp_skill_dependent_baseline = mlp_skill_dependent_baseline if use_skill_dependent_baseline: curr_env = kwargs['env'] skill_dependent_action_space = curr_env.action_space new_obs_space_no_bi = curr_env.observation_space.shape[ 0] + 1 # 1 for the t_remaining skill_dependent_obs_space_dim = (new_obs_space_no_bi * (self.num_latents + 1) + self.num_latents, ) skill_dependent_obs_space = Box( -1.0, 1.0, shape=skill_dependent_obs_space_dim) skill_dependent_env_spec = EnvSpec(skill_dependent_obs_space, skill_dependent_action_space) if self.mlp_skill_dependent_baseline: self.skill_dependent_baseline = GaussianMLPBaseline( env_spec=skill_dependent_env_spec) else: self.skill_dependent_baseline = LinearFeatureBaseline( env_spec=skill_dependent_env_spec) def init_opt(self): obs_var = ext.new_tensor( 'obs', ndim=2, dtype=theano.config.floatX) # todo: check the dtype manager_obs_var = ext.new_tensor('manager_obs', ndim=2, dtype=theano.config.floatX) action_var = self.env.action_space.new_tensor_variable( 'action', extra_dims=1, ) # this will have to be the advantage every time the manager makes a decision manager_advantage_var = ext.new_tensor('manager_advantage', ndim=1, dtype=theano.config.floatX) skill_advantage_var = ext.new_tensor('skill_advantage', ndim=1, dtype=theano.config.floatX) latent_var_sparse = ext.new_tensor('sparse_latent', ndim=2, dtype=theano.config.floatX) latent_var = ext.new_tensor('latents', ndim=2, dtype=theano.config.floatX) mean_var = ext.new_tensor('mean', ndim=2, dtype=theano.config.floatX) log_std_var = ext.new_tensor('log_std', ndim=2, dtype=theano.config.floatX) manager_prob_var = ext.new_tensor('log_std', ndim=2, dtype=theano.config.floatX) assert isinstance(self.policy, HierarchicalPolicy) ############################################################# ### calculating the manager portion of the surrogate loss ### ############################################################# # i, j should contain the probability of latent j at time step self.period*i # should be a len(obs)//self.period by len(self.latent) tensor latent_probs = self.policy.manager.dist_info_sym( manager_obs_var)['prob'] # old_latent_probs = self.old_policy.manager.dist_info_sym(manager_obs_var)['prob'] actual_latent_probs = TT.sum(latent_probs * latent_var_sparse, axis=1) old_actual_latent_probs = TT.sum(manager_prob_var * latent_var_sparse, axis=1) lr = TT.exp( TT.log(actual_latent_probs) - TT.log(old_actual_latent_probs)) manager_surr_loss_vector = TT.minimum( lr * manager_advantage_var, TT.clip(lr, 1 - self.epsilon, 1 + self.epsilon) * manager_advantage_var) manager_surr_loss = -TT.mean(manager_surr_loss_vector) ############################################################ ### calculating the skills portion of the surrogate loss ### ############################################################ dist_info_var = self.policy.low_policy.dist_info_sym( obs_var, state_info_var=latent_var) old_dist_info_var = dict(mean=mean_var, log_std=log_std_var) skill_lr = self.diagonal.likelihood_ratio_sym(action_var, old_dist_info_var, dist_info_var) skill_surr_loss_vector = TT.minimum( skill_lr * skill_advantage_var, TT.clip(skill_lr, 1 - self.epsilon, 1 + self.epsilon) * skill_advantage_var) skill_surr_loss = -TT.mean(skill_surr_loss_vector) surr_loss = manager_surr_loss / self.average_period + skill_surr_loss input_list = [ obs_var, manager_obs_var, action_var, manager_advantage_var, skill_advantage_var, latent_var, latent_var_sparse, mean_var, log_std_var, manager_prob_var ] self.optimizer.update_opt(loss=surr_loss, target=self.policy, inputs=input_list) return dict() # do the optimization def optimize_policy(self, itr, samples_data): # print(len(samples_data['observations']), self.period) # assert len(samples_data['observations']) % self.period == 0 # note that I have to do extra preprocessing to the advantages, and also create obs_var_sparse if self.use_skill_dependent_baseline: input_values = tuple( ext.extract(samples_data, "observations", "actions", "advantages", "agent_infos", "skill_advantages")) else: input_values = tuple( ext.extract(samples_data, "observations", "actions", "advantages", "agent_infos")) time_remaining = input_values[3]['time_remaining'] resampled_period = input_values[3]['resampled_period'] obs_var = np.insert(input_values[0], self.policy.obs_robot_dim, time_remaining, axis=1) manager_obs_var = obs_var[resampled_period] action_var = input_values[1] manager_adv_var = input_values[2][resampled_period] latent_var = input_values[3]['latents'] latent_var_sparse = latent_var[resampled_period] mean = input_values[3]['mean'] log_std = input_values[3]['log_std'] prob = input_values[3]['prob'][resampled_period] if self.use_skill_dependent_baseline: skill_adv_var = input_values[4] all_input_values = (obs_var, manager_obs_var, action_var, manager_adv_var, skill_adv_var, latent_var, latent_var_sparse, mean, log_std, prob) else: skill_adv_var = input_values[2] all_input_values = (obs_var, manager_obs_var, action_var, manager_adv_var, skill_adv_var, latent_var, latent_var_sparse, mean, log_std, prob) # todo: assign current parameters to old policy; does this work? # old_param_values = self.policy.get_param_values() # self.old_policy.set_param_values(old_param_values) loss_before = self.optimizer.loss(all_input_values) self.optimizer.optimize(all_input_values) loss_after = self.optimizer.loss(all_input_values) logger.record_tabular('LossBefore', loss_before) logger.record_tabular('LossAfter', loss_after) logger.record_tabular('dLoss', loss_before - loss_after) return dict() def get_itr_snapshot(self, itr, samples_data): return dict(itr=itr, policy=self.policy, baseline=self.baseline, env=self.env) def log_diagnostics(self, paths): # paths obtained by self.sampler.obtain_samples BatchPolopt.log_diagnostics(self, paths)
class PPO_flat(BatchPolopt): """ Normal clipped PPO version of the HiPPO one that this paper investigates. """ # double check this constructor later def __init__(self, optimizer=None, optimizer_args=None, step_size=0.0003, truncate_local_is_ratio=None, epsilon=0.1, train_pi_iters=80, **kwargs): if optimizer is None: if optimizer_args is None: optimizer_args = dict(batch_size=None) self.optimizer = FirstOrderOptimizer(learning_rate=step_size, max_epochs=train_pi_iters, **optimizer_args) self.step_size = step_size self.truncate_local_is_ratio = truncate_local_is_ratio self.epsilon = epsilon super(PPO_flat, self).__init__(**kwargs) # not sure if this line is correct # i hope this is right self.debug_fns = [] # self.old_policy = copy.deepcopy(self.policy) # initialize the computation graph # optimize is run on >= 1 trajectory at a time def init_opt(self): obs_var = self.env.observation_space.new_tensor_variable( 'obs', extra_dims=1, ) action_var = self.env.action_space.new_tensor_variable( 'action', extra_dims=1, ) advantage_var = ext.new_tensor( 'advantage', ndim=1, dtype=theano.config.floatX ) mean_var = ext.new_tensor( 'mean', ndim=2, dtype=theano.config.floatX ) log_std_var = ext.new_tensor( 'log_std', ndim=2, dtype=theano.config.floatX ) old_dist_info_vars = dict(mean=mean_var, log_std=log_std_var) dist_info_vars = self.policy.dist_info_sym(obs_var) lr = self.policy.distribution.likelihood_ratio_sym(action_var, old_dist_info_vars, dist_info_vars) surr_loss_vector = TT.minimum(lr * advantage_var, TT.clip(lr, 1 - self.epsilon, 1 + self.epsilon) * advantage_var) surr_loss = -TT.mean(surr_loss_vector) input_list = [obs_var, action_var, advantage_var, mean_var, log_std_var] self.optimizer.update_opt( loss=surr_loss, target=self.policy, inputs=input_list ) return dict() # do the optimization def optimize_policy(self, itr, samples_data): # note that I have to do extra preprocessing to the advantages, and also create obs_var_sparse input_values = tuple(ext.extract( samples_data, "observations", "actions", "advantages", "agent_infos" )) mean = input_values[3]['mean'] log_std = input_values[3]['log_std'] all_input_values = (input_values[0], input_values[1], input_values[2], mean, log_std) # todo: assign current parameters to old policy; does this work? # old_param_values = self.policy.get_param_values() # self.old_policy.set_param_values(old_param_values) loss_before = self.optimizer.loss(all_input_values) self.optimizer.optimize(all_input_values) loss_after = self.optimizer.loss(all_input_values) logger.record_tabular('LossBefore', loss_before) logger.record_tabular('LossAfter', loss_after) logger.record_tabular('dLoss', loss_before - loss_after) return dict() def get_itr_snapshot(self, itr, samples_data): return dict( itr=itr, policy=self.policy, baseline=self.baseline, env=self.env ) def log_diagnostics(self, paths): # paths obtained by self.sampler.obtain_samples BatchPolopt.log_diagnostics(self, paths)
class PG_concurrent(BatchPolopt): """ Designed to enable concurrent training of a SNN that parameterizes skills and also train the manager at the same time Note that, if I'm not trying to do the sample approximation of the weird log of sum term, I don't need to know which skill was picked, just need to know the action """ # double check this constructor later def __init__( self, manager_optimizer=None, optimizer=None, snn_optimizer=None, optimizer_args=None, step_size=1e-6, latents=None, # some sort of iterable of the actual latent vectors period=10, # how often I choose a latent truncate_local_is_ratio=None, **kwargs): if optimizer is None: if optimizer_args is None: # optimizer_args = dict() optimizer_args = dict(batch_size=None) self.optimizer = FirstOrderOptimizer( learning_rate=step_size, **optimizer_args) # I hope this is right self.manager_optimizer = manager_optimizer self.snn_optimizer = snn_optimizer self.step_size = step_size self.truncate_local_is_ratio = truncate_local_is_ratio super(PG_concurrent, self).__init__(**kwargs) # not sure if this line is correct self.latents = latents self.period = period # todo: fix this sampler stuff self.sampler = HierBatchSampler(self, self.period) # i hope this is right self.diagonal = DiagonalGaussian( self.policy.low_policy.action_space.flat_dim) self.debug_fns = [] # initialize the computation graph # optimize is run on >= 1 trajectory at a time def init_opt(self): # obs_var_raw = self.env.observation_space.new_tensor_variable( # 'obs', # extra_dims=1, # ) obs_var_raw = ext.new_tensor( 'obs', ndim=3, dtype=theano.config.floatX) # todo: check the dtype action_var = self.env.action_space.new_tensor_variable( 'action', extra_dims=1, ) # this will have to be the advantage every self.period timesteps advantage_var = ext.new_tensor('advantage', ndim=1, dtype=theano.config.floatX) obs_var_sparse = ext.new_tensor( 'sparse_obs', ndim=2, dtype=theano.config. floatX # todo: check this with carlos, refer to discrete.py in rllab.spaces ) assert isinstance(self.policy, HierarchicalPolicy) # todo: assumptions: 1 trajectory, which is a multiple of p; that the obs_var_probs is valid # undoing the reshape, so that batch sampling is ok obs_var = TT.reshape(obs_var_raw, [ obs_var_raw.shape[0] * obs_var_raw.shape[1], obs_var_raw.shape[2] ]) # obs_var = obs_var_raw # i, j should contain the probability of latent j at time step self.period*i # should be a len(obs)//self.period by len(self.latent) tensor latent_probs = self.policy.manager.dist_info_sym( obs_var_sparse)['prob'] # get the distribution parameters # dist_info_vars = [] # for latent in self.latents: # self.policy.low_policy.set_latent_train(latent) # dist_info_vars.append(self.policy.low_policy.dist_info_sym(obs_var)) # hopefully the above line takes multiple samples, and state_info_vars not needed as input dist_info_vars = self.policy.low_policy.dist_info_sym_all_latents( obs_var) probs = [ TT.exp(self.diagonal.log_likelihood_sym(action_var, dist_info)) for dist_info in dist_info_vars ] # need to reshape at the end reshaped_probs = [ TT.reshape(prob, [obs_var.shape[0] // self.period, self.period]) for prob in probs ] # now, multiply out each row and concatenate subtrajectory_probs = TT.stack([ TT.prod(reshaped_prob, axis=1) for reshaped_prob in reshaped_probs ], axis=1) # shape error might come out of here # elementwise multiplication, then sum up each individual row and take log likelihood = TT.log(TT.sum(subtrajectory_probs * latent_probs, axis=1)) surr_loss = -TT.mean(likelihood * advantage_var) input_list = [obs_var_raw, obs_var_sparse, action_var, advantage_var] # npo has state_info_vars and old_dist_info_vars, I don't think I need them until I go for NPO/TRPO self.optimizer.update_opt(loss=surr_loss, target=self.policy, inputs=input_list) return dict() #do the optimization def optimize_policy(self, itr, samples_data): assert len(samples_data) // self.period == 0 # note that I have to do extra preprocessing to the advantages, and also create obs_var_sparse input_values = tuple( ext.extract(samples_data, "observations", "actions", "advantages")) # print(input_values[0].shape) obs_raw = input_values[0].reshape( input_values[0].shape[0] // self.period, self.period, input_values[0].shape[1]) # obs_raw = input_values[0] obs_sparse = input_values[0].take( [i for i in range(0, input_values[0].shape[0], self.period)], axis=0) advantage_sparse = np.sum(input_values[2].reshape( [input_values[2].shape[0] // self.period, self.period]), axis=1) all_input_values = (obs_raw, obs_sparse, input_values[1], advantage_sparse) loss_before = self.optimizer.loss(all_input_values) self.optimizer.optimize(all_input_values) loss_after = self.optimizer.loss(all_input_values) logger.record_tabular('LossBefore', loss_before) logger.record_tabular('LossAfter', loss_after) logger.record_tabular('dLoss', loss_before - loss_after) return dict() def optimize_manager(self, itr, samples_data): pass def optimize_snn(self, itr, samples_data): pass def get_itr_snapshot(self, itr, samples_data): return dict(itr=itr, policy=self.policy, baseline=self.baseline, env=self.env) def log_diagnostics(self, paths): #paths obtained by self.sampler.obtain_samples BatchPolopt.log_diagnostics(self, paths)
def __init__( self, env_spec, policy, recurrent=False, predict_all=True, obs_regressed='all', act_regressed='all', use_only_sign=False, noisify_traj_coef=0, optimizer=None, # this defaults to LBFGS regressor_args=None, # here goes all args straight to the regressor: hidden_sizes, TR, step_size.... ): """ :param predict_all: this is only for the recurrent case, to use all hidden states as predictions :param obs_regressed: list of index of the obs variables used to fit the regressor. default string 'all' :param act_regressed: list of index of the act variables used to fit the regressor. default string 'all' :param regressor_args: """ self.env_spec = env_spec self.policy = policy self.latent_dim = policy.latent_dim self.recurrent = recurrent self.predict_all = predict_all self.use_only_sign = use_only_sign self.noisify_traj_coef = noisify_traj_coef self.regressor_args = regressor_args # decide what obs variables will be regressed upon if obs_regressed == 'all': self.obs_regressed = list( range(env_spec.observation_space.flat_dim)) else: self.obs_regressed = obs_regressed # decide what action variables will be regressed upon if act_regressed == 'all': self.act_regressed = list(range(env_spec.action_space.flat_dim)) else: self.act_regressed = act_regressed # shape the input dimension of the NN for the above decisions. self.obs_act_dim = len(self.obs_regressed) + len(self.act_regressed) Serializable.quick_init(self, locals()) # ?? if regressor_args is None: regressor_args = dict() if optimizer == 'first_order': self.optimizer = FirstOrderOptimizer( max_epochs=10, # both of these are to match Rocky's 10 batch_size=128, ) elif optimizer is None: self.optimizer = None else: raise NotImplementedError if policy.latent_name == 'bernoulli': if self.recurrent: self._regressor = BernoulliRecurrentRegressor( input_shape=(self.obs_act_dim, ), output_dim=policy.latent_dim, optimizer=self.optimizer, predict_all=self.predict_all, **regressor_args) else: self._regressor = BernoulliMLPRegressor( input_shape=(self.obs_act_dim, ), output_dim=policy.latent_dim, optimizer=self.optimizer, **regressor_args) elif policy.latent_name == 'categorical': if self.recurrent: self._regressor = CategoricalRecurrentRegressor( # not implemented input_shape=(self.obs_act_dim, ), output_dim=policy.latent_dim, optimizer=self.optimizer, # predict_all=self.predict_all, **regressor_args) else: self._regressor = CategoricalMLPRegressor( input_shape=(self.obs_act_dim, ), output_dim=policy.latent_dim, optimizer=self.optimizer, **regressor_args) elif policy.latent_name == 'normal': self._regressor = GaussianMLPRegressor( input_shape=(self.obs_act_dim, ), output_dim=policy.latent_dim, optimizer=self.optimizer, **regressor_args) else: raise NotImplementedError
class ConcurrentContinuousPPO(BatchPolopt): """ Designed to enable concurrent training of a SNN that parameterizes skills and also train the manager at the same time Note that, if I'm not trying to do the sample approximation of the weird log of sum term, I don't need to know which skill was picked, just need to know the action """ # double check this constructor later def __init__( self, optimizer=None, optimizer_args=None, step_size=0.003, num_latents=6, latents=None, # some sort of iterable of the actual latent vectors period=10, # how often I choose a latent truncate_local_is_ratio=None, epsilon=0.1, train_pi_iters=10, use_skill_dependent_baseline=False, mlp_skill_dependent_baseline=False, freeze_manager=False, freeze_skills=False, **kwargs): if optimizer is None: if optimizer_args is None: # optimizer_args = dict() optimizer_args = dict(batch_size=None) self.optimizer = FirstOrderOptimizer(learning_rate=step_size, max_epochs=train_pi_iters, **optimizer_args) self.step_size = step_size self.truncate_local_is_ratio = truncate_local_is_ratio self.epsilon = epsilon super(ConcurrentContinuousPPO, self).__init__(**kwargs) # not sure if this line is correct self.num_latents = kwargs['policy'].latent_dim self.latents = latents self.period = period self.freeze_manager = freeze_manager self.freeze_skills = freeze_skills assert (not freeze_manager) or (not freeze_skills) # todo: fix this sampler stuff # import pdb; pdb.set_trace() self.sampler = HierBatchSampler(self, self.period) # self.sampler = BatchSampler(self) # i hope this is right self.diagonal = DiagonalGaussian( self.policy.low_policy.action_space.flat_dim) self.debug_fns = [] assert isinstance(self.policy, HierarchicalPolicy) self.period = self.policy.period assert self.policy.period == self.period self.continuous_latent = self.policy.continuous_latent assert self.continuous_latent # self.old_policy = copy.deepcopy(self.policy) # skill dependent baseline self.use_skill_dependent_baseline = use_skill_dependent_baseline self.mlp_skill_dependent_baseline = mlp_skill_dependent_baseline if use_skill_dependent_baseline: curr_env = kwargs['env'] skill_dependent_action_space = curr_env.action_space new_obs_space_no_bi = curr_env.observation_space.shape[ 0] + 1 # 1 for the t_remaining skill_dependent_obs_space_dim = (new_obs_space_no_bi * (self.num_latents + 1) + self.num_latents, ) skill_dependent_obs_space = Box( -1.0, 1.0, shape=skill_dependent_obs_space_dim) skill_dependent_env_spec = EnvSpec(skill_dependent_obs_space, skill_dependent_action_space) if self.mlp_skill_dependent_baseline: self.skill_dependent_baseline = GaussianMLPBaseline( env_spec=skill_dependent_env_spec) else: self.skill_dependent_baseline = LinearFeatureBaseline( env_spec=skill_dependent_env_spec) # initialize the computation graph # optimize is run on >= 1 trajectory at a time # assumptions: 1 trajectory, which is a multiple of p; that the obs_var_probs is valid def init_opt(self): assert isinstance(self.policy, HierarchicalPolicy) assert not self.freeze_manager and not self.freeze_skills manager_surr_loss = 0 # skill_surr_loss = 0 obs_var_sparse = ext.new_tensor('sparse_obs', ndim=2, dtype=theano.config.floatX) obs_var_raw = ext.new_tensor( 'obs', ndim=3, dtype=theano.config.floatX) # todo: check the dtype action_var = self.env.action_space.new_tensor_variable( 'action', extra_dims=1, ) advantage_var = ext.new_tensor('advantage', ndim=1, dtype=theano.config.floatX) # latent_var = ext.new_tensor('latents', ndim=2, dtype=theano.config.floatX) mean_var = ext.new_tensor('mean', ndim=2, dtype=theano.config.floatX) log_std_var = ext.new_tensor('log_std', ndim=2, dtype=theano.config.floatX) # undoing the reshape, so that batch sampling is ok obs_var = TT.reshape(obs_var_raw, [ obs_var_raw.shape[0] * obs_var_raw.shape[1], obs_var_raw.shape[2] ]) ############################################################ ### calculating the skills portion of the surrogate loss ### ############################################################ latent_var_sparse = self.policy.manager.dist_info_sym( obs_var_sparse)['mean'] latent_var = TT.extra_ops.repeat(latent_var_sparse, self.period, axis=0) #.dimshuffle(0, 'x') dist_info_var = self.policy.low_policy.dist_info_sym( obs_var, state_info_var=latent_var) old_dist_info_var = dict(mean=mean_var, log_std=log_std_var) skill_lr = self.diagonal.likelihood_ratio_sym(action_var, old_dist_info_var, dist_info_var) skill_surr_loss_vector = TT.minimum( skill_lr * advantage_var, TT.clip(skill_lr, 1 - self.epsilon, 1 + self.epsilon) * advantage_var) skill_surr_loss = -TT.mean(skill_surr_loss_vector) surr_loss = skill_surr_loss # so that the relative magnitudes are correct if self.freeze_skills and not self.freeze_manager: raise NotImplementedError elif self.freeze_manager and not self.freeze_skills: raise NotImplementedError else: assert (not self.freeze_manager) or (not self.freeze_skills) input_list = [ obs_var_raw, obs_var_sparse, action_var, advantage_var, mean_var, log_std_var ] self.optimizer.update_opt(loss=surr_loss, target=self.policy, inputs=input_list) return dict() # do the optimization def optimize_policy(self, itr, samples_data): print(len(samples_data['observations']), self.period) assert len(samples_data['observations']) % self.period == 0 # note that I have to do extra preprocessing to the advantages, and also create obs_var_sparse if self.use_skill_dependent_baseline: input_values = tuple( ext.extract(samples_data, "observations", "actions", "advantages", "agent_infos", "skill_advantages")) else: input_values = tuple( ext.extract(samples_data, "observations", "actions", "advantages", "agent_infos")) obs_raw = input_values[0].reshape( input_values[0].shape[0] // self.period, self.period, input_values[0].shape[1]) obs_sparse = input_values[0].take( [i for i in range(0, input_values[0].shape[0], self.period)], axis=0) if not self.continuous_latent: advantage_sparse = input_values[2].reshape( [input_values[2].shape[0] // self.period, self.period])[:, 0] latents = input_values[3]['latents'] latents_sparse = latents.take( [i for i in range(0, latents.shape[0], self.period)], axis=0) prob = np.array(list(input_values[3]['prob'].take( [i for i in range(0, latents.shape[0], self.period)], axis=0)), dtype=np.float32) mean = input_values[3]['mean'] log_std = input_values[3]['log_std'] if self.use_skill_dependent_baseline: advantage_var = input_values[4] else: advantage_var = input_values[2] # import ipdb; ipdb.set_trace() if self.freeze_skills and not self.freeze_manager: raise NotImplementedError elif self.freeze_manager and not self.freeze_skills: raise NotImplementedError else: assert (not self.freeze_manager) or (not self.freeze_skills) all_input_values = (obs_raw, obs_sparse, input_values[1], advantage_var, mean, log_std) # todo: assign current parameters to old policy; does this work? # old_param_values = self.policy.get_param_values(trainable=True) # self.old_policy.set_param_values(old_param_values, trainable=True) # old_param_values = self.policy.get_param_values() # self.old_policy.set_param_values(old_param_values) loss_before = self.optimizer.loss(all_input_values) self.optimizer.optimize(all_input_values) loss_after = self.optimizer.loss(all_input_values) logger.record_tabular('LossBefore', loss_before) logger.record_tabular('LossAfter', loss_after) logger.record_tabular('dLoss', loss_before - loss_after) return dict() def get_itr_snapshot(self, itr, samples_data): return dict(itr=itr, policy=self.policy, baseline=self.baseline, env=self.env) def log_diagnostics(self, paths): # paths obtained by self.sampler.obtain_samples BatchPolopt.log_diagnostics(self, paths)
class CriticEval: def __init__(self, qf, policy, min_pool_size=10000, replay_pool_size=1000000, replacement_prob=1.0, qf_batch_size=32, qf_weight_decay=0., qf_update_method='adam', qf_learning_rate=1e-3, qf_use_target=True, soft_target_tau=0.001, ): self.soft_target_tau = soft_target_tau self.min_pool_size = min_pool_size self.replay_pool_size = replay_pool_size self.replacement_prob = replacement_prob self.qf_batch_size = qf_batch_size self.qf_weight_decay = qf_weight_decay self.qf_update_method = FirstOrderOptimizer(update_method=qf_update_method, learning_rate=qf_learning_rate) self.qf_use_target = qf_use_target self.discount = 0.99 self.qf = qf self.policy = policy self.qf_loss_averages = [] self.q_averages = [] self.y_averages = [] def init_opt_critic(self, obs_dim, act_dim): target_qf = self.qf extra_dims = 1 obs = tf.placeholder(tf.float32, shape=[None] * extra_dims + list([obs_dim]), name='qf_obs') action = tf.placeholder(tf.float32, shape=[None] * extra_dims + list([act_dim]), name='qf_action') # obs = tf.placeholder(tf.float32, shape=list([obs_dim] + [None] * extra_dims), name='qf_obs') # action = tf.placeholder(tf.float32, shape=list([act_dim] + [None] * extra_dims), name='qf_action') yvar = tf.placeholder(dtype=tf.float32, shape=[None], name='ys') # qf_weight_decay_term = 0.5 * self.qf_weight_decay * \ # sum([tf.reduce_sum(tf.square(param)) for param in # self.qf.get_params(regularizable=True)]) qval = self.qf.get_qval_sym(obs, action) qf_loss = tf.reduce_mean(tf.square(yvar - qval)) qf_input_list = [yvar, obs, action] qf_output_list = [qf_loss, qval] # qf_reg_loss = qf_loss + qf_weight_decay_term qf_reg_loss = qf_loss self.qf_update_method.update_opt( loss=qf_reg_loss, target=self.qf, inputs=qf_input_list) # qf_output_list += [self.qf_update_method._train_op] f_train_qf = compile_function(inputs=qf_input_list, outputs=qf_output_list, sess=tf.get_default_session()) self.opt_info_critic = dict( f_train_qf=f_train_qf, target_qf=target_qf, ) def do_critic_training(self, batch): obs = batch['states'] actions = batch['actions'] rewards = batch['rewards'] next_obs = batch['states_'] terminals = batch['terminals'] target_qf = self.opt_info_critic["target_qf"] target_policy = self.policy next_qvals = target_qf.get_e_qval(next_obs, target_policy) ys = rewards + (1. - terminals) * self.discount * next_qvals inputs = (ys, obs, actions) qf_outputs = self.opt_info_critic['f_train_qf'](*inputs) qf_loss = qf_outputs.pop(0) qval = qf_outputs.pop(0) if self.qf_use_target: target_qf.set_param_values( target_qf.get_param_values() * (1.0 - self.soft_target_tau) + self.qf.get_param_values() * self.soft_target_tau) self.qf_loss_averages.append(qf_loss) self.q_averages.append(qval) self.y_averages.append(ys) def optimize_critic(self, batch, batch_size): """ Train the critic for batch sampling-based policy optimization methods :param samples: :param batch_size: :param policy: :return: """ qf_updates_ratio = 1 qf_itrs = float(batch_size) * qf_updates_ratio qf_itrs = int(np.ceil(qf_itrs)) for i in range(qf_itrs): # Train critic self.do_critic_training(batch)
class Concurrent_PPO(BatchPolopt): """ Designed to enable concurrent training of a SNN that parameterizes skills and also train the manager at the same time Note that, if I'm not trying to do the sample approximation of the weird log of sum term, I don't need to know which skill was picked, just need to know the action """ # double check this constructor later def __init__(self, optimizer=None, optimizer_args=None, step_size=0.0003, num_latents=6, latents=None, # some sort of iterable of the actual latent vectors period=10, # how often I choose a latent truncate_local_is_ratio=None, epsilon=0.1, train_pi_iters=80, use_skill_dependent_baseline=False, **kwargs): if optimizer is None: if optimizer_args is None: # optimizer_args = dict() optimizer_args = dict(batch_size=None) self.optimizer = FirstOrderOptimizer(learning_rate=step_size, max_epochs=train_pi_iters, **optimizer_args) self.step_size = step_size self.truncate_local_is_ratio = truncate_local_is_ratio self.epsilon = epsilon super(Concurrent_PPO, self).__init__(**kwargs) # not sure if this line is correct self.num_latents = kwargs['policy'].latent_dim self.latents = latents self.period = period # todo: fix this sampler stuff # import pdb; pdb.set_trace() self.sampler = HierBatchSampler(self, self.period) # self.sampler = BatchSampler(self) # i hope this is right self.diagonal = DiagonalGaussian(self.policy.low_policy.action_space.flat_dim) self.debug_fns = [] assert isinstance(self.policy, HierarchicalPolicy) if self.policy is not None: self.period = self.policy.period assert self.policy.period == self.period self.old_policy = copy.deepcopy(self.policy) # skill dependent baseline self.use_skill_dependent_baseline = use_skill_dependent_baseline if use_skill_dependent_baseline: curr_env = kwargs['env'] skill_dependent_action_space = curr_env.action_space skill_dependent_obs_space_dim = ((curr_env.observation_space.shape[0] + 1) * self.num_latents,) skill_dependent_obs_space = Box(-1.0, 1.0, shape=skill_dependent_obs_space_dim) skill_depdendent_env_spec = EnvSpec(skill_dependent_obs_space, skill_dependent_action_space) self.skill_dependent_baseline = LinearFeatureBaseline(env_spec=skill_depdendent_env_spec) # initialize the computation graph # optimize is run on >= 1 trajectory at a time def init_opt(self): # obs_var_raw = self.env.observation_space.new_tensor_variable( # 'obs', # extra_dims=1, # ) obs_var_raw = ext.new_tensor('obs', ndim=3, dtype=theano.config.floatX) # todo: check the dtype action_var = self.env.action_space.new_tensor_variable( 'action', extra_dims=1, ) # this will have to be the advantage every self.period timesteps advantage_var_sparse = ext.new_tensor( 'sparse_advantage', ndim=1, dtype=theano.config.floatX ) advantage_var = ext.new_tensor( 'advantage', ndim=1, dtype=theano.config.floatX ) obs_var_sparse = ext.new_tensor( 'sparse_obs', ndim=2, dtype=theano.config.floatX # todo: check this with carlos, refer to discrete.py in rllab.spaces ) latent_var_sparse = ext.new_tensor( 'sparse_latent', ndim=2, dtype=theano.config.floatX ) latent_var = ext.new_tensor( 'latents', ndim=2, dtype=theano.config.floatX ) assert isinstance(self.policy, HierarchicalPolicy) # todo: assumptions: 1 trajectory, which is a multiple of p; that the obs_var_probs is valid # undoing the reshape, so that batch sampling is ok obs_var = TT.reshape(obs_var_raw, [obs_var_raw.shape[0] * obs_var_raw.shape[1], obs_var_raw.shape[2]]) # obs_var = obs_var_raw ############################################################# ### calculating the manager portion of the surrogate loss ### ############################################################# # i, j should contain the probability of latent j at time step self.period*i # should be a len(obs)//self.period by len(self.latent) tensor latent_probs = self.policy.manager.dist_info_sym(obs_var_sparse)['prob'] old_latent_probs = self.old_policy.manager.dist_info_sym(obs_var_sparse)['prob'] actual_latent_probs = TT.sum(latent_probs * latent_var_sparse, axis=1) old_actual_latent_probs = TT.sum(old_latent_probs * latent_var_sparse, axis=1) lr = actual_latent_probs / old_actual_latent_probs manager_surr_loss_vector = TT.minimum(lr * advantage_var_sparse, TT.clip(lr, 1 - self.epsilon, 1 + self.epsilon) * advantage_var_sparse) manager_surr_loss = -TT.mean(manager_surr_loss_vector) # manager_surr_loss = - TT.mean(TT.log(actual_latent_probs) * advantage_var_sparse) ############################################################ ### calculating the skills portion of the surrogate loss ### ############################################################ # get the distribution parameters # dist_info_vars = [] # for latent in self.latents: # self.policy.low_policy.set_latent_train(latent) # dist_info_vars.append(self.policy.low_policy.dist_info_sym(obs_var)) # hopefully the above line takes multiple samples, and state_info_vars not needed as input dist_info_vars = self.policy.low_policy.dist_info_sym_all_latents(obs_var) probs = TT.stack([self.diagonal.log_likelihood_sym(action_var, dist_info) for dist_info in dist_info_vars], axis=1) actual_action_log_probs = TT.sum(probs * latent_var, axis=1) # todo: verify that dist_info_vars is in order # old policy stuff old_dist_info_vars = self.old_policy.low_policy.dist_info_sym_all_latents(obs_var) old_probs = TT.stack([self.diagonal.log_likelihood_sym(action_var, dist_info) for dist_info in old_dist_info_vars], axis=1) old_actual_action_log_probs = TT.sum(old_probs * latent_var, axis=1) skill_lr = TT.exp(actual_action_log_probs - old_actual_action_log_probs) skill_surr_loss_vector = TT.minimum(skill_lr * advantage_var, TT.clip(skill_lr, 1 - self.epsilon, 1 + self.epsilon) * advantage_var) skill_surr_loss = -TT.mean(skill_surr_loss_vector) # skill_surr_loss = - TT.mean(actual_action_log_probs*advantage_var) surr_loss = manager_surr_loss/self.period + skill_surr_loss # so that the relative magnitudes are correct input_list = [obs_var_raw, obs_var_sparse, action_var, advantage_var, advantage_var_sparse, latent_var, latent_var_sparse] # input_list = [obs_var_raw, obs_var_sparse, action_var, advantage_var] # npo has state_info_vars and old_dist_info_vars, I don't think I need them until I go for NPO/TRPO self.optimizer.update_opt( loss=surr_loss, target=self.policy, inputs=input_list ) return dict() # do the optimization def optimize_policy(self, itr, samples_data): print(len(samples_data['observations']), self.period) assert len(samples_data['observations']) % self.period == 0 # note that I have to do extra preprocessing to the advantages, and also create obs_var_sparse if self.use_skill_dependent_baseline: input_values = tuple(ext.extract( samples_data, "observations", "actions", "advantages", "agent_infos", "skill_advantages" )) else: input_values = tuple(ext.extract( samples_data, "observations", "actions", "advantages", "agent_infos" )) obs_raw = input_values[0].reshape(input_values[0].shape[0] // self.period, self.period, input_values[0].shape[1]) obs_sparse = input_values[0].take([i for i in range(0, input_values[0].shape[0], self.period)], axis=0) advantage_sparse = input_values[2].reshape([input_values[2].shape[0] // self.period, self.period])[:, 0] latents = input_values[3]['latents'] latents_sparse = latents.take([i for i in range(0, latents.shape[0], self.period)], axis=0) if self.use_skill_dependent_baseline: all_input_values = ( obs_raw, obs_sparse, input_values[1], input_values[4], advantage_sparse, latents, latents_sparse) else: all_input_values = ( obs_raw, obs_sparse, input_values[1], input_values[2], advantage_sparse, latents, latents_sparse) # todo: assign current parameters to old policy; does this work? old_param_values = self.policy.get_param_values(trainable=True) self.old_policy.set_param_values(old_param_values, trainable=True) loss_before = self.optimizer.loss(all_input_values) self.optimizer.optimize(all_input_values) loss_after = self.optimizer.loss(all_input_values) logger.record_tabular('LossBefore', loss_before) logger.record_tabular('LossAfter', loss_after) logger.record_tabular('dLoss', loss_before - loss_after) return dict() def get_itr_snapshot(self, itr, samples_data): return dict( itr=itr, policy=self.policy, baseline=self.baseline, env=self.env ) def log_diagnostics(self, paths): # paths obtained by self.sampler.obtain_samples BatchPolopt.log_diagnostics(self, paths)