def __init__(self, policy, supervised_model=None, supervised_ground_truth='teacher', name="ppo", learning_rate=1e-3, clip_eps=0.2, max_epochs=5, max_epochs_r=20, entropy_bonus=0., reward_predictor=None, reward_predictor_type='gaussian', grad_clip_threshold=None, **kwargs): # TODO: Check to avoid duplicates of variables and scopes self.reward_predictor = reward_predictor Serializable.quick_init(self, locals()) super(PPO, self).__init__(policy) self.recurrent = getattr(self.policy, 'recurrent', False) self.supervised_model = supervised_model if self.recurrent: backprop_steps = kwargs.get('backprop_steps', 32) self.optimizer = RL2FirstOrderOptimizer( learning_rate=learning_rate, max_epochs=max_epochs, backprop_steps=backprop_steps, grad_clip_threshold=grad_clip_threshold) if self.reward_predictor is not None: self.optimizer_r = RL2FirstOrderOptimizer( learning_rate=learning_rate, max_epochs=max_epochs_r, backprop_steps=backprop_steps, grad_clip_threshold=grad_clip_threshold) if self.supervised_model is not None: self.optimizer_s = RL2FirstOrderOptimizer( learning_rate=learning_rate, max_epochs=max_epochs_r, backprop_steps=backprop_steps, grad_clip_threshold=grad_clip_threshold) else: self.optimizer = FirstOrderOptimizer(learning_rate=learning_rate, max_epochs=max_epochs) # TODO figure out what this does self._optimization_keys = [ 'observations', 'actions', 'advantages', 'rewards', 'agent_infos', 'env_infos' ] self._optimization_r_keys = [ 'observations', 'actions', 'advantages', 'rewards', 'agent_infos', 'env_infos' ] self.name = name self._clip_eps = clip_eps self.entropy_bonus = entropy_bonus self.supervised_ground_truth = supervised_ground_truth self.reward_predictor_type = reward_predictor_type self.build_graph()
def __init__( self, env, policy, dynamics_model, num_rollouts, max_path_length, parallel=False, deterministic_policy=False, optimize_actions=False, max_epochs=2, learning_rate=1e-4, **kwargs, ): super(BPTTSampler, self).__init__(env, policy, num_rollouts, max_path_length) assert not parallel self.env = env self.policy = policy self.dynamics_model = dynamics_model self.max_path_length = max_path_length self.total_samples = num_rollouts * max_path_length self.num_rollouts = num_rollouts self.total_timesteps_sampled = 0 self.deterministic_policy = deterministic_policy self.optimize_actions = optimize_actions self.num_models = getattr(dynamics_model, 'num_models', 1) self.optimizer = FirstOrderOptimizer(learning_rate=learning_rate, max_epochs=max_epochs) self.build_graph()
def __init__( self, policy, name="ppo", learning_rate=1e-3, clip_eps=0.2, max_epochs=5, entropy_bonus=0., **kwargs ): Serializable.quick_init(self, locals()) super(PPO, self).__init__(policy) self.recurrent = getattr(self.policy, 'recurrent', False) if self.recurrent: backprop_steps = kwargs.get('backprop_steps', 32) self.optimizer = RL2FirstOrderOptimizer(learning_rate=learning_rate, max_epochs=max_epochs, backprop_steps=backprop_steps) else: self.optimizer = FirstOrderOptimizer(learning_rate=learning_rate, max_epochs=max_epochs) self._optimization_keys = ['observations', 'actions', 'advantages', 'agent_infos'] self.name = name self._clip_eps = clip_eps self.entropy_bonus = entropy_bonus self.build_graph()
def __init__(self, policy, dynamics_model, tf_reward, name="svg_inf", learning_rate=1e-3, max_epochs=5, **kwargs): super(PPO, self).__init__(policy) self.dynamics_model = dynamics_model self.tf_reward = tf_reward self.recurrent = getattr(self.policy, 'recurrent', False) if self.recurrent: backprop_steps = kwargs.get('backprop_steps', 32) self.optimizer = RL2FirstOrderOptimizer( learning_rate=learning_rate, max_epochs=max_epochs, backprop_steps=backprop_steps) else: self.optimizer = FirstOrderOptimizer(learning_rate=learning_rate, max_epochs=max_epochs) self._optimization_keys = [ 'observations', 'actions', 'advantages', 'agent_infos' ] self.name = name self._clip_eps = clip_eps self.build_graph()
class PPO(Algo, Serializable): """ Algorithm for PPO MAML Args: policy (Policy): policy object name (str): tf variable scope learning_rate (float): learning rate for the meta-objective exploration (bool): use exploration / pre-update sampling term / E-MAML term inner_type (str): inner optimization objective - either log_likelihood or likelihood_ratio inner_lr (float) : gradient step size used for inner step meta_batch_size (int): number of meta-learning tasks num_inner_grad_steps (int) : number of gradient updates taken per maml iteration trainable_inner_step_size (boolean): whether make the inner step size a trainable variable """ def __init__(self, policy, supervised_model=None, supervised_ground_truth='teacher', name="ppo", learning_rate=1e-3, clip_eps=0.2, max_epochs=5, max_epochs_r=20, entropy_bonus=0., reward_predictor=None, reward_predictor_type='gaussian', grad_clip_threshold=None, **kwargs): # TODO: Check to avoid duplicates of variables and scopes self.reward_predictor = reward_predictor Serializable.quick_init(self, locals()) super(PPO, self).__init__(policy) self.recurrent = getattr(self.policy, 'recurrent', False) self.supervised_model = supervised_model if self.recurrent: backprop_steps = kwargs.get('backprop_steps', 32) self.optimizer = RL2FirstOrderOptimizer( learning_rate=learning_rate, max_epochs=max_epochs, backprop_steps=backprop_steps, grad_clip_threshold=grad_clip_threshold) if self.reward_predictor is not None: self.optimizer_r = RL2FirstOrderOptimizer( learning_rate=learning_rate, max_epochs=max_epochs_r, backprop_steps=backprop_steps, grad_clip_threshold=grad_clip_threshold) if self.supervised_model is not None: self.optimizer_s = RL2FirstOrderOptimizer( learning_rate=learning_rate, max_epochs=max_epochs_r, backprop_steps=backprop_steps, grad_clip_threshold=grad_clip_threshold) else: self.optimizer = FirstOrderOptimizer(learning_rate=learning_rate, max_epochs=max_epochs) # TODO figure out what this does self._optimization_keys = [ 'observations', 'actions', 'advantages', 'rewards', 'agent_infos', 'env_infos' ] self._optimization_r_keys = [ 'observations', 'actions', 'advantages', 'rewards', 'agent_infos', 'env_infos' ] self.name = name self._clip_eps = clip_eps self.entropy_bonus = entropy_bonus self.supervised_ground_truth = supervised_ground_truth self.reward_predictor_type = reward_predictor_type self.build_graph() def build_graph(self): """ Creates the computation graph Notes: Pseudocode: for task in meta_batch_size: make_vars init_init_dist_sym for step in num_inner_grad_steps: for task in meta_batch_size: make_vars update_init_dist_sym set objectives for optimizer """ """ Create Variables """ """ ----- Build graph for the meta-update ----- """ self.op_phs_dict = OrderedDict() if isinstance(self.policy, DiscreteRNNPolicy): discrete = True else: discrete = False obs_ph, action_ph, adv_ph, r_ph, obs_r_ph, dist_info_old_ph, all_phs_dict, ground_truth_action_ph = self._make_input_placeholders( 'train', recurrent=self.recurrent, discrete=discrete) self.op_phs_dict.update(all_phs_dict) if self.recurrent: distribution_info_vars, hidden_ph, next_hidden_var = self.policy.distribution_info_sym( obs_ph) # TODO: Check if anything is problematic here, when obs is concatenating previous reward if self.reward_predictor is not None: distribution_info_vars_r, hidden_ph_r, next_hidden_var_r = self.reward_predictor.distribution_info_sym( obs_r_ph) if self.reward_predictor_type == 'gaussian': distribution_info_vars_r[ "mean"] = distribution_info_vars_r["mean"][:, :, 0] distribution_info_vars_r[ "log_std"] = distribution_info_vars_r[ "log_std"][:, 0] # TODO: uncomment if self.supervised_model is not None: distribution_info_vars_s, hidden_ph_s, next_hidden_var_s = self.supervised_model.distribution_info_sym( obs_ph) else: distribution_info_vars = self.policy.distribution_info_sym(obs_ph) hidden_ph, next_hidden_var = None, None """ Outer objective """ # TODO: Check if anything changes for discrete likelihood_ratio = self.policy.distribution.likelihood_ratio_sym( action_ph, dist_info_old_ph, distribution_info_vars) # TODO: Check if anything changes for discrete clipped_obj = tf.minimum( likelihood_ratio * adv_ph, tf.clip_by_value(likelihood_ratio, 1 - self._clip_eps, 1 + self._clip_eps) * adv_ph) # TODO: Check that the discrete entropy looks fine mask = tf.reduce_sum(all_phs_dict['train_agent_infos/probs'], axis=2) ent = self.policy.distribution.entropy_sym( distribution_info_vars) * mask self.log_values = [ likelihood_ratio, adv_ph, clipped_obj, dist_info_old_ph, distribution_info_vars, ent ] self.reward_loss = tf.reduce_mean(clipped_obj) self.entropy_loss = self.entropy_bonus * tf.reduce_mean( self.policy.distribution.entropy_sym(distribution_info_vars) * mask) surr_obj = - tf.reduce_mean(clipped_obj) - self.entropy_bonus * \ tf.reduce_mean(self.policy.distribution.entropy_sym(distribution_info_vars)) if self.reward_predictor is not None: if self.reward_predictor_type == 'gaussian': r_obj = -tf.reduce_mean( self.reward_predictor.distribution.log_likelihood_sym( r_ph, distribution_info_vars_r)) else: r_obj = -tf.reduce_mean( tf.exp(5 * r_ph) * self.reward_predictor.distribution.log_likelihood_sym( tf.cast(r_ph, tf.int32), distribution_info_vars_r)) # TODO: what's this? self.optimizer_r.build_graph(loss=r_obj, target=self.reward_predictor, input_ph_dict=self.op_phs_dict, hidden_ph=hidden_ph_r, next_hidden_var=next_hidden_var_r) if self.supervised_model is not None: if self.supervised_ground_truth == 'teacher': action_logits = tf.log(distribution_info_vars_s['probs']) ground_truth = tf.squeeze(tf.one_hot(ground_truth_action_ph, action_logits.shape[-1]), axis=2) sup_learning_loss = tf.compat.v1.losses.softmax_cross_entropy( ground_truth, action_logits, weights=mask, ) self.log_values_sup = [ sup_learning_loss, action_logits, ground_truth ] elif self.supervised_ground_truth == 'agent': old_prob_var = all_phs_dict['train_agent_infos/probs'] new_prob_var = distribution_info_vars_s['probs'] TINY = tf.constant(1e-6) # TODO: we could switch to this loss function instead, but for whatever reason it gives errors. # diff = new_prob_var - old_prob_var mask = tf.expand_dims(tf.reduce_sum(old_prob_var, axis=2), axis=2) sup_learning_loss = tf.reduce_sum( mask * old_prob_var * (tf.log(old_prob_var + TINY) - tf.log(new_prob_var + TINY)), ) # diff = diff * mask # sup_learning_loss = tf.reduce_mean(diff**2) self.log_values_sup = [ old_prob_var, new_prob_var, sup_learning_loss, mask ] else: raise NotImplementedError # self.log_values_sup = self.[action_logits, distribution_info_vars_s['probs'], ground_truth] self.optimizer_s.build_graph(loss=sup_learning_loss, target=self.supervised_model, input_ph_dict=self.op_phs_dict, hidden_ph=hidden_ph_s, next_hidden_var=next_hidden_var_s) self.optimizer.build_graph(loss=surr_obj, target=self.policy, input_ph_dict=self.op_phs_dict, hidden_ph=hidden_ph, next_hidden_var=next_hidden_var) def optimize_policy(self, samples_data, log=True, prefix='', verbose=False): """ Performs MAML outer step Args: samples_data (list) : list of lists of lists of samples (each is a dict) split by gradient update and meta task log (bool) : whether to log statistics Returns: None """ input_dict = self._extract_input_dict(samples_data, self._optimization_keys, prefix='train') entropy_loss, reward_loss = self.optimizer.compute_loss_variations( input_dict, self.entropy_loss, self.reward_loss, self.log_values) if verbose: logger.log("Optimizing") # Update model loss_before = self.optimizer.optimize(input_val_dict=input_dict) if verbose: logger.log("Computing statistics") loss_after = self.optimizer.loss(input_val_dict=input_dict) if log: logger.logkv(prefix + 'Loss/LossBefore', loss_before) logger.logkv(prefix + 'Loss/LossAfter', loss_after) logger.logkv(prefix + 'Loss/PartialLossEntropy', entropy_loss) logger.logkv(prefix + 'Loss/PartialLossReward', reward_loss) def optimize_reward(self, samples_data, log=True, prefix='', verbose=False): """ Performs MAML outer step Args: samples_data (list) : list of lists of lists of samples (each is a dict) split by gradient update and meta task log (bool) : whether to log statistics Returns: None """ input_dict = self._extract_input_dict(samples_data, self._optimization_r_keys, prefix='train') if verbose: logger.log("Optimizing") loss_before = self.optimizer_r.optimize(input_val_dict=input_dict) if verbose: logger.log("Computing statistics") loss_after = self.optimizer_r.loss(input_val_dict=input_dict) if log: logger.logkv(prefix + 'RewardLossBefore', loss_before) logger.logkv(prefix + 'RewardLossAfter', loss_after) def optimize_supervised(self, samples_data, log=True, prefix='', verbose=False): input_dict = self._extract_input_dict(samples_data, self._optimization_keys, prefix='train') self.optimizer_s.compute_loss_variations(input_dict, None, None, self.log_values_sup) if verbose: logger.log("Optimizing Supervised Model") loss_before = self.optimizer_s.optimize(input_val_dict=input_dict) if verbose: logger.log("Computing statistics") loss_after = self.optimizer_s.loss(input_val_dict=input_dict) if log: logger.logkv(prefix + 'SupervisedLossBefore', loss_before) logger.logkv(prefix + 'SupervisedLossAfter', loss_after) def __getstate__(self): state = dict() state['init_args'] = Serializable.__getstate__(self) print('getstate\n') print(state['init_args']) state['policy'] = self.policy.__getstate__() state['optimizer'] = self.optimizer.__getstate__() return state def __setstate__(self, state): Serializable.__setstate__(self, state['init_args']) self.policy.__setstate__(state['policy']) self.optimizer.__getstate__(state['optimizer'])
class BPTTSampler(BaseSampler): """ Sampler for Meta-RL Args: env (meta_mb.meta_envs.base.MetaEnv) : environment object policy (meta_mb.policies.base.Policy) : policy object batch_size (int) : number of trajectories per task meta_batch_size (int) : number of meta tasks max_path_length (int) : max number of steps per trajectory envs_per_task (int) : number of meta_envs to run vectorized for each task (influences the memory usage) """ def __init__( self, env, policy, dynamics_model, num_rollouts, max_path_length, parallel=False, deterministic_policy=False, optimize_actions=False, max_epochs=2, learning_rate=1e-4, **kwargs, ): super(BPTTSampler, self).__init__(env, policy, num_rollouts, max_path_length) assert not parallel self.env = env self.policy = policy self.dynamics_model = dynamics_model self.max_path_length = max_path_length self.total_samples = num_rollouts * max_path_length self.num_rollouts = num_rollouts self.total_timesteps_sampled = 0 self.deterministic_policy = deterministic_policy self.optimize_actions = optimize_actions self.num_models = getattr(dynamics_model, 'num_models', 1) self.optimizer = FirstOrderOptimizer(learning_rate=learning_rate, max_epochs=max_epochs) self.build_graph() def build_graph(self): self._initial_obs_ph = tf.placeholder(dtype=tf.float32, shape=(self.num_rollouts, self.policy.obs_dim), name='init_obs') obses = [] acts = [] rewards = [] means = [] log_stds = [] obs = self._initial_obs_ph for t in range(self.max_path_length): dist_policy = self.policy.distribution_info_sym(obs) act, dist_policy = self.policy.distribution.sample_sym(dist_policy) next_obs = self.dynamics_model.predict_sym(obs, act) reward = self.env.tf_reward(obs, act, next_obs) obses.append(obs) acts.append(act) rewards.append(reward) means.append(dist_policy['mean']) log_stds.append(dist_policy['log_std']) obs = next_obs # rewards = tf.stack(tf.split(tf.transpose(tf.stack(rewards, axis=0)), self.num_models)) # random_weights = tf.random.uniform(shape=(self.num_models, self.num_rollouts, self.max_path_length)) # rewards = rewards * random_weights / tf.reduce_sum(random_weights, axis=0) self._returns_var = tf.reduce_sum(rewards, axis=0) self._rewards_var = rewards self._actions_var = acts self._observations_var = obses self._means_var = means self._log_stds_var = log_stds def obtain_samples(self, log=False, log_prefix='', buffer=None): """ Collect batch_size trajectories from each task Args: log (boolean): whether to log sampling times log_prefix (str) : prefix for logger random (boolean): whether the actions are random Returns: (dict) : A dict of paths of size [meta_batch_size] x (batch_size) x [5] x (max_path_length) """ # initial setup / preparation policy = self.policy policy.reset(dones=[True] * self.num_rollouts) # initial reset of meta_envs init_obses = np.array( [self.env.reset() for _ in range(self.num_rollouts)]) sess = tf.get_default_session() observations, actions, means, log_stds, rewards = sess.run( [ self._observations_var, self._actions_var, self._means_var, self._log_stds_var, self._rewards_var ], feed_dict={self._initial_obs_ph: init_obses}) means = np.array(means).transpose((1, 0, 2)) log_stds = np.array(log_stds).transpose((1, 0, 2)) if log_stds.shape[0] == 1: log_stds = np.repeat(log_stds, self.num_rollouts, axis=0) agent_infos = [ dict(mean=mean, log_std=log_std) for mean, log_std in zip(means, log_stds) ] observations = np.array(observations).transpose((1, 0, 2)) actions = np.array(actions).transpose((1, 0, 2)) rewards = np.array(rewards).T dones = [[False for _ in range(self.max_path_length)] for _ in range(self.num_rollouts)] env_infos = [dict() for _ in range(self.num_rollouts)] paths = [ dict(observations=obs, actions=act, rewards=rew, dones=done, env_infos=env_info, agent_infos=agent_info) for obs, act, rew, done, env_info, agent_info in zip( observations, actions, rewards, dones, env_infos, agent_infos) ] self.total_timesteps_sampled += self.total_samples logger.logkv('ModelSampler-n_timesteps', self.total_timesteps_sampled) return paths def optimize_policy(self, log=True): init_obses = np.array( [self.env.reset() for _ in range(self.num_rollouts)] * self.num_models) input_dict = dict(initial_obs=init_obses) self.optimizer.optimize(input_dict)
class PPO(Algo, Serializable): """ Algorithm for PPO MAML Args: policy (Policy): policy object name (str): tf variable scope learning_rate (float): learning rate for the meta-objective exploration (bool): use exploration / pre-update sampling term / E-MAML term inner_type (str): inner optimization objective - either log_likelihood or likelihood_ratio inner_lr (float) : gradient step size used for inner step meta_batch_size (int): number of meta-learning tasks num_inner_grad_steps (int) : number of gradient updates taken per maml iteration trainable_inner_step_size (boolean): whether make the inner step size a trainable variable """ def __init__( self, policy, name="ppo", learning_rate=1e-3, clip_eps=0.2, max_epochs=5, entropy_bonus=0., **kwargs ): Serializable.quick_init(self, locals()) super(PPO, self).__init__(policy) self.recurrent = getattr(self.policy, 'recurrent', False) if self.recurrent: backprop_steps = kwargs.get('backprop_steps', 32) self.optimizer = RL2FirstOrderOptimizer(learning_rate=learning_rate, max_epochs=max_epochs, backprop_steps=backprop_steps) else: self.optimizer = FirstOrderOptimizer(learning_rate=learning_rate, max_epochs=max_epochs) self._optimization_keys = ['observations', 'actions', 'advantages', 'agent_infos'] self.name = name self._clip_eps = clip_eps self.entropy_bonus = entropy_bonus self.build_graph() def build_graph(self): """ Creates the computation graph Notes: Pseudocode: for task in meta_batch_size: make_vars init_init_dist_sym for step in num_inner_grad_steps: for task in meta_batch_size: make_vars update_init_dist_sym set objectives for optimizer """ """ Create Variables """ """ ----- Build graph for the meta-update ----- """ self.op_phs_dict = OrderedDict() obs_ph, action_ph, adv_ph, dist_info_old_ph, all_phs_dict = self._make_input_placeholders('train', recurrent=self.recurrent) self.op_phs_dict.update(all_phs_dict) if self.recurrent: distribution_info_vars, hidden_ph, next_hidden_var = self.policy.distribution_info_sym(obs_ph) else: distribution_info_vars = self.policy.distribution_info_sym(obs_ph) hidden_ph, next_hidden_var = None, None """ Outer objective """ likelihood_ratio = self.policy.distribution.likelihood_ratio_sym(action_ph, dist_info_old_ph, distribution_info_vars) clipped_obj = tf.minimum(likelihood_ratio * adv_ph, tf.clip_by_value(likelihood_ratio, 1 - self._clip_eps, 1 + self._clip_eps ) * adv_ph) surr_obj = - tf.reduce_mean(clipped_obj) - self.entropy_bonus * \ tf.reduce_mean(self.policy.distribution.entropy_sym(distribution_info_vars)) self.optimizer.build_graph( loss=surr_obj, target=self.policy, input_ph_dict=self.op_phs_dict, hidden_ph=hidden_ph, next_hidden_var=next_hidden_var ) def optimize_policy(self, samples_data, log=True, prefix='', verbose=False): """ Performs MAML outer step Args: samples_data (list) : list of lists of lists of samples (each is a dict) split by gradient update and meta task log (bool) : whether to log statistics Returns: None """ input_dict = self._extract_input_dict(samples_data, self._optimization_keys, prefix='train') if verbose: logger.log("Optimizing") loss_before = self.optimizer.optimize(input_val_dict=input_dict) if verbose: logger.log("Computing statistics") loss_after = self.optimizer.loss(input_val_dict=input_dict) if log: logger.logkv(prefix+'LossBefore', loss_before) logger.logkv(prefix+'LossAfter', loss_after) def __getstate__(self): state = dict() state['init_args'] = Serializable.__getstate__(self) print('getstate\n') print(state['init_args']) state['policy'] = self.policy.__getstate__() state['optimizer'] = self.optimizer.__getstate__() return state def __setstate__(self, state): Serializable.__setstate__(self, state['init_args']) self.policy.__setstate__(state['policy']) self.optimizer.__getstate__(state['optimizer'])
class SVGInf(Algo): """ Args: policy (Policy): policy object name (str): tf variable scope learning_rate (float): learning rate for the meta-objective exploration (bool): use exploration / pre-update sampling term / E-MAML term inner_type (str): inner optimization objective - either log_likelihood or likelihood_ratio inner_lr (float) : gradient step size used for inner step meta_batch_size (int): number of meta-learning tasks num_inner_grad_steps (int) : number of gradient updates taken per maml iteration trainable_inner_step_size (boolean): whether make the inner step size a trainable variable """ def __init__(self, policy, dynamics_model, tf_reward, name="svg_inf", learning_rate=1e-3, max_epochs=5, **kwargs): super(PPO, self).__init__(policy) self.dynamics_model = dynamics_model self.tf_reward = tf_reward self.recurrent = getattr(self.policy, 'recurrent', False) if self.recurrent: backprop_steps = kwargs.get('backprop_steps', 32) self.optimizer = RL2FirstOrderOptimizer( learning_rate=learning_rate, max_epochs=max_epochs, backprop_steps=backprop_steps) else: self.optimizer = FirstOrderOptimizer(learning_rate=learning_rate, max_epochs=max_epochs) self._optimization_keys = [ 'observations', 'actions', 'advantages', 'agent_infos' ] self.name = name self._clip_eps = clip_eps self.build_graph() def build_graph(self): """ Creates the computation graph Notes: Pseudocode: for task in meta_batch_size: make_vars init_init_dist_sym for step in num_inner_grad_steps: for task in meta_batch_size: make_vars update_init_dist_sym set objectives for optimizer """ """ Create Variables """ """ ----- Build graph for the meta-update ----- """ self.op_phs_dict = OrderedDict() obs_ph, action_ph, next_obs_ph, adv_ph, dist_info_old_ph, all_phs_dict = \ self._make_input_placeholders('train', recurrent=False, next_obs=True) # TODO: I need the full trajectory here! So I need to reshape or concat the data or sth so the distribution info_vars makes sense self.op_phs_dict.update(all_phs_dict) distribution_info_vars = self.policy.distribution_info_sym(obs_ph) hidden_ph, next_hidden_var = None, None for t in range(sel.horizon, 0, -1): v = self.tf_reward(obs_ph[t - 1], action_ph[t], obs_ph[t]) surr_obj = -tf.reduce_mean(clipped_obj) self.optimizer.build_graph(loss=surr_obj, target=self.policy, input_ph_dict=self.op_phs_dict, hidden_ph=hidden_ph, next_hidden_var=next_hidden_var) def optimize_policy(self, samples_data, log=True): """ Performs MAML outer step Args: samples_data (list) : list of lists of lists of samples (each is a dict) split by gradient update and meta task log (bool) : whether to log statistics Returns: None """ input_dict = self._extract_input_dict(samples_data, self._optimization_keys, prefix='train') if log: logger.log("Optimizing") loss_before = self.optimizer.optimize(input_val_dict=input_dict) if log: logger.log("Computing statistics") loss_after = self.optimizer.loss(input_val_dict=input_dict) if log: logger.logkv('LossBefore', loss_before) logger.logkv('LossAfter', loss_after)