Exemplo n.º 1
0
    def make_vars(self, stepnum='0'):
        # lists over the meta_batch_size
        obs_vars, action_vars, adv_vars, imp_vars = [], [], [], []
        for i in range(self.meta_batch_size):
            obs_vars.append(
                self.env.observation_space.new_tensor_variable(
                    'obs' + stepnum + '_' + str(i),
                    extra_dims=1,
                ))
            action_vars.append(
                self.env.action_space.new_tensor_variable(
                    'action' + stepnum + '_' + str(i),
                    extra_dims=1,
                ))
            adv_vars.append(
                tensor_utils.new_tensor(
                    name='advantage' + stepnum + '_' + str(i),
                    ndim=1,
                    dtype=tf.float32,
                ))

            imp_vars.append(
                tensor_utils.new_tensor(
                    name='imp_ratios' + stepnum + '_' + str(i),
                    ndim=1,
                    dtype=tf.float32,
                ))

        return obs_vars, action_vars, adv_vars, imp_vars
Exemplo n.º 2
0
    def make_vars(self):

        obs_var = self.env.observation_space.new_tensor_variable(
            'obs',
            extra_dims=1,
        )
        action_var = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1,
        )
        adv_var = tensor_utils.new_tensor(
            name='advantage',
            ndim=1,
            dtype=tf.float32,
        )
        noise_var = tf.placeholder(dtype=tf.float32,
                                   shape=[None, self.latent_dim],
                                   name='noise')

        task_idx_var = tensor_utils.new_tensor(
            name='task_idx',
            ndim=1,
            dtype=tf.int32,
        )

        return obs_var, action_var, adv_var, noise_var, task_idx_var
Exemplo n.º 3
0
 def make_vars(self, stepnum='0'):
     # lists over the meta_batch_size
     obs_vars, action_vars, adv_vars, noise_vars, task_idx_vars = [], [], [], [], []
     for i in range(self.meta_batch_size):
         obs_vars.append(
             self.env.observation_space.new_tensor_variable(
                 'obs' + stepnum + '_' + str(i),
                 extra_dims=1,
             ))
         action_vars.append(
             self.env.action_space.new_tensor_variable(
                 'action' + stepnum + '_' + str(i),
                 extra_dims=1,
             ))
         adv_vars.append(
             tensor_utils.new_tensor(
                 name='advantage' + stepnum + '_' + str(i),
                 ndim=1,
                 dtype=tf.float32,
             ))
         noise_vars.append(
             tf.placeholder(dtype=tf.float32,
                            shape=[None, self.latent_dim],
                            name='noise' + stepnum + '_' + str(i)))
         task_idx_vars.append(
             tensor_utils.new_tensor(
                 name='task_idx' + stepnum + '_' + str(i),
                 ndim=1,
                 dtype=tf.int32,
             ))
     return obs_vars, action_vars, adv_vars, noise_vars, task_idx_vars
Exemplo n.º 4
0
    def _init_graph(self, chunk_size):
        with self._graph.as_default():
            with tf.variable_scope('SimilarityCalculator'):
                X = tensor_utils.new_tensor(
                    'X',
                    ndim=2,
                    dtype=tf.float32,
                )
                pool = tensor_utils.new_tensor(
                    'pool',
                    ndim=2,
                    dtype=tf.float32,
                )
                division_factor = tensor_utils.new_tensor(
                    'division_factor',
                    ndim=0,
                    dtype=tf.float32,
                )

                inputs = [X, pool, division_factor]

                size = tf.shape(X)[0]

                if chunk_size is None:
                    chunk_size = size
                    chunk_size_float = tf.cast(chunk_size, tf.float32)
                else:
                    chunk_size_float = float(chunk_size)
                array_size = tf.cast(
                    tf.ceil(tf.cast(size, tf.float32) / chunk_size_float),
                    tf.int32)
                ta_initial = tf.TensorArray(dtype=tf.float32,
                                            size=array_size,
                                            infer_shape=False)

                def _cond(idx, i, ta):
                    return i < size

                def _body(idx, i, ta):
                    until = tf.minimum(i + chunk_size, size)
                    new_pdiffs = (X[i:until, tf.newaxis, :] - pool)
                    squared_l2 = tf.reduce_sum(tf.square(new_pdiffs), axis=-1)
                    part_similarities = tf.reduce_mean(tf.exp(-squared_l2 /
                                                              division_factor),
                                                       axis=1)
                    return idx + 1, until, ta.write(idx, part_similarities)

                final_idx, final_i, ta = tf.while_loop(
                    _cond,
                    _body,
                    loop_vars=[0, 0, ta_initial],
                    parallel_iterations=1)
                result = ta.concat()

                self._get_result = tensor_utils.compile_function(
                    inputs=inputs,
                    outputs=result,
                )
Exemplo n.º 5
0
 def make_vars(self, stepnum='0'):
     # lists over the meta_batch_size
     # We should only need the last stepnum for meta-optimization.
     obs_vars, action_vars, adv_vars, rewards_vars, returns_vars, path_lengths_vars, expert_action_vars = [], [], [], [], [], [], []
     for i in range(self.meta_batch_size):
         obs_vars.append(
             self.env.observation_space.new_tensor_variable(
                 'obs' + stepnum + '_' + str(i),
                 extra_dims=1,
                 add_to_flat_dim=(0 if self.extra_input is None else
                                  self.extra_input_dim),
             ))
         action_vars.append(
             self.env.action_space.new_tensor_variable(
                 'action' + stepnum + '_' + str(i),
                 extra_dims=1,
             ))
         adv_vars.append(
             tensor_utils.new_tensor(
                 'advantage' + stepnum + '_' + str(i),
                 ndim=1,
                 dtype=tf.float32,
             ))
         if self.metalearn_baseline:
             rewards_vars.append(
                 tensor_utils.new_tensor(
                     'rewards' + stepnum + '_' + str(i),
                     ndim=1,
                     dtype=tf.float32,
                 ))
             returns_vars.append(
                 tensor_utils.new_tensor(
                     'returns' + stepnum + '_' + str(i),
                     ndim=1,
                     dtype=tf.float32,
                 ))
             # path_lengths_vars.append(tensor_utils.new_tensor(
             #     'path_lengths' + stepnum + '_' + str(i),
             #     ndim=1, dtype=tf.float32,
             # ))
         expert_action_vars.append(
             tensor_utils.new_tensor(
                 'expert_actions' + stepnum + '_' + str(i),
                 ndim=1,
                 dtype=tf.float32,
             ))
     if not self.metalearn_baseline:
         return obs_vars, action_vars, adv_vars, expert_action_vars
     else:
         return obs_vars, action_vars, adv_vars, rewards_vars, returns_vars, expert_action_vars  # path_lengths_vars before expert action
Exemplo n.º 6
0
 def make_vars(self, stepnum='0'):
     # lists over the meta_batch_size
     obs_vars, action_vars, adv_vars = [], [], []
     for i in range(self.meta_batch_size):
         obs_vars.append(
             self.env.observation_space.new_tensor_variable(
                 'obs' + stepnum + '_' + str(i),
                 extra_dims=1,
             ))
         action_vars.append(
             tf.placeholder(tf.float32,
                            shape=[None] +
                            [self.env.action_space.flat_dim * 20],
                            name='action' + stepnum + '_' + str(i)))
         #action_vars.append(self.env.action_space.new_tensor_variable(
         #    'action' + stepnum + '_' + str(i),
         #    extra_dims=1,
         #))
         adv_vars.append(
             tensor_utils.new_tensor(
                 name='advantage' + stepnum + '_' + str(i),
                 ndim=1,
                 dtype=tf.float32,
             ))
     return obs_vars, action_vars, adv_vars
Exemplo n.º 7
0
    def init_opt(self):
        is_recurrent = int(self.policy.recurrent)
        obs_var = self.env.observation_space.new_tensor_variable(
            'obs',
            extra_dims=1 + is_recurrent,
        )
        action_var = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1 + is_recurrent,
        )
        advantage_var = tensor_utils.new_tensor(
            'advantage',
            ndim=1 + is_recurrent,
            dtype=tf.float32,
        )
        dist = self.policy.distribution

        old_dist_info_vars = {
            k: tf.placeholder(tf.float32, shape=[None] * (1 + is_recurrent) + list(shape), name='old_%s' % k)
            for k, shape in dist.dist_info_specs
            }
        old_dist_info_vars_list = [old_dist_info_vars[k] for k in dist.dist_info_keys]

        state_info_vars = {
            k: tf.placeholder(tf.float32, shape=[None] * (1 + is_recurrent) + list(shape), name=k)
            for k, shape in self.policy.state_info_specs
            }
        state_info_vars_list = [state_info_vars[k] for k in self.policy.state_info_keys]

        if is_recurrent:
            valid_var = tf.placeholder(tf.float32, shape=[None, None], name="valid")
        else:
            valid_var = None

        dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars)
        kl = dist.kl_sym(old_dist_info_vars, dist_info_vars)
        lr = dist.likelihood_ratio_sym(action_var, old_dist_info_vars, dist_info_vars)
        if is_recurrent:
            mean_kl = tf.reduce_sum(kl * valid_var) / tf.reduce_sum(valid_var)
            surr_loss = - tf.reduce_sum(lr * advantage_var * valid_var) / tf.reduce_sum(valid_var)
        else:
            mean_kl = tf.reduce_mean(kl)
            surr_loss = - tf.reduce_mean(lr * advantage_var)

        input_list = [
                         obs_var,
                         action_var,
                         advantage_var,
                     ] + state_info_vars_list + old_dist_info_vars_list
        if is_recurrent:
            input_list.append(valid_var)

        self.optimizer.update_opt(
            loss=surr_loss,
            target=self.policy,
            leq_constraint=(mean_kl, self.step_size),
            inputs=input_list,
            constraint_name="mean_kl"
        )
        return dict()
Exemplo n.º 8
0
    def init_opt(self):
        is_recurrent = int(self.policy.recurrent)
        obs_var = self.env.observation_space.new_tensor_variable(
            'obs',
            extra_dims=1 + is_recurrent,
        )
        action_var = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1 + is_recurrent,
        )
        advantage_var = tensor_utils.new_tensor(
            'advantage',
            ndim=1 + is_recurrent,
            dtype=tf.float32,
        )
        dist = self.policy.distribution

        old_dist_info_vars = {
            k: tf.placeholder(tf.float32, shape=[None] * (1 + is_recurrent) + list(shape), name='old_%s' % k)
            for k, shape in dist.dist_info_specs
            }
        old_dist_info_vars_list = [old_dist_info_vars[k] for k in dist.dist_info_keys]

        state_info_vars = {
            k: tf.placeholder(tf.float32, shape=[None] * (1 + is_recurrent) + list(shape), name=k)
            for k, shape in self.policy.state_info_specs
            }
        state_info_vars_list = [state_info_vars[k] for k in self.policy.state_info_keys]

        if is_recurrent:
            valid_var = tf.placeholder(tf.float32, shape=[None, None], name="valid")
        else:
            valid_var = None

        dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars)
        kl = dist.kl_sym(old_dist_info_vars, dist_info_vars)
        lr = dist.likelihood_ratio_sym(action_var, old_dist_info_vars, dist_info_vars)
        if is_recurrent:
            mean_kl = tf.reduce_sum(kl * valid_var) / tf.reduce_sum(valid_var)
            surr_loss = - tf.reduce_sum(lr * advantage_var * valid_var) / tf.reduce_sum(valid_var)
        else:
            mean_kl = tf.reduce_mean(kl)
            surr_loss = - tf.reduce_mean(lr * advantage_var)

        input_list = [
                         obs_var,
                         action_var,
                         advantage_var,
                     ] + state_info_vars_list + old_dist_info_vars_list
        if is_recurrent:
            input_list.append(valid_var)

        self.optimizer.update_opt(
            loss=surr_loss,
            target=self.policy,
            leq_constraint=(mean_kl, self.step_size),
            inputs=input_list,
            constraint_name="mean_kl"
        )
        return dict()
Exemplo n.º 9
0
    def opt_helper(self, policy, optimizer):
        is_recurrent = int(policy.recurrent)

        obs_var = self.env.observation_space.new_tensor_variable(
            'obs',
            extra_dims=1 + is_recurrent,)
        action_var = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1 + is_recurrent,)
        advantage_var = tensor_utils.new_tensor(
            name='advantage',
            ndim=1 + is_recurrent,
            dtype=tf.float32,)
        dist = policy.distribution

        old_dist_info_vars = {
            k: tf.placeholder(tf.float32, shape=[None] * (1 + is_recurrent) + list(shape),
                              name='old_%s' % k)
            for k, shape in dist.dist_info_specs
        }
        old_dist_info_vars_list = [old_dist_info_vars[k] for k in dist.dist_info_keys]

        state_info_vars = {
            k: tf.placeholder(tf.float32, shape=[None] * (1 + is_recurrent) + list(shape), name=k)
            for k, shape in policy.state_info_specs
        }
        state_info_vars_list = [state_info_vars[k] for k in policy.state_info_keys]

        if is_recurrent:
            valid_var = tf.placeholder(tf.float32, shape=[None, None], name="valid")
        else:
            valid_var = None

        dist_info_vars = policy.dist_info_sym(obs_var, state_info_vars)
        logli = dist.log_likelihood_sym(action_var, dist_info_vars)
        kl = dist.kl_sym(old_dist_info_vars, dist_info_vars)

        # formulate as a minimization problem
        # The gradient of the surrogate objective is the policy gradient
        if is_recurrent:
            surr_obj = -tf.reduce_sum(logli * advantage_var * valid_var) / tf.reduce_sum(valid_var)
            mean_kl = tf.reduce_sum(kl * valid_var) / tf.reduce_sum(valid_var)
            max_kl = tf.reduce_max(kl * valid_var)
        else:
            surr_obj = -tf.reduce_mean(logli * advantage_var)
            mean_kl = tf.reduce_mean(kl)
            max_kl = tf.reduce_max(kl)

        input_list = [obs_var, action_var, advantage_var] + state_info_vars_list
        if is_recurrent:
            input_list.append(valid_var)

        optimizer.update_opt(loss=surr_obj, target=policy, inputs=input_list)

        f_kl = tensor_utils.compile_function(
            inputs=input_list + old_dist_info_vars_list,
            outputs=[mean_kl, max_kl],)
        opt_info = dict(f_kl=f_kl,)
        return opt_info
Exemplo n.º 10
0
    def init_opt(self):
        obs_var = self.env.observation_space.new_tensor_variable(
            'obs',
            extra_dims=1,
        )
        action_var = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1,
        )
        advantage_var = tensor_utils.new_tensor(
            name='advantage',
            ndim=1,
            dtype=tf.float32,
        )
        dist = self.policy.distribution

        old_dist_info_vars = {
            k: tf.placeholder(tf.float32,
                              shape=[None] + list(shape),
                              name='old_%s' % k)
            for k, shape in dist.dist_info_specs
        }
        old_dist_info_vars_list = [
            old_dist_info_vars[k] for k in dist.dist_info_keys
        ]

        state_info_vars = {
            k: tf.placeholder(tf.float32, shape=[None] + list(shape), name=k)
            for k, shape in self.policy.state_info_specs
        }
        state_info_vars_list = [
            state_info_vars[k] for k in self.policy.state_info_keys
        ]

        dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars)
        # todo, delete this var
        loglik = dist.log_likelihood_sym(action_var, dist_info_vars)
        kl = dist.kl_sym(old_dist_info_vars, dist_info_vars)

        # formulate as a minimization problem
        # The gradient of the surrogate objective is the policy gradient
        surr_obj = -tf.reduce_mean(loglik * advantage_var)
        mean_kl = tf.reduce_mean(kl)
        max_kl = tf.reduce_max(kl)

        input_list = [obs_var, action_var, advantage_var
                      ] + state_info_vars_list + old_dist_info_vars_list

        self.optimizer.update_opt(loss=surr_obj,
                                  target=self.policy,
                                  leq_constraint=(mean_kl, self.delta),
                                  inputs=input_list)

        f_kl = tensor_utils.compile_function(
            inputs=input_list + old_dist_info_vars_list,
            outputs=[mean_kl, max_kl],
        )
        self.opt_info = dict(f_kl=f_kl, )
Exemplo n.º 11
0
    def make_vars_latent(self):
        # lists over the meta_batch_size

        adv_var = tensor_utils.new_tensor(
            name='advantage_latent',
            ndim=1,
            dtype=tf.float32,
        )

        z_var = tf.placeholder(dtype=tf.float32,
                               shape=[None, self.latent_dim],
                               name='zs_latent')
        task_idx_var = tensor_utils.new_tensor(
            name='task_idx_latent',
            ndim=1,
            dtype=tf.int32,
        )
        return adv_var, z_var, task_idx_var
 def __init__(self, env_spec, reg_coeff=1e-5):
     self._coeffs = None
     self._reg_coeff = reg_coeff
     self.feature_mat = tensor_utils.new_tensor(
         'feature_mat',
         ndim=2,
         dtype=tf.float32,
     )
     self.returns = tensor_utils.new_tensor(
         'returns',
         ndim=2,
         dtype=tf.float32,
     )
     # import pdb; pdb.set_trace()
     ident = tf.identity(self.feature_mat)
     self.train_ops = tf.matrix_solve_ls(tf.square(self.feature_mat) +
                                         self._reg_coeff * ident,
                                         self.returns,
                                         fast=False)
     self.sess = tf.Session()
Exemplo n.º 13
0
 def make_vars_latent(self, stepnum='0'):
     # lists over the meta_batch_size
     adv_vars, z_vars, task_idx_vars = [], [], []
     for i in range(self.meta_batch_size):
         adv_vars.append(
             tensor_utils.new_tensor(
                 name='advantage_latent' + stepnum + '_' + str(i),
                 ndim=1,
                 dtype=tf.float32,
             ))
         z_vars.append(
             tf.placeholder(dtype=tf.float32,
                            shape=[None, self.latent_dim],
                            name='zs_latent' + stepnum + '_' + str(i)))
         task_idx_vars.append(
             tensor_utils.new_tensor(
                 name='task_idx_latents' + stepnum + '_' + str(i),
                 ndim=1,
                 dtype=tf.int32,
             ))
     return adv_vars, z_vars, task_idx_vars
Exemplo n.º 14
0
    def init_opt(self):
        is_recurrent = int(self.policy.recurrent)

        obs_var = self.env.observation_space.new_tensor_variable(
            'obs',
            extra_dims=1 + is_recurrent,
        )
        action_var = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1 + is_recurrent,
        )
        advantage_var = tensor_utils.new_tensor(
            name='advantage',
            ndim=1 + is_recurrent,
            dtype=tf.float32,
        )
        dist = self.policy.distribution

        old_dist_info_vars = {
            k: tf.placeholder(tf.float32, shape=[None] * (1 + is_recurrent) + list(shape), name='old_%s' % k)
            for k, shape in dist.dist_info_specs
            }
        old_dist_info_vars_list = [old_dist_info_vars[k] for k in dist.dist_info_keys]

        state_info_vars = {
            k: tf.placeholder(tf.float32, shape=[None] * (1 + is_recurrent) + list(shape), name=k)
            for k, shape in self.policy.state_info_specs
            }
        state_info_vars_list = [state_info_vars[k] for k in self.policy.state_info_keys]
        self.input_list_for_grad = [obs_var, action_var, advantage_var] + state_info_vars_list

        if is_recurrent:
            valid_var = tf.placeholder(tf.float32, shape=[None, None], name="valid")
        else:
            valid_var = None

        dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars)
        logli = dist.log_likelihood_sym(action_var, dist_info_vars)
        kl = dist.kl_sym(old_dist_info_vars, dist_info_vars)

        # formulate as a minimization problem
        # The gradient of the surrogate objective is the policy gradient
        if is_recurrent:
            surr_obj = - tf.reduce_sum(logli * advantage_var * valid_var) / tf.reduce_sum(valid_var)
            mean_kl = tf.reduce_sum(kl * valid_var) / tf.reduce_sum(valid_var)
            max_kl = tf.reduce_max(kl * valid_var)
        else:
            surr_obj = - tf.reduce_mean(logli * advantage_var)
            mean_kl = tf.reduce_mean(kl)
            max_kl = tf.reduce_max(kl)
        
        self.surr_obj = surr_obj
Exemplo n.º 15
0
 def make_vars(self, stepnum='0'):
     # lists over the meta_batch_size
     obs_vars, action_vars, adv_vars = [], [], []
     for i in range(self.meta_batch_size):
         obs_vars.append(self.env.observation_space.new_tensor_variable(
             'obs' + stepnum + '_' + str(i),
             extra_dims=1,
         ))
         action_vars.append(self.env.action_space.new_tensor_variable(
             'action' + stepnum + '_' + str(i),
             extra_dims=1,
         ))
         adv_vars.append(tensor_utils.new_tensor(
             name='advantage' + stepnum + '_' + str(i),
             ndim=1, dtype=tf.float32,
         ))
     return obs_vars, action_vars, adv_vars
Exemplo n.º 16
0
    def init_opt(self):
        is_recurrent = int(self.policy.recurrent)
        obs_var = self.env.observation_space.new_tensor_variable(
            'obs',
            extra_dims=1 + is_recurrent,
        )

        nobs_var = self.env.observation_space.new_tensor_variable(
            'nobs',
            extra_dims=1 + is_recurrent,
        )
        action_var = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1 + is_recurrent,
        )
        advantage_var = tensor_utils.new_tensor(
            'advantage',
            ndim=1 + is_recurrent,
            dtype=tf.float32,
        )

        empw_var = tensor_utils.new_tensor(
            'empowerment',
            ndim=2 + is_recurrent,
            dtype=tf.float32,
        )

        input_list = [
            obs_var,
            nobs_var,
            action_var,
            advantage_var,
            empw_var,
        ]

        dist = self.policy.distribution

        old_dist_info_vars = {
            k: tf.placeholder(tf.float32,
                              shape=[None] * (1 + is_recurrent) + list(shape),
                              name='old_%s' % k)
            for k, shape in dist.dist_info_specs
        }
        old_dist_info_vars_list = [
            old_dist_info_vars[k] for k in dist.dist_info_keys
        ]

        state_info_vars = {
            k: tf.placeholder(tf.float32,
                              shape=[None] * (1 + is_recurrent) + list(shape),
                              name=k)
            for k, shape in self.policy.state_info_specs
        }
        state_info_vars_list = [
            state_info_vars[k] for k in self.policy.state_info_keys
        ]

        if is_recurrent:
            valid_var = tf.placeholder(tf.float32,
                                       shape=[None, None],
                                       name="valid")
        else:
            valid_var = None

        dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars)
        # dist_info_vars["mean"]=dist_info_vars["mean"]+empw_var
        q_input = tf.concat([obs_var, nobs_var], axis=1)
        q_dist_info_vars = self.qvar_model.dist_info_sym(
            q_input, state_info_vars)

        kl = dist.kl_sym(old_dist_info_vars, dist_info_vars)
        lr = dist.likelihood_ratio_sym(action_var, old_dist_info_vars,
                                       dist_info_vars)

        if self.pol_ent_wt > 0:
            if 'log_std' in dist_info_vars:
                log_std = dist_info_vars['log_std']
                ent = tf.reduce_sum(log_std +
                                    tf.log(tf.sqrt(2 * np.pi * np.e)),
                                    reduction_indices=-1)
            elif 'prob' in dist_info_vars:
                prob = dist_info_vars['prob']
                ent = -tf.reduce_sum(prob * tf.log(prob), reduction_indices=-1)
            else:
                raise NotImplementedError()
            ent = tf.stop_gradient(ent)
            adv = advantage_var + self.pol_ent_wt * ent
        else:
            adv = advantage_var

        if is_recurrent:
            mean_kl = tf.reduce_sum(kl * valid_var) / tf.reduce_sum(valid_var)
            surr_loss = -tf.reduce_sum(
                lr * adv * valid_var) / tf.reduce_sum(valid_var)
        else:
            mean_kl = tf.reduce_mean(kl)
            surr_loss = -tf.reduce_mean(lr * adv)

        if self.train_empw:
            print(
                "training empowerment========================================")
            pred = dist.log_likelihood(dist.sample(dist_info_vars),
                                       dist_info_vars) + empw_var
            target = dist.log_likelihood(dist.sample(q_dist_info_vars),
                                         q_dist_info_vars)
            # print("pred = {}, target={}".format(pred.shape, target.shape))
            surr_loss = surr_loss + self.lambda_i * tf.losses.mean_squared_error(
                predictions=pred, labels=target)

        input_list += state_info_vars_list + old_dist_info_vars_list
        if is_recurrent:
            input_list.append(valid_var)

        self.optimizer.update_opt(loss=surr_loss,
                                  target=self.policy,
                                  leq_constraint=(mean_kl, self.step_size),
                                  inputs=input_list,
                                  constraint_name="mean_kl")
        return dict()
Exemplo n.º 17
0
Arquivo: ppo.py Projeto: Neo-X/GMPS
    def init_opt(self):
        is_recurrent = int(self.policy.recurrent)

        obs_var = self.env.observation_space.new_tensor_variable(
            'obs',
            extra_dims=1 + is_recurrent,
            add_to_flat_dim=(0 if self.extra_input is None else
                             self.extra_input_dim),
        )
        action_var = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1 + is_recurrent,
        )
        advantage_var = tensor_utils.new_tensor(
            name='advantage',
            ndim=1 + is_recurrent,
            dtype=tf.float32,
        )
        dist = self.policy.distribution

        old_dist_info_vars = {
            k: tf.placeholder(tf.float32,
                              shape=[None] * (1 + is_recurrent) + list(shape),
                              name='old_%s' % k)
            for k, shape in dist.dist_info_specs
        }
        old_dist_info_vars_list = [
            old_dist_info_vars[k] for k in dist.dist_info_keys
        ]

        state_info_vars = {
            k: tf.placeholder(tf.float32,
                              shape=[None] * (1 + is_recurrent) + list(shape),
                              name=k)
            for k, shape in self.policy.state_info_specs
        }
        state_info_vars_list = [
            state_info_vars[k] for k in self.policy.state_info_keys
        ]

        if is_recurrent:
            valid_var = tf.placeholder(tf.float32,
                                       shape=[None, None],
                                       name="valid")
        else:
            valid_var = None

        dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars)
        logli = dist.log_likelihood_sym(action_var, dist_info_vars)
        # logli_old = dist.log_likelihood_sym(action_var, old_dist_info_vars)
        r__ = dist.likelihood_ratio_sym(action_var, old_dist_info_vars,
                                        dist_info_vars)
        clip_frac = tf.reduce_mean(
            tf.to_float(tf.greater(tf.abs(r__ - 1.0), 0.2)))
        r_ = tf.clip_by_value(r__, 0.8, 1.2)
        kl = dist.kl_sym(old_dist_info_vars, dist_info_vars)

        # formulate as a minimization problem
        # The gradient of the surrogate objective is the policy gradient
        if is_recurrent:
            surr_obj = -tf.reduce_sum(
                r_ * advantage_var * valid_var) / tf.reduce_sum(valid_var)
            mean_kl = tf.reduce_sum(kl * valid_var) / tf.reduce_sum(valid_var)
            max_kl = tf.reduce_max(kl * valid_var)
        else:
            surr_obj = -tf.reduce_mean(r_ * advantage_var)
            mean_kl = tf.reduce_mean(kl)
            max_kl = tf.reduce_max(kl)

        input_list = [obs_var, action_var, advantage_var
                      ] + state_info_vars_list + old_dist_info_vars_list
        if is_recurrent:
            input_list.append(valid_var)

        #self.policy.set_init_surr_obj(input_list, [surr_obj])  # debugging
        self.optimizer.update_opt(loss=surr_obj,
                                  target=self.policy,
                                  inputs=input_list)

        f_kl = tensor_utils.compile_function(
            inputs=input_list + old_dist_info_vars_list,
            outputs=[mean_kl, max_kl, clip_frac, dist_info_vars['log_std']],
        )
        self.opt_info = dict(f_kl=f_kl, )
Exemplo n.º 18
0
    def init_opt(self, name=''):
        is_recurrent = int(self.policy.recurrent)
        obs_var = self.env.observation_space.new_tensor_variable(
            name + 'obs',
            extra_dims=1 + is_recurrent,
        )
        action_var = self.env.action_space.new_tensor_variable(
            name + 'action',
            extra_dims=1 + is_recurrent,
        )
        advantage_var = tensor_utils.new_tensor(
            name + 'advantage',
            ndim=1 + is_recurrent,
            dtype=tf.float32,
        )
        dist = self.policy.distribution

        old_dist_info_vars = {
            k: tf.placeholder(tf.float32,
                              shape=[None] * (1 + is_recurrent) + list(shape),
                              name=name + 'old_%s' % k)
            for k, shape in dist.dist_info_specs
        }
        old_dist_info_vars_list = [
            old_dist_info_vars[k] for k in dist.dist_info_keys
        ]

        state_info_vars = {
            k: tf.placeholder(tf.float32,
                              shape=[None] * (1 + is_recurrent) + list(shape),
                              name=name + k)
            for k, shape in self.policy.state_info_specs
        }
        state_info_vars_list = [
            state_info_vars[k] for k in self.policy.state_info_keys
        ]

        if is_recurrent:
            valid_var = tf.placeholder(tf.float32,
                                       shape=[None, None],
                                       name=name + "valid")
        else:
            valid_var = None

        input_list = [
            obs_var,
            action_var,
            advantage_var,
        ] + state_info_vars_list + old_dist_info_vars_list
        if is_recurrent:
            input_list.append(valid_var)

        if self.kl_sample_backups > 0:
            kl_obs_var = self.env.observation_space.new_tensor_variable(
                name + 'kl_obs',
                extra_dims=1 + is_recurrent,
            )
            kl_old_dist_info_vars = {
                k:
                tf.placeholder(tf.float32,
                               shape=[None] * (1 + is_recurrent) + list(shape),
                               name=name + 'kl_old_%s' % k)
                for k, shape in dist.dist_info_specs
            }
            kl_old_dist_info_vars_list = [
                kl_old_dist_info_vars[k] for k in dist.dist_info_keys
            ]

            kl_state_info_vars = {
                k:
                tf.placeholder(tf.float32,
                               shape=[None] * (1 + is_recurrent) + list(shape),
                               name=name + 'kl_%s' % k)
                for k, shape in self.policy.state_info_specs
            }
            kl_state_info_vars_list = [
                kl_state_info_vars[k] for k in self.policy.state_info_keys
            ]
            kl_dist_info_vars = self.policy.dist_info_sym(
                kl_obs_var, kl_state_info_vars)
            kl = dist.kl_sym(kl_old_dist_info_vars, kl_dist_info_vars)

            input_list += [
                kl_obs_var
            ] + kl_state_info_vars_list + kl_old_dist_info_vars_list

            dist_info_vars = self.policy.dist_info_sym(obs_var,
                                                       state_info_vars)
        else:
            dist_info_vars = self.policy.dist_info_sym(obs_var,
                                                       state_info_vars)
            kl = dist.kl_sym(old_dist_info_vars, dist_info_vars)

        lr = dist.likelihood_ratio_sym(action_var, old_dist_info_vars,
                                       dist_info_vars)

        if not self.qprop:
            if is_recurrent:
                mean_kl = tf.reduce_sum(
                    kl * valid_var) / tf.reduce_sum(valid_var)
                surr_loss = -tf.reduce_sum(
                    lr * advantage_var * valid_var) / tf.reduce_sum(valid_var)
            else:
                mean_kl = tf.reduce_mean(kl)
                surr_loss = -tf.reduce_mean(lr * advantage_var)
        else:
            if is_recurrent: raise NotImplementedError
            eta_var = tensor_utils.new_tensor(
                'eta',
                ndim=1 + is_recurrent,
                dtype=tf.float32,
            )
            surr_loss = -tf.reduce_mean(lr * advantage_var)
            if self.qprop_nu > 0: surr_loss *= 1 - self.qprop_nu
            if self.sample_backups > 0 or not self.policy_sample_last:
                off_obs_var = self.env.observation_space.new_tensor_variable(
                    name + 'off_obs',
                    extra_dims=1 + is_recurrent,
                )
                off_e_qval = self.qf.get_e_qval_sym(off_obs_var,
                                                    self.policy,
                                                    deterministic=True)
                input_list += [off_obs_var]
                surr_loss -= tf.reduce_mean(off_e_qval)  # * eta_var)
            else:
                if not self.mqprop:
                    # Originally, we subtract this value for the bias correction, but we don't do that if we want mqprop (no action-conditional baseline).
                    e_qval = self.qf.get_e_qval_sym(obs_var,
                                                    self.policy,
                                                    deterministic=True)
                    surr_loss -= tf.reduce_mean(e_qval * eta_var)

            mean_kl = tf.reduce_mean(kl)
            input_list += [eta_var]
            control_variate = self.qf.get_cv_sym(obs_var, action_var,
                                                 self.policy)
            f_control_variate = tensor_utils.compile_function(
                inputs=[obs_var, action_var],
                outputs=control_variate,
            )
            self.opt_info_qprop = dict(f_control_variate=f_control_variate, )
        if self.ac_delta > 0:
            ac_obs_var = self.env.observation_space.new_tensor_variable(
                name + 'ac_obs',
                extra_dims=1 + is_recurrent,
            )
            e_qval = self.qf.get_e_qval_sym(ac_obs_var,
                                            self.policy,
                                            deterministic=True)
            input_list += [ac_obs_var]
            surr_loss *= (1.0 - self.ac_delta)
            surr_loss -= self.ac_delta * tf.reduce_mean(e_qval)
        self.optimizer.update_opt(loss=surr_loss,
                                  target=self.policy,
                                  leq_constraint=(mean_kl, self.step_size),
                                  inputs=input_list,
                                  constraint_name="mean_kl")
        self.opt_info = dict(target_policy=self.policy, )
        self.init_opt_critic()
        return dict()
Exemplo n.º 19
0
    def init_opt(self):
        is_recurrent = int(self.policy.recurrent)
        obs_var = self.env.observation_space.new_tensor_variable(
            'obs',
            extra_dims=1 + is_recurrent,
        )
        action_var = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1 + is_recurrent,
        )
        advantage_var = tensor_utils.new_tensor(
            'advantage',
            ndim=1 + is_recurrent,
            dtype=tf.float32,
        )
        dist = self.policy.distribution

        old_dist_info_vars = {
            k: tf.placeholder(tf.float32,
                              shape=[None] * (1 + is_recurrent) + list(shape),
                              name='old_%s' % k)
            for k, shape in dist.dist_info_specs
        }
        old_dist_info_vars_list = [
            old_dist_info_vars[k] for k in dist.dist_info_keys
        ]

        state_info_vars = {
            k: tf.placeholder(tf.float32,
                              shape=[None] * (1 + is_recurrent) + list(shape),
                              name=k)
            for k, shape in self.policy.state_info_specs
        }
        state_info_vars_list = [
            state_info_vars[k] for k in self.policy.state_info_keys
        ]

        if is_recurrent:
            valid_var = tf.placeholder(tf.float32,
                                       shape=[None, None],
                                       name="valid")
        else:
            valid_var = None

        dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars)
        kl = dist.kl_sym(old_dist_info_vars, dist_info_vars)
        lr = dist.likelihood_ratio_sym(action_var, old_dist_info_vars,
                                       dist_info_vars)

        if self.pol_ent_wt > 0:
            if 'log_std' in dist_info_vars:
                log_std = dist_info_vars['log_std']
                ent = tf.reduce_sum(log_std +
                                    tf.log(tf.sqrt(2 * np.pi * np.e)),
                                    reduction_indices=-1)
            elif 'prob' in dist_info_vars:
                prob = dist_info_vars['prob']
                ent = -tf.reduce_sum(prob * tf.log(prob), reduction_indices=-1)
            else:
                raise NotImplementedError()
            ent = tf.stop_gradient(ent)
            adv = advantage_var + self.pol_ent_wt * ent
        else:
            adv = advantage_var

        if is_recurrent:
            mean_kl = tf.reduce_sum(kl * valid_var) / tf.reduce_sum(valid_var)
            surr_loss = -tf.reduce_sum(
                lr * adv * valid_var) / tf.reduce_sum(valid_var)
        else:
            mean_kl = tf.reduce_mean(kl)
            surr_loss = -tf.reduce_mean(lr * adv)

        input_list = [
            obs_var,
            action_var,
            advantage_var,
        ] + state_info_vars_list + old_dist_info_vars_list
        if is_recurrent:
            input_list.append(valid_var)

        self.optimizer.update_opt(loss=surr_loss,
                                  target=self.policy,
                                  leq_constraint=(mean_kl, self.step_size),
                                  inputs=input_list,
                                  constraint_name="mean_kl")
        return dict()
Exemplo n.º 20
0
    def init_opt(self):

        with tf.variable_scope("target_policy"):
            target_policy = Serializable.clone(self.policy)

        oracle_policy = self.oracle_policy

        with tf.variable_scope("target_qf"):
            target_qf = Serializable.clone(self.qf)

        with tf.variable_scope("target_gate_qf"):
            target_gate_qf = Serializable.clone(self.gate_qf)

        obs = self.obs = self.env.observation_space.new_tensor_variable(
            'obs',
            extra_dims=1,
        )
        action = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1,
        )

        discrete_action = tensor_utils.new_tensor(
            'discrete_action',
            ndim=2,
            dtype=tf.float32,
        )

        yvar = tensor_utils.new_tensor(
            'ys',
            ndim=1,
            dtype=tf.float32,
        )

        qf_weight_decay_term = 0.5 * self.qf_weight_decay * \
                               sum([tf.reduce_sum(tf.square(param)) for param in
                                    self.qf.get_params(regularizable=True)])

        policy_weight_decay_term = 0.5 * self.policy_weight_decay * \
                                   sum([tf.reduce_sum(tf.square(param))
                                        for param in self.policy.get_params(regularizable=True)])

        policy_qval_novice = self.qf.get_qval_sym(
            obs, self.policy.get_novice_policy_sym(obs), deterministic=True)

        policy_qval_gate = self.discrete_qf.get_qval_sym(
            obs,
            self.policy.get_action_binary_gate_sym(obs),
            deterministic=True)

        qval = self.qf.get_qval_sym(obs, action)
        qf_loss = tf.reduce_mean(tf.square(yvar - qval))
        qf_reg_loss = qf_loss + qf_weight_decay_term

        discrete_qval = self.gate_qf.get_qval_sym(obs, discrete_action)
        discrete_qf_loss = tf.reduce_mean(tf.square(yvar - discrete_qval))
        discrete_qf_reg_loss = discrete_qf_loss + qf_weight_decay_term

        qf_input_list = [yvar, obs, action]
        discrete_qf_input_list = [yvar, obs, discrete_action]

        policy_input_list = [obs]
        policy_gate_input_list = [obs]

        gating_network = self.policy.get_action_binary_gate_sym(obs)

        policy_surr = -tf.reduce_mean(policy_qval_novice)
        policy_reg_surr = policy_surr + policy_weight_decay_term

        policy_gate_surr = -tf.reduce_mean(
            policy_qval_gate) + policy_weight_decay_term
        policy_reg_gate_surr = policy_gate_surr + policy_weight_decay_term

        self.qf_update_method.update_opt(loss=qf_reg_loss,
                                         target=self.qf,
                                         inputs=qf_input_list)

        self.gate_qf_update_method.update_opt(loss=discrete_qf_reg_loss,
                                              target=self.gate_qf,
                                              inputs=discrete_qf_input_list)

        self.policy_update_method.update_opt(loss=policy_reg_surr,
                                             target=self.policy,
                                             inputs=policy_input_list)

        self.policy_gate_update_method.update_opt(
            loss=policy_reg_gate_surr,
            target=self.policy,
            inputs=policy_gate_input_list)

        f_train_qf = tensor_utils.compile_function(
            inputs=qf_input_list,
            outputs=[qf_loss, qval, self.qf_update_method._train_op],
        )

        f_train_discrete_qf = tensor_utils.compile_function(
            inputs=discrete_qf_input_list,
            outputs=[
                discrete_qf_loss, discrete_qval,
                self.gate_qf_update_method._train_op
            ],
        )

        f_train_policy = tensor_utils.compile_function(
            inputs=policy_input_list,
            outputs=[policy_surr, self.policy_update_method._train_op],
        )

        f_train_policy_gate = tensor_utils.compile_function(
            inputs=policy_gate_input_list,
            outputs=[
                policy_gate_surr, self.policy_gate_update_method._train_op,
                gating_network
            ],
        )

        self.opt_info = dict(
            f_train_qf=f_train_qf,
            f_train_discrete_qf=f_train_discrete_qf,
            f_train_policy=f_train_policy,
            f_train_policy_gate=f_train_policy_gate,
            target_qf=target_qf,
            target_gate_qf=target_gate_qf,
            target_policy=target_policy,
            oracle_policy=oracle_policy,
        )
Exemplo n.º 21
0
    def init_opt(self):
        is_recurrent = int(self.policy.recurrent)
        # lists over the meta_batch_size0
        context = tf.reshape(self.irl_model.reparam_latent_tile, [
            self.meta_batch_size, -1, self.irl_model.T,
            self.irl_model.latent_dim
        ])

        # if not self.train_irl:
        # context = tf.stop_gradient(context)

        obs_vars = self.env.observation_space.new_tensor_variable(
            'obs',
            extra_dims=1 + is_recurrent,
        )
        action_vars = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1 + is_recurrent,
        )
        advantage_vars = tensor_utils.new_tensor(
            'advantage',
            ndim=1 + is_recurrent,
            dtype=tf.float32,
        )
        clean_obs_vars = tf.placeholder(
            tf.float32,
            shape=[None] * (1 + is_recurrent) +
            [self.env.observation_space.flat_dim - self.irl_model.latent_dim],
            name='clean_obs')
        policy_input = tf.reshape(
            tf.concat([
                tf.reshape(clean_obs_vars, [
                    self.meta_batch_size, -1, self.irl_model.T,
                    self.env.observation_space.flat_dim -
                    self.irl_model.latent_dim
                ]), context
            ],
                      axis=-1), [-1, self.env.observation_space.flat_dim])

        # input_list = obs_vars + action_vars + advantage_vars
        input_list = [clean_obs_vars] + [action_vars] + [advantage_vars] + [
            self.irl_model.expert_traj_var
        ]

        dist = self.policy.distribution

        old_dist_info_vars_list, state_info_vars_list = [], []
        old_dist_info_vars = {
            k: tf.placeholder(tf.float32,
                              shape=[None] * (1 + is_recurrent) + list(shape),
                              name='old_%s' % k)
            for k, shape in dist.dist_info_specs
        }
        old_dist_info_vars_list += [
            old_dist_info_vars[k] for k in dist.dist_info_keys
        ]

        state_info_vars = {
            k: tf.placeholder(tf.float32,
                              shape=[None] * (1 + is_recurrent) + list(shape),
                              name='%s' % k)
            for k, shape in self.policy.state_info_specs
        }
        state_info_vars_list += [
            state_info_vars[k] for k in self.policy.state_info_keys
        ]

        if is_recurrent:
            valid_vars = tf.placeholder(tf.float32,
                                        shape=[None, None],
                                        name="valid")
        else:
            valid_vars = None

        surr_losses, mean_kls = [], []
        # dist_info_vars = self.policy.dist_info_sym(obs_vars[i], state_info_vars[i])
        dist_info_vars = self.policy.dist_info_sym(policy_input,
                                                   state_info_vars)
        kl = dist.kl_sym(old_dist_info_vars, dist_info_vars)
        lr = dist.likelihood_ratio_sym(action_vars, old_dist_info_vars,
                                       dist_info_vars)

        if self.pol_ent_wt > 0:
            if 'log_std' in dist_info_vars:
                log_std = dist_info_vars['log_std']
                ent = tf.reduce_sum(log_std +
                                    tf.log(tf.sqrt(2 * np.pi * np.e)),
                                    reduction_indices=-1)
            elif 'prob' in dist_info_vars:
                prob = dist_info_vars['prob']
                ent = -tf.reduce_sum(prob * tf.log(prob), reduction_indices=-1)
            else:
                raise NotImplementedError()
            ent = tf.stop_gradient(ent)
            adv = advantage_vars + self.pol_ent_wt * ent
        else:
            adv = advantage_vars

        if is_recurrent:
            mean_kl = tf.reduce_sum(
                kl * valid_vars) / tf.reduce_sum(valid_vars)
            surr_loss = -tf.reduce_sum(
                lr * adv * valid_vars) / tf.reduce_sum(valid_vars)
        else:
            mean_kl = tf.reduce_mean(kl)
            surr_loss = -tf.reduce_mean(lr * adv)
        surr_losses.append(surr_loss)
        mean_kls.append(mean_kl)

        surr_loss = tf.reduce_mean(tf.stack(
            surr_losses, 0))  # mean over meta_batch_size (the diff tasks)
        mean_kl = tf.reduce_mean(tf.stack(mean_kls))
        input_list += state_info_vars_list + old_dist_info_vars_list

        if is_recurrent:
            input_list += valid_vars

        self.optimizer.update_opt(loss=surr_loss,
                                  target=self.policy,
                                  leq_constraint=(mean_kl, self.step_size),
                                  inputs=input_list,
                                  constraint_name="mean_kl")
        return dict()
Exemplo n.º 22
0
    def init_opt(self):
        is_recurrent = int(self.policy.recurrent)
        obs_var = self.env.observation_space.new_tensor_variable(
            'obs',
            extra_dims=1 + is_recurrent,
        )
        action_var = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1 + is_recurrent,
        )
        advantage_var = tensor_utils.new_tensor(
            'advantage',
            ndim=1 + is_recurrent,
            dtype=tf.float32,
        )

        input_list = [
            obs_var,
            action_var,
            advantage_var,
        ]

        dist = self.policy.distribution

        old_dist_info_vars = {
            k: tf.placeholder(tf.float32, shape=[None] * (1 + is_recurrent) + list(shape), name='old_%s' % k)
            for k, shape in dist.dist_info_specs
            }
        old_dist_info_vars_list = [old_dist_info_vars[k] for k in dist.dist_info_keys]

        state_info_vars = {
            k: tf.placeholder(tf.float32, shape=[None] * (1 + is_recurrent) + list(shape), name=k)
            for k, shape in self.policy.state_info_specs
            }
        state_info_vars_list = [state_info_vars[k] for k in self.policy.state_info_keys]

        if is_recurrent:
            valid_var = tf.placeholder(tf.float32, shape=[None, None], name="valid")
        else:
            valid_var = None

        dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars)
        kl = dist.kl_sym(old_dist_info_vars, dist_info_vars)
        lr = dist.likelihood_ratio_sym(action_var, old_dist_info_vars, dist_info_vars)

        if self.pol_ent_wt > 0:
            if 'log_std' in dist_info_vars:
                log_std = dist_info_vars['log_std']
                ent = tf.reduce_sum(log_std + tf.log(tf.sqrt(2 * np.pi * np.e)), reduction_indices=-1)
            elif 'prob' in dist_info_vars:
                prob = dist_info_vars['prob']
                ent = -tf.reduce_sum(prob*tf.log(prob), reduction_indices=-1)
            else:
                raise NotImplementedError()
            ent = tf.stop_gradient(ent)
            adv = advantage_var + self.pol_ent_wt*ent
        else:
            adv = advantage_var


        if is_recurrent:
            mean_kl = tf.reduce_sum(kl * valid_var) / tf.reduce_sum(valid_var)
            surr_loss = - tf.reduce_sum(lr * adv * valid_var) / tf.reduce_sum(valid_var)
        else:
            mean_kl = tf.reduce_mean(kl)
            surr_loss = - tf.reduce_mean(lr * adv)

        input_list += state_info_vars_list + old_dist_info_vars_list
        if is_recurrent:
            input_list.append(valid_var)

        self.optimizer.update_opt(
            loss=surr_loss,
            target=self.policy,
            leq_constraint=(mean_kl, self.step_size),
            inputs=input_list,
            constraint_name="mean_kl"
        )
        return dict()
Exemplo n.º 23
0
    def init_opt(self):

        # First, create "target" policy and Q functions
        with tf.variable_scope("target_policy"):
            target_policy = Serializable.clone(self.policy)
        with tf.variable_scope("target_qf"):
            target_qf = Serializable.clone(self.qf)

        # y need to be computed first
        obs = self.env.observation_space.new_tensor_variable(
            'obs',
            extra_dims=1,
        )

        # The yi values are computed separately as above and then passed to
        # the training functions below
        action = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1,
        )

        yvar = tensor_utils.new_tensor(
            'ys',
            ndim=1,
            dtype=tf.float32,
        )

        qf_weight_decay_term = 0.5 * self.qf_weight_decay * \
                               sum([tf.reduce_sum(tf.square(param)) for param in
                                    self.qf.get_params(regularizable=True)])

        qval = self.qf.get_qval_sym(obs, action)

        qf_loss = tf.reduce_mean(tf.square(yvar - qval))
        qf_reg_loss = qf_loss + qf_weight_decay_term

        policy_weight_decay_term = 0.5 * self.policy_weight_decay * \
                                   sum([tf.reduce_sum(tf.square(param))
                                        for param in self.policy.get_params(regularizable=True)])
        policy_qval = self.qf.get_qval_sym(obs,
                                           self.policy.get_action_sym(obs),
                                           deterministic=True)
        policy_surr = -tf.reduce_mean(policy_qval)

        policy_reg_surr = policy_surr + policy_weight_decay_term

        qf_input_list = [yvar, obs, action]
        policy_input_list = [obs]

        self.qf_update_method.update_opt(loss=qf_reg_loss,
                                         target=self.qf,
                                         inputs=qf_input_list)
        self.policy_update_method.update_opt(loss=policy_reg_surr,
                                             target=self.policy,
                                             inputs=policy_input_list)

        f_train_qf = tensor_utils.compile_function(
            inputs=qf_input_list,
            outputs=[qf_loss, qval, self.qf_update_method._train_op],
        )

        f_train_policy = tensor_utils.compile_function(
            inputs=policy_input_list,
            outputs=[policy_surr, self.policy_update_method._train_op],
        )

        self.opt_info = dict(
            f_train_qf=f_train_qf,
            f_train_policy=f_train_policy,
            target_qf=target_qf,
            target_policy=target_policy,
        )
Exemplo n.º 24
0
    def init_opt(self):

        # First, create "target" policy and Q functions
        with tf.variable_scope("target_policy"):
            target_policy = Serializable.clone(self.policy)
        with tf.variable_scope("target_qf"):
            target_qf = Serializable.clone(self.qf)

        # y need to be computed first
        obs = self.env.observation_space.new_tensor_variable(
            'obs',
            extra_dims=1,
        )

        # The yi values are computed separately as above and then passed to
        # the training functions below
        action = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1,
        )

        yvar = tensor_utils.new_tensor(
            'ys',
            ndim=1,
            dtype=tf.float32,
        )

        obs_offpolicy = self.env.observation_space.new_tensor_variable(
            'obs_offpolicy',
            extra_dims=1,
        )

        action_offpolicy = self.env.action_space.new_tensor_variable(
            'action_offpolicy',
            extra_dims=1,
        )

        yvar = tensor_utils.new_tensor(
            'ys',
            ndim=1,
            dtype=tf.float32,
        )

        yvar_offpolicy = tensor_utils.new_tensor(
            'ys_offpolicy',
            ndim=1,
            dtype=tf.float32,
        )

        qf_weight_decay_term = 0.5 * self.qf_weight_decay * \
                               sum([tf.reduce_sum(tf.square(param)) for param in
                                    self.qf.get_params(regularizable=True)])

        qval = self.qf.get_qval_sym(obs, action)
        qval_off = self.qf.get_qval_sym(obs_offpolicy, action_offpolicy)

        qf_loss = tf.reduce_mean(tf.square(yvar - qval))
        qf_loss_off = tf.reduce_mean(tf.square(yvar_offpolicy - qval_off))

        # TODO: penalize dramatic changes in gating_func
        # if PENALIZE_GATING_DISTRIBUTION_DIVERGENCE:


        policy_weight_decay_term = 0.5 * self.policy_weight_decay * \
                                   sum([tf.reduce_sum(tf.square(param))
                                        for param in self.policy.get_params(regularizable=True)])
        policy_qval = self.qf.get_qval_sym(obs,
                                           self.policy.get_action_sym(obs),
                                           deterministic=True)

        policy_qval_off = self.qf.get_qval_sym(
            obs_offpolicy,
            self.policy.get_action_sym(obs_offpolicy),
            deterministic=True)

        policy_surr = -tf.reduce_mean(policy_qval)
        policy_surr_off = -tf.reduce_mean(policy_qval_off)

        if self.sigma_type == 'unified-gated' or self.sigma_type == 'unified-gated-decaying':
            print("Using Gated Sigma!")

            input_to_gates = tf.concat([obs, obs_offpolicy], axis=1)

            assert input_to_gates.get_shape().as_list()[-1] == obs.get_shape(
            ).as_list()[-1] + obs_offpolicy.get_shape().as_list()[-1]

            # TODO: right now this is a soft-gate, should make a hard-gate (options vs mixtures)
            gating_func = MLP(
                name="sigma_gate",
                output_dim=1,
                hidden_sizes=(64, 64),
                hidden_nonlinearity=tf.nn.relu,
                output_nonlinearity=tf.nn.sigmoid,
                input_var=input_to_gates,
                input_shape=tuple(
                    input_to_gates.get_shape().as_list()[1:])).output
        elif self.sigma_type == 'unified':
            # sample a bernoulli random variable
            print("Using Bernoulli sigma!")
            gating_func = tf.cast(self.random_dist.sample(qf_loss.get_shape()),
                                  tf.float32)
        elif self.sigma_type == 'unified-decaying':
            print("Using decaying sigma!")
            gating_func = tf.train.exponential_decay(1.0,
                                                     self.train_step,
                                                     20,
                                                     0.96,
                                                     staircase=True)
        else:
            raise Exception("sigma type not supported")

        qf_inputs_list = [
            yvar, obs, action, yvar_offpolicy, obs_offpolicy, action_offpolicy,
            self.train_step
        ]
        qf_reg_loss = qf_loss * (1.0 - gating_func) + qf_loss_off * (
            gating_func) + qf_weight_decay_term

        policy_input_list = [obs, obs_offpolicy, self.train_step]
        policy_reg_surr = policy_surr * (
            1.0 - gating_func) + policy_surr_off * (
                gating_func) + policy_weight_decay_term

        if self.sigma_type == 'unified-gated-decaying':
            print("Adding a decaying factor to gated sigma!")
            decaying_factor = tf.train.exponential_decay(.5,
                                                         self.train_step,
                                                         20,
                                                         0.96,
                                                         staircase=True)
            penalty = decaying_factor * tf.nn.l2_loss(gating_func)
            qf_reg_loss += penalty
            policy_reg_surr += penalty

        self.qf_update_method.update_opt(qf_reg_loss,
                                         target=self.qf,
                                         inputs=qf_inputs_list)

        self.policy_update_method.update_opt(policy_reg_surr,
                                             target=self.policy,
                                             inputs=policy_input_list)

        f_train_qf = tensor_utils.compile_function(
            inputs=qf_inputs_list,
            outputs=[qf_loss, qval, self.qf_update_method._train_op],
        )

        f_train_policy = tensor_utils.compile_function(
            inputs=policy_input_list,
            outputs=[policy_surr, self.policy_update_method._train_op],
        )

        self.opt_info = dict(
            f_train_qf=f_train_qf,
            f_train_policy=f_train_policy,
            target_qf=target_qf,
            target_policy=target_policy,
        )
Exemplo n.º 25
0
    def init_opt(self, lambda_s=100, lambda_v=10, tau=.5):

        with tf.variable_scope("target_policy"):
            target_policy = Serializable.clone(self.policy)

        oracle_policy = self.oracle_policy

        with tf.variable_scope("target_qf"):
            target_qf = Serializable.clone(self.qf)

        obs = self.obs = self.env.observation_space.new_tensor_variable(
            'obs',
            extra_dims=1,
        )

        action = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1,
        )

        yvar = tensor_utils.new_tensor(
            'ys',
            ndim=1,
            dtype=tf.float32,
        )

        qf_weight_decay_term = 0.5 * self.qf_weight_decay * \
                               sum([tf.reduce_sum(tf.square(param)) for param in
                                    self.qf.get_params(regularizable=True)])

        qval = self.qf.get_qval_sym(obs, action)

        qf_loss = tf.reduce_mean(tf.square(yvar - qval))
        qf_reg_loss = qf_loss + qf_weight_decay_term

        policy_weight_decay_term = 0.5 * self.policy_weight_decay * \
                                   sum([tf.reduce_sum(tf.square(param))
                                        for param in self.policy.get_params(regularizable=True)])

        qf_input_list = [yvar, obs, action]
        policy_input_list = [obs]

        obs_oracle = self.env.observation_space.new_tensor_variable(
            'obs_oracle',
            extra_dims=1,
        )

        action_oracle = self.env.action_space.new_tensor_variable(
            'action_oracle',
            extra_dims=1,
        )

        yvar_oracle = tensor_utils.new_tensor(
            'ys_oracle',
            ndim=1,
            dtype=tf.float32,
        )

        qval_oracle = self.qf.get_qval_sym(obs_oracle, action_oracle)
        qf_loss_oracle = tf.reduce_mean(tf.square(yvar_oracle - qval_oracle))
        qf_reg_loss_oracle = qf_loss_oracle + qf_weight_decay_term

        policy_qval_novice = self.qf.get_qval_sym(
            obs, self.policy.get_novice_policy_sym(obs), deterministic=True)

        gating_network = self.policy.get_action_binary_gate_sym(obs)

        policy_qval_oracle = self.qf.get_qval_sym(
            obs, self.policy.get_action_oracle_sym(obs), deterministic=True)

        combined_losses = tf.concat([
            tf.reshape(policy_qval_novice, [-1, 1]),
            tf.reshape(policy_qval_oracle, [-1, 1])
        ],
                                    axis=1)

        combined_loss = -tf.reduce_mean(tf.reshape(
            tf.reduce_mean(combined_losses * gating_network, axis=1), [-1, 1]),
                                        axis=0)

        lambda_s_loss = tf.constant(0.0)

        if lambda_s > 0.0:
            lambda_s_loss = lambda_s * (tf.reduce_mean(
                (tf.reduce_mean(gating_network, axis=0) - tau)**
                2) + tf.reduce_mean(
                    (tf.reduce_mean(gating_network, axis=1) - tau)**2))

        lambda_v_loss = tf.constant(0.0)

        if lambda_v > 0.0:
            mean0, var0 = tf.nn.moments(gating_network, axes=[0])
            mean, var1 = tf.nn.moments(gating_network, axes=[1])
            lambda_v_loss = -lambda_v * (tf.reduce_mean(var0) +
                                         tf.reduce_mean(var1))

        combined_losses = tf.concat([
            tf.reshape(policy_qval_novice, [-1, 1]),
            tf.reshape(policy_qval_oracle, [-1, 1])
        ],
                                    axis=1)
        combined_loss = -tf.reduce_mean(tf.reshape(
            tf.reduce_mean(combined_losses * gating_network, axis=1), [-1, 1]),
                                        axis=0)
        lambda_s_loss = tf.constant(0.0)

        if lambda_s > 0.0:
            lambda_s_loss = lambda_s * (tf.reduce_mean(
                (tf.reduce_mean(gating_network, axis=0) - tau)**
                2) + tf.reduce_mean(
                    (tf.reduce_mean(gating_network, axis=1) - tau)**2))

        lambda_v_loss = tf.constant(0.0)

        if lambda_v > 0.0:
            mean0, var0 = tf.nn.moments(gating_network, axes=[0])
            mean, var1 = tf.nn.moments(gating_network, axes=[1])
            lambda_v_loss = -lambda_v * (tf.reduce_mean(var0) +
                                         tf.reduce_mean(var1))

        policy_surr = combined_loss
        policy_reg_surr = combined_loss + policy_weight_decay_term + lambda_s_loss + lambda_v_loss
        gf_input_list = [obs_oracle, action_oracle, yvar_oracle
                         ] + qf_input_list

        self.qf_update_method.update_opt(loss=qf_reg_loss,
                                         target=self.qf,
                                         inputs=qf_input_list)

        self.policy_update_method.update_opt(loss=policy_reg_surr,
                                             target=self.policy,
                                             inputs=policy_input_list)

        f_train_qf = tensor_utils.compile_function(
            inputs=qf_input_list,
            outputs=[qf_loss, qval, self.qf_update_method._train_op],
        )

        f_train_policy = tensor_utils.compile_function(
            inputs=policy_input_list,
            outputs=[
                policy_surr, self.policy_update_method._train_op,
                gating_network
            ],
        )

        self.opt_info = dict(
            f_train_qf=f_train_qf,
            f_train_policy=f_train_policy,
            target_qf=target_qf,
            target_policy=target_policy,
            oracle_policy=oracle_policy,
        )
Exemplo n.º 26
0
    def init_opt(self):

        ###############################
        #
        # Variable Definitions
        #
        ###############################

        all_task_dist_info_vars = []
        all_obs_vars = []

        for i, policy in enumerate(self.local_policies):

            task_obs_var = self.env_partitions[
                i].observation_space.new_tensor_variable('obs%d' % i,
                                                         extra_dims=1)
            task_dist_info_vars = []

            for j, other_policy in enumerate(self.local_policies):

                state_info_vars = dict()  # Not handling recurrent policies
                dist_info_vars = other_policy.dist_info_sym(
                    task_obs_var, state_info_vars)
                task_dist_info_vars.append(dist_info_vars)

            all_obs_vars.append(task_obs_var)
            all_task_dist_info_vars.append(task_dist_info_vars)

        obs_var = self.env.observation_space.new_tensor_variable('obs',
                                                                 extra_dims=1)
        action_var = self.env.action_space.new_tensor_variable('action',
                                                               extra_dims=1)
        advantage_var = tensor_utils.new_tensor('advantage',
                                                ndim=1,
                                                dtype=tf.float32)

        old_dist_info_vars = {
            k: tf.placeholder(tf.float32,
                              shape=[None] + list(shape),
                              name='old_%s' % k)
            for k, shape in self.policy.distribution.dist_info_specs
        }

        old_dist_info_vars_list = [
            old_dist_info_vars[k]
            for k in self.policy.distribution.dist_info_keys
        ]

        input_list = [obs_var, action_var, advantage_var
                      ] + old_dist_info_vars_list + all_obs_vars

        ###############################
        #
        # Local Policy Optimization
        #
        ###############################

        self.optimizers = []
        self.metrics = []

        for n, policy in enumerate(self.local_policies):

            state_info_vars = dict()
            dist_info_vars = policy.dist_info_sym(obs_var, state_info_vars)
            dist = policy.distribution

            kl = dist.kl_sym(old_dist_info_vars, dist_info_vars)
            lr = dist.likelihood_ratio_sym(action_var, old_dist_info_vars,
                                           dist_info_vars)
            surr_loss = -tf.reduce_mean(lr * advantage_var)

            if self.constrain_together:
                additional_loss = Metrics.kl_on_others(
                    n, dist, all_task_dist_info_vars)
            else:
                additional_loss = tf.constant(0.0)

            local_loss = surr_loss + self.penalty * additional_loss

            kl_metric = tensor_utils.compile_function(inputs=input_list,
                                                      outputs=additional_loss,
                                                      log_name="KLPenalty%d" %
                                                      n)
            self.metrics.append(kl_metric)

            mean_kl_constraint = tf.reduce_mean(kl)

            optimizer = self.optimizer_class(**self.optimizer_args)
            optimizer.update_opt(
                loss=local_loss,
                target=policy,
                leq_constraint=(mean_kl_constraint, self.step_size),
                inputs=input_list,
                constraint_name="mean_kl_%d" % n,
            )
            self.optimizers.append(optimizer)

        ###############################
        #
        # Global Policy Optimization
        #
        ###############################

        # Behaviour Cloning Loss

        state_info_vars = dict()
        center_dist_info_vars = self.policy.dist_info_sym(
            obs_var, state_info_vars)
        behaviour_cloning_loss = tf.losses.mean_squared_error(
            action_var, center_dist_info_vars['mean'])
        self.center_optimizer = FirstOrderOptimizer(max_epochs=1,
                                                    verbose=True,
                                                    batch_size=1000)
        self.center_optimizer.update_opt(behaviour_cloning_loss, self.policy,
                                         [obs_var, action_var])

        # TRPO Loss

        kl = dist.kl_sym(old_dist_info_vars, center_dist_info_vars)
        lr = dist.likelihood_ratio_sym(action_var, old_dist_info_vars,
                                       center_dist_info_vars)
        center_trpo_loss = -tf.reduce_mean(lr * advantage_var)
        mean_kl_constraint = tf.reduce_mean(kl)

        optimizer = self.optimizer_class(**self.optimizer_args)
        optimizer.update_opt(
            loss=center_trpo_loss,
            target=self.policy,
            leq_constraint=(mean_kl_constraint, self.step_size),
            inputs=[obs_var, action_var, advantage_var] +
            old_dist_info_vars_list,
            constraint_name="mean_kl_center",
        )

        self.center_trpo_optimizer = optimizer

        # Reset Local Policies to Global Policy

        assignment_operations = []

        for policy in self.local_policies:
            for param_local, param_center in zip(
                    policy.get_params_internal(),
                    self.policy.get_params_internal()):
                if 'std' not in param_local.name:
                    assignment_operations.append(
                        tf.assign(param_local, param_center))

        self.reset_to_center = tf.group(*assignment_operations)

        return dict()
Exemplo n.º 27
0
    def init_opt(self):
        is_recurrent = int(self.policy.recurrent)

        obs_var = self.env.observation_space.new_tensor_variable(
            'obs',
            extra_dims=1 + is_recurrent,
        )
        action_var = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1 + is_recurrent,
        )
        advantage_var = tensor_utils.new_tensor(
            name='advantage',
            ndim=1 + is_recurrent,
            dtype=tf.float32,
        )
        dist = self.policy.distribution

        old_dist_info_vars = {
            k: tf.placeholder(tf.float32,
                              shape=[None] * (1 + is_recurrent) + list(shape),
                              name='old_%s' % k)
            for k, shape in dist.dist_info_specs
        }
        old_dist_info_vars_list = [
            old_dist_info_vars[k] for k in dist.dist_info_keys
        ]

        state_info_vars = {
            k: tf.placeholder(tf.float32,
                              shape=[None] * (1 + is_recurrent) + list(shape),
                              name=k)
            for k, shape in self.policy.state_info_specs
        }
        state_info_vars_list = [
            state_info_vars[k] for k in self.policy.state_info_keys
        ]

        if is_recurrent:
            valid_var = tf.placeholder(tf.float32,
                                       shape=[None, None],
                                       name="valid")
        else:
            valid_var = None

        dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars)
        logli = dist.log_likelihood_sym(action_var, dist_info_vars)
        kl = dist.kl_sym(old_dist_info_vars, dist_info_vars)

        # formulate as a minimization problem
        # The gradient of the surrogate objective is the policy gradient
        if is_recurrent:
            surr_obj = -tf.reduce_sum(
                logli * advantage_var * valid_var) / tf.reduce_sum(valid_var)
            mean_kl = tf.reduce_sum(kl * valid_var) / tf.reduce_sum(valid_var)
            max_kl = tf.reduce_max(kl * valid_var)
        else:
            surr_obj = -tf.reduce_mean(logli * advantage_var)
            mean_kl = tf.reduce_mean(kl)
            max_kl = tf.reduce_max(kl)

        input_list = [obs_var, action_var, advantage_var
                      ] + state_info_vars_list
        if is_recurrent:
            input_list.append(valid_var)

        vars_info = {
            "mean_kl": mean_kl,
            "input_list": input_list,
            "obs_var": obs_var,
            "action_var": action_var,
            "advantage_var": advantage_var,
            "surr_loss": surr_obj,
            "dist_info_vars": dist_info_vars,
            "lr": logli,
        }

        if self.qprop:
            eta_var = tensor_utils.new_tensor(
                'eta',
                ndim=1 + is_recurrent,
                dtype=tf.float32,
            )
            qvalue = self.qf.get_e_qval_sym(vars_info["obs_var"],
                                            self.policy,
                                            deterministic=True)
            qprop_surr_loss = -tf.reduce_mean(
                vars_info["lr"] * vars_info["advantage_var"]) - tf.reduce_mean(
                    qvalue * eta_var)
            input_list += [eta_var]
            self.optimizer.update_opt(
                loss=qprop_surr_loss,
                target=self.policy,
                inputs=input_list,
            )
            control_variate = self.qf.get_cv_sym(obs_var, action_var,
                                                 self.policy)
            f_control_variate = tensor_utils.compile_function(
                inputs=[obs_var, action_var],
                outputs=control_variate,
            )
            self.opt_info_qprop = dict(f_control_variate=f_control_variate, )
        else:
            self.optimizer.update_opt(loss=surr_obj,
                                      target=self.policy,
                                      inputs=input_list)

        f_kl = tensor_utils.compile_function(
            inputs=input_list + old_dist_info_vars_list,
            outputs=[mean_kl, max_kl],
        )
        self.opt_info = dict(
            f_kl=f_kl,
            target_policy=self.policy,
        )
        self.init_opt_critic()
Exemplo n.º 28
0
    def init_opt(self, name=''):
        is_recurrent = int(self.policy.recurrent)
        obs_var = self.env.observation_space.new_tensor_variable(
            name + 'obs',
            extra_dims=1 + is_recurrent,
        )
        action_var = self.env.action_space.new_tensor_variable(
            name + 'action',
            extra_dims=1 + is_recurrent,
        )
        advantage_var = tensor_utils.new_tensor(
            name + 'advantage',
            ndim=1 + is_recurrent,
            dtype=tf.float32,
        )
        dist = self.policy.distribution

        old_dist_info_vars = {
            k: tf.placeholder(tf.float32, shape=[None] * (1 + is_recurrent) + list(shape), name=name+'old_%s' % k)
            for k, shape in dist.dist_info_specs
            }
        old_dist_info_vars_list = [old_dist_info_vars[k] for k in dist.dist_info_keys]

        state_info_vars = {
            k: tf.placeholder(tf.float32, shape=[None] * (1 + is_recurrent) + list(shape), name=name+k)
            for k, shape in self.policy.state_info_specs
            }
        state_info_vars_list = [state_info_vars[k] for k in self.policy.state_info_keys]

        if is_recurrent:
            valid_var = tf.placeholder(tf.float32, shape=[None, None], name=name+"valid")
        else:
            valid_var = None

        input_list = [
                         obs_var,
                         action_var,
                         advantage_var,
                     ] + state_info_vars_list + old_dist_info_vars_list
        if is_recurrent:
            input_list.append(valid_var)

        if self.kl_sample_backups > 0:
            kl_obs_var = self.env.observation_space.new_tensor_variable(
                name + 'kl_obs',
                extra_dims=1 + is_recurrent,
            )
            kl_old_dist_info_vars = {
                k: tf.placeholder(tf.float32, shape=[None] * (1 + is_recurrent) + list(shape), name=name+'kl_old_%s' % k)
                for k, shape in dist.dist_info_specs
                }
            kl_old_dist_info_vars_list = [kl_old_dist_info_vars[k] for k in dist.dist_info_keys]

            kl_state_info_vars = {
                k: tf.placeholder(tf.float32, shape=[None] * (1 + is_recurrent) + list(shape), name=name+'kl_%s'%k)
                for k, shape in self.policy.state_info_specs
                }
            kl_state_info_vars_list = [kl_state_info_vars[k] for k in self.policy.state_info_keys]
            kl_dist_info_vars = self.policy.dist_info_sym(kl_obs_var, kl_state_info_vars)
            kl = dist.kl_sym(kl_old_dist_info_vars, kl_dist_info_vars)

            input_list += [kl_obs_var] + kl_state_info_vars_list + kl_old_dist_info_vars_list

            dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars)
        else:
            dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars)
            kl = dist.kl_sym(old_dist_info_vars, dist_info_vars)

        lr = dist.likelihood_ratio_sym(action_var, old_dist_info_vars, dist_info_vars)


        if self.qprop:
            if is_recurrent: raise NotImplementedError
            eta_var = tensor_utils.new_tensor(
                'eta',
                ndim=1 + is_recurrent,
                dtype=tf.float32,
            )
            surr_loss = -tf.reduce_mean(lr * advantage_var)
            if self.qprop_nu > 0: surr_loss *= 1-self.qprop_nu
            if self.sample_backups > 0 or not self.policy_sample_last:
                off_obs_var = self.env.observation_space.new_tensor_variable(
                    name + 'off_obs',
                    extra_dims=1 + is_recurrent,
                )
                off_e_qval = self.qf.get_e_qval_sym(off_obs_var, self.policy, deterministic=True)
                input_list += [off_obs_var]
                surr_loss -= tf.reduce_mean(off_e_qval)# * eta_var)
            else:
                e_qval = self.qf.get_e_qval_sym(obs_var, self.policy, deterministic=True)
                surr_loss -= tf.reduce_mean(e_qval * eta_var)
            mean_kl = tf.reduce_mean(kl)
            input_list += [eta_var]
            control_variate = self.qf.get_cv_sym(obs_var,
                    action_var, self.policy)
            f_control_variate = tensor_utils.compile_function(
                inputs=[obs_var, action_var],
                outputs=control_variate,
            )
            self.opt_info_qprop = dict(
                f_control_variate=f_control_variate,
            )
        elif self.phi:
            # Using stein control functional variate reduction
            if is_recurrent: raise NotImplementedError
            eta_var = tensor_utils.new_tensor(
                'eta',
                ndim = 1 + is_recurrent,
                dtype=tf.float32,
            )

            if isinstance(self.pf, ContinuousLinearPhiFunction):

                phival = self.pf.get_e_phival_sym(obs_var, self.policy, 
                                gradwrtmu=True, deterministic=True)
            
                surr_loss = -tf.reduce_mean(lr * advantage_var) - \
                                tf.reduce_mean(phival * eta_var)
                stein_phi = self.pf.get_phi_bar_sym(obs_var, 
                        action_var, self.policy)
            
            elif isinstance(self.pf, ContinuousQuadraticPhiFunction):
                dist_info = self.policy.dist_info_sym(obs_var)
                mean = dist_info["mean"]
                log_std = dist_info["log_std"]
                
                phi_derives = self.pf.get_phi_derive_sym(obs_var, action_var)
                surr_loss= -tf.reduce_mean(lr * advantage_var) 
                mu_loss = - tf.reduce_sum(tf.stop_gradient(tf.expand_dims(lr * eta_var, axis=1) * \
                                phi_derives['phi_prime']) * mean, axis=1)
                var_loss = - tf.reduce_sum(tf.stop_gradient(tf.expand_dims(lr * eta_var, axis=1) * \
                                phi_derives['phi_double_prime']) * tf.exp(2.*log_std),axis=1)
                
                surr_loss = surr_loss + tf.reduce_mean(mu_loss) + \
                                tf.reduce_mean(var_loss)
                stein_phi = self.pf.get_phival_sym(obs_var, action_var)
            
            elif isinstance(self.pf, ContinuousMLPPhiFunction):
                dist_info = self.policy.dist_info_sym(obs_var)
                mean = dist_info['mean']
                log_std = dist_info['log_std']

                grad_info, _ = self.policy.get_grad_info_sym(obs_var, action_var)

                phi_derives = self.pf.get_phi_derive_sym(obs_var, action_var)
                surr_loss = -tf.reduce_mean(lr * advantage_var) 
                mu_loss = - tf.reduce_sum(tf.stop_gradient(tf.expand_dims(lr * eta_var, axis=1) * \
                                phi_derives['phi_prime']) * mean, axis=1)
                var_loss = -(- tf.reduce_sum(tf.stop_gradient(tf.expand_dims(lr * eta_var, axis=1) * \
                                .5 * grad_info['logpi_dmu'] * \
                                phi_derives['phi_prime']) * tf.exp(2.*log_std), axis=1))
                
                surr_loss = surr_loss + tf.reduce_mean(mu_loss) + \
                                tf.reduce_mean(var_loss)
                
                stein_phi = self.pf.get_phival_sym(obs_var, action_var)

            else:
                raise NotImplementedError

            mean_kl = tf.reduce_mean(kl)
            input_list += [eta_var]

            f_stein_phi = tensor_utils.compile_function(
                inputs=[obs_var, action_var],
                outputs=stein_phi,
            )
            self.opt_info_phi=dict(
                f_stein_phi=f_stein_phi
            )
        
        elif not self.qprop and not self.phi:
            if is_recurrent:
                mean_kl = tf.reduce_sum(kl * valid_var) / tf.reduce_sum(valid_var)
                surr_loss = - tf.reduce_sum(lr * advantage_var * valid_var) / tf.reduce_sum(valid_var)
            else:
                mean_kl = tf.reduce_mean(kl)
                surr_loss = - tf.reduce_mean(lr * advantage_var)


        if self.ac_delta > 0:
            ac_obs_var = self.env.observation_space.new_tensor_variable(
                name + 'ac_obs',
                extra_dims=1 + is_recurrent,
            )
            e_qval = self.qf.get_e_qval_sym(ac_obs_var, self.policy, deterministic=True)
            input_list += [ac_obs_var]
            surr_loss *= (1.0 - self.ac_delta)
            surr_loss -= self.ac_delta * tf.reduce_mean(e_qval)
        self.optimizer.update_opt(
            loss=surr_loss,
            target=self.policy,
            leq_constraint=(mean_kl, self.step_size),
            inputs=input_list,
            constraint_name="mean_kl"
        )
        self.opt_info = dict(
                target_policy=self.policy,
        )
        self.init_opt_critic()
        self.init_opt_phi()
        return dict()
    def init_opt(self):

        is_recurrent = int(self.policy.recurrent)

        obs_var = self.env.observation_space.new_tensor_variable(
            'obs',
            extra_dims=1 + is_recurrent,
        )
        action_var = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1 + is_recurrent,
        )
        advantage_var = tensor_utils.new_tensor(
            'advantage',
            ndim=1 + is_recurrent,
            dtype=tf.float32,
        )
        dist = self.policy.distribution

        old_dist_info_vars = {
            k: tf.placeholder(tf.float32, shape=(None,) * (1 + is_recurrent) + shape, name='old_%s' % k)
            for k, shape in dist.dist_info_specs
            }
        old_dist_info_vars_list = [old_dist_info_vars[k] for k in dist.dist_info_keys]

        state_info_vars = {
            k: tf.placeholder(tf.float32, shape=(None,) * (1 + is_recurrent) + shape, name=k)
            for k, shape in self.policy.state_info_specs
            }
        state_info_vars_list = [state_info_vars[k] for k in self.policy.state_info_keys]

        kl_penalty_var = tf.Variable(
            initial_value=self.initial_kl_penalty,
            dtype=tf.float32,
            name="kl_penalty"
        )

        if is_recurrent:
            valid_var = tf.placeholder(tf.float32, shape=(None, None), name="valid")

            if hasattr(self.policy, "prob_network"):
                rnn_network = self.policy.prob_network
                state_dim = rnn_network.state_dim
                recurrent_layer = rnn_network.recurrent_layer
                state_init_param = rnn_network.state_init_param
            elif hasattr(self.policy, "head_network"):
                rnn_network = self.policy.head_network
                state_dim = rnn_network.state_dim
                recurrent_layer = rnn_network.recurrent_layer
                state_init_param = rnn_network.state_init_param
            else:
                state_dim = self.policy.l_rnn.state_dim
                recurrent_layer = self.policy.l_rnn
                state_init_param = tf.reshape(self.policy.l_rnn.cell.zero_state(1, dtype=tf.float32), (-1,))

            state_var = tf.placeholder(tf.float32, (None, state_dim), "state")

            recurrent_state_output = dict()

            minibatch_dist_info_vars = self.policy.dist_info_sym(
                obs_var, state_info_vars,
                recurrent_state={recurrent_layer: state_var},
                recurrent_state_output=recurrent_state_output,
            )

            state_output = recurrent_state_output[recurrent_layer]

            if hasattr(self.policy, "prob_network") or hasattr(self.policy, "head_network"):
                final_state = tf.reverse(state_output, [1])[:, 0, :]
            else:
                final_state = state_output

            lr = dist.likelihood_ratio_sym(action_var, old_dist_info_vars, minibatch_dist_info_vars)
            kl = dist.kl_sym(old_dist_info_vars, minibatch_dist_info_vars)
            ent = tf.reduce_sum(dist.entropy_sym(minibatch_dist_info_vars) * valid_var) / tf.reduce_sum(valid_var)
            mean_kl = tf.reduce_sum(kl * valid_var) / tf.reduce_sum(valid_var)

            clipped_lr = tf.clip_by_value(lr, 1. - self.clip_lr, 1. + self.clip_lr)

            surr_loss = - tf.reduce_sum(lr * advantage_var * valid_var) / tf.reduce_sum(valid_var)
            clipped_surr_loss = - tf.reduce_sum(
                tf.minimum(lr * advantage_var, clipped_lr * advantage_var) * valid_var
            ) / tf.reduce_sum(valid_var)

            clipped_surr_pen_loss = clipped_surr_loss - self.entropy_bonus_coeff * ent
            if self.use_kl_penalty:
                clipped_surr_pen_loss += kl_penalty_var * tf.maximum(0., mean_kl - self.step_size)

            self.optimizer.update_opt(
                loss=clipped_surr_pen_loss,
                target=self.policy,
                inputs=[obs_var, action_var, advantage_var] + state_info_vars_list + old_dist_info_vars_list + [
                    valid_var],
                rnn_init_state=state_init_param,
                rnn_state_input=state_var,
                rnn_final_state=final_state,
                diagnostic_vars=OrderedDict([
                    ("UnclippedSurrLoss", surr_loss),
                    ("MeanKL", mean_kl),
                ])
            )
        else:
            dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars)

            lr = dist.likelihood_ratio_sym(action_var, old_dist_info_vars, dist_info_vars)
            kl = dist.kl_sym(old_dist_info_vars, dist_info_vars)
            ent = tf.reduce_mean(dist.entropy_sym(dist_info_vars))
            mean_kl = tf.reduce_mean(kl)

            clipped_lr = tf.clip_by_value(lr, 1. - self.clip_lr, 1. + self.clip_lr)

            surr_loss = - tf.reduce_mean(lr * advantage_var)
            clipped_surr_loss = - tf.reduce_mean(
                tf.minimum(lr * advantage_var, clipped_lr * advantage_var)
            )

            clipped_surr_pen_loss = clipped_surr_loss - self.entropy_bonus_coeff * ent
            if self.use_kl_penalty:
                clipped_surr_pen_loss += kl_penalty_var * tf.maximum(0., mean_kl - self.step_size)

            self.optimizer.update_opt(
                loss=clipped_surr_pen_loss,
                target=self.policy,
                inputs=[obs_var, action_var, advantage_var] + state_info_vars_list + old_dist_info_vars_list,
                diagnostic_vars=OrderedDict([
                    ("UnclippedSurrLoss", surr_loss),
                    ("MeanKL", mean_kl),
                ])
            )

        self.kl_penalty_var = kl_penalty_var
        self.f_increase_penalty = tensor_utils.compile_function(
            inputs=[],
            outputs=tf.assign(
                kl_penalty_var,
                tf.minimum(kl_penalty_var * self.increase_penalty_factor, self.max_penalty)
            )
        )
        self.f_decrease_penalty = tensor_utils.compile_function(
            inputs=[],
            outputs=tf.assign(
                kl_penalty_var,
                tf.maximum(kl_penalty_var * self.decrease_penalty_factor, self.min_penalty)
            )
        )
        self.f_reset_penalty = tensor_utils.compile_function(
            inputs=[],
            outputs=tf.assign(
                kl_penalty_var,
                self.initial_kl_penalty
            )
        )
Exemplo n.º 30
0
    def init_experts_opt(self):

        ###############################
        #
        # Variable Definitions
        #
        ###############################

        all_task_dist_info_vars = []
        all_obs_vars = []

        for i, policy in enumerate(self.local_policies):

            task_obs_var = self.env_partitions[
                i].observation_space.new_tensor_variable('obs%d' % i,
                                                         extra_dims=1)
            task_dist_info_vars = []

            for j, other_policy in enumerate(self.local_policies):

                state_info_vars = dict()  # Not handling recurrent policies
                dist_info_vars = other_policy.dist_info_sym(
                    task_obs_var, state_info_vars)
                task_dist_info_vars.append(dist_info_vars)

            all_obs_vars.append(task_obs_var)
            all_task_dist_info_vars.append(task_dist_info_vars)

        obs_var = self.env.observation_space.new_tensor_variable('obs',
                                                                 extra_dims=1)
        action_var = self.env.action_space.new_tensor_variable('action',
                                                               extra_dims=1)
        advantage_var = tensor_utils.new_tensor('advantage',
                                                ndim=1,
                                                dtype=tf.float32)

        old_dist_info_vars = {
            k: tf.placeholder(tf.float32,
                              shape=[None] + list(shape),
                              name='old_%s' % k)
            for k, shape in self.policy.distribution.dist_info_specs
        }

        old_dist_info_vars_list = [
            old_dist_info_vars[k]
            for k in self.policy.distribution.dist_info_keys
        ]

        central_obs_vars = [elem[1] for elem in self.central_policy_dist_infos]

        input_list = [
            obs_var, action_var, advantage_var
        ] + old_dist_info_vars_list + all_obs_vars + central_obs_vars

        ###############################
        #
        # Local Policy Optimization
        #
        ###############################

        self.optimizers = []
        self.metrics = []

        for n, policy in enumerate(self.local_policies):

            state_info_vars = dict()
            dist_info_vars = policy.dist_info_sym(obs_var, state_info_vars)
            dist = policy.distribution

            kl = dist.kl_sym(old_dist_info_vars, dist_info_vars)
            lr = dist.likelihood_ratio_sym(action_var, old_dist_info_vars,
                                           dist_info_vars)
            surr_loss = -tf.reduce_mean(lr * advantage_var)

            if self.constrain_together:
                additional_loss = Metrics.kl_on_others(
                    n, dist, all_task_dist_info_vars)

            elif self.constrain_against_central:
                additional_loss = Metrics.kl_on_central(
                    dist, dist_info_vars, self.central_policy_dist_infos[n][0])

            else:
                additional_loss = tf.constant(0.0)

            local_loss = surr_loss + self.penalty * additional_loss

            kl_metric = tensor_utils.compile_function(inputs=input_list,
                                                      outputs=additional_loss,
                                                      log_name="KLPenalty%d" %
                                                      n)
            self.metrics.append(kl_metric)

            mean_kl_constraint = tf.reduce_mean(kl)

            optimizer = PenaltyLbfgsOptimizer(name='expertOptimizer_' + str(n))
            optimizer.update_opt(
                loss=local_loss,
                target=policy,
                leq_constraint=(mean_kl_constraint, self.step_size),
                inputs=input_list,
                constraint_name="mean_kl_%d" % n,
            )
            self.optimizers.append(optimizer)

        return dict()
Exemplo n.º 31
0
    def init_opt(self):
        is_recurrent = int(self.policy.recurrent)

        obs_var = self.env.observation_space.new_tensor_variable(
            'obs',
            extra_dims=1 + is_recurrent,
        )
        action_var = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1 + is_recurrent,
        )
        advantage_var = tensor_utils.new_tensor(
            name='advantage',
            ndim=1 + is_recurrent,
            dtype=tf.float32,
        )
        dist = self.policy.distribution

        old_dist_info_vars = {
            k: tf.placeholder(tf.float32, shape=[None] * (1 + is_recurrent) + list(shape), name='old_%s' % k)
            for k, shape in dist.dist_info_specs
            }
        old_dist_info_vars_list = [old_dist_info_vars[k] for k in dist.dist_info_keys]

        state_info_vars = {
            k: tf.placeholder(tf.float32, shape=[None] * (1 + is_recurrent) + list(shape), name=k)
            for k, shape in self.policy.state_info_specs
            }
        state_info_vars_list = [state_info_vars[k] for k in self.policy.state_info_keys]

        if is_recurrent:
            valid_var = tf.placeholder(tf.float32, shape=[None, None], name="valid")
        else:
            valid_var = None

        dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars)
        logli = dist.log_likelihood_sym(action_var, dist_info_vars)
        kl = dist.kl_sym(old_dist_info_vars, dist_info_vars)

        # formulate as a minimization problem
        # The gradient of the surrogate objective is the policy gradient
        if is_recurrent:
            surr_obj = - tf.reduce_sum(logli * advantage_var * valid_var) / tf.reduce_sum(valid_var)
            mean_kl = tf.reduce_sum(kl * valid_var) / tf.reduce_sum(valid_var)
            max_kl = tf.reduce_max(kl * valid_var)
        else:
            surr_obj = - tf.reduce_mean(logli * advantage_var)
            mean_kl = tf.reduce_mean(kl)
            max_kl = tf.reduce_max(kl)

        input_list = [obs_var, action_var, advantage_var] + state_info_vars_list
        if is_recurrent:
            input_list.append(valid_var)

        self.optimizer.update_opt(loss=surr_obj, target=self.policy, inputs=input_list)

        f_kl = tensor_utils.compile_function(
            inputs=input_list + old_dist_info_vars_list,
            outputs=[mean_kl, max_kl],
        )
        self.opt_info = dict(
            f_kl=f_kl,
        )
Exemplo n.º 32
0
    def init_opt(self):
        is_recurrent = int(self.policy.recurrent)
        obs_var = self.env.observation_space.new_tensor_variable(
            'obs',
            extra_dims=1 + is_recurrent,
        )
        action_var = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1 + is_recurrent,
        )
        advantage_var = tensor_utils.new_tensor(
            'advantage',
            ndim=1 + is_recurrent,
            dtype=tf.float32,
        )
        dist = self.policy.distribution

        old_dist_info_vars = {
            k: tf.placeholder(tf.float32, shape=(None,) * (1 + is_recurrent) + shape, name='old_%s' % k)
            for k, shape in dist.dist_info_specs
            }
        old_dist_info_vars_list = [old_dist_info_vars[k] for k in dist.dist_info_keys]

        state_info_vars = {
            k: tf.placeholder(tf.float32, shape=(None,) * (1 + is_recurrent) + shape, name=k)
            for k, shape in self.policy.state_info_specs
            }
        state_info_vars_list = [state_info_vars[k] for k in self.policy.state_info_keys]

        kl_penalty_var = tf.Variable(
            initial_value=self.initial_kl_penalty,
            dtype=tf.float32,
            name="kl_penalty"
        )

        # TODO: The code below only works for FF policy.
        assert is_recurrent == 0

        dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars)

        lr = dist.likelihood_ratio_sym(action_var, old_dist_info_vars, dist_info_vars)
        kl = dist.kl_sym(old_dist_info_vars, dist_info_vars)
        ent = tf.reduce_mean(dist.entropy_sym(dist_info_vars))
        mean_kl = tf.reduce_mean(kl)

        clipped_lr = tf.clip_by_value(lr, 1. - self.clip_lr, 1. + self.clip_lr)

        surr_loss = - tf.reduce_mean(lr * advantage_var)
        clipped_surr_loss = - tf.reduce_mean(
            tf.minimum(lr * advantage_var, clipped_lr * advantage_var)
        )

        clipped_surr_pen_loss = clipped_surr_loss - self.entropy_bonus_coeff * ent
        if self.use_kl_penalty:
            clipped_surr_pen_loss += kl_penalty_var * tf.maximum(0., mean_kl - self.step_size)

        self.optimizer.update_opt(
            loss=clipped_surr_pen_loss,
            target=self.policy,
            inputs=[obs_var, action_var, advantage_var] + state_info_vars_list + old_dist_info_vars_list,
            diagnostic_vars=OrderedDict([
                ("UnclippedSurrLoss", surr_loss),
                ("MeanKL", mean_kl),
            ])
        )
        self.kl_penalty_var = kl_penalty_var
        self.f_increase_penalty = tensor_utils.compile_function(
            inputs=[],
            outputs=tf.assign(
                kl_penalty_var,
                tf.minimum(kl_penalty_var * self.increase_penalty_factor, self.max_penalty)
            )
        )
        self.f_decrease_penalty = tensor_utils.compile_function(
            inputs=[],
            outputs=tf.assign(
                kl_penalty_var,
                tf.maximum(kl_penalty_var * self.decrease_penalty_factor, self.min_penalty)
            )
        )
        self.f_reset_penalty = tensor_utils.compile_function(
            inputs=[],
            outputs=tf.assign(
                kl_penalty_var,
                self.initial_kl_penalty
            )
        )
        return dict()
Exemplo n.º 33
0
    def init_opt(self):
        is_recurrent = int(self.policy.recurrent)
        obs_var = self.env.observation_space.new_tensor_variable(
            'obs',
            extra_dims=1 + is_recurrent,
        )
        action_var = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1 + is_recurrent,
        )
        advantage_var = tensor_utils.new_tensor(
            'advantage',
            ndim=1 + is_recurrent,
            dtype=tf.float32,
        )
        dist = self.policy.distribution

        old_dist_info_vars = {
            k: tf.placeholder(tf.float32,
                              shape=[None] * (1 + is_recurrent) + list(shape),
                              name='old_%s' % k)
            for k, shape in dist.dist_info_specs
        }
        old_dist_info_vars_list = [
            old_dist_info_vars[k] for k in dist.dist_info_keys
        ]

        state_info_vars = {
            k: tf.placeholder(tf.float32,
                              shape=[None] * (1 + is_recurrent) + list(shape),
                              name=k)
            for k, shape in self.policy.state_info_specs
        }
        state_info_vars_list = [
            state_info_vars[k] for k in self.policy.state_info_keys
        ]

        if is_recurrent:
            valid_var = tf.placeholder(tf.float32,
                                       shape=[None, None],
                                       name="valid")
        else:
            valid_var = None

        dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars)
        kl = dist.kl_sym(old_dist_info_vars, dist_info_vars)
        lr = dist.likelihood_ratio_sym(action_var, old_dist_info_vars,
                                       dist_info_vars)
        # entropy_bonus = sum(list(entropy_list[j][i] for j in range(self.num_grad_updates)))
        entropy = dist.entropy_sym(dist_info_vars)
        clipped_obj = tf.minimum(
            lr * advantage_var,
            tf.clip_by_value(lr, 1 - self.clip_eps, 1 + self.clip_eps) *
            advantage_var)

        if is_recurrent:
            mean_entropy = tf.reduce_sum(entropy) / tf.reduce_sum(valid_var)
            mean_kl = tf.reduce_sum(kl * valid_var) / tf.reduce_sum(valid_var)
            surr_loss = - tf.reduce_sum(clipped_obj * valid_var) / tf.reduce_sum(valid_var) \
                        + self.kl_coeff * mean_kl - self.entropy_coeff * mean_entropy
        else:
            mean_entropy = tf.reduce_mean(entropy)
            mean_kl = tf.reduce_mean(kl)
            surr_loss = -tf.reduce_mean(
                clipped_obj
            ) + self.kl_coeff * mean_kl - self.entropy_coeff * mean_entropy

        input_list = [
            obs_var,
            action_var,
            advantage_var,
        ] + state_info_vars_list + old_dist_info_vars_list
        if is_recurrent:
            input_list.append(valid_var)

        extra_inputs = [tf.placeholder(tf.float32, shape=[], name='kl_coeff')]
        self.optimizer.update_opt(loss=surr_loss,
                                  target=self.policy,
                                  kl=mean_kl,
                                  inputs=input_list,
                                  extra_inputs=extra_inputs)
        return dict()
Exemplo n.º 34
0
    def init_opt(self):

        observations = self.env.observation_space.new_tensor_variable(
            'observations',
            extra_dims=1,
        )
        actions = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1,
        )
        advantage = tensor_utils.new_tensor(
            name='advantage',
            ndim=1,
            dtype=tf.float32,
        )
        dist = self.policy.distribution

        self.loss = tf.placeholder(tf.float32, name='actor_loss')
        self.entropy_loss = tf.placeholder(tf.float32, name='entropy_loss')
        self.avg_rewards = tf.placeholder(tf.float32, name='avg_rewards')
        self.total_rewards = tf.placeholder(tf.float32, name='total_rewards')

        old_dist_info_vars = {
            k: tf.placeholder(tf.float32,
                              shape=[None] * 1 + list(shape),
                              name='old_%s' % k)
            for k, shape in dist.dist_info_specs
        }
        old_dist_info_vars_list = [
            old_dist_info_vars[k] for k in dist.dist_info_keys
        ]

        state_info_vars = {
            k: tf.placeholder(tf.float32,
                              shape=[None] * 1 + list(shape),
                              name=k)
            for k, shape in self.policy.state_info_specs
        }
        state_info_vars_list = [
            state_info_vars[k] for k in self.policy.state_info_keys
        ]

        dist_info_vars = self.policy.dist_info_sym(observations,
                                                   state_info_vars)
        logli = dist.log_likelihood_sym(actions, dist_info_vars)
        kl = dist.kl_sym(old_dist_info_vars, dist_info_vars)

        loss = -tf.reduce_mean(logli * advantage)
        mean_kl = tf.reduce_mean(kl)
        max_kl = tf.reduce_max(kl)

        input_list = [observations, actions, advantage] + state_info_vars_list

        self.optimizer.update_opt(loss=loss,
                                  target=self.policy,
                                  inputs=input_list)

        f_kl = tensor_utils.compile_function(
            inputs=input_list + old_dist_info_vars_list,
            outputs=[mean_kl, max_kl],
        )
        self.opt_info = dict(f_kl=f_kl, )

        self.writer = tf.train.SummaryWriter("summary/")
        self.write_op = tf.merge_summary([
            tf.scalar_summary("Loss", self.loss),
            tf.scalar_summary("Entropy Loss", self.entropy_loss),
            tf.scalar_summary("Total Rewards", self.total_rewards),
            tf.scalar_summary("Avg Rewards", self.avg_rewards)
        ])