예제 #1
0
    def __init__(self,
                 dimo,
                 dimu,
                 o_stats,
                 u_stats,
                 clip_norm=5,
                 norm_eps=1e-4,
                 hidden=400,
                 layers=4,
                 learning_rate=1e-3):
        self.sess = U.get_session()
        with tf.variable_scope('forward_dynamics'):
            self.obs0 = tf.placeholder(tf.float32,
                                       shape=(None, self.dimo),
                                       name='obs0')
            self.obs1 = tf.placeholder(tf.float32,
                                       shape=(None, self.dimo),
                                       name='obs1')
            self.actions = tf.placeholder(tf.float32,
                                          shape=(None, self.dimu),
                                          name='actions')

            self.dynamics_scope = tf.get_variable_scope().name
            obs0_norm = self.o_stats.normalize(self.obs0)
            obs1_norm = self.o_stats.normalize(self.obs1)
            actions_norm = self.u_stats.normalize(self.actions)
            input = tf.concat(values=[obs0_norm, actions_norm], axis=-1)
            self.next_state_diff_tf = nn(input,
                                         [hidden] * layers + [self.dimo])
            self.next_state_denorm = self.o_stats.denormalize(
                self.next_state_diff_tf + obs0_norm)

            # no normalize
            # input = tf.concat(values=[self.obs0, self.actions], axis=-1)
            # self.next_state_diff_tf = nn(input,[hidden] * layers+ [self.dimo])
            # self.next_state_tf = self.next_state_diff_tf + self.obs0
            # self.next_state_denorm = self.next_state_tf

        # loss functions
        self.per_sample_loss_tf = tf.reduce_mean(
            tf.abs(self.next_state_diff_tf - obs1_norm + obs0_norm), axis=1)
        # self.per_sample_loss_tf = tf.reduce_mean(tf.abs(self.next_state_tf - self.obs1), axis=1)
        self.mean_loss_tf = tf.reduce_mean(self.per_sample_loss_tf)
        self.test_loss_tf = tf.reduce_mean(
            tf.abs(self.next_state_denorm - self.obs1))
        # self.test_loss_tf = tf.reduce_mean(tf.abs(self.next_state_tf - self.obs1))

        self.dynamics_grads = U.flatgrad(self.mean_loss_tf,
                                         _vars(self.dynamics_scope),
                                         clip_norm=clip_norm)

        # optimizers
        self.dynamics_adam = MpiAdam(_vars(self.dynamics_scope),
                                     scale_grad_by_procs=False)
        # initial
        tf.variables_initializer(_vars(self.dynamics_scope)).run()
        self.dynamics_adam.sync()
예제 #2
0
    def _create_network(self, reuse=False):
        logger.info("Creating a SAC agent with action space %d x %s..." % (self.dimu, self.max_u))
        self.sess = tf_util.get_session()
        self._create_normalizer(reuse)
        batch_tf = self._get_batch_tf()

        # networks
        self._create_target_main(SAC_ActorCritic, reuse, batch_tf)

        # loss functions
        clip_range = (-self.clip_return, 0. if self.clip_pos_returns else np.inf)
        target_tf = self._clip_target(batch_tf, clip_range, self.target.v_tf)
        q_backup_tf = tf.stop_gradient(target_tf)
        v_backup_tf = tf.stop_gradient(self.main.min_q_pi_tf - self.sac_alpha * self.main.logp_pi_tf)

        q1_loss_tf = 0.5 * tf.reduce_mean((q_backup_tf - self.main.q1_tf) ** 2)
        q2_loss_tf = 0.5 * tf.reduce_mean((q_backup_tf - self.main.q2_tf) ** 2)
        v_loss_tf = 0.5 * tf.reduce_mean((v_backup_tf - self.main.v_tf) ** 2)
        self.abs_tf_error_tf = tf.reduce_mean(tf.abs(q_backup_tf - self.main.q1_tf) + tf.abs(q_backup_tf -self.main.q2_tf))

        self.value_loss_tf = q1_loss_tf + q2_loss_tf + v_loss_tf
        self.pi_loss_tf = tf.reduce_mean(self.sac_alpha * self.main.logp_pi_tf - self.main.q1_pi_tf)
        
        # virables
        value_params = get_var(self._name_variable('q')) + get_var(self._name_variable('v'))
        pi_params = get_var(self._name_variable('pi'))
        # gradients
        V_grads_tf = tf.gradients(self.value_loss_tf, value_params)
        pi_grads_tf = tf.gradients(self.pi_loss_tf, pi_params)
        self.V_grad_tf = flatten_grads(grads=V_grads_tf, var_list=value_params)
        self.pi_grad_tf = flatten_grads(grads=pi_grads_tf, var_list=pi_params)

        # optimizers
        self.V_adam = MpiAdam(value_params, scale_grad_by_procs=False)
        self.pi_adam = MpiAdam(pi_params, scale_grad_by_procs=False)

        # polyak averaging
        self.main_vars = get_var(self._name_variable('pi')) + get_var(self._name_variable('q1')) + get_var(self._name_variable('q2')) + get_var(self._name_variable('v'))
        self.target_vars = get_var(self._name_variable('pi', main=False)) + get_var(self._name_variable('q1', main=False)) + get_var(self._name_variable('q2', main=False)) + get_var(self._name_variable('v', main=False))

        self.init_target_net_op = list(map(lambda v: v[0].assign(v[1]), zip(self.target_vars, self.main_vars)))
        self.update_target_net_op = list(map(lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), \
                                        zip(self.target_vars, self.main_vars)))

        # initialize all variables
        self.global_vars = get_var(self.scope, key='global')
        tf.variables_initializer(self.global_vars).run()
        self._sync_optimizers()
        self._init_target_net()
예제 #3
0
    def __init__(self,
                 dimo,
                 dimu,
                 clip_norm=5,
                 norm_eps=1e-4,
                 hidden=256,
                 layers=8,
                 learning_rate=1e-3):
        self.obs_normalizer = NormalizerNumpy(size=dimo, eps=norm_eps)
        self.action_normalizer = NormalizerNumpy(size=dimu, eps=norm_eps)
        self.sess = U.get_session()

        with tf.variable_scope('forward_dynamics_numpy'):
            self.obs0_norm = tf.placeholder(tf.float32,
                                            shape=(None, self.dimo),
                                            name='obs0')
            self.obs1_norm = tf.placeholder(tf.float32,
                                            shape=(None, self.dimo),
                                            name='obs1')
            self.actions_norm = tf.placeholder(tf.float32,
                                               shape=(None, self.dimu),
                                               name='actions')

            self.dynamics_scope = tf.get_variable_scope().name
            input = tf.concat(values=[self.obs0_norm, self.actions_norm],
                              axis=-1)
            self.next_state_diff_tf = nn(input,
                                         [hidden] * layers + [self.dimo])
            self.next_state_norm_tf = self.next_state_diff_tf + self.obs0_norm

        # loss functions
        self.per_sample_loss_tf = tf.reduce_mean(
            tf.abs(self.next_state_diff_tf - self.obs1_norm + self.obs0_norm),
            axis=1)
        self.mean_loss_tf = tf.reduce_mean(self.per_sample_loss_tf)
        self.dynamics_grads = U.flatgrad(self.mean_loss_tf,
                                         _vars(self.dynamics_scope),
                                         clip_norm=clip_norm)

        # optimizers
        self.dynamics_adam = MpiAdam(_vars(self.dynamics_scope),
                                     scale_grad_by_procs=False)
        # initial
        tf.variables_initializer(_vars(self.dynamics_scope)).run()
        self.dynamics_adam.sync()
예제 #4
0
    def __init__(self, obs0, action, obs1, clip_norm, hidden, layers):
        logger.info("Using Random Network Distillation")
        rep_size = hidden

        with tf.variable_scope('random_network_distillation'):
            self.rnd_scope = tf.get_variable_scope().name
            # Random Target Network

            with tf.variable_scope('target_network'):
                xr = nn(obs1, [hidden] * layers + [rep_size])

            with tf.variable_scope('predictor_network'):
                self.predictor_scope = tf.get_variable_scope().name
                xr_hat = nn(obs1, [hidden] * layers + [rep_size])

        total_parameters = 0
        for variable in _vars(self.predictor_scope):
            # shape is an array of tf.Dimension
            shape = variable.get_shape()
            # print(shape)
            # print(len(shape))
            variable_parameters = 1
            for dim in shape:
                # print(dim)
                variable_parameters *= dim.value
            # print(variable_parameters)
            total_parameters += variable_parameters
        logger.info(
            "params in target rnd network: {}".format(total_parameters))

        self.feat_var = tf.reduce_mean(tf.nn.moments(xr, axes=[0])[1])
        self.max_feat = tf.reduce_max(tf.abs(xr))
        # loss functions
        self.per_sample_loss_tf = tf.reduce_mean(
            tf.square(tf.stop_gradient(xr) - xr_hat), axis=-1, keepdims=True)
        self.mean_loss_tf = tf.reduce_mean(self.per_sample_loss_tf)

        self.dynamics_grads = U.flatgrad(self.mean_loss_tf,
                                         _vars(self.predictor_scope),
                                         clip_norm=clip_norm)

        # optimizers
        self.dynamics_adam = MpiAdam(_vars(self.predictor_scope),
                                     scale_grad_by_procs=False)
예제 #5
0
class SAC(Algorithm):
    @store_args
    def __init__(self, buffer, input_dims, hidden, layers, polyak, Q_lr, pi_lr,
                 norm_eps, norm_clip, max_u, action_l2, clip_obs, scope, subtract_goals, 
                 relative_goals, clip_pos_returns, clip_return, gamma, vloss_type='normal',
                 priority=False, sac_alpha=0.03, reuse=False, **kwargs):
        """Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER).
        Args:
            sac_alpha: hyperparameter in SAC
        """
        super(SAC, self).__init__(**self.__dict__)

    def _name_variable(self, name, main=True):
        if main:
            return self.scope + '/main/' + name
        else:
            return self.scope + '/target/' + name
    
    def _create_network(self, reuse=False):
        logger.info("Creating a SAC agent with action space %d x %s..." % (self.dimu, self.max_u))
        self.sess = tf_util.get_session()
        self._create_normalizer(reuse)
        batch_tf = self._get_batch_tf()

        # networks
        self._create_target_main(SAC_ActorCritic, reuse, batch_tf)

        # loss functions
        clip_range = (-self.clip_return, 0. if self.clip_pos_returns else np.inf)
        target_tf = self._clip_target(batch_tf, clip_range, self.target.v_tf)
        q_backup_tf = tf.stop_gradient(target_tf)
        v_backup_tf = tf.stop_gradient(self.main.min_q_pi_tf - self.sac_alpha * self.main.logp_pi_tf)

        q1_loss_tf = 0.5 * tf.reduce_mean((q_backup_tf - self.main.q1_tf) ** 2)
        q2_loss_tf = 0.5 * tf.reduce_mean((q_backup_tf - self.main.q2_tf) ** 2)
        v_loss_tf = 0.5 * tf.reduce_mean((v_backup_tf - self.main.v_tf) ** 2)
        self.abs_tf_error_tf = tf.reduce_mean(tf.abs(q_backup_tf - self.main.q1_tf) + tf.abs(q_backup_tf -self.main.q2_tf))

        self.value_loss_tf = q1_loss_tf + q2_loss_tf + v_loss_tf
        self.pi_loss_tf = tf.reduce_mean(self.sac_alpha * self.main.logp_pi_tf - self.main.q1_pi_tf)
        
        # virables
        value_params = get_var(self._name_variable('q')) + get_var(self._name_variable('v'))
        pi_params = get_var(self._name_variable('pi'))
        # gradients
        V_grads_tf = tf.gradients(self.value_loss_tf, value_params)
        pi_grads_tf = tf.gradients(self.pi_loss_tf, pi_params)
        self.V_grad_tf = flatten_grads(grads=V_grads_tf, var_list=value_params)
        self.pi_grad_tf = flatten_grads(grads=pi_grads_tf, var_list=pi_params)

        # optimizers
        self.V_adam = MpiAdam(value_params, scale_grad_by_procs=False)
        self.pi_adam = MpiAdam(pi_params, scale_grad_by_procs=False)

        # polyak averaging
        self.main_vars = get_var(self._name_variable('pi')) + get_var(self._name_variable('q1')) + get_var(self._name_variable('q2')) + get_var(self._name_variable('v'))
        self.target_vars = get_var(self._name_variable('pi', main=False)) + get_var(self._name_variable('q1', main=False)) + get_var(self._name_variable('q2', main=False)) + get_var(self._name_variable('v', main=False))

        self.init_target_net_op = list(map(lambda v: v[0].assign(v[1]), zip(self.target_vars, self.main_vars)))
        self.update_target_net_op = list(map(lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), \
                                        zip(self.target_vars, self.main_vars)))

        # initialize all variables
        self.global_vars = get_var(self.scope, key='global')
        tf.variables_initializer(self.global_vars).run()
        self._sync_optimizers()
        self._init_target_net()


    def _sync_optimizers(self):
        self.V_adam.sync()
        self.pi_adam.sync()

    def _grads(self):
        critic_loss, actor_loss, V_grad, pi_grad, abs_td_error = self.sess.run([
            self.value_loss_tf,
            self.pi_loss_tf,
            self.V_grad_tf,
            self.pi_grad_tf,
            self.abs_tf_error_tf
        ])
        return critic_loss, actor_loss, V_grad, pi_grad, abs_td_error

    def _update(self, V_grad, pi_grad):
        self.V_adam.update(V_grad, self.Q_lr)
        self.pi_adam.update(pi_grad, self.pi_lr)
    
    # sac doesn't need noise
    def get_actions(self, o, ag, g, noise_eps=0., random_eps=0., use_target_net=False, compute_Q=False):
        o, g = self._preprocess_og(o=o, g=g, ag=ag)
        if not noise_eps and not random_eps:
            u = self.simple_get_action(o, g, use_target_net, deterministic=True)
        else:
            u = self.simple_get_action(o, g, use_target_net, deterministic=False)

        if compute_Q:
            Q_pi = self.get_Q_fun(o, g)

        u = np.clip(u, -self.max_u, self.max_u)
        if u.shape[0] == 1:
            u = u[0] 

        if compute_Q:
            return [u, Q_pi]
        else:
            return u

    def simple_get_action(self, o, g, use_target_net=False, deterministic=False):
        o,g = self._preprocess_og(o=o,g=g)
        policy = self.target if use_target_net else self.main  # in n-step self.target performs better
        act_tf = policy.mu_tf if deterministic else policy.pi_tf
        action, logp_pi, min_q_pi, q1_pi, q2_pi,log_std  = self.sess.run( \
            [act_tf, policy.logp_pi_tf, policy.min_q_pi_tf, policy.q1_pi_tf, policy.q2_pi_tf, policy.log_std], \
            feed_dict={
            policy.o_tf: o.reshape(-1, self.dimo),
            policy.g_tf: g.reshape(-1, self.dimg)
        })
        return action
예제 #6
0
class ForwardDynamics:
    @store_args
    def __init__(self,
                 dimo,
                 dimu,
                 o_stats,
                 u_stats,
                 clip_norm=5,
                 norm_eps=1e-4,
                 hidden=400,
                 layers=4,
                 learning_rate=1e-3):
        self.sess = U.get_session()
        with tf.variable_scope('forward_dynamics'):
            self.obs0 = tf.placeholder(tf.float32,
                                       shape=(None, self.dimo),
                                       name='obs0')
            self.obs1 = tf.placeholder(tf.float32,
                                       shape=(None, self.dimo),
                                       name='obs1')
            self.actions = tf.placeholder(tf.float32,
                                          shape=(None, self.dimu),
                                          name='actions')

            self.dynamics_scope = tf.get_variable_scope().name
            obs0_norm = self.o_stats.normalize(self.obs0)
            obs1_norm = self.o_stats.normalize(self.obs1)
            actions_norm = self.u_stats.normalize(self.actions)
            input = tf.concat(values=[obs0_norm, actions_norm], axis=-1)
            self.next_state_diff_tf = nn(input,
                                         [hidden] * layers + [self.dimo])
            self.next_state_denorm = self.o_stats.denormalize(
                self.next_state_diff_tf + obs0_norm)

            # no normalize
            # input = tf.concat(values=[self.obs0, self.actions], axis=-1)
            # self.next_state_diff_tf = nn(input,[hidden] * layers+ [self.dimo])
            # self.next_state_tf = self.next_state_diff_tf + self.obs0
            # self.next_state_denorm = self.next_state_tf

        # loss functions
        self.per_sample_loss_tf = tf.reduce_mean(
            tf.abs(self.next_state_diff_tf - obs1_norm + obs0_norm), axis=1)
        # self.per_sample_loss_tf = tf.reduce_mean(tf.abs(self.next_state_tf - self.obs1), axis=1)
        self.mean_loss_tf = tf.reduce_mean(self.per_sample_loss_tf)
        self.test_loss_tf = tf.reduce_mean(
            tf.abs(self.next_state_denorm - self.obs1))
        # self.test_loss_tf = tf.reduce_mean(tf.abs(self.next_state_tf - self.obs1))

        self.dynamics_grads = U.flatgrad(self.mean_loss_tf,
                                         _vars(self.dynamics_scope),
                                         clip_norm=clip_norm)

        # optimizers
        self.dynamics_adam = MpiAdam(_vars(self.dynamics_scope),
                                     scale_grad_by_procs=False)
        # initial
        tf.variables_initializer(_vars(self.dynamics_scope)).run()
        self.dynamics_adam.sync()

    def predict_next_state(self, obs0, actions):
        obs1 = self.sess.run(self.next_state_denorm,
                             feed_dict={
                                 self.obs0: obs0,
                                 self.actions: actions
                             })
        return obs1

    def _get_intrinsic_rewards(self, obs0, actions, obs1):
        intrinsic_rewards = self.sess.run(self.per_sample_loss_tf,
                                          feed_dict={
                                              self.obs0: obs0,
                                              self.actions: actions,
                                              self.obs1: obs1
                                          })
        return intrinsic_rewards

    def update(self, obs0, actions, obs1):
        dynamics_grads, dynamics_loss, dynamics_per_sample_loss, test_loss = self.sess.run(
            [
                self.dynamics_grads, self.mean_loss_tf,
                self.per_sample_loss_tf, self.test_loss_tf
            ],
            feed_dict={
                self.obs0: obs0,
                self.actions: actions,
                self.obs1: obs1
            })
        self.dynamics_adam.update(dynamics_grads, stepsize=self.learning_rate)
        return dynamics_loss, test_loss

    def get_intrinsic_rewards(self, obs0, actions, obs1, update=True):
        if update:
            return self.update(obs0, actions, obs1)
        else:
            return self._get_intrinsic_rewards(obs0, actions, obs1)
예제 #7
0
class ForwardDynamicsNumpy:
    @store_args
    def __init__(self,
                 dimo,
                 dimu,
                 clip_norm=5,
                 norm_eps=1e-4,
                 hidden=256,
                 layers=8,
                 learning_rate=1e-3):
        self.obs_normalizer = NormalizerNumpy(size=dimo, eps=norm_eps)
        self.action_normalizer = NormalizerNumpy(size=dimu, eps=norm_eps)
        self.sess = U.get_session()

        with tf.variable_scope('forward_dynamics_numpy'):
            self.obs0_norm = tf.placeholder(tf.float32,
                                            shape=(None, self.dimo),
                                            name='obs0')
            self.obs1_norm = tf.placeholder(tf.float32,
                                            shape=(None, self.dimo),
                                            name='obs1')
            self.actions_norm = tf.placeholder(tf.float32,
                                               shape=(None, self.dimu),
                                               name='actions')

            self.dynamics_scope = tf.get_variable_scope().name
            input = tf.concat(values=[self.obs0_norm, self.actions_norm],
                              axis=-1)
            self.next_state_diff_tf = nn(input,
                                         [hidden] * layers + [self.dimo])
            self.next_state_norm_tf = self.next_state_diff_tf + self.obs0_norm

        # loss functions
        self.per_sample_loss_tf = tf.reduce_mean(
            tf.abs(self.next_state_diff_tf - self.obs1_norm + self.obs0_norm),
            axis=1)
        self.mean_loss_tf = tf.reduce_mean(self.per_sample_loss_tf)
        self.dynamics_grads = U.flatgrad(self.mean_loss_tf,
                                         _vars(self.dynamics_scope),
                                         clip_norm=clip_norm)

        # optimizers
        self.dynamics_adam = MpiAdam(_vars(self.dynamics_scope),
                                     scale_grad_by_procs=False)
        # initial
        tf.variables_initializer(_vars(self.dynamics_scope)).run()
        self.dynamics_adam.sync()

    def predict_next_state(self, obs0, actions):
        obs0_norm = self.obs_normalizer.normalize(obs0)
        action_norm = self.action_normalizer.normalize(actions)
        obs1 = self.sess.run(self.next_state_norm_tf,
                             feed_dict={
                                 self.obs0_norm: obs0_norm,
                                 self.actions_norm: action_norm
                             })
        obs1_norm = self.obs_normalizer.denormalize(obs1)
        return obs1_norm

    def clip_gauss_noise(self, size):
        clip_range = 0.002
        std = 0.001
        return np.clip(np.random.normal(0, std, size), -clip_range, clip_range)
        # return 0

    def update(self, obs0, actions, obs1, times=1):
        self.obs_normalizer.update(obs0)
        self.obs_normalizer.update(obs1)
        self.action_normalizer.update(actions)

        for _ in range(times):
            obs0_norm = self.obs_normalizer.normalize(
                obs0) + self.clip_gauss_noise(size=self.dimo)
            action_norm = self.action_normalizer.normalize(
                actions) + self.clip_gauss_noise(size=self.dimu)
            obs1_norm = self.obs_normalizer.normalize(
                obs1)  #+ self.clip_gauss_noise(size=self.dimo)

            dynamics_grads, dynamics_loss, dynamics_per_sample_loss = self.sess.run(
                [
                    self.dynamics_grads, self.mean_loss_tf,
                    self.per_sample_loss_tf
                ],
                feed_dict={
                    self.obs0_norm: obs0_norm,
                    self.actions_norm: action_norm,
                    self.obs1_norm: obs1_norm
                })
            self.dynamics_adam.update(dynamics_grads,
                                      stepsize=self.learning_rate)
        return dynamics_loss
예제 #8
0
class DDPG(Algorithm):
    @store_args
    def __init__(self,
                 buffer,
                 input_dims,
                 hidden,
                 layers,
                 polyak,
                 Q_lr,
                 pi_lr,
                 norm_eps,
                 norm_clip,
                 max_u,
                 action_l2,
                 clip_obs,
                 scope,
                 subtract_goals,
                 relative_goals,
                 clip_pos_returns,
                 clip_return,
                 gamma,
                 vloss_type='normal',
                 priority=False,
                 reuse=False,
                 **kwargs):
        """
        see algorithm
        """
        super(DDPG, self).__init__(**self.__dict__)

    def _create_network(self, reuse=False):
        logger.info("Creating a DDPG agent with action space %d x %s..." %
                    (self.dimu, self.max_u))
        self.sess = tf_util.get_session()
        # normalizer for input
        self._create_normalizer(reuse)
        batch_tf = self._get_batch_tf()

        # networks
        self._create_target_main(ActorCritic, reuse, batch_tf)

        # loss functions
        target_Q_pi_tf = self.target.Q_pi_tf
        clip_range = (-self.clip_return,
                      0. if self.clip_pos_returns else np.inf)
        target_tf = self._clip_target(batch_tf, clip_range, target_Q_pi_tf)

        self.abs_td_error_tf = tf.abs(
            tf.stop_gradient(target_tf) - self.main.Q_tf)
        self.Q_loss = tf.square(self.abs_td_error_tf)
        if self.priority:
            self.Q_loss_tf = tf.reduce_mean(batch_tf['w'] * self.Q_loss)
        else:
            self.Q_loss_tf = tf.reduce_mean(self.Q_loss)
        self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf)
        self.pi_loss_tf += self.action_l2 * tf.reduce_mean(
            tf.square(self.main.pi_tf / self.max_u))

        # varibles
        self.main_Q_var = get_var(self.scope + '/main/Q')
        self.main_pi_var = get_var(self.scope + '/main/pi')
        self.target_Q_var = get_var(self.scope + '/target/Q')
        self.target_pi_var = get_var(self.scope + '/target/pi')

        Q_grads_tf = tf.gradients(self.Q_loss_tf, self.main_Q_var)
        pi_grads_tf = tf.gradients(self.pi_loss_tf, self.main_pi_var)
        assert len(self.main_Q_var) == len(Q_grads_tf)
        assert len(self.main_pi_var) == len(pi_grads_tf)
        self.Q_grads_vars_tf = zip(Q_grads_tf, self.main_Q_var)
        self.pi_grads_vars_tf = zip(pi_grads_tf, self.main_pi_var)
        self.Q_grad_tf = flatten_grads(grads=Q_grads_tf,
                                       var_list=self.main_Q_var)
        self.pi_grad_tf = flatten_grads(grads=pi_grads_tf,
                                        var_list=self.main_pi_var)

        # optimizers
        self.Q_adam = MpiAdam(self.main_Q_var, scale_grad_by_procs=False)
        self.pi_adam = MpiAdam(self.main_pi_var, scale_grad_by_procs=False)
        self.main_vars = self.main_Q_var + self.main_pi_var
        self.target_vars = self.target_Q_var + self.target_pi_var
        self.init_target_net_op = list(
            map(lambda v: v[0].assign(v[1]),
                zip(self.target_vars, self.main_vars)))
        self.update_target_net_op = list(
            map(
                lambda v: v[0].assign(self.polyak * v[0] +
                                      (1. - self.polyak) * v[1]),
                zip(self.target_vars, self.main_vars)))

        # initialize all variables
        self.global_vars = get_var(self.scope, key='global')
        tf.variables_initializer(self.global_vars).run()
        self._sync_optimizers()
        self._init_target_net()

    def _sync_optimizers(self):
        self.Q_adam.sync()
        self.pi_adam.sync()

    def _grads(self):  # Avoid feed_dict here for performance!
        critic_loss, actor_loss, Q_grad, pi_grad, abs_td_error = self.sess.run(
            [
                self.Q_loss_tf, self.main.Q_pi_tf, self.Q_grad_tf,
                self.pi_grad_tf, self.abs_td_error_tf
            ])
        return critic_loss, actor_loss, Q_grad, pi_grad, abs_td_error

    def _update(self, Q_grad, pi_grad):
        self.Q_adam.update(Q_grad, self.Q_lr)
        self.pi_adam.update(pi_grad, self.pi_lr)
예제 #9
0
    def _create_network(self, reuse=False):
        logger.info("Creating a DDPG agent with action space %d x %s..." %
                    (self.dimu, self.max_u))
        self.sess = tf_util.get_session()
        # normalizer for input
        self._create_normalizer(reuse)
        batch_tf = self._get_batch_tf()

        # networks
        self._create_target_main(ActorCritic, reuse, batch_tf)

        # loss functions
        target_Q_pi_tf = self.target.Q_pi_tf
        clip_range = (-self.clip_return,
                      0. if self.clip_pos_returns else np.inf)
        target_tf = self._clip_target(batch_tf, clip_range, target_Q_pi_tf)

        self.abs_td_error_tf = tf.abs(
            tf.stop_gradient(target_tf) - self.main.Q_tf)
        self.Q_loss = tf.square(self.abs_td_error_tf)
        if self.priority:
            self.Q_loss_tf = tf.reduce_mean(batch_tf['w'] * self.Q_loss)
        else:
            self.Q_loss_tf = tf.reduce_mean(self.Q_loss)
        self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf)
        self.pi_loss_tf += self.action_l2 * tf.reduce_mean(
            tf.square(self.main.pi_tf / self.max_u))

        # varibles
        self.main_Q_var = get_var(self.scope + '/main/Q')
        self.main_pi_var = get_var(self.scope + '/main/pi')
        self.target_Q_var = get_var(self.scope + '/target/Q')
        self.target_pi_var = get_var(self.scope + '/target/pi')

        Q_grads_tf = tf.gradients(self.Q_loss_tf, self.main_Q_var)
        pi_grads_tf = tf.gradients(self.pi_loss_tf, self.main_pi_var)
        assert len(self.main_Q_var) == len(Q_grads_tf)
        assert len(self.main_pi_var) == len(pi_grads_tf)
        self.Q_grads_vars_tf = zip(Q_grads_tf, self.main_Q_var)
        self.pi_grads_vars_tf = zip(pi_grads_tf, self.main_pi_var)
        self.Q_grad_tf = flatten_grads(grads=Q_grads_tf,
                                       var_list=self.main_Q_var)
        self.pi_grad_tf = flatten_grads(grads=pi_grads_tf,
                                        var_list=self.main_pi_var)

        # optimizers
        self.Q_adam = MpiAdam(self.main_Q_var, scale_grad_by_procs=False)
        self.pi_adam = MpiAdam(self.main_pi_var, scale_grad_by_procs=False)
        self.main_vars = self.main_Q_var + self.main_pi_var
        self.target_vars = self.target_Q_var + self.target_pi_var
        self.init_target_net_op = list(
            map(lambda v: v[0].assign(v[1]),
                zip(self.target_vars, self.main_vars)))
        self.update_target_net_op = list(
            map(
                lambda v: v[0].assign(self.polyak * v[0] +
                                      (1. - self.polyak) * v[1]),
                zip(self.target_vars, self.main_vars)))

        # initialize all variables
        self.global_vars = get_var(self.scope, key='global')
        tf.variables_initializer(self.global_vars).run()
        self._sync_optimizers()
        self._init_target_net()