示例#1
0
class C51(make_off_policy_class(mode='share')):
    '''
    Category 51, https://arxiv.org/abs/1707.06887
    No double, no dueling, no noisy net.
    '''

    def __init__(self,
                 s_dim,
                 visual_sources,
                 visual_resolution,
                 a_dim,
                 is_continuous,

                 v_min=-10,
                 v_max=10,
                 atoms=51,
                 lr=5.0e-4,
                 eps_init=1,
                 eps_mid=0.2,
                 eps_final=0.01,
                 init2mid_annealing_step=1000,
                 assign_interval=1000,
                 hidden_units=[128, 128],
                 **kwargs):
        assert not is_continuous, 'c51 only support discrete action space'
        super().__init__(
            s_dim=s_dim,
            visual_sources=visual_sources,
            visual_resolution=visual_resolution,
            a_dim=a_dim,
            is_continuous=is_continuous,
            **kwargs)
        self.v_min = v_min
        self.v_max = v_max
        self.atoms = atoms
        self.delta_z = (self.v_max - self.v_min) / (self.atoms - 1)
        self.z = tf.reshape(tf.constant([self.v_min + i * self.delta_z for i in range(self.atoms)], dtype=tf.float32), [-1, self.atoms])  # [1, N]
        self.zb = tf.tile(self.z, tf.constant([self.a_dim, 1]))  # [A, N]
        self.expl_expt_mng = ExplorationExploitationClass(eps_init=eps_init,
                                                          eps_mid=eps_mid,
                                                          eps_final=eps_final,
                                                          init2mid_annealing_step=init2mid_annealing_step,
                                                          max_step=self.max_train_step)
        self.assign_interval = assign_interval

        def _net(): return NetWork(self.feat_dim, self.a_dim, self.atoms, hidden_units)

        self.q_dist_net = _net()
        self.q_target_dist_net = _net()
        self.critic_tv = self.q_dist_net.trainable_variables + self.other_tv
        update_target_net_weights(self.q_target_dist_net.weights, self.q_dist_net.weights)
        self.lr = self.init_lr(lr)
        self.optimizer = self.init_optimizer(self.lr)

        self.model_recorder(dict(
            model=self.q_dist_net,
            optimizer=self.optimizer
        ))

    def show_logo(self):
        self.logger.info('''
     xxxxxxx         xxxxx          xxx      
    xxxx xxx         xxxx          xxxx      
   xxxx    x        xxxx             xx      
   xxx     x        xxxxx            xx      
   xxx                xxx            xx      
   xxx                 xxx           xx      
   xxx                  xx           xx      
    xxx    x        xx xx            xx      
    xxxxxxxx        xxxxx           xxxx     
      xxxxx          x              xxxx    
        ''')

    def choose_action(self, s, visual_s, evaluation=False):
        if np.random.uniform() < self.expl_expt_mng.get_esp(self.train_step, evaluation=evaluation):
            a = np.random.randint(0, self.a_dim, self.n_agents)
        else:
            a, self.cell_state = self._get_action(s, visual_s, self.cell_state)
            a = a.numpy()
        return a

    @tf.function
    def _get_action(self, s, visual_s, cell_state):
        with tf.device(self.device):
            feat, cell_state = self.get_feature(s, visual_s, cell_state=cell_state, record_cs=True)
            q = self.get_q(feat)  # [B, A]
        return tf.argmax(q, axis=-1), cell_state  # [B, 1]

    def learn(self, **kwargs):
        self.train_step = kwargs.get('train_step')

        def _update():
            if self.global_step % self.assign_interval == 0:
                update_target_net_weights(self.q_target_dist_net.weights, self.q_dist_net.weights)
        for i in range(self.train_times_per_step):
            self._learn(function_dict={
                'train_function': self.train,
                'update_function': _update,
                'summary_dict': dict([['LEARNING_RATE/lr', self.lr(self.train_step)]])
            })

    @tf.function(experimental_relax_shapes=True)
    def train(self, memories, isw, crsty_loss, cell_state):
        ss, vvss, a, r, done = memories
        batch_size = tf.shape(a)[0]
        with tf.device(self.device):
            with tf.GradientTape() as tape:
                feat, feat_ = self.get_feature(ss, vvss, cell_state=cell_state, s_and_s_=True)
                indexs = tf.reshape(tf.range(batch_size), [-1, 1])  # [B, 1]
                q_dist = self.q_dist_net(feat)  # [B, A, N]
                q_dist = tf.transpose(tf.reduce_sum(tf.transpose(q_dist, [2, 0, 1]) * a, axis=-1), [1, 0])  # [B, N]
                q_eval = tf.reduce_sum(q_dist * self.z, axis=-1)
                target_q_dist = self.q_target_dist_net(feat_)  # [B, A, N]
                target_q = tf.reduce_sum(self.zb * target_q_dist, axis=-1)  # [B, A, N] => [B, A]
                a_ = tf.reshape(tf.cast(tf.argmax(target_q, axis=-1), dtype=tf.int32), [-1, 1])  # [B, 1]
                target_q_dist = tf.gather_nd(target_q_dist, tf.concat([indexs, a_], axis=-1))   # [B, N]
                target = tf.tile(r, tf.constant([1, self.atoms])) \
                    + self.gamma * tf.multiply(self.z,   # [1, N]
                                               (1.0 - tf.tile(done, tf.constant([1, self.atoms]))))  # [B, N], [1, N]* [B, N] = [B, N]
                target = tf.clip_by_value(target, self.v_min, self.v_max)  # [B, N]
                b = (target - self.v_min) / self.delta_z  # [B, N]
                u, l = tf.math.ceil(b), tf.math.floor(b)  # [B, N]
                u_id, l_id = tf.cast(u, tf.int32), tf.cast(l, tf.int32)  # [B, N]
                u_minus_b, b_minus_l = u - b, b - l  # [B, N]
                index_help = tf.tile(indexs, tf.constant([1, self.atoms]))  # [B, N]
                index_help = tf.expand_dims(index_help, -1)  # [B, N, 1]
                u_id = tf.concat([index_help, tf.expand_dims(u_id, -1)], axis=-1)    # [B, N, 2]
                l_id = tf.concat([index_help, tf.expand_dims(l_id, -1)], axis=-1)    # [B, N, 2]
                _cross_entropy = tf.stop_gradient(target_q_dist * u_minus_b) * tf.math.log(tf.gather_nd(q_dist, l_id)) \
                    + tf.stop_gradient(target_q_dist * b_minus_l) * tf.math.log(tf.gather_nd(q_dist, u_id))  # [B, N]
                # tf.debugging.check_numerics(_cross_entropy, '_cross_entropy')
                cross_entropy = -tf.reduce_sum(_cross_entropy, axis=-1)  # [B,]
                # tf.debugging.check_numerics(cross_entropy, 'cross_entropy')
                loss = tf.reduce_mean(cross_entropy * isw) + crsty_loss
                td_error = cross_entropy
            grads = tape.gradient(loss, self.critic_tv)
            self.optimizer.apply_gradients(
                zip(grads, self.critic_tv)
            )
            self.global_step.assign_add(1)
            return td_error, dict([
                ['LOSS/loss', loss],
                ['Statistics/q_max', tf.reduce_max(q_eval)],
                ['Statistics/q_min', tf.reduce_min(q_eval)],
                ['Statistics/q_mean', tf.reduce_mean(q_eval)]
            ])

    @tf.function(experimental_relax_shapes=True)
    def get_q(self, feat):
        with tf.device(self.device):
            return tf.reduce_sum(self.zb * self.q_dist_net(feat), axis=-1)  # [B, A, N] => [B, A]
示例#2
0
class CURL(make_off_policy_class(mode='no_share')):
    """
    CURL: Contrastive Unsupervised Representations for Reinforcement Learning, http://arxiv.org/abs/2004.04136
    """
    def __init__(
            self,
            s_dim,
            visual_sources,
            visual_resolution,
            a_dim,
            is_continuous,
            alpha=0.2,
            annealing=True,
            last_alpha=0.01,
            ployak=0.995,
            discrete_tau=1.0,
            log_std_bound=[-20, 2],
            hidden_units={
                'actor_continuous': {
                    'share': [128, 128],
                    'mu': [64],
                    'log_std': [64]
                },
                'actor_discrete': [64, 32],
                'q': [128, 128],
                'encoder': 128
            },
            auto_adaption=True,
            actor_lr=5.0e-4,
            critic_lr=1.0e-3,
            alpha_lr=5.0e-4,
            curl_lr=5.0e-4,
            img_size=64,
            **kwargs):
        super().__init__(s_dim=s_dim,
                         visual_sources=visual_sources,
                         visual_resolution=visual_resolution,
                         a_dim=a_dim,
                         is_continuous=is_continuous,
                         **kwargs)
        assert self.visual_sources == 1
        self.ployak = ployak
        self.discrete_tau = discrete_tau
        self.log_std_min, self.log_std_max = log_std_bound[:]
        self.auto_adaption = auto_adaption
        self.annealing = annealing
        self.img_size = img_size
        self.img_dim = [img_size, img_size, self.visual_dim[-1]]
        self.vis_feat_size = hidden_units['encoder']

        if self.auto_adaption:
            self.log_alpha = tf.Variable(initial_value=0.0,
                                         name='log_alpha',
                                         dtype=tf.float32,
                                         trainable=True)
        else:
            self.log_alpha = tf.Variable(initial_value=tf.math.log(alpha),
                                         name='log_alpha',
                                         dtype=tf.float32,
                                         trainable=False)
            if self.annealing:
                self.alpha_annealing = LinearAnnealing(alpha, last_alpha,
                                                       1.0e6)

        if self.is_continuous:
            self.actor_net = ActorCts(self.s_dim + self.vis_feat_size,
                                      self.a_dim,
                                      hidden_units['actor_continuous'])
        else:
            self.actor_net = ActorDcs(self.s_dim + self.vis_feat_size,
                                      self.a_dim,
                                      hidden_units['actor_discrete'])
            self.gumbel_dist = tfp.distributions.Gumbel(0, 1)

        self.actor_tv = self.actor_net.trainable_variables
        # entropy = -log(1/|A|) = log |A|
        self.target_entropy = 0.98 * (-self.a_dim if self.is_continuous else
                                      np.log(self.a_dim))

        def _q_net():
            return Critic(self.s_dim + self.vis_feat_size, self.a_dim,
                          hidden_units['q'])

        self.critic_net = DoubleQ(_q_net)
        self.critic_target_net = DoubleQ(_q_net)

        self.encoder = VisualEncoder(self.img_dim, hidden_units['encoder'])
        self.encoder_target = VisualEncoder(self.img_dim,
                                            hidden_units['encoder'])

        self.curl_w = tf.Variable(
            initial_value=tf.random.normal(shape=(self.vis_feat_size,
                                                  self.vis_feat_size)),
            name='curl_w',
            dtype=tf.float32,
            trainable=True)

        self.critic_tv = self.critic_net.trainable_variables + self.encoder.trainable_variables

        update_target_net_weights(
            self.critic_target_net.weights +
            self.encoder_target.trainable_variables,
            self.critic_net.weights + self.encoder.trainable_variables)
        self.actor_lr, self.critic_lr, self.alpha_lr, self.curl_lr = map(
            self.init_lr, [actor_lr, critic_lr, alpha_lr, curl_lr])
        self.optimizer_actor, self.optimizer_critic, self.optimizer_alpha, self.optimizer_curl = map(
            self.init_optimizer,
            [self.actor_lr, self.critic_lr, self.alpha_lr, self.curl_lr])

        self.model_recorder(
            dict(
                actor=self.actor_net,
                critic_net=self.critic_net,
                curl_w=self.curl_w,
                optimizer_actor=self.optimizer_actor,
                optimizer_critic=self.optimizer_critic,
                optimizer_alpha=self.optimizer_alpha,
                optimizer_curl=self.optimizer_curl,
            ))

    def show_logo(self):
        self.logger.info('''
     xxxxxx      xxxxx xxxx      xxxxxxxx       xxxxx       
    xxx  xx        xx   xx         xx xxx         x         
    xx    xx       x     x         x   xxx        x         
    xx             x     x         x   xx         x         
   xxx             x     x         xxxxxx         x         
   xxx             x     x         xx xxx         x         
    xx    xx       xx   xx         x   xx         x    xx   
    xxx  xxx       xx   xx         x   xxx        x   xxx   
     xxxxxx        xxxxxxx       xxxxx xxx      xxxxxxxx
        ''')

    def choose_action(self, s, visual_s, evaluation=False):
        visual_s = center_crop_image(visual_s[:, 0], self.img_size)
        mu, pi = self._get_action(s, visual_s)
        a = mu.numpy() if evaluation else pi.numpy()
        return a

    @tf.function
    def _get_action(self, s, visual_s):
        with tf.device(self.device):
            feat = tf.concat([self.encoder(visual_s), s], axis=-1)
            if self.is_continuous:
                mu, log_std = self.actor_net(feat)
                log_std = clip_nn_log_std(log_std, self.log_std_min,
                                          self.log_std_max)
                pi, _ = squash_rsample(mu, log_std)
                mu = tf.tanh(mu)  # squash mu
            else:
                logits = self.actor_net(feat)
                mu = tf.argmax(logits, axis=1)
                cate_dist = tfp.distributions.Categorical(logits)
                pi = cate_dist.sample()
            return mu, pi

    def learn(self, **kwargs):
        self.train_step = kwargs.get('train_step')

        def _train(memories, isw, crsty_loss, cell_state):
            td_error, summaries = self.train(memories, isw, crsty_loss,
                                             cell_state)
            if self.annealing and not self.auto_adaption:
                self.log_alpha.assign(
                    tf.math.log(
                        tf.cast(self.alpha_annealing(self.global_step.numpy()),
                                tf.float32)))
            return td_error, summaries

        def _pre_process(data):
            data['visual_s'] = np.transpose(data['visual_s'][:, 0].numpy(),
                                            (0, 3, 1, 2))
            data['visual_s_'] = np.transpose(data['visual_s_'][:, 0].numpy(),
                                             (0, 3, 1, 2))
            data['pos'] = self.data_convert(
                np.transpose(random_crop(data['visual_s'], self.img_size),
                             (0, 2, 3, 1)))
            data['visual_s'] = self.data_convert(
                np.transpose(random_crop(data['visual_s'], self.img_size),
                             (0, 2, 3, 1)))
            data['visual_s_'] = self.data_convert(
                np.transpose(random_crop(data['visual_s_'], self.img_size),
                             (0, 2, 3, 1)))
            return (data, )

        for i in range(self.train_times_per_step):
            self._learn(
                function_dict={
                    'train_function':
                    _train,
                    'update_function':
                    lambda: update_target_net_weights(
                        self.critic_target_net.weights + self.encoder_target.
                        trainable_variables, self.critic_net.weights + self.
                        encoder.trainable_variables, self.ployak),
                    'summary_dict':
                    dict([[
                        'LEARNING_RATE/actor_lr',
                        self.actor_lr(self.train_step)
                    ],
                          [
                              'LEARNING_RATE/critic_lr',
                              self.critic_lr(self.train_step)
                          ],
                          [
                              'LEARNING_RATE/alpha_lr',
                              self.alpha_lr(self.train_step)
                          ]]),
                    'train_data_list': [
                        's', 'visual_s', 'a', 'r', 's_', 'visual_s_', 'done',
                        'pos'
                    ],
                    'pre_process_function':
                    _pre_process
                })

    @property
    def alpha(self):
        return tf.exp(self.log_alpha)

    @tf.function(experimental_relax_shapes=True)
    def train(self, memories, isw, crsty_loss, cell_state):
        s, visual_s, a, r, s_, visual_s_, done, pos = memories
        batch_size = tf.shape(a)[0]
        with tf.device(self.device):
            with tf.GradientTape(persistent=True) as tape:
                vis_feat = self.encoder(visual_s)
                vis_feat_ = self.encoder(visual_s_)
                target_vis_feat_ = self.encoder_target(visual_s_)
                feat = tf.concat([vis_feat, s], axis=-1)
                feat_ = tf.concat([vis_feat_, s_], axis=-1)
                target_feat_ = tf.concat([target_vis_feat_, s_], axis=-1)
                if self.is_continuous:
                    target_mu, target_log_std = self.actor_net(feat_)
                    target_log_std = clip_nn_log_std(target_log_std)
                    target_pi, target_log_pi = squash_rsample(
                        target_mu, target_log_std)
                else:
                    target_logits = self.actor_net(feat_)
                    target_cate_dist = tfp.distributions.Categorical(
                        target_logits)
                    target_pi = target_cate_dist.sample()
                    target_log_pi = target_cate_dist.log_prob(target_pi)
                    target_pi = tf.one_hot(target_pi,
                                           self.a_dim,
                                           dtype=tf.float32)
                q1, q2 = self.critic_net(feat, a)
                q1_target, q2_target = self.critic_target_net(feat_, target_pi)
                dc_r_q1 = tf.stop_gradient(
                    r + self.gamma * (1 - done) *
                    (q1_target - self.alpha * target_log_pi))
                dc_r_q2 = tf.stop_gradient(
                    r + self.gamma * (1 - done) *
                    (q2_target - self.alpha * target_log_pi))
                td_error1 = q1 - dc_r_q1
                td_error2 = q2 - dc_r_q2
                q1_loss = tf.reduce_mean(tf.square(td_error1) * isw)
                q2_loss = tf.reduce_mean(tf.square(td_error2) * isw)
                critic_loss = 0.5 * q1_loss + 0.5 * q2_loss + crsty_loss

                z_a = vis_feat  # [B, N]
                z_out = self.encoder_target(pos)
                logits = tf.matmul(
                    z_a, tf.matmul(self.curl_w, tf.transpose(z_out, [1, 0])))
                logits -= tf.reduce_max(logits, axis=-1, keepdims=True)
                curl_loss = tf.reduce_mean(
                    tf.keras.losses.sparse_categorical_crossentropy(
                        tf.range(self.batch_size), logits))
            critic_grads = tape.gradient(critic_loss, self.critic_tv)
            self.optimizer_critic.apply_gradients(
                zip(critic_grads, self.critic_tv))
            curl_grads = tape.gradient(curl_loss, [self.curl_w] +
                                       self.encoder.trainable_variables)
            self.optimizer_curl.apply_gradients(
                zip(curl_grads,
                    [self.curl_w] + self.encoder.trainable_variables))

            with tf.GradientTape() as tape:
                if self.is_continuous:
                    mu, log_std = self.actor_net(feat)
                    log_std = clip_nn_log_std(log_std, self.log_std_min,
                                              self.log_std_max)
                    pi, log_pi = squash_rsample(mu, log_std)
                    entropy = gaussian_entropy(log_std)
                else:
                    logits = self.actor_net(feat)
                    logp_all = tf.nn.log_softmax(logits)
                    gumbel_noise = tf.cast(self.gumbel_dist.sample(
                        [batch_size, self.a_dim]),
                                           dtype=tf.float32)
                    _pi = tf.nn.softmax(
                        (logp_all + gumbel_noise) / self.discrete_tau)
                    _pi_true_one_hot = tf.one_hot(tf.argmax(_pi, axis=-1),
                                                  self.a_dim)
                    _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi)
                    pi = _pi_diff + _pi
                    log_pi = tf.reduce_sum(tf.multiply(logp_all, pi),
                                           axis=1,
                                           keepdims=True)
                    entropy = -tf.reduce_mean(
                        tf.reduce_sum(tf.exp(logp_all) * logp_all,
                                      axis=1,
                                      keepdims=True))
                q_s_pi = self.critic_net.get_min(feat, pi)
                actor_loss = -tf.reduce_mean(q_s_pi - self.alpha * log_pi)
            actor_grads = tape.gradient(actor_loss, self.actor_tv)
            self.optimizer_actor.apply_gradients(
                zip(actor_grads, self.actor_tv))

            if self.auto_adaption:
                with tf.GradientTape() as tape:
                    if self.is_continuous:
                        mu, log_std = self.actor_net(feat)
                        log_std = clip_nn_log_std(log_std, self.log_std_min,
                                                  self.log_std_max)
                        norm_dist = tfp.distributions.Normal(
                            loc=mu, scale=tf.exp(log_std))
                        log_pi = tf.reduce_sum(norm_dist.log_prob(
                            norm_dist.sample()),
                                               axis=-1)
                    else:
                        logits = self.actor_net(feat)
                        cate_dist = tfp.distributions.Categorical(logits)
                        log_pi = cate_dist.log_prob(cate_dist.sample())
                    alpha_loss = -tf.reduce_mean(
                        self.alpha *
                        tf.stop_gradient(log_pi + self.target_entropy))
                alpha_grad = tape.gradient(alpha_loss, self.log_alpha)
                self.optimizer_alpha.apply_gradients([(alpha_grad,
                                                       self.log_alpha)])
            self.global_step.assign_add(1)
            summaries = dict(
                [['LOSS/actor_loss', actor_loss], ['LOSS/q1_loss', q1_loss],
                 ['LOSS/q2_loss', q2_loss], ['LOSS/critic_loss', critic_loss],
                 ['LOSS/curl_loss', curl_loss],
                 ['Statistics/log_alpha', self.log_alpha],
                 ['Statistics/alpha', self.alpha],
                 ['Statistics/entropy', entropy],
                 ['Statistics/q_min',
                  tf.reduce_min(tf.minimum(q1, q2))],
                 ['Statistics/q_mean',
                  tf.reduce_mean(tf.minimum(q1, q2))],
                 ['Statistics/q_max',
                  tf.reduce_max(tf.maximum(q1, q2))]])
            if self.auto_adaption:
                summaries.update({'LOSS/alpha_loss': alpha_loss})
            return (td_error1 + td_error2) / 2., summaries
示例#3
0
文件: maxsqn.py 项目: ncepuwwy97/RLs
class MAXSQN(make_off_policy_class(mode='share')):
    '''
    https://github.com/createamind/DRL/blob/master/spinup/algos/maxsqn/maxsqn.py
    '''
    def __init__(self,
                 s_dim,
                 visual_sources,
                 visual_resolution,
                 a_dim,
                 is_continuous,
                 alpha=0.2,
                 beta=0.1,
                 ployak=0.995,
                 eps_init=1,
                 eps_mid=0.2,
                 eps_final=0.01,
                 init2mid_annealing_step=1000,
                 use_epsilon=False,
                 q_lr=5.0e-4,
                 alpha_lr=5.0e-4,
                 auto_adaption=True,
                 hidden_units=[32, 32],
                 **kwargs):
        assert not is_continuous, 'maxsqn only support discrete action space'
        super().__init__(s_dim=s_dim,
                         visual_sources=visual_sources,
                         visual_resolution=visual_resolution,
                         a_dim=a_dim,
                         is_continuous=is_continuous,
                         **kwargs)
        self.expl_expt_mng = ExplorationExploitationClass(
            eps_init=eps_init,
            eps_mid=eps_mid,
            eps_final=eps_final,
            init2mid_annealing_step=init2mid_annealing_step,
            max_step=self.max_train_step)
        self.use_epsilon = use_epsilon
        self.ployak = ployak
        self.log_alpha = alpha if not auto_adaption else tf.Variable(
            initial_value=0.0,
            name='log_alpha',
            dtype=tf.float32,
            trainable=True)
        self.auto_adaption = auto_adaption
        self.target_entropy = beta * np.log(self.a_dim)

        def _q_net():
            return Critic(self.feat_dim, self.a_dim, hidden_units)

        self.critic_net = DoubleQ(_q_net)
        self.critic_target_net = DoubleQ(_q_net)
        self.critic_tv = self.critic_net.trainable_variables + self.other_tv
        update_target_net_weights(self.critic_target_net.weights,
                                  self.critic_net.weights)
        self.q_lr, self.alpha_lr = map(self.init_lr, [q_lr, alpha_lr])
        self.optimizer_critic, self.optimizer_alpha = map(
            self.init_optimizer, [self.q_lr, self.alpha_lr])

        self.model_recorder(
            dict(critic_net=self.critic_net,
                 optimizer_critic=self.optimizer_critic,
                 optimizer_alpha=self.optimizer_alpha))

    def show_logo(self):
        self.logger.info('''
   xx     xx                                      xxxxxx         xxxxxx       xxxx   xx   
   xxx   xxx                                     xxx xxx        xxxx xxx      xxxx   xx   
   xxx   xxx        xxxxx          x   xx        xx             xx    xx      xxxxx  xx   
   xxxx  xxx       xxxxxx          xx xxx        xxxxxx         xx    xxx     xx xxx xx   
   xxxx xx x        x  xxx         xxxxx          xxxxxx       xx      xx     xx  xxxxx   
   xxxx xx x        xxxxxx          xxx               xxx      xxx  x xxx     xx   xxxx   
   xx xxx  x       xxx  xx          xxx          xx    xx       xx xxxxx      xx   xxxx   
   xx xxx  x       xx  xxx         xxxxx        xxxxxxxxx       xxx xxxx      xx    xxx   
   xx xxx  x       xxxxxxxx       xxx xxx        xxxxxxx         xxxxxxx      xx     xx   
                                                                  xxxxxxx                       
        ''')

    @property
    def alpha(self):
        return tf.exp(self.log_alpha)

    def choose_action(self, s, visual_s, evaluation=False):
        if self.use_epsilon and np.random.uniform(
        ) < self.expl_expt_mng.get_esp(self.train_step, evaluation=evaluation):
            a = np.random.randint(0, self.a_dim, self.n_agents)
        else:
            mu, pi, self.cell_state = self._get_action(s, visual_s,
                                                       self.cell_state)
            a = pi.numpy()
        return a

    @tf.function
    def _get_action(self, s, visual_s, cell_state):
        with tf.device(self.device):
            feat, cell_state = self.get_feature(s,
                                                visual_s,
                                                cell_state=cell_state,
                                                record_cs=True)
            q = self.critic_net.Q1(feat)
            cate_dist = tfp.distributions.Categorical(logits=q / self.alpha)
            pi = cate_dist.sample()
        return tf.argmax(q, axis=1), pi, cell_state

    def learn(self, **kwargs):
        self.train_step = kwargs.get('train_step')
        for i in range(self.train_times_per_step):
            self._learn(
                function_dict={
                    'train_function':
                    self.train,
                    'update_function':
                    lambda: update_target_net_weights(
                        self.critic_target_net.weights, self.critic_net.
                        weights, self.ployak),
                    'summary_dict':
                    dict([['LEARNING_RATE/q_lr',
                           self.q_lr(self.train_step)],
                          [
                              'LEARNING_RATE/alpha_lr',
                              self.alpha_lr(self.train_step)
                          ]])
                })

    @tf.function(experimental_relax_shapes=True)
    def train(self, memories, isw, crsty_loss, cell_state):
        ss, vvss, a, r, done = memories
        with tf.device(self.device):
            with tf.GradientTape() as tape:
                feat, feat_ = self.get_feature(ss,
                                               vvss,
                                               cell_state=cell_state,
                                               s_and_s_=True)
                q1, q2 = self.critic_net(feat)
                q1_eval = tf.reduce_sum(tf.multiply(q1, a),
                                        axis=1,
                                        keepdims=True)
                q2_eval = tf.reduce_sum(tf.multiply(q2, a),
                                        axis=1,
                                        keepdims=True)

                q1_target, q2_target = self.critic_target_net(feat_)
                q1_target_max = tf.reduce_max(q1_target, axis=1, keepdims=True)
                q1_target_log_probs = tf.nn.log_softmax(q1_target / self.alpha,
                                                        axis=1) + 1e-8
                q1_target_entropy = -tf.reduce_mean(
                    tf.reduce_sum(
                        tf.exp(q1_target_log_probs) * q1_target_log_probs,
                        axis=1,
                        keepdims=True))

                q2_target_max = tf.reduce_max(q2_target, axis=1, keepdims=True)
                # q2_target_log_probs = tf.nn.log_softmax(q2_target, axis=1)
                # q2_target_log_max = tf.reduce_max(q2_target_log_probs, axis=1, keepdims=True)

                q_target = tf.minimum(
                    q1_target_max,
                    q2_target_max) + self.alpha * q1_target_entropy
                dc_r = tf.stop_gradient(r + self.gamma * q_target * (1 - done))
                td_error1 = q1_eval - dc_r
                td_error2 = q2_eval - dc_r
                q1_loss = tf.reduce_mean(tf.square(td_error1) * isw)
                q2_loss = tf.reduce_mean(tf.square(td_error2) * isw)
                loss = 0.5 * (q1_loss + q2_loss) + crsty_loss
            loss_grads = tape.gradient(loss, self.critic_tv)
            self.optimizer_critic.apply_gradients(
                zip(loss_grads, self.critic_tv))
            if self.auto_adaption:
                with tf.GradientTape() as tape:
                    q1 = self.critic_net.Q1(feat)
                    q1_log_probs = tf.nn.log_softmax(q1 / self.alpha,
                                                     axis=1) + 1e-8
                    q1_entropy = -tf.reduce_mean(
                        tf.reduce_sum(tf.exp(q1_log_probs) * q1_log_probs,
                                      axis=1,
                                      keepdims=True))
                    alpha_loss = -tf.reduce_mean(
                        self.alpha *
                        tf.stop_gradient(self.target_entropy - q1_entropy))
                alpha_grad = tape.gradient(alpha_loss, self.log_alpha)
                self.optimizer_alpha.apply_gradients([(alpha_grad,
                                                       self.log_alpha)])
            self.global_step.assign_add(1)
            summaries = dict(
                [['LOSS/loss', loss], ['Statistics/log_alpha', self.log_alpha],
                 ['Statistics/alpha', self.alpha],
                 ['Statistics/q1_entropy', q1_entropy],
                 ['Statistics/q_min',
                  tf.reduce_mean(tf.minimum(q1, q2))],
                 ['Statistics/q_mean', tf.reduce_mean(q1)],
                 ['Statistics/q_max',
                  tf.reduce_mean(tf.maximum(q1, q2))]])
            if self.auto_adaption:
                summaries.update({'LOSS/alpha_loss': alpha_loss})
            return (td_error1 + td_error2) / 2, summaries
示例#4
0
class TAC(make_off_policy_class(mode='share')):
    """Tsallis Actor Critic, TAC with V neural Network. https://arxiv.org/abs/1902.00137
    """
    def __init__(
            self,
            s_dim,
            visual_sources,
            visual_resolution,
            a_dim,
            is_continuous,
            alpha=0.2,
            annealing=True,
            last_alpha=0.01,
            ployak=0.995,
            entropic_index=1.5,
            discrete_tau=1.0,
            log_std_bound=[-20, 2],
            hidden_units={
                'actor_continuous': {
                    'share': [128, 128],
                    'mu': [64],
                    'log_std': [64]
                },
                'actor_discrete': [64, 32],
                'q': [128, 128]
            },
            auto_adaption=True,
            actor_lr=5.0e-4,
            critic_lr=1.0e-3,
            alpha_lr=5.0e-4,
            **kwargs):
        super().__init__(s_dim=s_dim,
                         visual_sources=visual_sources,
                         visual_resolution=visual_resolution,
                         a_dim=a_dim,
                         is_continuous=is_continuous,
                         **kwargs)
        self.ployak = ployak
        self.discrete_tau = discrete_tau
        self.entropic_index = 2 - entropic_index
        self.log_std_min, self.log_std_max = log_std_bound[:]
        self.auto_adaption = auto_adaption
        self.annealing = annealing

        if self.auto_adaption:
            self.log_alpha = tf.Variable(initial_value=0.0,
                                         name='log_alpha',
                                         dtype=tf.float32,
                                         trainable=True)
        else:
            self.log_alpha = tf.Variable(initial_value=tf.math.log(alpha),
                                         name='log_alpha',
                                         dtype=tf.float32,
                                         trainable=False)
            if self.annealing:
                self.alpha_annealing = LinearAnnealing(alpha, last_alpha, 1e6)

        if self.is_continuous:
            self.actor_net = ActorCts(self.feat_dim, self.a_dim,
                                      hidden_units['actor_continuous'])
        else:
            self.actor_net = ActorDcs(self.feat_dim, self.a_dim,
                                      hidden_units['actor_discrete'])
            self.gumbel_dist = tfp.distributions.Gumbel(0, 1)
        self.actor_tv = self.actor_net.trainable_variables
        # entropy = -log(1/|A|) = log |A|
        self.target_entropy = 0.98 * (-self.a_dim if self.is_continuous else
                                      np.log(self.a_dim))

        def _q_net():
            return CriticQ1(self.feat_dim, self.a_dim, hidden_units['q'])

        self.critic_net = DoubleQ(_q_net)
        self.critic_target_net = DoubleQ(_q_net)
        self.critic_tv = self.critic_net.trainable_variables + self.other_tv

        update_target_net_weights(self.critic_target_net.weights,
                                  self.critic_net.weights)
        self.actor_lr, self.critic_lr, self.alpha_lr = map(
            self.init_lr, [actor_lr, critic_lr, alpha_lr])
        self.optimizer_actor, self.optimizer_critic, self.optimizer_alpha = map(
            self.init_optimizer,
            [self.actor_lr, self.critic_lr, self.alpha_lr])

        self.model_recorder(
            dict(
                actor=self.actor_net,
                critic_net=self.critic_net,
                log_alpha=self.log_alpha,
                optimizer_actor=self.optimizer_actor,
                optimizer_critic=self.optimizer_critic,
                optimizer_alpha=self.optimizer_alpha,
            ))

    def show_logo(self):
        self.logger.info('''
   xxxxxxxxx          xx           xxxxxx    
   xx  x  xx         xxx          xxx  xx    
   xx  x  xx         xxx          xx    xx   
       x             x xx         xx         
       x            xx xx        xxx         
       x            xxxxxx       xxx         
       x           xx   xx        xx    xx   
       x           xx   xx        xxx  xxx   
     xxxxx        xxx  xxxxx       xxxxxx     
        ''')

    @property
    def alpha(self):
        return tf.exp(self.log_alpha)

    def choose_action(self, s, visual_s, evaluation=False):
        mu, pi, self.cell_state = self._get_action(s, visual_s,
                                                   self.cell_state)
        a = mu.numpy() if evaluation else pi.numpy()
        return a

    @tf.function
    def _get_action(self, s, visual_s, cell_state):
        with tf.device(self.device):
            feat, cell_state = self.get_feature(s,
                                                visual_s,
                                                cell_state=cell_state,
                                                record_cs=True)
            if self.is_continuous:
                mu, log_std = self.actor_net(feat)
                log_std = clip_nn_log_std(log_std, self.log_std_min,
                                          self.log_std_max)
                pi, _ = tsallis_squash_rsample(mu, log_std,
                                               self.entropic_index)
                mu = tf.tanh(mu)  # squash mu
            else:
                logits = self.actor_net(feat)
                mu = tf.argmax(logits, axis=1)
                cate_dist = tfp.distributions.Categorical(logits)
                pi = cate_dist.sample()
            return mu, pi, cell_state

    def learn(self, **kwargs):
        self.train_step = kwargs.get('train_step')

        def _train(memories, isw, crsty_loss, cell_state):
            td_error, summaries = self.train(memories, isw, crsty_loss,
                                             cell_state)
            if self.annealing and not self.auto_adaption:
                self.log_alpha.assign(
                    tf.math.log(
                        tf.cast(self.alpha_annealing(self.global_step.numpy()),
                                tf.float32)))
            return td_error, summaries

        for i in range(self.train_times_per_step):
            self._learn(
                function_dict={
                    'train_function':
                    self.train,
                    'update_function':
                    lambda: update_target_net_weights(
                        self.critic_target_net.weights, self.critic_net.
                        weights, self.ployak),
                    'summary_dict':
                    dict([[
                        'LEARNING_RATE/actor_lr',
                        self.actor_lr(self.train_step)
                    ],
                          [
                              'LEARNING_RATE/critic_lr',
                              self.critic_lr(self.train_step)
                          ],
                          [
                              'LEARNING_RATE/alpha_lr',
                              self.alpha_lr(self.train_step)
                          ]])
                })

    @tf.function(experimental_relax_shapes=True)
    def train(self, memories, isw, crsty_loss, cell_state):
        ss, vvss, a, r, done = memories
        batch_size = tf.shape(a)[0]
        with tf.device(self.device):
            with tf.GradientTape() as tape:
                feat, feat_ = self.get_feature(ss,
                                               vvss,
                                               cell_state=cell_state,
                                               s_and_s_=True)
                if self.is_continuous:
                    target_mu, target_log_std = self.actor_net(feat_)
                    target_log_std = clip_nn_log_std(target_log_std)
                    target_pi, target_log_pi = tsallis_squash_rsample(
                        target_mu, target_log_std, self.entropic_index)
                else:
                    target_logits = self.actor_net(feat_)
                    target_cate_dist = tfp.distributions.Categorical(
                        target_logits)
                    target_pi = target_cate_dist.sample()
                    target_log_pi = target_cate_dist.log_prob(target_pi)
                    target_pi = tf.one_hot(target_pi,
                                           self.a_dim,
                                           dtype=tf.float32)
                q1, q2 = self.critic_net(feat, a)
                q1_target, q2_target = self.critic_target_net(feat_, target_pi)
                dc_r_q1 = tf.stop_gradient(
                    r + self.gamma * (1 - done) *
                    (q1_target - self.alpha * target_log_pi))
                dc_r_q2 = tf.stop_gradient(
                    r + self.gamma * (1 - done) *
                    (q2_target - self.alpha * target_log_pi))
                td_error1 = q1 - dc_r_q1
                td_error2 = q2 - dc_r_q2
                q1_loss = tf.reduce_mean(tf.square(td_error1) * isw)
                q2_loss = tf.reduce_mean(tf.square(td_error2) * isw)
                critic_loss = 0.5 * q1_loss + 0.5 * q2_loss + crsty_loss
            critic_grads = tape.gradient(critic_loss, self.critic_tv)
            self.optimizer_critic.apply_gradients(
                zip(critic_grads, self.critic_tv))

            with tf.GradientTape() as tape:
                if self.is_continuous:
                    mu, log_std = self.actor_net(feat)
                    log_std = clip_nn_log_std(log_std, self.log_std_min,
                                              self.log_std_max)
                    pi, log_pi = tsallis_squash_rsample(
                        mu, log_std, self.entropic_index)
                    entropy = gaussian_entropy(log_std)
                else:
                    logits = self.actor_net(feat)
                    logp_all = tf.nn.log_softmax(logits)
                    gumbel_noise = tf.cast(self.gumbel_dist.sample(
                        [batch_size, self.a_dim]),
                                           dtype=tf.float32)
                    _pi = tf.nn.softmax(
                        (logp_all + gumbel_noise) / self.discrete_tau)
                    _pi_true_one_hot = tf.one_hot(tf.argmax(_pi, axis=-1),
                                                  self.a_dim)
                    _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi)
                    pi = _pi_diff + _pi
                    log_pi = tf.reduce_sum(tf.multiply(logp_all, pi),
                                           axis=1,
                                           keepdims=True)
                    entropy = -tf.reduce_mean(
                        tf.reduce_sum(tf.exp(logp_all) * logp_all,
                                      axis=1,
                                      keepdims=True))
                q_s_pi = self.critic_net.get_min(feat, pi)
                actor_loss = -tf.reduce_mean(q_s_pi - self.alpha * log_pi)
            actor_grads = tape.gradient(actor_loss, self.actor_tv)
            self.optimizer_actor.apply_gradients(
                zip(actor_grads, self.actor_tv))

            if self.auto_adaption:
                with tf.GradientTape() as tape:
                    if self.is_continuous:
                        mu, log_std = self.actor_net(feat)
                        log_std = clip_nn_log_std(log_std, self.log_std_min,
                                                  self.log_std_max)
                        pi, log_pi = tsallis_squash_rsample(
                            mu, log_std, self.entropic_index)
                    else:
                        logits = self.actor_net(feat)
                        cate_dist = tfp.distributions.Categorical(logits)
                        log_pi = cate_dist.log_prob(cate_dist.sample())
                    alpha_loss = -tf.reduce_mean(
                        self.alpha *
                        tf.stop_gradient(log_pi + self.target_entropy))
                alpha_grad = tape.gradient(alpha_loss, self.log_alpha)
                self.optimizer_alpha.apply_gradients([(alpha_grad,
                                                       self.log_alpha)])
            self.global_step.assign_add(1)
            summaries = dict(
                [['LOSS/actor_loss', actor_loss], ['LOSS/q1_loss', q1_loss],
                 ['LOSS/q2_loss', q2_loss], ['LOSS/critic_loss', critic_loss],
                 ['Statistics/log_alpha', self.log_alpha],
                 ['Statistics/alpha', self.alpha],
                 ['Statistics/entropy', entropy],
                 ['Statistics/q_min',
                  tf.reduce_min(tf.minimum(q1, q2))],
                 ['Statistics/q_mean',
                  tf.reduce_mean(tf.minimum(q1, q2))],
                 ['Statistics/q_max',
                  tf.reduce_max(tf.maximum(q1, q2))]])
            if self.auto_adaption:
                summaries.update({'LOSS/alpha_loss': alpha_loss})
            return (td_error1 + td_error2) / 2, summaries

    @tf.function(experimental_relax_shapes=True)
    def train_persistent(self, memories, isw, crsty_loss, cell_state):
        ss, vvss, a, r, done = memories
        batch_size = tf.shape(a)[0]
        with tf.device(self.device):
            with tf.GradientTape(persistent=True) as tape:
                feat, feat_ = self.get_feature(ss,
                                               vvss,
                                               cell_state=cell_state,
                                               s_and_s_=True)
                if self.is_continuous:
                    mu, log_std = self.actor_net(feat)
                    log_std = clip_nn_log_std(log_std, self.log_std_min,
                                              self.log_std_max)
                    pi, log_pi = tsallis_squash_rsample(
                        mu, log_std, self.entropic_index)
                    entropy = gaussian_entropy(log_std)
                    target_mu, target_log_std = self.actor_net(feat_)
                    target_log_std = clip_nn_log_std(target_log_std)
                    target_pi, target_log_pi = tsallis_squash_rsample(
                        target_mu, target_log_std, self.entropic_index)
                else:
                    logits = self.actor_net(feat)
                    logp_all = tf.nn.log_softmax(logits)
                    gumbel_noise = tf.cast(self.gumbel_dist.sample(
                        [batch_size, self.a_dim]),
                                           dtype=tf.float32)
                    _pi = tf.nn.softmax(
                        (logp_all + gumbel_noise) / self.discrete_tau)
                    _pi_true_one_hot = tf.one_hot(tf.argmax(_pi, axis=-1),
                                                  self.a_dim)
                    _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi)
                    pi = _pi_diff + _pi
                    log_pi = tf.reduce_sum(tf.multiply(logp_all, pi),
                                           axis=1,
                                           keepdims=True)
                    entropy = -tf.reduce_mean(
                        tf.reduce_sum(tf.exp(logp_all) * logp_all,
                                      axis=1,
                                      keepdims=True))

                    target_logits = self.actor_net(feat_)
                    target_cate_dist = tfp.distributions.Categorical(
                        target_logits)
                    target_pi = target_cate_dist.sample()
                    target_pi = tf.one_hot(target_pi,
                                           self.a_dim,
                                           dtype=tf.float32)
                    target_log_pi = target_cate_dist.log_prob(target_pi)
                q1, q2 = self.critic_net(feat, a)
                q1_target, q2_target = self.critic_target_net(feat_, target_pi)
                q_s_pi = self.critic_net.get_min(feat, pi)
                dc_r_q1 = tf.stop_gradient(
                    r + self.gamma * (1 - done) *
                    (q1_target - self.alpha * target_log_pi))
                dc_r_q2 = tf.stop_gradient(
                    r + self.gamma * (1 - done) *
                    (q2_target - self.alpha * target_log_pi))
                td_error1 = q1 - dc_r_q1
                td_error2 = q2 - dc_r_q2
                q1_loss = tf.reduce_mean(tf.square(td_error1) * isw)
                q2_loss = tf.reduce_mean(tf.square(td_error2) * isw)
                critic_loss = 0.5 * q1_loss + 0.5 * q2_loss + crsty_loss
                actor_loss = -tf.reduce_mean(q_s_pi - self.alpha * log_pi)
                if self.auto_adaption:
                    alpha_loss = -tf.reduce_mean(
                        self.alpha *
                        tf.stop_gradient(log_pi + self.target_entropy))
            critic_grads = tape.gradient(critic_loss, self.critic_tv)
            self.optimizer_critic.apply_gradients(
                zip(critic_grads, self.critic_tv))
            actor_grads = tape.gradient(actor_loss, self.actor_tv)
            self.optimizer_actor.apply_gradients(
                zip(actor_grads, self.actor_tv))
            if self.auto_adaption:
                alpha_grad = tape.gradient(alpha_loss, self.log_alpha)
                self.optimizer_alpha.apply_gradients([(alpha_grad,
                                                       self.log_alpha)])
            self.global_step.assign_add(1)
            summaries = dict(
                [['LOSS/actor_loss', actor_loss], ['LOSS/q1_loss', q1_loss],
                 ['LOSS/q2_loss', q2_loss], ['LOSS/critic_loss', critic_loss],
                 ['Statistics/log_alpha', self.log_alpha],
                 ['Statistics/alpha', self.alpha],
                 ['Statistics/entropy', entropy],
                 ['Statistics/q_min',
                  tf.reduce_min(tf.minimum(q1, q2))],
                 ['Statistics/q_mean',
                  tf.reduce_mean(tf.minimum(q1, q2))],
                 ['Statistics/q_max',
                  tf.reduce_max(tf.maximum(q1, q2))]])
            if self.auto_adaption:
                summaries.update({'LOSS/alpha_loss': alpha_loss})
            return (td_error1 + td_error2) / 2, summaries
示例#5
0
class SQL(make_off_policy_class(mode='share')):
    '''
        Soft Q-Learning.
        Reinforcement Learning with Deep Energy-Based Policies: https://arxiv.org/abs/1702.08165
    '''
    def __init__(self,
                 s_dim,
                 visual_sources,
                 visual_resolution,
                 a_dim,
                 is_continuous,
                 lr=5.0e-4,
                 alpha=2,
                 ployak=0.995,
                 hidden_units=[32, 32],
                 **kwargs):
        assert not is_continuous, 'sql only support discrete action space'
        super().__init__(s_dim=s_dim,
                         visual_sources=visual_sources,
                         visual_resolution=visual_resolution,
                         a_dim=a_dim,
                         is_continuous=is_continuous,
                         **kwargs)
        self.alpha = alpha
        self.ployak = ployak

        def _q_net():
            return NetWork(self.feat_dim, self.a_dim, hidden_units)

        self.q_net = _q_net()
        self.q_target_net = _q_net()
        self.critic_tv = self.q_net.trainable_variables + self.other_tv
        self.lr = self.init_lr(lr)
        self.optimizer = self.init_optimizer(self.lr)

        update_target_net_weights(self.q_target_net.weights,
                                  self.q_net.weights)

        self.model_recorder(dict(model=self.q_net, optimizer=self.optimizer))

    def show_logo(self):
        self.logger.info('''
       xxxxxx           xxxxxx          xxxxx         
      xxxxxxx          xxxxxxxxx         xxx          
     xxx  xxx         xxxx   xxx          xx          
     xxx   xx         xxx    xxxx         xx          
     xxxxx           xxx      xxx         xx          
      xxxxx          xxx      xxx         xx          
       xxxxxx        xxx      xxx         xx          
         xxxx        xxx      xxx         xx          
     xx   xxxx        xxx     xxx         xx     x    
     xx    xx         xxxx   xxx          xx    xx    
     xxxxxxxx          xxxxxxxxx        xxxxxxxxx     
     xxxxxxx            xxxxxx                        
                          xxxx                        
                            xxxx  
        ''')

    def choose_action(self, s, visual_s, evaluation=False):
        a, self.cell_state = self._get_action(s, visual_s, self.cell_state)
        a = a.numpy()
        return a

    @tf.function
    def _get_action(self, s, visual_s, cell_state):
        with tf.device(self.device):
            feat, cell_state = self.get_feature(s,
                                                visual_s,
                                                cell_state=cell_state,
                                                record_cs=True)
            q_values = self.q_net(feat)
            logits = tf.math.exp(
                (q_values - self.get_v(q_values)) / self.alpha)
            cate_dist = tfp.distributions.Categorical(logits)
            pi = cate_dist.sample()
        return pi, cell_state

    @tf.function
    def get_v(self, q):
        with tf.device(self.device):
            v = self.alpha * tf.math.log(
                tf.reduce_mean(
                    tf.math.exp(q / self.alpha), axis=1, keepdims=True))
        return v

    def learn(self, **kwargs):
        self.train_step = kwargs.get('train_step')
        for i in range(self.train_times_per_step):
            self._learn(
                function_dict={
                    'train_function':
                    self.train,
                    'update_function':
                    lambda: update_target_net_weights(
                        self.q_target_net.weights, self.q_net.weights, self.
                        ployak),
                    'summary_dict':
                    dict([['LEARNING_RATE/lr',
                           self.lr(self.train_step)]])
                })

    @tf.function(experimental_relax_shapes=True)
    def train(self, memories, isw, crsty_loss, cell_state):
        ss, vvss, a, r, done = memories
        with tf.device(self.device):
            with tf.GradientTape() as tape:
                feat, feat_ = self.get_feature(ss,
                                               vvss,
                                               cell_state=cell_state,
                                               s_and_s_=True)
                q = self.q_net(feat)
                q_next = self.q_target_net(feat_)
                v_next = self.get_v(q_next)
                q_eval = tf.reduce_sum(tf.multiply(q, a),
                                       axis=1,
                                       keepdims=True)
                q_target = tf.stop_gradient(r + self.gamma *
                                            (1 - done) * v_next)
                td_error = q_eval - q_target
                q_loss = tf.reduce_mean(tf.square(td_error) * isw) + crsty_loss
            grads = tape.gradient(q_loss, self.critic_tv)
            self.optimizer.apply_gradients(zip(grads, self.critic_tv))
            self.global_step.assign_add(1)
            return td_error, dict(
                [['LOSS/loss', q_loss],
                 ['Statistics/q_max',
                  tf.reduce_max(q_eval)],
                 ['Statistics/q_min',
                  tf.reduce_min(q_eval)],
                 ['Statistics/q_mean',
                  tf.reduce_mean(q_eval)]])
示例#6
0
class DDPG(make_off_policy_class(mode='share')):
    '''
    Deep Deterministic Policy Gradient, https://arxiv.org/abs/1509.02971
    '''
    def __init__(self,
                 s_dim,
                 visual_sources,
                 visual_resolution,
                 a_dim,
                 is_continuous,
                 ployak=0.995,
                 actor_lr=5.0e-4,
                 critic_lr=1.0e-3,
                 discrete_tau=1.0,
                 hidden_units={
                     'actor_continuous': [32, 32],
                     'actor_discrete': [32, 32],
                     'q': [32, 32]
                 },
                 **kwargs):
        super().__init__(s_dim=s_dim,
                         visual_sources=visual_sources,
                         visual_resolution=visual_resolution,
                         a_dim=a_dim,
                         is_continuous=is_continuous,
                         **kwargs)
        self.ployak = ployak
        self.discrete_tau = discrete_tau

        if self.is_continuous:

            def _actor_net():
                return ActorCts(self.feat_dim, self.a_dim,
                                hidden_units['actor_continuous'])

            # self.action_noise = NormalActionNoise(mu=np.zeros(self.a_dim), sigma=1 * np.ones(self.a_dim))
            self.action_noise = OrnsteinUhlenbeckActionNoise(
                mu=np.zeros(self.a_dim), sigma=0.2 * np.ones(self.a_dim))
        else:

            def _actor_net():
                return ActorDcs(self.feat_dim, self.a_dim,
                                hidden_units['actor_discrete'])

            self.gumbel_dist = tfp.distributions.Gumbel(0, 1)

        self.actor_net = _actor_net()
        self.actor_target_net = _actor_net()
        self.actor_tv = self.actor_net.trainable_variables

        def _q_net():
            return Critic(self.feat_dim, self.a_dim, hidden_units['q'])

        self.q_net = _q_net()
        self.q_target_net = _q_net()
        self.critic_tv = self.q_net.trainable_variables + self.other_tv
        update_target_net_weights(
            self.actor_target_net.weights + self.q_target_net.weights,
            self.actor_net.weights + self.q_net.weights)
        self.actor_lr, self.critic_lr = map(self.init_lr,
                                            [actor_lr, critic_lr])
        self.optimizer_actor, self.optimizer_critic = map(
            self.init_optimizer, [self.actor_lr, self.critic_lr])

        self.model_recorder(
            dict(actor=self.actor_net,
                 critic=self.q_net,
                 optimizer_actor=self.optimizer_actor,
                 optimizer_critic=self.optimizer_critic))

    def show_logo(self):
        self.logger.info('''
   xxxxxxx        xxxxxxx        xxxxxxxx        xxxxxx     
     x  xxx         x  xxx         xx  xx       xxx  xx     
     x   xx         x   xx         x   xxx      xx    x     
     x   xx         x   xx         x   xxx      xx          
     x   xxx        x   xxx        xxxxxx       x   xxxxx   
     x   xx         x   xx         x            xx   xxx    
     x   xx         x   xx         x            xx    x     
     x  xxx         x  xxx         x            xxx  xx     
   xxxxxxx        xxxxxxx        xxxxx           xxxxxx     
                                                   xx     
        ''')

    def choose_action(self, s, visual_s, evaluation=False):
        mu, pi, self.cell_state = self._get_action(s, visual_s,
                                                   self.cell_state)
        a = mu.numpy() if evaluation else pi.numpy()
        return a

    @tf.function
    def _get_action(self, s, visual_s, cell_state):
        with tf.device(self.device):
            feat, cell_state = self.get_feature(s,
                                                visual_s,
                                                cell_state=cell_state,
                                                record_cs=True)
            if self.is_continuous:
                mu = self.actor_net(feat)
                pi = tf.clip_by_value(mu + self.action_noise(), -1, 1)
            else:
                logits = self.actor_net(feat)
                mu = tf.argmax(logits, axis=1)
                cate_dist = tfp.distributions.Categorical(logits)
                pi = cate_dist.sample()
            return mu, pi, cell_state

    def learn(self, **kwargs):
        self.train_step = kwargs.get('train_step')
        for i in range(self.train_times_per_step):
            self._learn(
                function_dict={
                    'train_function':
                    self.train,
                    'update_function':
                    lambda: update_target_net_weights(
                        self.actor_target_net.weights + self.q_target_net.
                        weights, self.actor_net.weights + self.q_net.weights,
                        self.ployak),
                    'summary_dict':
                    dict([[
                        'LEARNING_RATE/actor_lr',
                        self.actor_lr(self.train_step)
                    ],
                          [
                              'LEARNING_RATE/critic_lr',
                              self.critic_lr(self.train_step)
                          ]])
                })

    @tf.function(experimental_relax_shapes=True)
    def train(self, memories, isw, crsty_loss, cell_state):
        ss, vvss, a, r, done = memories
        batch_size = tf.shape(a)[0]
        with tf.device(self.device):
            with tf.GradientTape() as tape:
                feat, feat_ = self.get_feature(ss,
                                               vvss,
                                               cell_state=cell_state,
                                               s_and_s_=True)
                if self.is_continuous:
                    target_mu = self.actor_target_net(feat_)
                    action_target = tf.clip_by_value(
                        target_mu + self.action_noise(), -1, 1)
                else:
                    target_logits = self.actor_target_net(feat_)
                    logp_all = tf.nn.log_softmax(target_logits)
                    gumbel_noise = tf.cast(self.gumbel_dist.sample(
                        [batch_size, self.a_dim]),
                                           dtype=tf.float32)
                    _pi = tf.nn.softmax(
                        (logp_all + gumbel_noise) / self.discrete_tau)
                    _pi_true_one_hot = tf.one_hot(tf.argmax(_pi, axis=-1),
                                                  self.a_dim)
                    _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi)
                    action_target = _pi_diff + _pi
                q = self.q_net(feat, a)
                q_target = self.q_target_net(feat_, action_target)
                dc_r = tf.stop_gradient(r + self.gamma * q_target * (1 - done))
                td_error = q - dc_r
                q_loss = 0.5 * tf.reduce_mean(
                    tf.square(td_error) * isw) + crsty_loss
            q_grads = tape.gradient(q_loss, self.critic_tv)
            self.optimizer_critic.apply_gradients(zip(q_grads, self.critic_tv))
            with tf.GradientTape() as tape:
                if self.is_continuous:
                    mu = self.actor_net(feat)
                else:
                    logits = self.actor_net(feat)
                    _pi = tf.nn.softmax(logits)
                    _pi_true_one_hot = tf.one_hot(tf.argmax(logits, axis=-1),
                                                  self.a_dim,
                                                  dtype=tf.float32)
                    _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi)
                    mu = _pi_diff + _pi
                q_actor = self.q_net(feat, mu)
                actor_loss = -tf.reduce_mean(q_actor)
            actor_grads = tape.gradient(actor_loss, self.actor_tv)
            self.optimizer_actor.apply_gradients(
                zip(actor_grads, self.actor_tv))
            self.global_step.assign_add(1)
            return td_error, dict([['LOSS/actor_loss', actor_loss],
                                   ['LOSS/critic_loss', q_loss],
                                   ['Statistics/q_min',
                                    tf.reduce_min(q)],
                                   ['Statistics/q_mean',
                                    tf.reduce_mean(q)],
                                   ['Statistics/q_max',
                                    tf.reduce_max(q)]])

    @tf.function(experimental_relax_shapes=True)
    def train_persistent(self, memories, isw, crsty_loss, cell_state):
        ss, vvss, a, r, done = memories
        batch_size = tf.shape(a)[0]
        with tf.device(self.device):
            with tf.GradientTape(persistent=True) as tape:
                feat, feat_ = self.get_feature(ss,
                                               vvss,
                                               cell_state=cell_state,
                                               s_and_s_=True)
                if self.is_continuous:
                    target_mu = self.actor_target_net(feat_)
                    action_target = tf.clip_by_value(
                        target_mu + self.action_noise(), -1, 1)
                    mu = self.actor_net(feat)
                else:
                    target_logits = self.actor_target_net(feat_)
                    logp_all = tf.nn.log_softmax(target_logits)
                    gumbel_noise = tf.cast(self.gumbel_dist.sample(
                        [batch_size, self.a_dim]),
                                           dtype=tf.float32)
                    _pi = tf.nn.softmax(
                        (logp_all + gumbel_noise) / self.discrete_tau)
                    _pi_true_one_hot = tf.one_hot(tf.argmax(_pi, axis=-1),
                                                  self.a_dim)
                    _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi)
                    action_target = _pi_diff + _pi
                    logits = self.actor_net(feat)
                    _pi = tf.nn.softmax(logits)
                    _pi_true_one_hot = tf.one_hot(tf.argmax(logits, axis=-1),
                                                  self.a_dim,
                                                  dtype=tf.float32)
                    _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi)
                    mu = _pi_diff + _pi
                q = self.q_net(feat, a)
                q_target = self.q_target_net(feat_, action_target)
                dc_r = tf.stop_gradient(r + self.gamma * q_target * (1 - done))
                td_error = q - dc_r
                q_loss = 0.5 * tf.reduce_mean(
                    tf.square(td_error) * isw) + crsty_loss

                q_actor = self.q_net(feat, mu)
                actor_loss = -tf.reduce_mean(q_actor)
            q_grads = tape.gradient(q_loss, self.critic_tv)
            self.optimizer_critic.apply_gradients(zip(q_grads, self.critic_tv))
            actor_grads = tape.gradient(actor_loss, self.actor_tv)
            self.optimizer_actor.apply_gradients(
                zip(actor_grads, self.actor_tv))
            self.global_step.assign_add(1)
            return td_error, dict([['LOSS/actor_loss', actor_loss],
                                   ['LOSS/critic_loss', q_loss],
                                   ['Statistics/q_min',
                                    tf.reduce_min(q)],
                                   ['Statistics/q_mean',
                                    tf.reduce_mean(q)],
                                   ['Statistics/q_max',
                                    tf.reduce_max(q)]])
示例#7
0
class BootstrappedDQN(make_off_policy_class(mode='share')):
    '''
    Deep Exploration via Bootstrapped DQN, http://arxiv.org/abs/1602.04621
    '''
    def __init__(self,
                 s_dim,
                 visual_sources,
                 visual_resolution,
                 a_dim,
                 is_continuous,
                 lr=5.0e-4,
                 eps_init=1,
                 eps_mid=0.2,
                 eps_final=0.01,
                 init2mid_annealing_step=1000,
                 assign_interval=1000,
                 head_num=4,
                 hidden_units=[32, 32],
                 **kwargs):
        assert not is_continuous, 'Bootstrapped DQN only support discrete action space'
        super().__init__(s_dim=s_dim,
                         visual_sources=visual_sources,
                         visual_resolution=visual_resolution,
                         a_dim=a_dim,
                         is_continuous=is_continuous,
                         **kwargs)
        self.expl_expt_mng = ExplorationExploitationClass(
            eps_init=eps_init,
            eps_mid=eps_mid,
            eps_final=eps_final,
            init2mid_annealing_step=init2mid_annealing_step,
            max_step=self.max_train_step)
        self.assign_interval = assign_interval
        self.head_num = head_num
        self._probs = [1. / head_num for _ in range(head_num)]
        self.now_head = 0

        def _q_net():
            return NetWork(self.feat_dim, self.a_dim, self.head_num,
                           hidden_units)

        self.q_net = _q_net()
        self.q_target_net = _q_net()
        self.critic_tv = self.q_net.trainable_variables + self.other_tv
        update_target_net_weights(self.q_target_net.weights,
                                  self.q_net.weights)
        self.lr = self.init_lr(lr)
        self.optimizer = self.init_optimizer(self.lr)

        self.model_recorder(dict(model=self.q_net, optimizer=self.optimizer))

    def show_logo(self):
        self.logger.info('''
    xxxxxxx                      xxxxxxxx         xxxxxx      xxxx   xxxx  
     xx xxxx                      xxxxxxxx       xxx xxxx       xxx    x   
     xx  xxx                      xx    xxx     xxx   xxxx      xxxx   x   
     xx  xxx                      xx    xxx     xxx    xxx      xxxxx  x   
     xxxxxx      xxx xxxx xxx     xx     xx     xx     xxx      x xxxx x   
     xx xxxx     xxx xxxx xxx     xx     xx     xxx    xxx      x  xxxxx   
     xx  xxx     xxx  xx  xxx     xx    xxx     xxx    xxx      x   xxxx   
     xx   xx                      xx   xxxx     xxx   xxx       x    xxx   
     xx xxxx                      xxxxxxxx       xxxxxxxx      xxx    xx   
    xxxxxxxx                     xxxxxxx          xxxxx                    
                                                    xxxx                   
                                                      xxx 
        ''')

    def reset(self):
        super().reset()
        self.now_head = np.random.randint(self.head_num)

    def choose_action(self, s, visual_s, evaluation=False):
        if np.random.uniform() < self.expl_expt_mng.get_esp(
                self.train_step, evaluation=evaluation):
            a = np.random.randint(0, self.a_dim, self.n_agents)
        else:
            q, self.cell_state = self._get_action(s, visual_s, self.cell_state)
            q = q.numpy()
            a = np.argmax(q[self.now_head],
                          axis=1)  # [H, B, A] => [B, A] => [B, ]
        return a

    @tf.function
    def _get_action(self, s, visual_s, cell_state):
        with tf.device(self.device):
            feat, cell_state = self.get_feature(s,
                                                visual_s,
                                                cell_state=cell_state,
                                                record_cs=True)
            q_values = self.q_net(feat)  # [H, B, A]
        return q_values, cell_state

    def learn(self, **kwargs):
        self.train_step = kwargs.get('train_step')

        def _update():
            if self.global_step % self.assign_interval == 0:
                update_target_net_weights(self.q_target_net.weights,
                                          self.q_net.weights)

        for i in range(self.train_times_per_step):
            self._learn(
                function_dict={
                    'train_function':
                    self.train,
                    'update_function':
                    _update,
                    'summary_dict':
                    dict([['LEARNING_RATE/lr',
                           self.lr(self.train_step)]])
                })

    @tf.function(experimental_relax_shapes=True)
    def train(self, memories, isw, crsty_loss, cell_state):
        ss, vvss, a, r, done = memories
        batch_size = tf.shape(a)[0]
        with tf.device(self.device):
            with tf.GradientTape() as tape:
                feat, feat_ = self.get_feature(ss,
                                               vvss,
                                               cell_state=cell_state,
                                               s_and_s_=True)
                q = self.q_net(feat)  # [H, B, A]
                q_next = self.q_target_net(feat_)  # [H, B, A]
                q_eval = tf.reduce_sum(
                    tf.multiply(q, a), axis=-1,
                    keepdims=True)  # [H, B, A] * [B, A] => [H, B, 1]
                q_target = tf.stop_gradient(
                    r + self.gamma *
                    (1 - done) * tf.reduce_max(q_next, axis=-1, keepdims=True))
                td_error = q_eval - q_target  # [H, B, 1]
                td_error = tf.reduce_sum(td_error, axis=-1)  # [H, B]

                mask_dist = tfp.distributions.Bernoulli(probs=self._probs)
                mask = tf.transpose(mask_dist.sample(batch_size),
                                    [1, 0])  # [H, B]
                q_loss = tf.reduce_mean(tf.square(td_error) * isw) + crsty_loss
            grads = tape.gradient(q_loss, self.critic_tv)
            self.optimizer.apply_gradients(zip(grads, self.critic_tv))
            self.global_step.assign_add(1)
            return tf.reduce_mean(td_error, axis=0), dict([  # [H, B] =>
                ['LOSS/loss', q_loss],
                ['Statistics/q_max', tf.reduce_max(q_eval)],
                ['Statistics/q_min', tf.reduce_min(q_eval)],
                ['Statistics/q_mean',
                 tf.reduce_mean(q_eval)]
            ])
示例#8
0
class HIRO(make_off_policy_class(mode='no_share')):
    '''
    Data-Efficient Hierarchical Reinforcement Learning, http://arxiv.org/abs/1805.08296
    '''
    def __init__(
            self,
            s_dim,
            visual_sources,
            visual_resolution,
            a_dim,
            is_continuous,
            ployak=0.995,
            high_scale=1.0,
            reward_scale=1.0,
            sample_g_nums=100,
            sub_goal_steps=10,
            fn_goal_dim=0,
            intrinsic_reward_mode='os',
            high_batch_size=256,
            high_buffer_size=100000,
            low_batch_size=8,
            low_buffer_size=10000,
            high_actor_lr=1.0e-4,
            high_critic_lr=1.0e-3,
            low_actor_lr=1.0e-4,
            low_critic_lr=1.0e-3,
            hidden_units={
                'high_actor': [64, 64],
                'high_critic': [64, 64],
                'low_actor': [64, 64],
                'low_critic': [64, 64]
            },
            **kwargs):
        assert visual_sources == 0, 'HIRO doesn\'t support visual inputs.'
        super().__init__(s_dim=s_dim,
                         visual_sources=visual_sources,
                         visual_resolution=visual_resolution,
                         a_dim=a_dim,
                         is_continuous=is_continuous,
                         **kwargs)
        self.data_high = ExperienceReplay(high_batch_size, high_buffer_size)
        self.data_low = ExperienceReplay(low_batch_size, low_buffer_size)

        self.ployak = ployak
        self.high_scale = np.array(
            high_scale if isinstance(high_scale, list) else [high_scale] *
            self.s_dim,
            dtype=np.float32)
        self.reward_scale = reward_scale
        self.fn_goal_dim = fn_goal_dim
        self.sample_g_nums = sample_g_nums
        self.sub_goal_steps = sub_goal_steps
        self.sub_goal_dim = self.s_dim - self.fn_goal_dim

        self.high_noise = ClippedNormalActionNoise(
            mu=np.zeros(self.sub_goal_dim),
            sigma=self.high_scale * np.ones(self.sub_goal_dim),
            bound=self.high_scale / 2)
        self.low_noise = ClippedNormalActionNoise(mu=np.zeros(self.a_dim),
                                                  sigma=1.0 *
                                                  np.ones(self.a_dim),
                                                  bound=0.5)

        def _high_actor_net():
            return ActorCts(self.s_dim, self.sub_goal_dim,
                            hidden_units['high_actor'])

        if self.is_continuous:

            def _low_actor_net():
                return ActorCts(self.s_dim + self.sub_goal_dim, self.a_dim,
                                hidden_units['low_actor'])
        else:

            def _low_actor_net():
                return ActorDcs(self.s_dim + self.sub_goal_dim, self.a_dim,
                                hidden_units['low_actor'])

            self.gumbel_dist = tfd.Gumbel(0, 1)

        self.high_actor = _high_actor_net()
        self.high_actor_target = _high_actor_net()
        self.low_actor = _low_actor_net()
        self.low_actor_target = _low_actor_net()

        def _high_critic_net():
            return Critic(self.s_dim, self.sub_goal_dim,
                          hidden_units['high_critic'])

        def _low_critic_net():
            return Critic(self.s_dim + self.sub_goal_dim, self.a_dim,
                          hidden_units['low_critic'])

        self.high_critic = DoubleQ(_high_critic_net)
        self.high_critic_target = DoubleQ(_high_critic_net)
        self.low_critic = DoubleQ(_low_critic_net)
        self.low_critic_target = DoubleQ(_low_critic_net)

        update_target_net_weights(
            self.low_actor_target.weights + self.low_critic_target.weights +
            self.high_actor_target.weights + self.high_critic_target.weights,
            self.low_actor.weights + self.low_critic.weights +
            self.high_actor.weights + self.high_critic.weights)

        self.low_actor_lr, self.low_critic_lr = map(
            self.init_lr, [low_actor_lr, low_critic_lr])
        self.high_actor_lr, self.high_critic_lr = map(
            self.init_lr, [high_actor_lr, high_critic_lr])
        self.low_actor_optimizer, self.low_critic_optimizer = map(
            self.init_optimizer, [self.low_actor_lr, self.low_critic_lr])
        self.high_actor_optimizer, self.high_critic_optimizer = map(
            self.init_optimizer, [self.high_actor_lr, self.high_critic_lr])

        self.model_recorder(
            dict(high_actor=self.high_actor,
                 high_critic=self.high_critic,
                 low_actor=self.low_actor,
                 low_critic=self.low_critic,
                 low_actor_optimizer=self.low_actor_optimizer,
                 low_critic_optimizer=self.low_critic_optimizer,
                 high_actor_optimizer=self.high_actor_optimizer,
                 high_critic_optimizer=self.high_critic_optimizer))

        self.counts = 0
        self._high_s = [[] for _ in range(self.n_agents)]
        self._noop_subgoal = np.random.uniform(-self.high_scale,
                                               self.high_scale,
                                               size=(self.n_agents,
                                                     self.sub_goal_dim))
        self.get_ir = self.generate_ir_func(mode=intrinsic_reward_mode)

    def generate_ir_func(self, mode='os'):
        if mode == 'os':
            return lambda last_feat, subgoal, feat: -tf.norm(
                last_feat + subgoal - feat, ord=2, axis=-1, keepdims=True)
        elif mode == 'cos':
            return lambda last_feat, subgoal, feat: tf.expand_dims(
                -tf.keras.losses.cosine_similarity(
                    tf.cast(feat - last_feat, tf.float32),
                    tf.cast(subgoal, tf.float32),
                    axis=-1),
                axis=-1)

    def show_logo(self):
        self.logger.info('''
  xxxxx xxxxx        xxxx        xxxxxxx          xxxxxx    
    xx   xx           xx          xxxxxxx        xxx xxxx   
    xx   xx           xx          xx  xxx       xxx   xxx   
    xx   xx           xx          xx  xxx       xx     xxx  
    xxxxxxx           xx          xxxxxx        xx     xxx  
    xx   xx           xx          xxxxxx        xx     xxx  
    xx   xx           xx          xx xxxx       xx     xxx  
    xx   xx           xx          xx  xxx       xxx   xxx   
  xxxxx xxxxx        xxxx        xxxxx xxxx      xxxxxxx   
        ''')

    def store_high_buffer(self, i):
        eps_len = len(self._high_s[i])
        intervals = list(range(0, eps_len, self.sub_goal_steps))
        if len(intervals) < 1:
            return
        left = intervals[:-1]
        right = intervals[1:]
        s, r, a, g, d, s_ = [], [], [], [], [], []
        for _l, _r in zip(left, right):
            s.append(self._high_s[i][_l:_r])
            r.append(sum(self._high_r[i][_l:_r]) * self.reward_scale)
            a.append(self._high_a[i][_l:_r])
            g.append(self._subgoals[i][_l])
            d.append(self._done[i][_r - 1])
            s_.append(self._high_s_[i][_r - 1])

        right = intervals[-1]
        s.append(self._high_s[i][right:eps_len] + [self._high_s[i][-1]] *
                 (self.sub_goal_steps + right - eps_len))
        r.append(sum(self._high_r[i][right:eps_len]))
        a.append(self._high_a[i][right:eps_len] + [self._high_a[i][-1]] *
                 (self.sub_goal_steps + right - eps_len))
        g.append(self._subgoals[i][right])
        d.append(self._done[i][-1])
        s_.append(self._high_s_[i][-1])
        self.data_high.add(np.array(s),
                           np.array(r)[:, np.newaxis], np.array(a),
                           np.array(g),
                           np.array(d)[:, np.newaxis], np.array(s_))

    def reset(self):
        self._c = np.full((self.n_agents, 1), self.sub_goal_steps, np.int32)

        for i in range(self.n_agents):
            self.store_high_buffer(i)
        self._high_r = [[] for _ in range(self.n_agents)]
        self._high_a = [[] for _ in range(self.n_agents)]
        self._high_s = [[] for _ in range(self.n_agents)]
        self._subgoals = [[] for _ in range(self.n_agents)]
        self._done = [[] for _ in range(self.n_agents)]
        self._high_s_ = [[] for _ in range(self.n_agents)]

        self._new_subgoal = np.zeros((self.n_agents, self.sub_goal_dim),
                                     dtype=np.float32)

    def partial_reset(self, done):
        self._c = np.where(
            done[:, np.newaxis],
            np.full((self.n_agents, 1), self.sub_goal_steps, np.int32),
            self._c)
        idx = np.where(done)[0]
        for i in idx:
            self.store_high_buffer(i)
            self._high_s[i] = []
            self._high_a[i] = []
            self._high_s_[i] = []
            self._high_r[i] = []
            self._done[i] = []
            self._subgoals[i] = []

    @tf.function
    def _get_action(self, s, visual_s, subgoal):
        with tf.device(self.device):
            feat = tf.concat([s, subgoal], axis=-1)
            if self.is_continuous:
                mu = self.low_actor(feat)
                pi = tf.clip_by_value(mu + self.low_noise(), -1, 1)
            else:
                logits = self.low_actor(feat)
                mu = tf.argmax(logits, axis=1)
                cate_dist = tfd.Categorical(logits)
                pi = cate_dist.sample()
            return mu, pi

    def choose_action(self, s, visual_s, evaluation=False):
        self._subgoal = np.where(self._c == self.sub_goal_steps,
                                 self.get_subgoal(s).numpy(),
                                 self._new_subgoal)
        mu, pi = self._get_action(s, visual_s, self._subgoal)
        a = mu.numpy() if evaluation else pi.numpy()
        return a

    @tf.function
    def get_subgoal(self, s):
        '''
        last_s 上一个隐状态
        subgoal 上一个子目标
        s 当前隐状态
        '''
        new_subgoal = self.high_scale * self.high_actor(s)
        new_subgoal = tf.clip_by_value(new_subgoal + self.high_noise(),
                                       -self.high_scale, self.high_scale)
        return new_subgoal

    def learn(self, **kwargs):
        self.train_step = kwargs.get('train_step')
        for i in range(self.train_times_per_step):
            if self.data_low.is_lg_batch_size and self.data_high.is_lg_batch_size:
                self.intermediate_variable_reset()
                low_data = self.get_transitions(
                    self.data_low,
                    data_name_list=['s', 'a', 'r', 's_', 'done', 'g', 'g_'])
                high_data = self.get_transitions(
                    self.data_high,
                    data_name_list=['s', 'r', 'a', 'g', 'done', 's_'])

                # --------------------------------------获取需要传给train函数的参数
                _low_training_data = self.get_value_from_dict(
                    data_name_list=['s', 'a', 'r', 's_', 'done', 'g', 'g_'],
                    data_dict=low_data)
                _high_training_data = self.get_value_from_dict(
                    data_name_list=['s', 'r', 'a', 'g', 'done', 's_'],
                    data_dict=high_data)
                summaries = self.train_low(_low_training_data)

                self.summaries.update(summaries)
                update_target_net_weights(
                    self.low_actor_target.weights +
                    self.low_critic_target.weights,
                    self.low_actor.weights + self.low_critic.weights,
                    self.ployak)
                if self.counts % self.sub_goal_steps == 0:
                    self.counts = 0
                    high_summaries = self.train_high(_high_training_data)
                    self.summaries.update(high_summaries)
                    update_target_net_weights(
                        self.high_actor_target.weights +
                        self.high_critic_target.weights,
                        self.high_actor.weights + self.high_critic.weights,
                        self.ployak)
                self.counts += 1
                self.summaries.update(
                    dict([[
                        'LEARNING_RATE/low_actor_lr',
                        self.low_actor_lr(self.train_step)
                    ],
                          [
                              'LEARNING_RATE/low_critic_lr',
                              self.low_critic_lr(self.train_step)
                          ],
                          [
                              'LEARNING_RATE/high_actor_lr',
                              self.high_actor_lr(self.train_step)
                          ],
                          [
                              'LEARNING_RATE/high_critic_lr',
                              self.high_critic_lr(self.train_step)
                          ]]))
                self.write_training_summaries(self.global_step, self.summaries)

    @tf.function(experimental_relax_shapes=True)
    def train_low(self, memories):
        s, a, r, s_, done, g, g_ = memories
        with tf.device(self.device):
            with tf.GradientTape() as tape:
                feat = tf.concat([s, g], axis=-1)
                feat_ = tf.concat([s_, g_], axis=-1)

                if self.is_continuous:
                    target_mu = self.low_actor_target(feat_)
                    action_target = tf.clip_by_value(
                        target_mu + self.low_noise(), -1, 1)
                else:
                    target_logits = self.low_actor_target(feat_)
                    logp_all = tf.nn.log_softmax(target_logits)
                    gumbel_noise = tf.cast(self.gumbel_dist.sample(
                        [tf.shape(feat_)[0], self.a_dim]),
                                           dtype=tf.float32)
                    _pi = tf.nn.softmax((logp_all + gumbel_noise) / 1.)
                    _pi_true_one_hot = tf.one_hot(tf.argmax(_pi, axis=-1),
                                                  self.a_dim)
                    _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi)
                    action_target = _pi_diff + _pi
                q1, q2 = self.low_critic(feat, a)
                q = tf.minimum(q1, q2)
                q_target = self.low_critic_target.get_min(feat_, action_target)
                dc_r = tf.stop_gradient(r + self.gamma * q_target * (1 - done))
                td_error1 = q1 - dc_r
                td_error2 = q2 - dc_r
                q1_loss = tf.reduce_mean(tf.square(td_error1))
                q2_loss = tf.reduce_mean(tf.square(td_error2))
                low_critic_loss = q1_loss + q2_loss
            low_critic_grads = tape.gradient(low_critic_loss,
                                             self.low_critic.weights)
            self.low_critic_optimizer.apply_gradients(
                zip(low_critic_grads, self.low_critic.weights))
            with tf.GradientTape() as tape:
                if self.is_continuous:
                    mu = self.low_actor(feat)
                else:
                    logits = self.low_actor(feat)
                    _pi = tf.nn.softmax(logits)
                    _pi_true_one_hot = tf.one_hot(tf.argmax(logits, axis=-1),
                                                  self.a_dim,
                                                  dtype=tf.float32)
                    _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi)
                    mu = _pi_diff + _pi
                q_actor = self.low_critic.Q1(feat, mu)
                low_actor_loss = -tf.reduce_mean(q_actor)
            low_actor_grads = tape.gradient(low_actor_loss,
                                            self.low_actor.trainable_variables)
            self.low_actor_optimizer.apply_gradients(
                zip(low_actor_grads, self.low_actor.trainable_variables))

            self.global_step.assign_add(1)
            return dict([['LOSS/low_actor_loss', low_actor_loss],
                         ['LOSS/low_critic_loss', low_critic_loss],
                         ['Statistics/low_q_min',
                          tf.reduce_min(q)],
                         ['Statistics/low_q_mean',
                          tf.reduce_mean(q)],
                         ['Statistics/low_q_max',
                          tf.reduce_max(q)]])

    @tf.function(experimental_relax_shapes=True)
    def train_high(self, memories):
        # s_ : [B, N]
        ss, r, aa, g, done, s_ = memories

        batchs = tf.shape(ss)[0]
        # ss, aa [B, T, *]
        with tf.device(self.device):
            with tf.GradientTape() as tape:
                s = ss[:, 0]  # [B, N]
                true_end = (s_ - s)[:, self.fn_goal_dim:]
                g_dist = tfd.Normal(loc=true_end,
                                    scale=0.5 * self.high_scale[None, :])
                ss = tf.expand_dims(ss, 0)  # [1, B, T, *]
                ss = tf.tile(ss,
                             [self.sample_g_nums, 1, 1, 1])  # [10, B, T, *]
                ss = tf.reshape(ss, [-1, tf.shape(ss)[-1]])  # [10*B*T, *]
                aa = tf.expand_dims(aa, 0)  # [1, B, T, *]
                aa = tf.tile(aa,
                             [self.sample_g_nums, 1, 1, 1])  # [10, B, T, *]
                aa = tf.reshape(aa, [-1, tf.shape(aa)[-1]])  # [10*B*T, *]
                gs = tf.concat([
                    tf.expand_dims(g, 0),
                    tf.expand_dims(true_end, 0),
                    tf.clip_by_value(g_dist.sample(self.sample_g_nums - 2),
                                     -self.high_scale, self.high_scale)
                ],
                               axis=0)  # [10, B, N]

                all_g = gs + s[:, self.fn_goal_dim:]
                all_g = tf.expand_dims(all_g, 2)  # [10, B, 1, N]
                all_g = tf.tile(
                    all_g, [1, 1, self.sub_goal_steps, 1])  # [10, B, T, N]
                all_g = tf.reshape(all_g,
                                   [-1, tf.shape(all_g)[-1]])  # [10*B*T, N]
                all_g = all_g - ss[:, self.fn_goal_dim:]  # [10*B*T, N]
                feat = tf.concat([ss, all_g], axis=-1)  # [10*B*T, *]
                _aa = self.low_actor(feat)  # [10*B*T, A]
                if not self.is_continuous:
                    _aa = tf.one_hot(tf.argmax(_aa, axis=-1),
                                     self.a_dim,
                                     dtype=tf.float32)
                diff = _aa - aa
                diff = tf.reshape(
                    diff,
                    [self.sample_g_nums, batchs, self.sub_goal_steps, -1
                     ])  # [10, B, T, A]
                diff = tf.transpose(diff, [1, 0, 2, 3])  # [B, 10, T, A]
                logps = -0.5 * tf.reduce_sum(tf.norm(diff, ord=2, axis=-1)**2,
                                             axis=-1)  # [B, 10]
                idx = tf.argmax(logps, axis=-1, output_type=tf.int32)
                idx = tf.stack([tf.range(batchs), idx], axis=1)  # [B, 2]
                g = tf.gather_nd(tf.transpose(gs, [1, 0, 2]), idx)  # [B, N]

                q1, q2 = self.high_critic(s, g)
                q = tf.minimum(q1, q2)

                target_sub_goal = self.high_actor_target(s_) * self.high_scale
                target_sub_goal = tf.clip_by_value(
                    target_sub_goal + self.high_noise(), -self.high_scale,
                    self.high_scale)
                q_target = self.high_critic_target.get_min(s_, target_sub_goal)

                dc_r = tf.stop_gradient(r + self.gamma * (1 - done) * q_target)
                td_error1 = q1 - dc_r
                td_error2 = q2 - dc_r
                q1_loss = tf.reduce_mean(tf.square(td_error1))
                q2_loss = tf.reduce_mean(tf.square(td_error2))
                high_critic_loss = q1_loss + q2_loss

            high_critic_grads = tape.gradient(high_critic_loss,
                                              self.high_critic.weights)
            self.high_critic_optimizer.apply_gradients(
                zip(high_critic_grads, self.high_critic.weights))
            with tf.GradientTape() as tape:
                mu = self.high_actor(s) * self.high_scale
                q_actor = self.high_critic.Q1(s, mu)
                high_actor_loss = -tf.reduce_mean(q_actor)
            high_actor_grads = tape.gradient(
                high_actor_loss, self.high_actor.trainable_variables)
            self.high_actor_optimizer.apply_gradients(
                zip(high_actor_grads, self.high_actor.trainable_variables))
            return dict([['LOSS/high_actor_loss', high_actor_loss],
                         ['LOSS/high_critic_loss', high_critic_loss],
                         ['Statistics/high_q_min',
                          tf.reduce_min(q)],
                         ['Statistics/high_q_mean',
                          tf.reduce_mean(q)],
                         ['Statistics/high_q_max',
                          tf.reduce_max(q)]])

    def no_op_store(self, s, visual_s, a, r, s_, visual_s_, done):
        assert isinstance(a,
                          np.ndarray), "store need action type is np.ndarray"
        assert isinstance(r,
                          np.ndarray), "store need reward type is np.ndarray"
        assert isinstance(done,
                          np.ndarray), "store need done type is np.ndarray"
        [o.append(_s) for o, _s in zip(self._high_s, s)]
        [o.append(_a) for o, _a in zip(self._high_a, a)]
        [o.append(_r) for o, _r in zip(self._high_r, r)]
        [o.append(_s_) for o, _s_ in zip(self._high_s_, s_)]
        [o.append(_d) for o, _d in zip(self._done, done)]
        [
            o.append(_subgoal)
            for o, _subgoal in zip(self._subgoals, self._noop_subgoal)
        ]

        ir = self.get_ir(s[:, self.fn_goal_dim:], self._noop_subgoal,
                         s_[:, self.fn_goal_dim:])
        # subgoal = s[:, self.fn_goal_dim:] + self._noop_subgoal - s_[:, self.fn_goal_dim:]
        subgoal = np.random.uniform(-self.high_scale,
                                    self.high_scale,
                                    size=(self.n_agents, self.sub_goal_dim))
        self.data_low.add(
            s,
            a,
            ir,
            s_,
            done[:, np.newaxis],  # 升维
            self._noop_subgoal,
            subgoal)
        self._noop_subgoal = subgoal

    def store_data(self, s, visual_s, a, r, s_, visual_s_, done):
        """
        for off-policy training, use this function to store <s, a, r, s_, done> into ReplayBuffer.
        """
        assert isinstance(a,
                          np.ndarray), "store need action type is np.ndarray"
        assert isinstance(r,
                          np.ndarray), "store need reward type is np.ndarray"
        assert isinstance(done,
                          np.ndarray), "store need done type is np.ndarray"
        [o.append(_s) for o, _s in zip(self._high_s, s)]
        [o.append(_a) for o, _a in zip(self._high_a, a)]
        [o.append(_r) for o, _r in zip(self._high_r, r)]
        [o.append(_s_) for o, _s_ in zip(self._high_s_, s_)]
        [o.append(_d) for o, _d in zip(self._done, done)]
        [
            o.append(_subgoal)
            for o, _subgoal in zip(self._subgoals, self._subgoal)
        ]

        ir = self.get_ir(s[:, self.fn_goal_dim:], self._subgoal,
                         s_[:, self.fn_goal_dim:])
        self._new_subgoal = np.where(
            self._c == 1,
            self.get_subgoal(s_).numpy(),
            s[:, self.fn_goal_dim:] + self._subgoal - s_[:, self.fn_goal_dim:])

        self.data_low.add(
            s,
            a,
            ir,
            s_,
            done[:, np.newaxis],  # 升维
            self._subgoal,
            self._new_subgoal)
        self._c = np.where(
            self._c == 1,
            np.full((self.n_agents, 1), self.sub_goal_steps, np.int32),
            self._c - 1)

    def get_transitions(self,
                        databuffer,
                        data_name_list=['s', 'a', 'r', 's_', 'done']):
        '''
        TODO: Annotation
        '''
        data = databuffer.sample()  # 经验池取数据
        if not self.is_continuous and 'a' in data_name_list:
            a_idx = data_name_list.index('a')
            a = data[a_idx].astype(np.int32)
            pre_shape = a.shape
            a = a.reshape(-1)
            a = int2one_hot(a, self.a_dim)
            a = a.reshape(pre_shape + (-1, ))
            data[a_idx] = a
        return dict([[
            n, d
        ] for n, d in zip(data_name_list, list(map(self.data_convert, data)))])
示例#9
0
class PD_DDPG(make_off_policy_class(mode='share')):
    '''
    Accelerated Primal-Dual Policy Optimization for Safe Reinforcement Learning, http://arxiv.org/abs/1802.06480
    Refer to https://github.com/anita-hu/TF2-RL/blob/master/Primal-Dual_DDPG/TF2_PD_DDPG_Basic.py
    '''

    def __init__(self,
                 s_dim,
                 visual_sources,
                 visual_resolution,
                 a_dim,
                 is_continuous,

                 ployak=0.995,
                 actor_lr=5.0e-4,
                 reward_critic_lr=1.0e-3,
                 cost_critic_lr=1.0e-3,
                 lambda_lr=5.0e-4,
                 discrete_tau=1.0,
                 cost_constraint=1.0,
                 hidden_units={
                     'actor_continuous': [32, 32],
                     'actor_discrete': [32, 32],
                     'reward': [32, 32],
                     'cost': [32, 32]
                 },
                 **kwargs):
        super().__init__(
            s_dim=s_dim,
            visual_sources=visual_sources,
            visual_resolution=visual_resolution,
            a_dim=a_dim,
            is_continuous=is_continuous,
            **kwargs)
        self.ployak = ployak
        self.discrete_tau = discrete_tau
        self._lambda = tf.Variable(0.0, dtype=tf.float32)
        self.cost_constraint = cost_constraint  # long tern cost <= d

        if self.is_continuous:
            def _actor_net(): return ActorCts(self.feat_dim, self.a_dim, hidden_units['actor_continuous'])
            # self.action_noise = NormalActionNoise(mu=np.zeros(self.a_dim), sigma=1 * np.ones(self.a_dim))
            self.action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(self.a_dim), sigma=0.2 * np.ones(self.a_dim))
        else:
            def _actor_net(): return ActorDcs(self.feat_dim, self.a_dim, hidden_units['actor_discrete'])
            self.gumbel_dist = tfp.distributions.Gumbel(0, 1)

        self.actor_net = _actor_net()
        self.actor_target_net = _actor_net()
        self.actor_tv = self.actor_net.trainable_variables

        def _critic_net(hiddens): return Critic(self.feat_dim, self.a_dim, hiddens)
        self.reward_critic_net = _critic_net(hidden_units['reward'])
        self.reward_critic_target_net = _critic_net(hidden_units['reward'])
        self.cost_critic_net = _critic_net(hidden_units['cost'])
        self.cost_critic_target_net = _critic_net(hidden_units['cost'])

        self.reward_critic_tv = self.reward_critic_net.trainable_variables + self.other_tv
        update_target_net_weights(
            self.actor_target_net.weights + self.reward_critic_target_net.weights + self.cost_critic_target_net.weights,
            self.actor_net.weights + self.reward_critic_net.weights + self.cost_critic_net.weights
        )
        self.lambda_lr = lambda_lr
        self.actor_lr, self.reward_critic_lr, self.cost_critic_lr = map(self.init_lr, [actor_lr, reward_critic_lr, cost_critic_lr])
        self.optimizer_actor, self.optimizer_reward_critic, self.optimizer_cost_critic = map(self.init_optimizer, [self.actor_lr, self.reward_critic_lr, self.cost_critic_lr])

        self.model_recorder(dict(
            actor=self.actor_net,
            reward_critic=self.reward_critic_net,
            cost_critic=self.cost_critic_net,
            optimizer_actor=self.optimizer_actor,
            optimizer_reward_critic=self.optimizer_reward_critic,
            optimizer_cost_critic=self.optimizer_cost_critic
        ))

    def show_logo(self):
        self.logger.info('''
   xxxxxxxx       xxxxxxx        xxxxxxx        xxxxxxx        xxxxxxxx        xxxxxx     
     xx  xx         x  xxx         x  xxx         x  xxx         xx  xx       xxx  xx     
     x   xxx        x   xx         x   xx         x   xx         x   xxx      xx    x     
     x   xxx        x   xx         x   xx         x   xx         x   xxx      xx          
     xxxxxx         x   xxx        x   xxx        x   xxx        xxxxxx       x   xxxxx   
     x              x   xx         x   xx         x   xx         x            xx   xxx    
     x              x   xx         x   xx         x   xx         x            xx    x     
     x              x  xxx         x  xxx         x  xxx         x            xxx  xx     
   xxxxx          xxxxxxx        xxxxxxx        xxxxxxx        xxxxx           xxxxxx     
                                                                                 xx     
        ''')

    def choose_action(self, s, visual_s, evaluation=False):
        mu, pi, self.cell_state = self._get_action(s, visual_s, self.cell_state)
        a = mu.numpy() if evaluation else pi.numpy()
        return a

    @tf.function
    def _get_action(self, s, visual_s, cell_state):
        with tf.device(self.device):
            feat, cell_state = self.get_feature(s, visual_s, cell_state=cell_state, record_cs=True)
            if self.is_continuous:
                mu = self.actor_net(feat)
                pi = tf.clip_by_value(mu + self.action_noise(), -1, 1)
            else:
                logits = self.actor_net(feat)
                mu = tf.argmax(logits, axis=1)
                cate_dist = tfp.distributions.Categorical(logits)
                pi = cate_dist.sample()
            return mu, pi, cell_state

    def learn(self, **kwargs):
        self.train_step = kwargs.get('train_step')
        for i in range(self.train_times_per_step):
            self._learn(function_dict={
                'train_function': self.train,
                'update_function': lambda: update_target_net_weights(
                    self.actor_target_net.weights + self.reward_critic_target_net.weights + self.cost_critic_target_net.weights,
                    self.actor_net.weights + self.reward_critic_net.weights + self.cost_critic_net.weights,
                    self.ployak),
                'summary_dict': dict([
                    ['LEARNING_RATE/actor_lr', self.actor_lr(self.train_step)],
                    ['LEARNING_RATE/reward_critic_lr', self.reward_critic_lr(self.train_step)],
                    ['LEARNING_RATE/cost_critic_lr', self.cost_critic_lr(self.train_step)]
                ]),
                'sample_data_list': ['s', 'visual_s', 'a', 'r', 's_', 'visual_s_', 'done', 'cost'],
                'train_data_list': ['ss', 'vvss', 'a', 'r', 'done', 'cost'],
            })

    @tf.function(experimental_relax_shapes=True)
    def train(self, memories, isw, crsty_loss, cell_state):
        ss, vvss, a, r, done, cost = memories
        batch_size = tf.shape(a)[0]
        with tf.device(self.device):
            with tf.GradientTape() as tape:
                feat, feat_ = self.get_feature(ss, vvss, cell_state=cell_state, s_and_s_=True)
                if self.is_continuous:
                    target_mu = self.actor_target_net(feat_)
                    action_target = tf.clip_by_value(target_mu + self.action_noise(), -1, 1)
                else:
                    target_logits = self.actor_target_net(feat_)
                    logp_all = tf.nn.log_softmax(target_logits)
                    gumbel_noise = tf.cast(self.gumbel_dist.sample([batch_size, self.a_dim]), dtype=tf.float32)
                    _pi = tf.nn.softmax((logp_all + gumbel_noise) / self.discrete_tau)
                    _pi_true_one_hot = tf.one_hot(tf.argmax(_pi, axis=-1), self.a_dim)
                    _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi)
                    action_target = _pi_diff + _pi
                q_reward = self.reward_critic_net(feat, a)
                q_target = self.reward_critic_target_net(feat_, action_target)
                dc_r = tf.stop_gradient(r + self.gamma * q_target * (1 - done))
                td_error_reward = q_reward - dc_r
                reward_loss = 0.5 * tf.reduce_mean(tf.square(td_error_reward) * isw) + crsty_loss
            q_grads = tape.gradient(reward_loss, self.reward_critic_tv)
            self.optimizer_reward_critic.apply_gradients(
                zip(q_grads, self.reward_critic_tv)
            )

            with tf.GradientTape() as tape:
                q_cost = self.cost_critic_net(feat, a)
                q_target = self.cost_critic_target_net(feat_, action_target)
                dc_r = tf.stop_gradient(cost + self.gamma * q_target * (1 - done))
                td_error_cost = q_cost - dc_r
                cost_loss = 0.5 * tf.reduce_mean(tf.square(td_error_cost) * isw) + crsty_loss
            q_grads = tape.gradient(cost_loss, self.cost_critic_net.trainable_variables)
            self.optimizer_cost_critic.apply_gradients(
                zip(q_grads, self.cost_critic_net.trainable_variables)
            )

            q_loss = reward_loss + cost_loss

            with tf.GradientTape() as tape:
                if self.is_continuous:
                    mu = self.actor_net(feat)
                else:
                    logits = self.actor_net(feat)
                    _pi = tf.nn.softmax(logits)
                    _pi_true_one_hot = tf.one_hot(tf.argmax(logits, axis=-1), self.a_dim, dtype=tf.float32)
                    _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi)
                    mu = _pi_diff + _pi
                reward_actor = self.reward_critic_net(feat, mu)
                cost_actor = self.cost_critic_net(feat, mu)
                actor_loss = -tf.reduce_mean(reward_actor - self._lambda * cost_actor)
            actor_grads = tape.gradient(actor_loss, self.actor_tv)
            self.optimizer_actor.apply_gradients(
                zip(actor_grads, self.actor_tv)
            )

            # update dual variable
            lambda_update = tf.reduce_mean(cost_actor - self.cost_constraint)
            self._lambda.assign_add(self.lambda_lr * lambda_update)
            self._lambda.assign(tf.maximum(self._lambda, 0.0))

            self.global_step.assign_add(1)
            return (td_error_reward + td_error_cost) / 2, dict([
                ['LOSS/actor_loss', actor_loss],
                ['LOSS/reward_loss', reward_loss],
                ['LOSS/cost_loss', cost_loss],
                ['LOSS/q_loss', q_loss],
                ['Statistics/q_reward_min', tf.reduce_min(q_reward)],
                ['Statistics/q_reward_mean', tf.reduce_mean(q_reward)],
                ['Statistics/q_reward_max', tf.reduce_max(q_reward)],
                ['Statistics/q_cost_min', tf.reduce_min(q_cost)],
                ['Statistics/q_cost_mean', tf.reduce_mean(q_cost)],
                ['Statistics/q_cost_max', tf.reduce_max(q_cost)],
                ['Statistics/_lambda', self._lambda],
                ['Statistics/lambda_update', lambda_update]
            ])

    @tf.function(experimental_relax_shapes=True)
    def train_persistent(self, memories, isw, crsty_loss, cell_state):
        ss, vvss, a, r, done = memories
        batch_size = tf.shape(a)[0]
        with tf.device(self.device):
            with tf.GradientTape(persistent=True) as tape:
                feat, feat_ = self.get_feature(ss, vvss, cell_state=cell_state, s_and_s_=True)
                if self.is_continuous:
                    target_mu = self.actor_target_net(feat_)
                    action_target = tf.clip_by_value(target_mu + self.action_noise(), -1, 1)
                    mu = self.actor_net(feat)
                else:
                    target_logits = self.actor_target_net(feat_)
                    logp_all = tf.nn.log_softmax(target_logits)
                    gumbel_noise = tf.cast(self.gumbel_dist.sample([batch_size, self.a_dim]), dtype=tf.float32)
                    _pi = tf.nn.softmax((logp_all + gumbel_noise) / self.discrete_tau)
                    _pi_true_one_hot = tf.one_hot(tf.argmax(_pi, axis=-1), self.a_dim)
                    _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi)
                    action_target = _pi_diff + _pi
                    logits = self.actor_net(feat)
                    _pi = tf.nn.softmax(logits)
                    _pi_true_one_hot = tf.one_hot(tf.argmax(logits, axis=-1), self.a_dim, dtype=tf.float32)
                    _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi)
                    mu = _pi_diff + _pi
                q_reward = self.reward_critic_net(feat, a)
                q_target = self.reward_critic_target_net(feat_, action_target)
                dc_r = tf.stop_gradient(r + self.gamma * q_target * (1 - done))
                td_error_reward = q_reward - dc_r
                reward_loss = 0.5 * tf.reduce_mean(tf.square(td_error_reward) * isw) + crsty_loss

                q_cost = self.cost_critic_net(tf.stop_gradient(feat), a)
                q_target = self.cost_critic_target_net(feat_, action_target)
                dc_r = tf.stop_gradient(cost + self.gamma * q_target * (1 - done))
                td_error_cost = q_cost - dc_r
                cost_loss = 0.5 * tf.reduce_mean(tf.square(td_error_cost) * isw) + crsty_loss

                q_loss = reward_loss + cost_loss

                reward_actor = self.reward_critic_net(feat, mu)
                cost_actor = self.cost_critic_net(feat, mu)
                actor_loss = -tf.reduce_mean(reward_actor - self._lambda * cost_actor)
            q_grads = tape.gradient(reward_loss, self.reward_critic_tv)
            self.optimizer_reward_critic.apply_gradients(
                zip(q_grads, self.reward_critic_tv)
            )
            q_grads = tape.gradient(cost_loss, self.cost_critic_net.trainable_variables)
            self.optimizer_cost_critic.apply_gradients(
                zip(q_grads, self.cost_critic_net.trainable_variables)
            )
            actor_grads = tape.gradient(actor_loss, self.actor_tv)
            self.optimizer_actor.apply_gradients(
                zip(actor_grads, self.actor_tv)
            )

            # update dual variable
            lambda_update = tf.reduce_mean(cost_actor - self.cost_constraint)
            self._lambda.assign_add(self.lambda_lr * lambda_update)
            self._lambda.assign(tf.maximum(self._lambda, 0.0))

            self.global_step.assign_add(1)
            return (td_error_reward + td_error_cost) / 2, dict([
                ['LOSS/actor_loss', actor_loss],
                ['LOSS/reward_loss', reward_loss],
                ['LOSS/cost_loss', cost_loss],
                ['LOSS/q_loss', q_loss],
                ['Statistics/q_reward_min', tf.reduce_min(q_reward)],
                ['Statistics/q_reward_mean', tf.reduce_mean(q_reward)],
                ['Statistics/q_reward_max', tf.reduce_max(q_reward)],
                ['Statistics/q_cost_min', tf.reduce_min(q_cost)],
                ['Statistics/q_cost_mean', tf.reduce_mean(q_cost)],
                ['Statistics/q_cost_max', tf.reduce_max(q_cost)],
                ['Statistics/_lambda', self._lambda],
                ['Statistics/lambda_update', lambda_update]
            ])

    def get_cost(self, s, visual_s, a, r, s_, visual_s_, done):
        return np.abs(s_)[:, :1]    # CartPole

    def store_data(self, s, visual_s, a, r, s_, visual_s_, done):
        """
        for off-policy training, use this function to store <s, a, r, s_, done> into ReplayBuffer.
        """
        assert isinstance(a, np.ndarray), "store need action type is np.ndarray"
        assert isinstance(r, np.ndarray), "store need reward type is np.ndarray"
        assert isinstance(done, np.ndarray), "store need done type is np.ndarray"
        self._running_average(s)
        cost = self.get_cost(s, visual_s, a, r, s_, visual_s_, done)
        self.data.add(
            s,
            visual_s,
            a,
            r[:, np.newaxis],   # 升维
            s_,
            visual_s_,
            done[:, np.newaxis],  # 升维
            cost
        )

    def no_op_store(self, s, visual_s, a, r, s_, visual_s_, done):
        assert isinstance(a, np.ndarray), "no_op_store need action type is np.ndarray"
        assert isinstance(r, np.ndarray), "no_op_store need reward type is np.ndarray"
        assert isinstance(done, np.ndarray), "no_op_store need done type is np.ndarray"
        self._running_average(s)
        cost = self.get_cost(s, visual_s, a, r, s_, visual_s_, done)
        self.data.add(
            s,
            visual_s,
            a,
            r[:, np.newaxis],
            s_,
            visual_s_,
            done[:, np.newaxis],  # 升维
            cost
        )
示例#10
0
文件: dddqn.py 项目: ncepuwwy97/RLs
class DDDQN(make_off_policy_class(mode='share')):
    '''
    Dueling Double DQN, https://arxiv.org/abs/1511.06581
    '''

    def __init__(self,
                 s_dim,
                 visual_sources,
                 visual_resolution,
                 a_dim,
                 is_continuous,

                 lr=5.0e-4,
                 eps_init=1,
                 eps_mid=0.2,
                 eps_final=0.01,
                 init2mid_annealing_step=1000,
                 assign_interval=2,
                 hidden_units={
                     'share': [128],
                     'v': [128],
                     'adv': [128]
                 },
                 **kwargs):
        assert not is_continuous, 'dueling double dqn only support discrete action space'
        super().__init__(
            s_dim=s_dim,
            visual_sources=visual_sources,
            visual_resolution=visual_resolution,
            a_dim=a_dim,
            is_continuous=is_continuous,
            **kwargs)
        self.expl_expt_mng = ExplorationExploitationClass(eps_init=eps_init,
                                                          eps_mid=eps_mid,
                                                          eps_final=eps_final,
                                                          init2mid_annealing_step=init2mid_annealing_step,
                                                          max_step=self.max_train_step)
        self.assign_interval = assign_interval

        def _net(): return NetWork(self.feat_dim, self.a_dim, hidden_units)

        self.dueling_net = _net()
        self.dueling_target_net = _net()
        self.critic_tv = self.dueling_net.trainable_variables + self.other_tv
        update_target_net_weights(self.dueling_target_net.weights, self.dueling_net.weights)
        self.lr = self.init_lr(lr)
        self.optimizer = self.init_optimizer(self.lr)

        self.model_recorder(dict(
            model=self.dueling_net,
            optimizer=self.optimizer
        ))

    def show_logo(self):
        self.logger.info('''
   xxxxxxxx       xxxxxxxx       xxxxxxxx         xxxxxx      xxxx   xxxx  
    xxxxxxxx       xxxxxxxx       xxxxxxxx       xxx xxxx       xxx    x   
    xx    xxx      xx    xxx      xx    xxx     xxx   xxxx      xxxx   x   
    xx    xxx      xx    xxx      xx    xxx     xxx    xxx      xxxxx  x   
    xx     xx      xx     xx      xx     xx     xx     xxx      x xxxx x   
    xx     xx      xx     xx      xx     xx     xxx    xxx      x  xxxxx   
    xx    xxx      xx    xxx      xx    xxx     xxx    xxx      x   xxxx   
    xx   xxxx      xx   xxxx      xx   xxxx     xxx   xxx       x    xxx   
    xxxxxxxx       xxxxxxxx       xxxxxxxx       xxxxxxxx      xxx    xx   
   xxxxxxx        xxxxxxx        xxxxxxx          xxxxx                    
                                                    xxxx                   
                                                      xxx    
        ''')

    def choose_action(self, s, visual_s, evaluation=False):
        if np.random.uniform() < self.expl_expt_mng.get_esp(self.train_step, evaluation=evaluation):
            a = np.random.randint(0, self.a_dim, self.n_agents)
        else:
            a, self.cell_state = self._get_action(s, visual_s, self.cell_state)
            a = a.numpy()
        return a

    @tf.function
    def _get_action(self, s, visual_s, cell_state):
        with tf.device(self.device):
            feat, cell_state = self.get_feature(s, visual_s, cell_state=cell_state, record_cs=True)
            q = self.dueling_net(feat)
        return tf.argmax(q, axis=-1), cell_state

    def learn(self, **kwargs):
        self.train_step = kwargs.get('train_step')

        def _update():
            if self.global_step % self.assign_interval == 0:
                update_target_net_weights(self.dueling_target_net.weights, self.dueling_net.weights)
        for i in range(self.train_times_per_step):
            self._learn(function_dict={
                'train_function': self.train,
                'update_function': _update,
                'summary_dict': dict([['LEARNING_RATE/lr', self.lr(self.train_step)]])
            })

    @tf.function(experimental_relax_shapes=True)
    def train(self, memories, isw, crsty_loss, cell_state):
        ss, vvss, a, r, done = memories
        with tf.device(self.device):
            with tf.GradientTape() as tape:
                feat, feat_ = self.get_feature(ss, vvss, cell_state=cell_state, s_and_s_=True)
                q = self.dueling_net(feat)
                q_eval = tf.reduce_sum(tf.multiply(q, a), axis=1, keepdims=True)
                next_q = self.dueling_net(feat_)
                next_max_action = tf.argmax(next_q, axis=1, name='next_action_int')
                next_max_action_one_hot = tf.one_hot(tf.squeeze(next_max_action), self.a_dim, 1., 0., dtype=tf.float32)
                next_max_action_one_hot = tf.cast(next_max_action_one_hot, tf.float32)
                q_target = self.dueling_target_net(feat_)

                q_target_next_max = tf.reduce_sum(
                    tf.multiply(q_target, next_max_action_one_hot),
                    axis=1, keepdims=True)
                q_target = tf.stop_gradient(r + self.gamma * (1 - done) * q_target_next_max)
                td_error = q_eval - q_target
                q_loss = tf.reduce_mean(tf.square(td_error) * isw) + crsty_loss
            grads = tape.gradient(q_loss, self.critic_tv)
            self.optimizer.apply_gradients(
                zip(grads, self.critic_tv)
            )
            self.global_step.assign_add(1)
            return td_error, dict([
                ['LOSS/loss', q_loss],
                ['Statistics/q_max', tf.reduce_max(q_eval)],
                ['Statistics/q_min', tf.reduce_min(q_eval)],
                ['Statistics/q_mean', tf.reduce_mean(q_eval)]
            ])
示例#11
0
class DQN(make_off_policy_class(mode='share')):
    '''
    Deep Q-learning Network, DQN, [2013](https://arxiv.org/pdf/1312.5602.pdf), [2015](https://storage.googleapis.com/deepmind-media/dqn/DQNNaturePaper.pdf)
    DQN + LSTM, https://arxiv.org/abs/1507.06527
    '''
    def __init__(self,
                 s_dim: Union[int, np.ndarray],
                 visual_sources: Union[int, np.ndarray],
                 visual_resolution: Union[List, np.ndarray],
                 a_dim: Union[int, np.ndarray],
                 is_continuous: Union[bool, np.ndarray],
                 lr: float = 5.0e-4,
                 eps_init: float = 1,
                 eps_mid: float = 0.2,
                 eps_final: float = 0.01,
                 init2mid_annealing_step: int = 1000,
                 assign_interval: int = 1000,
                 hidden_units: List[int] = [32, 32],
                 **kwargs):
        assert not is_continuous, 'dqn only support discrete action space'
        super().__init__(s_dim=s_dim,
                         visual_sources=visual_sources,
                         visual_resolution=visual_resolution,
                         a_dim=a_dim,
                         is_continuous=is_continuous,
                         **kwargs)
        self.expl_expt_mng = ExplorationExploitationClass(
            eps_init=eps_init,
            eps_mid=eps_mid,
            eps_final=eps_final,
            init2mid_annealing_step=init2mid_annealing_step,
            max_step=self.max_train_step)
        self.assign_interval = assign_interval

        def _q_net():
            return NetWork(self.feat_dim, self.a_dim, hidden_units)

        self.q_net = _q_net()
        self.q_target_net = _q_net()
        self.critic_tv = self.q_net.trainable_variables + self.other_tv
        update_target_net_weights(self.q_target_net.weights,
                                  self.q_net.weights)
        self.lr = self.init_lr(lr)
        self.optimizer = self.init_optimizer(self.lr)

        self.model_recorder(dict(model=self.q_net, optimizer=self.optimizer))

    def show_logo(self) -> NoReturn:
        self.logger.info('''
       xxxxxxxx         xxxxxx      xxxx   xxxx  
        xxxxxxxx       xxx xxxx       xxx    x   
        xx    xxx     xxx   xxxx      xxxx   x   
        xx    xxx     xxx    xxx      xxxxx  x   
        xx     xx     xx     xxx      x xxxx x   
        xx     xx     xxx    xxx      x  xxxxx   
        xx    xxx     xxx    xxx      x   xxxx   
        xx   xxxx     xxx   xxx       x    xxx   
        xxxxxxxx       xxxxxxxx      xxx    xx   
       xxxxxxx          xxxxx                    
                          xxxx                   
                            xxx
        ''')

    def choose_action(self,
                      s: np.ndarray,
                      visual_s: np.ndarray,
                      evaluation: bool = False) -> np.ndarray:
        if np.random.uniform() < self.expl_expt_mng.get_esp(
                self.train_step, evaluation=evaluation):
            a = np.random.randint(0, self.a_dim, self.n_agents)
        else:
            a, self.cell_state = self._get_action(s, visual_s, self.cell_state)
            a = a.numpy()
        return a

    @tf.function
    def _get_action(self, s, visual_s, cell_state):
        with tf.device(self.device):
            feat, cell_state = self.get_feature(s,
                                                visual_s,
                                                cell_state=cell_state,
                                                record_cs=True)
            q_values = self.q_net(feat)
        return tf.argmax(q_values, axis=1), cell_state

    def learn(self, **kwargs) -> NoReturn:
        self.train_step = kwargs.get('train_step')

        def _update():
            if self.global_step % self.assign_interval == 0:
                update_target_net_weights(self.q_target_net.weights,
                                          self.q_net.weights)

        for i in range(self.train_times_per_step):
            self._learn(
                function_dict={
                    'train_function':
                    self.train,
                    'update_function':
                    _update,
                    'summary_dict':
                    dict([['LEARNING_RATE/lr',
                           self.lr(self.train_step)]])
                })

    @tf.function(experimental_relax_shapes=True)
    def train(self, memories, isw, crsty_loss, cell_state):
        ss, vvss, a, r, done = memories
        with tf.device(self.device):
            with tf.GradientTape() as tape:
                feat, feat_ = self.get_feature(ss,
                                               vvss,
                                               cell_state=cell_state,
                                               s_and_s_=True)
                q = self.q_net(feat)
                q_next = self.q_target_net(feat_)
                q_eval = tf.reduce_sum(tf.multiply(q, a),
                                       axis=1,
                                       keepdims=True)
                q_target = tf.stop_gradient(
                    r + self.gamma *
                    (1 - done) * tf.reduce_max(q_next, axis=1, keepdims=True))
                td_error = q_eval - q_target
                q_loss = tf.reduce_mean(tf.square(td_error) * isw) + crsty_loss
            grads = tape.gradient(q_loss, self.critic_tv)
            self.optimizer.apply_gradients(zip(grads, self.critic_tv))
            self.global_step.assign_add(1)
            return td_error, dict(
                [['LOSS/loss', q_loss],
                 ['Statistics/q_max',
                  tf.reduce_max(q_eval)],
                 ['Statistics/q_min',
                  tf.reduce_min(q_eval)],
                 ['Statistics/q_mean',
                  tf.reduce_mean(q_eval)]])
示例#12
0
文件: qrdqn.py 项目: ncepuwwy97/RLs
class QRDQN(make_off_policy_class(mode='share')):
    '''
    Quantile Regression DQN
    Distributional Reinforcement Learning with Quantile Regression, https://arxiv.org/abs/1710.10044
    No double, no dueling, no noisy net.
    '''
    def __init__(self,
                 s_dim,
                 visual_sources,
                 visual_resolution,
                 a_dim,
                 is_continuous,
                 nums=20,
                 huber_delta=1.,
                 lr=5.0e-4,
                 eps_init=1,
                 eps_mid=0.2,
                 eps_final=0.01,
                 init2mid_annealing_step=1000,
                 assign_interval=1000,
                 hidden_units=[128, 128],
                 **kwargs):
        assert not is_continuous, 'qrdqn only support discrete action space'
        assert nums > 0
        super().__init__(s_dim=s_dim,
                         visual_sources=visual_sources,
                         visual_resolution=visual_resolution,
                         a_dim=a_dim,
                         is_continuous=is_continuous,
                         **kwargs)
        self.nums = nums
        self.huber_delta = huber_delta
        self.quantiles = tf.reshape(
            tf.constant((2 * np.arange(self.nums) + 1) / (2.0 * self.nums),
                        dtype=tf.float32), [-1, self.nums])  # [1, N]
        self.batch_quantiles = tf.tile(self.quantiles,
                                       [self.a_dim, 1])  # [1, N] => [A, N]
        self.expl_expt_mng = ExplorationExploitationClass(
            eps_init=eps_init,
            eps_mid=eps_mid,
            eps_final=eps_final,
            init2mid_annealing_step=init2mid_annealing_step,
            max_step=self.max_train_step)
        self.assign_interval = assign_interval

        def _net():
            return NetWork(self.feat_dim, self.a_dim, self.nums, hidden_units)

        self.q_dist_net = _net()
        self.q_target_dist_net = _net()
        self.critic_tv = self.q_dist_net.trainable_variables + self.other_tv
        update_target_net_weights(self.q_target_dist_net.weights,
                                  self.q_dist_net.weights)
        self.lr = self.init_lr(lr)
        self.optimizer = self.init_optimizer(self.lr)
        self.model_recorder(
            dict(model=self.q_dist_net, optimizer=self.optimizer))

    def show_logo(self):
        self.logger.info('''
     xxxxxx       xxxxxxx        xxxxxxxx         xxxxxx      xxxx   xxxx  
    xxx xxxx       xxxxxxx        xxxxxxxx       xxx xxxx       xxx    x   
   xxx   xxxx      xx  xxx        xx    xxx     xxx   xxxx      xxxx   x   
   xxx    xxx      xx  xxx        xx    xxx     xxx    xxx      xxxxx  x   
   xx     xxx      xxxxxx         xx     xx     xx     xxx      x xxxx x   
   xxx    xxx      xxxxxx         xx     xx     xxx    xxx      x  xxxxx   
   xxx    xxx      xx xxxx        xx    xxx     xxx    xxx      x   xxxx   
   xxx   xxx       xx  xxx        xx   xxxx     xxx   xxx       x    xxx   
    xxxxxxxx      xxxxx xxxx      xxxxxxxx       xxxxxxxx      xxx    xx   
     xxxxx        xxxxx xxxx     xxxxxxx          xxxxx                    
       xxxx                                         xxxx                   
         xxx                                          xxx     
        ''')

    def choose_action(self, s, visual_s, evaluation=False):
        if np.random.uniform() < self.expl_expt_mng.get_esp(
                self.train_step, evaluation=evaluation):
            a = np.random.randint(0, self.a_dim, self.n_agents)
        else:
            a, self.cell_state = self._get_action(s, visual_s, self.cell_state)
            a = a.numpy()
        return a

    @tf.function
    def _get_action(self, s, visual_s, cell_state):
        with tf.device(self.device):
            feat, cell_state = self.get_feature(s,
                                                visual_s,
                                                cell_state=cell_state,
                                                record_cs=True)
            q = self.get_q(feat)  # [B, A]
        return tf.argmax(q, axis=-1), cell_state  # [B, 1]

    def learn(self, **kwargs):
        self.train_step = kwargs.get('train_step')

        def _update():
            if self.global_step % self.assign_interval == 0:
                update_target_net_weights(self.q_target_dist_net.weights,
                                          self.q_dist_net.weights)

        for i in range(self.train_times_per_step):
            self._learn(
                function_dict={
                    'train_function':
                    self.train,
                    'update_function':
                    _update,
                    'summary_dict':
                    dict([['LEARNING_RATE/lr',
                           self.lr(self.train_step)]])
                })

    @tf.function(experimental_relax_shapes=True)
    def train(self, memories, isw, crsty_loss, cell_state):
        ss, vvss, a, r, done = memories
        batch_size = tf.shape(a)[0]
        with tf.device(self.device):
            with tf.GradientTape() as tape:
                feat, feat_ = self.get_feature(ss,
                                               vvss,
                                               cell_state=cell_state,
                                               s_and_s_=True)
                indexs = tf.reshape(tf.range(batch_size), [-1, 1])  # [B, 1]
                q_dist = self.q_dist_net(feat)  # [B, A, N]
                q_dist = tf.transpose(
                    tf.reduce_sum(tf.transpose(q_dist, [2, 0, 1]) * a,
                                  axis=-1), [1, 0])  # [B, N]
                target_q_dist = self.q_target_dist_net(feat_)  # [B, A, N]
                target_q = tf.reduce_sum(self.batch_quantiles * target_q_dist,
                                         axis=-1)  # [B, A, N] => [B, A]
                a_ = tf.reshape(
                    tf.cast(tf.argmax(target_q, axis=-1), dtype=tf.int32),
                    [-1, 1])  # [B, 1]
                target_q_dist = tf.gather_nd(target_q_dist,
                                             tf.concat([indexs, a_],
                                                       axis=-1))  # [B, N]
                target = tf.tile(r, tf.constant([1, self.nums])) \
                    + self.gamma * tf.multiply(self.quantiles,   # [1, N]
                                               (1.0 - tf.tile(done, tf.constant([1, self.nums]))))  # [B, N], [1, N]* [B, N] = [B, N]
                q_eval = tf.reduce_sum(q_dist * self.quantiles,
                                       axis=-1)  # [B, 1]
                q_target = tf.reduce_sum(target * self.quantiles,
                                         axis=-1)  # [B, 1]
                td_error = q_eval - q_target  # [B, 1]

                quantile_error = tf.expand_dims(
                    q_dist, axis=-1) - tf.expand_dims(
                        target, axis=1)  # [B, N, 1] - [B, 1, N] => [B, N, N]
                huber = huber_loss(quantile_error,
                                   delta=self.huber_delta)  # [B, N, N]
                huber_abs = tf.abs(
                    self.quantiles -
                    tf.where(quantile_error < 0, tf.ones_like(quantile_error),
                             tf.zeros_like(quantile_error))
                )  # [1, N] - [B, N, N] => [B, N, N]
                loss = tf.reduce_mean(huber_abs * huber,
                                      axis=-1)  # [B, N, N] => [B, N]
                loss = tf.reduce_sum(loss, axis=-1)  # [B, N] => [B, ]
                loss = tf.reduce_mean(loss * isw) + crsty_loss  # [B, ] => 1
            grads = tape.gradient(loss, self.critic_tv)
            self.optimizer.apply_gradients(zip(grads, self.critic_tv))
            self.global_step.assign_add(1)
            return td_error, dict(
                [['LOSS/loss', loss],
                 ['Statistics/q_max',
                  tf.reduce_max(q_eval)],
                 ['Statistics/q_min',
                  tf.reduce_min(q_eval)],
                 ['Statistics/q_mean',
                  tf.reduce_mean(q_eval)]])

    @tf.function(experimental_relax_shapes=True)
    def get_q(self, feat):
        with tf.device(self.device):
            return tf.reduce_sum(self.batch_quantiles * self.q_dist_net(feat),
                                 axis=-1)  # [B, A, N] => [B, A]
示例#13
0
class IOC(make_off_policy_class(mode='share')):
    '''
    Learning Options with Interest Functions, https://www.aaai.org/ojs/index.php/AAAI/article/view/5114/4987 
    Options of Interest: Temporal Abstraction with Interest Functions, http://arxiv.org/abs/2001.00271
    '''
    def __init__(
            self,
            s_dim,
            visual_sources,
            visual_resolution,
            a_dim,
            is_continuous,
            q_lr=5.0e-3,
            intra_option_lr=5.0e-4,
            termination_lr=5.0e-4,
            interest_lr=5.0e-4,
            boltzmann_temperature=1.0,
            options_num=4,
            ent_coff=0.01,
            double_q=False,
            use_baseline=True,
            terminal_mask=True,
            termination_regularizer=0.01,
            assign_interval=1000,
            hidden_units={
                'q': [32, 32],
                'intra_option': [32, 32],
                'termination': [32, 32],
                'interest': [32, 32]
            },
            **kwargs):
        super().__init__(s_dim=s_dim,
                         visual_sources=visual_sources,
                         visual_resolution=visual_resolution,
                         a_dim=a_dim,
                         is_continuous=is_continuous,
                         **kwargs)
        self.assign_interval = assign_interval
        self.options_num = options_num
        self.termination_regularizer = termination_regularizer
        self.ent_coff = ent_coff
        self.use_baseline = use_baseline
        self.terminal_mask = terminal_mask
        self.double_q = double_q
        self.boltzmann_temperature = boltzmann_temperature

        def _q_net():
            return Critic(self.feat_dim, self.options_num, hidden_units['q'])

        self.q_net = _q_net()
        self.q_target_net = _q_net()
        self.intra_option_net = OptionNet(self.feat_dim, self.a_dim,
                                          self.options_num,
                                          hidden_units['intra_option'])
        self.termination_net = Critic(self.feat_dim, self.options_num,
                                      hidden_units['termination'], 'sigmoid')
        self.interest_net = Criticl(self.feat_dim, self.options_num,
                                    hidden_units['interest'], 'sigmoid')
        self.critic_tv = self.q_net.trainable_variables + self.other_tv
        self.actor_tv = self.intra_option_net.trainable_variables
        if self.is_continuous:
            self.log_std = tf.Variable(initial_value=-0.5 * np.ones(
                (self.options_num, self.a_dim), dtype=np.float32),
                                       trainable=True)  # [P, A]
            self.actor_tv += [self.log_std]
        update_target_net_weights(self.q_target_net.weights,
                                  self.q_net.weights)

        self.q_lr, self.intra_option_lr, self.termination_lr, self.interest_lr = map(
            self.init_lr, [q_lr, intra_option_lr, termination_lr, interest_lr])
        self.q_optimizer = self.init_optimizer(self.q_lr, clipvalue=5.)
        self.intra_option_optimizer = self.init_optimizer(self.intra_option_lr,
                                                          clipvalue=5.)
        self.termination_optimizer = self.init_optimizer(self.termination_lr,
                                                         clipvalue=5.)
        self.interest_optimizer = self.init_optimizer(self.interest_lr,
                                                      clipvalue=5.)

        self.model_recorder(
            dict(q_net=self.q_net,
                 intra_option_net=self.intra_option_net,
                 termination_net=self.termination_net,
                 interest_net=self.interest_net,
                 q_optimizer=self.q_optimizer,
                 intra_option_optimizer=self.intra_option_optimizer,
                 termination_optimizer=self.termination_optimizer,
                 interest_optimizer=self.interest_optimizer))

    def show_logo(self):
        self.logger.info('''
      xxxx          xxxxxx         xxxxxxx   
       xx          xxx xxxx       xxxx xxx   
       xx         xxx   xxx      xxxx    x   
       xx         xx     xxx     xxx     x   
       xx         xx     xxx     xxx         
       xx         xx     xxx     xxx         
       xx         xx     xxx     xxx         
       xx         xxx   xxx       xxx    x   
      xxxx         xxxxxxxx       xxxxxxxx   
                    xxxxx           xxxxx 
        ''')

    def _generate_random_options(self):
        return tf.constant(np.random.randint(0, self.options_num,
                                             self.n_agents),
                           dtype=tf.int32)

    def choose_action(self, s, visual_s, evaluation=False):
        if not hasattr(self, 'options'):
            self.options = self._generate_random_options()
        self.last_options = self.options

        a, self.options, self.cell_state = self._get_action(
            s, visual_s, self.cell_state, self.options)
        a = a.numpy()
        return a

    @tf.function
    def _get_action(self, s, visual_s, cell_state, options):
        with tf.device(self.device):
            feat, cell_state = self.get_feature(s,
                                                visual_s,
                                                cell_state=cell_state,
                                                record_cs=True)
            q = self.q_net(feat)  # [B, P]
            pi = self.intra_option_net(feat)  # [B, P, A]
            options_onehot = tf.one_hot(options,
                                        self.options_num,
                                        dtype=tf.float32)  # [B, P]
            options_onehot_expanded = tf.expand_dims(options_onehot,
                                                     axis=-1)  # [B, P, 1]
            pi = tf.reduce_sum(pi * options_onehot_expanded, axis=1)  # [B, A]
            if self.is_continuous:
                log_std = tf.gather(self.log_std, options)
                mu = tf.math.tanh(pi)
                a, _ = gaussian_clip_rsample(mu, log_std)
            else:
                pi = pi / self.boltzmann_temperature
                dist = tfp.distributions.Categorical(logits=pi)  # [B, ]
                a = dist.sample()
            interests = self.interest_net(feat)  # [B, P]
            op_logits = interests * q  # [B, P] or tf.nn.softmax(q)
            new_options = tfp.distributions.Categorical(
                logits=op_logits).sample()
        return a, new_options, cell_state

    def learn(self, **kwargs):
        self.train_step = kwargs.get('train_step')

        def _update():
            if self.global_step % self.assign_interval == 0:
                update_target_net_weights(self.q_target_net.weights,
                                          self.q_net.weights)

        for i in range(self.train_times_per_step):
            self._learn(
                function_dict={
                    'train_function':
                    self.train,
                    'update_function':
                    _update,
                    'sample_data_list': [
                        's', 'visual_s', 'a', 'r', 's_', 'visual_s_', 'done',
                        'last_options', 'options'
                    ],
                    'train_data_list': [
                        'ss', 'vvss', 'a', 'r', 'done', 'last_options',
                        'options'
                    ],
                    'summary_dict':
                    dict([['LEARNING_RATE/q_lr',
                           self.q_lr(self.train_step)],
                          [
                              'LEARNING_RATE/intra_option_lr',
                              self.intra_option_lr(self.train_step)
                          ],
                          [
                              'LEARNING_RATE/termination_lr',
                              self.termination_lr(self.train_step)
                          ], ['Statistics/option', self.options[0]]])
                })

    @tf.function(experimental_relax_shapes=True)
    def train(self, memories, isw, crsty_loss, cell_state):
        ss, vvss, a, r, done, last_options, options = memories
        last_options = tf.cast(last_options, tf.int32)
        options = tf.cast(options, tf.int32)
        with tf.device(self.device):
            with tf.GradientTape(persistent=True) as tape:
                feat, feat_ = self.get_feature(ss,
                                               vvss,
                                               cell_state=cell_state,
                                               s_and_s_=True)
                q = self.q_net(feat)  # [B, P]
                pi = self.intra_option_net(feat)  # [B, P, A]
                beta = self.termination_net(feat)  # [B, P]
                q_next = self.q_target_net(feat_)  # [B, P], [B, P, A], [B, P]
                beta_next = self.termination_net(feat_)  # [B, P]
                interests = self.interest_net(feat)  # [B, P]
                options_onehot = tf.one_hot(options,
                                            self.options_num,
                                            dtype=tf.float32)  # [B,] => [B, P]

                q_s = qu_eval = tf.reduce_sum(q * options_onehot,
                                              axis=-1,
                                              keepdims=True)  # [B, 1]
                beta_s_ = tf.reduce_sum(beta_next * options_onehot,
                                        axis=-1,
                                        keepdims=True)  # [B, 1]
                q_s_ = tf.reduce_sum(q_next * options_onehot,
                                     axis=-1,
                                     keepdims=True)  # [B, 1]
                if self.double_q:
                    q_ = self.q_net(feat)  # [B, P], [B, P, A], [B, P]
                    max_a_idx = tf.one_hot(
                        tf.argmax(q_, axis=-1),
                        self.options_num,
                        dtype=tf.float32)  # [B, P] => [B, ] => [B, P]
                    q_s_max = tf.reduce_sum(q_next * max_a_idx,
                                            axis=-1,
                                            keepdims=True)  # [B, 1]
                else:
                    q_s_max = tf.reduce_max(q_next, axis=-1,
                                            keepdims=True)  # [B, 1]
                u_target = (1 - beta_s_) * q_s_ + beta_s_ * q_s_max  # [B, 1]
                qu_target = tf.stop_gradient(r + self.gamma *
                                             (1 - done) * u_target)
                td_error = qu_target - qu_eval  # gradient : q
                q_loss = tf.reduce_mean(
                    tf.square(td_error) * isw) + crsty_loss  # [B, 1] => 1

                if self.use_baseline:
                    adv = tf.stop_gradient(qu_target - qu_eval)
                else:
                    adv = tf.stop_gradient(qu_target)
                options_onehot_expanded = tf.expand_dims(
                    options_onehot, axis=-1)  # [B, P] => [B, P, 1]
                pi = tf.reduce_sum(pi * options_onehot_expanded,
                                   axis=1)  # [B, P, A] => [B, A]
                if self.is_continuous:
                    log_std = tf.gather(self.log_std, options)
                    mu = tf.math.tanh(pi)
                    log_p = gaussian_likelihood_sum(a, mu, log_std)
                    entropy = gaussian_entropy(log_std)
                else:
                    pi = pi / self.boltzmann_temperature
                    log_pi = tf.nn.log_softmax(pi, axis=-1)  # [B, A]
                    entropy = -tf.reduce_sum(tf.exp(log_pi) * log_pi,
                                             axis=1,
                                             keepdims=True)  # [B, 1]
                    log_p = tf.reduce_sum(a * log_pi, axis=-1,
                                          keepdims=True)  # [B, 1]
                pi_loss = tf.reduce_mean(
                    -(log_p * adv + self.ent_coff * entropy)
                )  # [B, 1] * [B, 1] => [B, 1] => 1

                last_options_onehot = tf.one_hot(
                    last_options, self.options_num,
                    dtype=tf.float32)  # [B,] => [B, P]
                beta_s = tf.reduce_sum(beta * last_options_onehot,
                                       axis=-1,
                                       keepdims=True)  # [B, 1]

                pi_op = tf.nn.softmax(
                    interests *
                    tf.stop_gradient(q))  # [B, P] or tf.nn.softmax(q)
                interest_loss = -tf.reduce_mean(beta_s * tf.reduce_sum(
                    pi_op * options_onehot, axis=-1, keepdims=True) *
                                                q_s)  # [B, 1] => 1

                v_s = tf.reduce_sum(q * pi_op, axis=-1,
                                    keepdims=True)  # [B, P] * [B, P] => [B, 1]
                beta_loss = beta_s * tf.stop_gradient(q_s - v_s)  # [B, 1]
                if self.terminal_mask:
                    beta_loss *= (1 - done)
                beta_loss = tf.reduce_mean(beta_loss)  # [B, 1] => 1

            q_grads = tape.gradient(q_loss, self.critic_tv)
            intra_option_grads = tape.gradient(pi_loss, self.actor_tv)
            termination_grads = tape.gradient(
                beta_loss, self.termination_net.trainable_variables)
            interest_grads = tape.gradient(
                interest_loss, self.interest_net.trainable_variables)
            self.q_optimizer.apply_gradients(zip(q_grads, self.critic_tv))
            self.intra_option_optimizer.apply_gradients(
                zip(intra_option_grads, self.actor_tv))
            self.termination_optimizer.apply_gradients(
                zip(termination_grads,
                    self.termination_net.trainable_variables))
            self.interest_optimizer.apply_gradients(
                zip(interest_grads, self.interest_net.trainable_variables))
            self.global_step.assign_add(1)
            return td_error, dict(
                [['LOSS/q_loss', tf.reduce_mean(q_loss)],
                 ['LOSS/pi_loss', tf.reduce_mean(pi_loss)],
                 ['LOSS/beta_loss',
                  tf.reduce_mean(beta_loss)],
                 ['LOSS/interest_loss',
                  tf.reduce_mean(interest_loss)],
                 ['Statistics/q_option_max',
                  tf.reduce_max(q_s)],
                 ['Statistics/q_option_min',
                  tf.reduce_min(q_s)],
                 ['Statistics/q_option_mean',
                  tf.reduce_mean(q_s)]])

    def store_data(self, s, visual_s, a, r, s_, visual_s_, done):
        """
        for off-policy training, use this function to store <s, a, r, s_, done> into ReplayBuffer.
        """
        assert isinstance(a,
                          np.ndarray), "store need action type is np.ndarray"
        assert isinstance(r,
                          np.ndarray), "store need reward type is np.ndarray"
        assert isinstance(done,
                          np.ndarray), "store need done type is np.ndarray"
        self._running_average(s)
        self.data.add(
            s,
            visual_s,
            a,
            r[:, np.newaxis],  # 升维
            s_,
            visual_s_,
            done[:, np.newaxis],  # 升维
            self.last_options,
            self.options)

    def no_op_store(self, s, visual_s, a, r, s_, visual_s_, done):
        pass
示例#14
0
class OC(make_off_policy_class(mode='share')):
    '''
    The Option-Critic Architecture. http://arxiv.org/abs/1609.05140
    '''
    def __init__(self,
                 s_dim,
                 visual_sources,
                 visual_resolution,
                 a_dim,
                 is_continuous,
                 q_lr=5.0e-3,
                 intra_option_lr=5.0e-4,
                 termination_lr=5.0e-4,
                 use_eps_greedy=False,
                 eps_init=1,
                 eps_mid=0.2,
                 eps_final=0.01,
                 init2mid_annealing_step=1000,
                 boltzmann_temperature=1.0,
                 options_num=4,
                 ent_coff=0.01,
                 double_q=False,
                 use_baseline=True,
                 terminal_mask=True,
                 termination_regularizer=0.01,
                 assign_interval=1000,
                 hidden_units={
                     'q': [32, 32],
                     'intra_option': [32, 32],
                     'termination': [32, 32]
                 },
                 **kwargs):
        super().__init__(s_dim=s_dim,
                         visual_sources=visual_sources,
                         visual_resolution=visual_resolution,
                         a_dim=a_dim,
                         is_continuous=is_continuous,
                         **kwargs)
        self.expl_expt_mng = ExplorationExploitationClass(
            eps_init=eps_init,
            eps_mid=eps_mid,
            eps_final=eps_final,
            init2mid_annealing_step=init2mid_annealing_step,
            max_step=self.max_train_step)
        self.assign_interval = assign_interval
        self.options_num = options_num
        self.termination_regularizer = termination_regularizer
        self.ent_coff = ent_coff
        self.use_baseline = use_baseline
        self.terminal_mask = terminal_mask
        self.double_q = double_q
        self.boltzmann_temperature = boltzmann_temperature
        self.use_eps_greedy = use_eps_greedy

        def _q_net():
            return Critic(self.feat_dim, self.options_num, hidden_units['q'])

        self.q_net = _q_net()
        self.q_target_net = _q_net()
        self.intra_option_net = OptionNet(self.feat_dim, self.a_dim,
                                          self.options_num,
                                          hidden_units['intra_option'])
        self.termination_net = Critic(self.feat_dim, self.options_num,
                                      hidden_units['termination'], 'sigmoid')
        self.critic_tv = self.q_net.trainable_variables + self.other_tv
        self.actor_tv = self.intra_option_net.trainable_variables
        if self.is_continuous:
            self.log_std = tf.Variable(initial_value=-0.5 * np.ones(
                (self.options_num, self.a_dim), dtype=np.float32),
                                       trainable=True)  # [P, A]
            self.actor_tv += [self.log_std]
        update_target_net_weights(self.q_target_net.weights,
                                  self.q_net.weights)

        self.q_lr, self.intra_option_lr, self.termination_lr = map(
            self.init_lr, [q_lr, intra_option_lr, termination_lr])
        self.q_optimizer = self.init_optimizer(self.q_lr, clipvalue=5.)
        self.intra_option_optimizer = self.init_optimizer(self.intra_option_lr,
                                                          clipvalue=5.)
        self.termination_optimizer = self.init_optimizer(self.termination_lr,
                                                         clipvalue=5.)

        self.model_recorder(
            dict(q_net=self.q_net,
                 intra_option_net=self.intra_option_net,
                 termination_net=self.termination_net,
                 q_optimizer=self.q_optimizer,
                 intra_option_optimizer=self.intra_option_optimizer,
                 termination_optimizer=self.termination_optimizer))

    def show_logo(self):
        self.logger.info('''
     xxxxxx         xxxxxxx   
    xxx xxxx       xxxx xxx   
   xxx   xxx      xxxx    x   
   xx     xxx     xxx     x   
   xx     xxx     xxx         
   xx     xxx     xxx         
   xx     xxx     xxx         
   xxx   xxx       xxx    x   
    xxxxxxxx       xxxxxxxx   
     xxxxx           xxxxx
        ''')

    def _generate_random_options(self):
        return tf.constant(np.random.randint(0, self.options_num,
                                             self.n_agents),
                           dtype=tf.int32)

    def choose_action(self, s, visual_s, evaluation=False):
        if not hasattr(self, 'options'):
            self.options = self._generate_random_options()
        self.last_options = self.options

        a, self.options, self.cell_state = self._get_action(
            s, visual_s, self.cell_state, self.options)
        if self.use_eps_greedy:
            if np.random.uniform() < self.expl_expt_mng.get_esp(
                    self.train_step, evaluation=evaluation):  # epsilon greedy
                self.options = self._generate_random_options()
        a = a.numpy()
        return a

    @tf.function
    def _get_action(self, s, visual_s, cell_state, options):
        with tf.device(self.device):
            feat, cell_state = self.get_feature(s,
                                                visual_s,
                                                cell_state=cell_state,
                                                record_cs=True)
            q = self.q_net(feat)  # [B, P]
            pi = self.intra_option_net(feat)  # [B, P, A]
            beta = self.termination_net(feat)  # [B, P]
            options_onehot = tf.one_hot(options,
                                        self.options_num,
                                        dtype=tf.float32)  # [B, P]
            options_onehot_expanded = tf.expand_dims(options_onehot,
                                                     axis=-1)  # [B, P, 1]
            pi = tf.reduce_sum(pi * options_onehot_expanded, axis=1)  # [B, A]
            if self.is_continuous:
                log_std = tf.gather(self.log_std, options)
                mu = tf.math.tanh(pi)
                a, _ = gaussian_clip_rsample(mu, log_std)
            else:
                pi = pi / self.boltzmann_temperature
                dist = tfp.distributions.Categorical(logits=pi)  # [B, ]
                a = dist.sample()
            max_options = tf.cast(tf.argmax(q, axis=-1),
                                  dtype=tf.int32)  # [B, P] => [B, ]
            if self.use_eps_greedy:
                new_options = max_options
            else:
                beta_probs = tf.reduce_sum(beta * options_onehot,
                                           axis=1)  # [B, P] => [B,]
                beta_dist = tfp.distributions.Bernoulli(probs=beta_probs)
                new_options = tf.where(beta_dist.sample() < 1, options,
                                       max_options)
        return a, new_options, cell_state

    def learn(self, **kwargs):
        self.train_step = kwargs.get('train_step')

        def _update():
            if self.global_step % self.assign_interval == 0:
                update_target_net_weights(self.q_target_net.weights,
                                          self.q_net.weights)

        for i in range(self.train_times_per_step):
            self._learn(
                function_dict={
                    'train_function':
                    self.train,
                    'update_function':
                    _update,
                    'sample_data_list': [
                        's', 'visual_s', 'a', 'r', 's_', 'visual_s_', 'done',
                        'last_options', 'options'
                    ],
                    'train_data_list': [
                        'ss', 'vvss', 'a', 'r', 'done', 'last_options',
                        'options'
                    ],
                    'summary_dict':
                    dict([['LEARNING_RATE/q_lr',
                           self.q_lr(self.train_step)],
                          [
                              'LEARNING_RATE/intra_option_lr',
                              self.intra_option_lr(self.train_step)
                          ],
                          [
                              'LEARNING_RATE/termination_lr',
                              self.termination_lr(self.train_step)
                          ], ['Statistics/option', self.options[0]]])
                })

    @tf.function(experimental_relax_shapes=True)
    def train(self, memories, isw, crsty_loss, cell_state):
        ss, vvss, a, r, done, last_options, options = memories
        last_options = tf.cast(last_options, tf.int32)
        options = tf.cast(options, tf.int32)
        with tf.device(self.device):
            with tf.GradientTape(persistent=True) as tape:
                feat, feat_ = self.get_feature(ss,
                                               vvss,
                                               cell_state=cell_state,
                                               s_and_s_=True)
                q = self.q_net(feat)  # [B, P]
                pi = self.intra_option_net(feat)  # [B, P, A]
                beta = self.termination_net(feat)  # [B, P]
                q_next = self.q_target_net(feat_)  # [B, P], [B, P, A], [B, P]
                beta_next = self.termination_net(feat_)  # [B, P]
                options_onehot = tf.one_hot(options,
                                            self.options_num,
                                            dtype=tf.float32)  # [B,] => [B, P]

                q_s = qu_eval = tf.reduce_sum(q * options_onehot,
                                              axis=-1,
                                              keepdims=True)  # [B, 1]
                beta_s_ = tf.reduce_sum(beta_next * options_onehot,
                                        axis=-1,
                                        keepdims=True)  # [B, 1]
                q_s_ = tf.reduce_sum(q_next * options_onehot,
                                     axis=-1,
                                     keepdims=True)  # [B, 1]
                # https://github.com/jeanharb/option_critic/blob/5d6c81a650a8f452bc8ad3250f1f211d317fde8c/neural_net.py#L94
                if self.double_q:
                    q_ = self.q_net(feat)  # [B, P], [B, P, A], [B, P]
                    max_a_idx = tf.one_hot(
                        tf.argmax(q_, axis=-1),
                        self.options_num,
                        dtype=tf.float32)  # [B, P] => [B, ] => [B, P]
                    q_s_max = tf.reduce_sum(q_next * max_a_idx,
                                            axis=-1,
                                            keepdims=True)  # [B, 1]
                else:
                    q_s_max = tf.reduce_max(q_next, axis=-1,
                                            keepdims=True)  # [B, 1]
                u_target = (1 - beta_s_) * q_s_ + beta_s_ * q_s_max  # [B, 1]
                qu_target = tf.stop_gradient(r + self.gamma *
                                             (1 - done) * u_target)
                td_error = qu_target - qu_eval  # gradient : q
                q_loss = tf.reduce_mean(
                    tf.square(td_error) * isw) + crsty_loss  # [B, 1] => 1

                # https://github.com/jeanharb/option_critic/blob/5d6c81a650a8f452bc8ad3250f1f211d317fde8c/neural_net.py#L130
                if self.use_baseline:
                    adv = tf.stop_gradient(qu_target - qu_eval)
                else:
                    adv = tf.stop_gradient(qu_target)
                options_onehot_expanded = tf.expand_dims(
                    options_onehot, axis=-1)  # [B, P] => [B, P, 1]
                pi = tf.reduce_sum(pi * options_onehot_expanded,
                                   axis=1)  # [B, P, A] => [B, A]
                if self.is_continuous:
                    log_std = tf.gather(self.log_std, options)
                    mu = tf.math.tanh(pi)
                    log_p = gaussian_likelihood_sum(a, mu, log_std)
                    entropy = gaussian_entropy(log_std)
                else:
                    pi = pi / self.boltzmann_temperature
                    log_pi = tf.nn.log_softmax(pi, axis=-1)  # [B, A]
                    entropy = -tf.reduce_sum(tf.exp(log_pi) * log_pi,
                                             axis=1,
                                             keepdims=True)  # [B, 1]
                    log_p = tf.reduce_sum(a * log_pi, axis=-1,
                                          keepdims=True)  # [B, 1]
                pi_loss = tf.reduce_mean(
                    -(log_p * adv + self.ent_coff * entropy)
                )  # [B, 1] * [B, 1] => [B, 1] => 1

                last_options_onehot = tf.one_hot(
                    last_options, self.options_num,
                    dtype=tf.float32)  # [B,] => [B, P]
                beta_s = tf.reduce_sum(beta * last_options_onehot,
                                       axis=-1,
                                       keepdims=True)  # [B, 1]
                if self.use_eps_greedy:
                    v_s = tf.reduce_max(
                        q, axis=-1,
                        keepdims=True) - self.termination_regularizer  # [B, 1]
                else:
                    v_s = (1 - beta_s) * q_s + beta_s * tf.reduce_max(
                        q, axis=-1, keepdims=True)  # [B, 1]
                    # v_s = tf.reduce_mean(q, axis=-1, keepdims=True)   # [B, 1]
                beta_loss = beta_s * tf.stop_gradient(q_s - v_s)  # [B, 1]
                # https://github.com/lweitkamp/option-critic-pytorch/blob/0c57da7686f8903ed2d8dded3fae832ee9defd1a/option_critic.py#L238
                if self.terminal_mask:
                    beta_loss *= (1 - done)
                beta_loss = tf.reduce_mean(beta_loss)  # [B, 1] => 1

            q_grads = tape.gradient(q_loss, self.critic_tv)
            intra_option_grads = tape.gradient(pi_loss, self.actor_tv)
            termination_grads = tape.gradient(
                beta_loss, self.termination_net.trainable_variables)
            self.q_optimizer.apply_gradients(zip(q_grads, self.critic_tv))
            self.intra_option_optimizer.apply_gradients(
                zip(intra_option_grads, self.actor_tv))
            self.termination_optimizer.apply_gradients(
                zip(termination_grads,
                    self.termination_net.trainable_variables))
            self.global_step.assign_add(1)
            return td_error, dict(
                [['LOSS/q_loss', tf.reduce_mean(q_loss)],
                 ['LOSS/pi_loss', tf.reduce_mean(pi_loss)],
                 ['LOSS/beta_loss',
                  tf.reduce_mean(beta_loss)],
                 ['Statistics/q_option_max',
                  tf.reduce_max(q_s)],
                 ['Statistics/q_option_min',
                  tf.reduce_min(q_s)],
                 ['Statistics/q_option_mean',
                  tf.reduce_mean(q_s)]])

    def store_data(self, s, visual_s, a, r, s_, visual_s_, done):
        """
        for off-policy training, use this function to store <s, a, r, s_, done> into ReplayBuffer.
        """
        assert isinstance(a,
                          np.ndarray), "store need action type is np.ndarray"
        assert isinstance(r,
                          np.ndarray), "store need reward type is np.ndarray"
        assert isinstance(done,
                          np.ndarray), "store need done type is np.ndarray"
        self._running_average(s)
        self.data.add(
            s,
            visual_s,
            a,
            r[:, np.newaxis],  # 升维
            s_,
            visual_s_,
            done[:, np.newaxis],  # 升维
            self.last_options,
            self.options)

    def no_op_store(self, s, visual_s, a, r, s_, visual_s_, done):
        pass
示例#15
0
class IQN(make_off_policy_class(mode='share')):
    '''
    Implicit Quantile Networks, https://arxiv.org/abs/1806.06923
    Double DQN
    '''

    def __init__(self,
                 s_dim,
                 visual_sources,
                 visual_resolution,
                 a_dim,
                 is_continuous,

                 online_quantiles=8,
                 target_quantiles=8,
                 select_quantiles=32,
                 quantiles_idx=64,
                 huber_delta=1.,
                 lr=5.0e-4,
                 eps_init=1,
                 eps_mid=0.2,
                 eps_final=0.01,
                 init2mid_annealing_step=1000,
                 assign_interval=2,
                 hidden_units={
                     'q_net': [128, 64],
                     'quantile': [128, 64],
                     'tile': [64]
                 },
                 **kwargs):
        assert not is_continuous, 'iqn only support discrete action space'
        super().__init__(
            s_dim=s_dim,
            visual_sources=visual_sources,
            visual_resolution=visual_resolution,
            a_dim=a_dim,
            is_continuous=is_continuous,
            **kwargs)
        self.pi = tf.constant(np.pi)
        self.online_quantiles = online_quantiles
        self.target_quantiles = target_quantiles
        self.select_quantiles = select_quantiles
        self.quantiles_idx = quantiles_idx
        self.huber_delta = huber_delta
        self.assign_interval = assign_interval
        self.expl_expt_mng = ExplorationExploitationClass(eps_init=eps_init,
                                                          eps_mid=eps_mid,
                                                          eps_final=eps_final,
                                                          init2mid_annealing_step=init2mid_annealing_step,
                                                          max_step=self.max_train_step)

        def _net(): return NetWork(self.feat_dim, self.a_dim, self.quantiles_idx, hidden_units)

        self.q_net = _net()
        self.q_target_net = _net()
        self.critic_tv = self.q_net.trainable_variables + self.other_tv
        update_target_net_weights(self.q_target_net.weights, self.q_net.weights)
        self.lr = self.init_lr(lr)
        self.optimizer = self.init_optimizer(self.lr)

        self.model_recorder(dict(
            model=self.q_net,
            optimizer=self.optimizer
        ))

    def show_logo(self):
        self.logger.info('''
    xxxxxxxx       xxxxxxx       xxx    xxx  
    xxxxxxxx      xxxxxxxxx      xxxx   xxx  
      xxx         xxxx  xxxx     xxxxx  xxx  
      xxx         xxx    xxx     xxxxx  xxx  
      xxx        xxxx    xxx     xxxxxx xxx  
      xxx        xxxx    xxx     xxxxxxxxxx  
      xxx        xxxx    xxx     xxx xxxxxx  
      xxx         xxxx  xxxx     xxx xxxxxx  
    xxxxxxxx      xxxxxxxxx      xxx  xxxxx  
    xxxxxxxx       xxxxxxx       xxx   xxxx  
                      xxxx                   
                       xxxx                  
                        xxxx                          
        ''')

    def choose_action(self, s, visual_s, evaluation=False):
        if np.random.uniform() < self.expl_expt_mng.get_esp(self.train_step, evaluation=evaluation):
            a = np.random.randint(0, self.a_dim, self.n_agents)
        else:
            a, self.cell_state = self._get_action(s, visual_s, self.cell_state)
            a = a.numpy()
        return a

    @tf.function
    def _get_action(self, s, visual_s, cell_state):
        batch_size = tf.shape(s)[0]
        with tf.device(self.device):
            feat, cell_state = self.get_feature(s, visual_s, cell_state=cell_state, record_cs=True)
            _, select_quantiles_tiled = self._generate_quantiles(   # [N*B, 64]
                batch_size=batch_size,
                quantiles_num=self.select_quantiles,
                quantiles_idx=self.quantiles_idx
            )
            _, q_values = self.q_net(feat, select_quantiles_tiled, quantiles_num=self.select_quantiles)  # [B, A]
        return tf.argmax(q_values, axis=-1), cell_state  # [B,]

    @tf.function
    def _generate_quantiles(self, batch_size, quantiles_num, quantiles_idx):
        with tf.device(self.device):
            _quantiles = tf.random.uniform([batch_size * quantiles_num, 1], minval=0, maxval=1)  # [N*B, 1]
            _quantiles_tiled = tf.tile(_quantiles, [1, quantiles_idx])  # [N*B, 1] => [N*B, 64]
            _quantiles_tiled = tf.cast(tf.range(quantiles_idx), tf.float32) * self.pi * _quantiles_tiled  # pi * i * tau [N*B, 64] * [64, ] => [N*B, 64]
            _quantiles_tiled = tf.cos(_quantiles_tiled)   # [N*B, 64]
            _quantiles = tf.reshape(_quantiles, [batch_size, quantiles_num, 1])    # [N*B, 1] => [B, N, 1]
            return _quantiles, _quantiles_tiled

    def learn(self, **kwargs):
        self.train_step = kwargs.get('train_step')

        def _update():
            if self.global_step % self.assign_interval == 0:
                update_target_net_weights(self.q_target_net.weights, self.q_net.weights)
        for i in range(self.train_times_per_step):
            self._learn(function_dict={
                'train_function': self.train,
                'update_function': _update,
                'summary_dict': dict([['LEARNING_RATE/lr', self.lr(self.train_step)]])
            })

    @tf.function(experimental_relax_shapes=True)
    def train(self, memories, isw, crsty_loss, cell_state):
        ss, vvss, a, r, done = memories
        batch_size = tf.shape(a)[0]
        with tf.device(self.device):
            with tf.GradientTape() as tape:
                feat, feat_ = self.get_feature(ss, vvss, cell_state=cell_state, s_and_s_=True)
                quantiles, quantiles_tiled = self._generate_quantiles(   # [B, N, 1], [N*B, 64]
                    batch_size=batch_size,
                    quantiles_num=self.online_quantiles,
                    quantiles_idx=self.quantiles_idx
                )
                quantiles_value, q = self.q_net(feat, quantiles_tiled, quantiles_num=self.online_quantiles)    # [N, B, A], [B, A]
                _a = tf.reshape(tf.tile(a, [self.online_quantiles, 1]), [self.online_quantiles, -1, self.a_dim])  # [B, A] => [N*B, A] => [N, B, A]
                quantiles_value = tf.reduce_sum(quantiles_value * _a, axis=-1, keepdims=True)   # [N, B, A] => [N, B, 1]
                q_eval = tf.reduce_sum(q * a, axis=-1, keepdims=True)  # [B, A] => [B, 1]

                _, select_quantiles_tiled = self._generate_quantiles(   # [N*B, 64]
                    batch_size=batch_size,
                    quantiles_num=self.select_quantiles,
                    quantiles_idx=self.quantiles_idx
                )
                _, q_values = self.q_net(feat_, select_quantiles_tiled, quantiles_num=self.select_quantiles)  # [B, A]
                next_max_action = tf.argmax(q_values, axis=-1)   # [B,]
                next_max_action = tf.one_hot(tf.squeeze(next_max_action), self.a_dim, 1., 0., dtype=tf.float32)  # [B, A]
                _next_max_action = tf.reshape(tf.tile(next_max_action, [self.target_quantiles, 1]), [self.target_quantiles, -1, self.a_dim])  # [B, A] => [N'*B, A] => [N', B, A]
                _, target_quantiles_tiled = self._generate_quantiles(   # [N'*B, 64]
                    batch_size=batch_size,
                    quantiles_num=self.target_quantiles,
                    quantiles_idx=self.quantiles_idx
                )

                target_quantiles_value, target_q = self.q_target_net(feat_, target_quantiles_tiled, quantiles_num=self.target_quantiles)  # [N', B, A], [B, A]
                target_quantiles_value = tf.reduce_sum(target_quantiles_value * _next_max_action, axis=-1, keepdims=True)   # [N', B, A] => [N', B, 1]
                target_q = tf.reduce_sum(target_q * a, axis=-1, keepdims=True)  # [B, A] => [B, 1]
                q_target = tf.stop_gradient(r + self.gamma * (1 - done) * target_q)   # [B, 1]
                td_error = q_eval - q_target    # [B, 1]

                _r = tf.reshape(tf.tile(r, [self.target_quantiles, 1]), [self.target_quantiles, -1, 1])  # [B, 1] => [N'*B, 1] => [N', B, 1]
                _done = tf.reshape(tf.tile(done, [self.target_quantiles, 1]), [self.target_quantiles, -1, 1])    # [B, 1] => [N'*B, 1] => [N', B, 1]

                quantiles_value_target = tf.stop_gradient(_r + self.gamma * (1 - _done) * target_quantiles_value)   # [N', B, 1]
                quantiles_value_target = tf.transpose(quantiles_value_target, [1, 2, 0])    # [B, 1, N']
                quantiles_value_online = tf.transpose(quantiles_value, [1, 0, 2])   # [B, N, 1]
                quantile_error = quantiles_value_online - quantiles_value_target    # [B, N, 1] - [B, 1, N'] => [B, N, N']
                huber = huber_loss(quantile_error, delta=self.huber_delta)  # [B, N, N']
                huber_abs = tf.abs(quantiles - tf.where(quantile_error < 0, tf.ones_like(quantile_error), tf.zeros_like(quantile_error)))   # [B, N, 1] - [B, N, N'] => [B, N, N']
                loss = tf.reduce_mean(huber_abs * huber, axis=-1)  # [B, N, N'] => [B, N]
                loss = tf.reduce_sum(loss, axis=-1)  # [B, N] => [B, ]
                loss = tf.reduce_mean(loss * isw) + crsty_loss  # [B, ] => 1
            grads = tape.gradient(loss, self.critic_tv)
            self.optimizer.apply_gradients(
                zip(grads, self.critic_tv)
            )
            self.global_step.assign_add(1)
            return td_error, dict([
                ['LOSS/loss', loss],
                ['Statistics/q_max', tf.reduce_max(q_eval)],
                ['Statistics/q_min', tf.reduce_min(q_eval)],
                ['Statistics/q_mean', tf.reduce_mean(q_eval)]
            ])
示例#16
0
文件: ac.py 项目: ncepuwwy97/RLs
class AC(make_off_policy_class(mode='share')):
    # off-policy actor-critic
    def __init__(
            self,
            s_dim,
            visual_sources,
            visual_resolution,
            a_dim,
            is_continuous,
            actor_lr=5.0e-4,
            critic_lr=1.0e-3,
            hidden_units={
                'actor_continuous': [32, 32],
                'actor_discrete': [32, 32],
                'critic': [32, 32]
            },
            **kwargs):
        super().__init__(s_dim=s_dim,
                         visual_sources=visual_sources,
                         visual_resolution=visual_resolution,
                         a_dim=a_dim,
                         is_continuous=is_continuous,
                         **kwargs)

        if self.is_continuous:
            self.actor_net = ActorCts(self.feat_dim, self.a_dim,
                                      hidden_units['actor_continuous'])
            self.log_std = tf.Variable(initial_value=-0.5 *
                                       np.ones(self.a_dim, dtype=np.float32),
                                       trainable=True)
            self.actor_tv = self.actor_net.trainable_variables + [self.log_std]
        else:
            self.actor_net = ActorDcs(self.feat_dim, self.a_dim,
                                      hidden_units['actor_discrete'])
            self.actor_tv = self.actor_net.trainable_variables
        self.critic_net = Critic(self.feat_dim, self.a_dim,
                                 hidden_units['critic'])
        self.critic_tv = self.critic_net.trainable_variables + self.other_tv
        self.actor_lr, self.critic_lr = map(self.init_lr,
                                            [actor_lr, critic_lr])
        self.optimizer_actor, self.optimizer_critic = map(
            self.init_optimizer, [self.actor_lr, self.critic_lr])

        self.model_recorder(
            dict(actor=self.actor_net,
                 critic=self.critic_net,
                 optimizer_actor=self.optimizer_actor,
                 optimizer_critic=self.optimizer_critic))

    def show_logo(self):
        self.logger.info('''
       xx           xxxxxx    
      xxx          xxx  xx    
      xxx          xx    xx   
      x xx         xx         
     xx xx        xxx         
     xxxxxx       xxx         
    xx   xx        xx    xx   
    xx   xx        xxx  xxx   
   xxx  xxxxx       xxxxxx     
        ''')

    def choose_action(self, s, visual_s, evaluation=False):
        a, _lp, self.cell_state = self._get_action(s, visual_s,
                                                   self.cell_state)
        a = a.numpy()
        self._log_prob = _lp.numpy()
        return a

    @tf.function
    def _get_action(self, s, visual_s, cell_state):
        with tf.device(self.device):
            feat, cell_state = self.get_feature(s,
                                                visual_s,
                                                cell_state=cell_state,
                                                record_cs=True)
            if self.is_continuous:
                mu = self.actor_net(feat)
                sample_op, _ = gaussian_clip_rsample(mu, self.log_std)
                log_prob = gaussian_likelihood_sum(sample_op, mu, self.log_std)
            else:
                logits = self.actor_net(feat)
                norm_dist = tfp.distributions.Categorical(logits)
                sample_op = norm_dist.sample()
                log_prob = norm_dist.log_prob(sample_op)
        return sample_op, log_prob, cell_state

    def store_data(self, s, visual_s, a, r, s_, visual_s_, done):
        assert isinstance(
            a, np.ndarray), "store_data need action type is np.ndarray"
        assert isinstance(
            r, np.ndarray), "store_data need reward type is np.ndarray"
        assert isinstance(
            done, np.ndarray), "store_data need done type is np.ndarray"
        self._running_average(s)
        old_log_prob = self._log_prob
        self.data.add(s, visual_s, a, r[:, np.newaxis], s_, visual_s_,
                      done[:, np.newaxis], old_log_prob)

    def no_op_store(self, s, visual_s, a, r, s_, visual_s_, done):
        assert isinstance(
            a, np.ndarray), "store_data need action type is np.ndarray"
        assert isinstance(
            r, np.ndarray), "store_data need reward type is np.ndarray"
        assert isinstance(
            done, np.ndarray), "store_data need done type is np.ndarray"
        self._running_average(s)
        old_log_prob = np.ones_like(r)
        self.data.add(s, visual_s, a, r[:, np.newaxis], s_, visual_s_,
                      done[:, np.newaxis], old_log_prob[:, np.newaxis])

    def learn(self, **kwargs):
        self.train_step = kwargs.get('train_step')
        for i in range(self.train_times_per_step):
            self._learn(
                function_dict={
                    'train_function':
                    self.train,
                    'update_function':
                    lambda: None,
                    'summary_dict':
                    dict([[
                        'LEARNING_RATE/actor_lr',
                        self.actor_lr(self.train_step)
                    ],
                          [
                              'LEARNING_RATE/critic_lr',
                              self.critic_lr(self.train_step)
                          ]]),
                    'sample_data_list': [
                        's', 'visual_s', 'a', 'r', 's_', 'visual_s_', 'done',
                        'old_log_prob'
                    ],
                    'train_data_list':
                    ['ss', 'vvss', 'a', 'r', 'done', 'old_log_prob']
                })

    @tf.function(experimental_relax_shapes=True)
    def train(self, memories, isw, crsty_loss, cell_state):
        ss, vvss, a, r, done, old_log_prob = memories
        with tf.device(self.device):
            with tf.GradientTape() as tape:
                feat, feat_ = self.get_feature(ss,
                                               vvss,
                                               cell_state=cell_state,
                                               s_and_s_=True)
                if self.is_continuous:
                    next_mu = self.actor_net(feat_)
                    max_q_next = tf.stop_gradient(
                        self.critic_net(feat_, next_mu))
                else:
                    logits = self.actor_net(feat_)
                    max_a = tf.argmax(logits, axis=1)
                    max_a_one_hot = tf.one_hot(max_a,
                                               self.a_dim,
                                               dtype=tf.float32)
                    max_q_next = tf.stop_gradient(
                        self.critic_net(feat_, max_a_one_hot))
                q = self.critic_net(feat, a)
                td_error = q - (r + self.gamma * (1 - done) * max_q_next)
                critic_loss = tf.reduce_mean(
                    tf.square(td_error) * isw) + crsty_loss
            critic_grads = tape.gradient(critic_loss, self.critic_tv)
            self.optimizer_critic.apply_gradients(
                zip(critic_grads, self.critic_tv))
            with tf.GradientTape() as tape:
                if self.is_continuous:
                    mu = self.actor_net(feat)
                    log_prob = gaussian_likelihood_sum(a, mu, self.log_std)
                    entropy = gaussian_entropy(self.log_std)
                else:
                    logits = self.actor_net(feat)
                    logp_all = tf.nn.log_softmax(logits)
                    log_prob = tf.reduce_sum(tf.multiply(logp_all, a),
                                             axis=1,
                                             keepdims=True)
                    entropy = -tf.reduce_mean(
                        tf.reduce_sum(tf.exp(logp_all) * logp_all,
                                      axis=1,
                                      keepdims=True))
                q = self.critic_net(feat, a)
                ratio = tf.stop_gradient(tf.exp(log_prob - old_log_prob))
                q_value = tf.stop_gradient(q)
                actor_loss = -tf.reduce_mean(ratio * log_prob * q_value)
            actor_grads = tape.gradient(actor_loss, self.actor_tv)
            self.optimizer_actor.apply_gradients(
                zip(actor_grads, self.actor_tv))
            self.global_step.assign_add(1)
            return td_error, dict([['LOSS/actor_loss', actor_loss],
                                   ['LOSS/critic_loss', critic_loss],
                                   ['Statistics/q_max',
                                    tf.reduce_max(q)],
                                   ['Statistics/q_min',
                                    tf.reduce_min(q)],
                                   ['Statistics/q_mean',
                                    tf.reduce_mean(q)],
                                   ['Statistics/ratio',
                                    tf.reduce_mean(ratio)],
                                   ['Statistics/entropy', entropy]])

    @tf.function(experimental_relax_shapes=True)
    def train_persistent(self, memories, isw, crsty_loss, cell_state):
        ss, vvss, a, r, done, old_log_prob = memories
        with tf.device(self.device):
            with tf.GradientTape(persistent=True) as tape:
                feat, feat_ = self.get_feature(ss,
                                               vvss,
                                               cell_state=cell_state,
                                               s_and_s_=True)
                if self.is_continuous:
                    next_mu = self.actor_net(feat_)
                    max_q_next = tf.stop_gradient(
                        self.critic_net(feat_, next_mu))
                    mu, sigma = self.actor_net(feat)
                    log_prob = gaussian_likelihood_sum(a, mu, self.log_std)
                    entropy = gaussian_entropy(self.log_std)
                else:
                    logits = self.actor_net(feat_)
                    max_a = tf.argmax(logits, axis=1)
                    max_a_one_hot = tf.one_hot(max_a, self.a_dim)
                    max_q_next = tf.stop_gradient(
                        self.critic_net(feat_, max_a_one_hot))
                    logits = self.actor_net(feat)
                    logp_all = tf.nn.log_softmax(logits)
                    log_prob = tf.reduce_sum(tf.multiply(logp_all, a),
                                             axis=1,
                                             keepdims=True)
                    entropy = -tf.reduce_mean(
                        tf.reduce_sum(tf.exp(logp_all) * logp_all,
                                      axis=1,
                                      keepdims=True))
                q = self.critic_net(feat, a)
                ratio = tf.stop_gradient(tf.exp(log_prob - old_log_prob))
                q_value = tf.stop_gradient(q)
                td_error = q - (r + self.gamma * (1 - done) * max_q_next)
                critic_loss = tf.reduce_mean(
                    tf.square(td_error) * isw) + crsty_loss
                actor_loss = -tf.reduce_mean(ratio * log_prob * q_value)
            critic_grads = tape.gradient(critic_loss, self.critic_tv)
            self.optimizer_critic.apply_gradients(
                zip(critic_grads, self.critic_tv))
            actor_grads = tape.gradient(actor_loss, self.actor_tv)
            self.optimizer_actor.apply_gradients(
                zip(actor_grads, self.actor_tv))
            self.global_step.assign_add(1)
            return td_error, dict([['LOSS/actor_loss', actor_loss],
                                   ['LOSS/critic_loss', critic_loss],
                                   ['Statistics/q_max',
                                    tf.reduce_max(q)],
                                   ['Statistics/q_min',
                                    tf.reduce_min(q)],
                                   ['Statistics/q_mean',
                                    tf.reduce_mean(q)],
                                   ['Statistics/ratio',
                                    tf.reduce_mean(ratio)],
                                   ['Statistics/entropy', entropy]])