Пример #1
0
class PG(make_on_policy_class(mode='share')):
    def __init__(self,
                 s_dim,
                 visual_sources,
                 visual_resolution,
                 a_dim,
                 is_continuous,
                 lr=5.0e-4,
                 epoch=5,
                 hidden_units={
                     'actor_continuous': [32, 32],
                     'actor_discrete': [32, 32]
                 },
                 **kwargs):
        super().__init__(s_dim=s_dim,
                         visual_sources=visual_sources,
                         visual_resolution=visual_resolution,
                         a_dim=a_dim,
                         is_continuous=is_continuous,
                         **kwargs)
        self.epoch = epoch
        # self.TensorSpecs = get_TensorSpecs([self.s_dim], self.visual_dim, [self.a_dim], [1])
        if self.is_continuous:
            self.net = rls.actor_mu(self.feat_dim, self.a_dim,
                                    hidden_units['actor_continuous'])
            self.log_std = tf.Variable(initial_value=-0.5 *
                                       np.ones(self.a_dim, dtype=np.float32),
                                       trainable=True)
            self.net_tv = self.net.trainable_variables + [self.log_std
                                                          ] + self.other_tv
        else:
            self.net = rls.actor_discrete(self.feat_dim, self.a_dim,
                                          hidden_units['actor_discrete'])
            self.net_tv = self.net.trainable_variables + self.other_tv
        self.lr = self.init_lr(lr)
        self.optimizer = self.init_optimizer(self.lr)

        self.model_recorder(dict(model=self.net, optimizer=self.optimizer))

        self.initialize_data_buffer()

    def show_logo(self):
        self.recorder.logger.info('''
   xxxxxxxx        xxxxxx     
     xx  xx       xxx  xx     
     x   xxx      xx    x     
     x   xxx      xx          
     xxxxxx       x   xxxxx   
     x            xx   xxx    
     x            xx    x     
     x            xxx  xx     
   xxxxx           xxxxxx     
                     xx       
        ''')

    def choose_action(self, s, visual_s, evaluation=False):
        a, self.cell_state = self._get_action(s, visual_s, self.cell_state)
        a = a.numpy()
        return a

    @tf.function
    def _get_action(self, s, visual_s, cell_state):
        with tf.device(self.device):
            feat, cell_state = self.get_feature(s,
                                                visual_s,
                                                cell_state=cell_state,
                                                record_cs=True)
            if self.is_continuous:
                mu = self.net(feat)
                sample_op, _ = gaussian_clip_rsample(mu, self.log_std)
            else:
                logits = self.net(feat)
                norm_dist = tfp.distributions.Categorical(logits)
                sample_op = norm_dist.sample()
        return sample_op, cell_state

    def calculate_statistics(self):
        self.data.cal_dc_r(self.gamma, 0., normalize=True)

    def learn(self, **kwargs):
        self.episode = kwargs['episode']

        def _train(data, crsty_loss, cell_state):
            for _ in range(self.epoch):
                loss, entropy = self.train(data, crsty_loss, cell_state)
            summaries = dict([['LOSS/loss', loss],
                              ['Statistics/entropy', entropy]])
            return summaries

        self._learn(
            function_dict={
                'calculate_statistics': self.calculate_statistics,
                'train_function': _train,
                'train_data_list': ['s', 'visual_s', 'a', 'discounted_reward'],
                'summary_dict': dict(
                    [['LEARNING_RATE/lr',
                      self.lr(self.episode)]])
            })

    @tf.function(experimental_relax_shapes=True)
    def train(self, memories, crsty_loss, cell_state):
        s, visual_s, a, dc_r = memories
        with tf.device(self.device):
            with tf.GradientTape() as tape:
                feat = self.get_feature(s, visual_s, cell_state=cell_state)
                if self.is_continuous:
                    mu = self.net(feat)
                    log_act_prob = gaussian_likelihood_sum(a, mu, self.log_std)
                    entropy = gaussian_entropy(self.log_std)
                else:
                    logits = self.net(feat)
                    logp_all = tf.nn.log_softmax(logits)
                    log_act_prob = tf.reduce_sum(tf.multiply(logp_all, a),
                                                 axis=1,
                                                 keepdims=True)
                    entropy = -tf.reduce_mean(
                        tf.reduce_sum(tf.exp(logp_all) * logp_all,
                                      axis=1,
                                      keepdims=True))
                loss = -tf.reduce_mean(log_act_prob * dc_r) + crsty_loss
            loss_grads = tape.gradient(loss, self.net_tv)
            self.optimizer.apply_gradients(zip(loss_grads, self.net_tv))
            self.global_step.assign_add(1)
            return loss, entropy
Пример #2
0
Файл: cem.py Проект: yyht/RLs
class CEM(make_on_policy_class(mode='share')):
    '''
    Cross-Entropy Method
    '''
    def __init__(
            self,
            s_dim,
            visual_sources,
            visual_resolution,
            a_dim,
            is_continuous,
            hidden_units=[32, 32],
            frac=0.2,
            init_var=1,
            extra_std=1,
            extra_decay_eps=200,
            extra_var_last_multiplier=0.2,
            envs_per_popu=5,  # 环境数/模型数 余数为0
            **kwargs):
        super().__init__(s_dim=s_dim,
                         visual_sources=visual_sources,
                         visual_resolution=visual_resolution,
                         a_dim=a_dim,
                         is_continuous=is_continuous,
                         **kwargs)
        self.frac = frac
        self.hidden_units = hidden_units
        self.init_var = init_var
        self.extra_std = extra_std
        self.extra_decay_eps = extra_decay_eps
        self.envs_per_popu = envs_per_popu
        self.extra_var_last_multiplier = extra_var_last_multiplier

    def show_logo(self):
        self.recorder.logger.info('''
     xxxxxxx      xxxxxxxx      xxxx    xxxx 
    xxxx xxx       xxx  xx       xxx    xxx  
   xxxx    x       xxx   x       xxxx  xxxx  
   xxx     x       xxx  x        xxxx  xxxx  
   xxx             xxxxxx        x xx xxxxx  
   xxx             xxx  x        x xxxxxxxx  
   xxx             xxx  x x      x  xxx xxx  
    xxx    x       xxx   xx      x  xxx xxx  
    xxxxxxxx      xxxxxxxx      xxxxxx xxxxx 
      xxxxx                                  
        ''')

    def choose_action(self, s, visual_s, evaluation=False):
        self._check_agents(s)
        a = [
            model(s_).numpy() for model, s_ in zip(
                self.cem_models, np.split(s, self.populations, axis=0))
        ]
        if self.is_continuous:
            a = np.vstack(a)
        else:
            a = np.hstack(a)
        return a

    @tf.function
    def _get_action(self, s, visual_s):
        s, visual_s = self.cast(s, visual_s)
        with tf.device(self.device):
            pass

    def store_data(self, s, visual_s, a, r, s_, visual_s_, done):
        self.returns += r * (1 - self.dones)
        self.dones += done
        pass

    def learn(self, **kwargs):
        self.train_step = kwargs.get('train_step')
        rets = self.returns.reshape(-1, self.envs_per_popu).mean(axis=-1)
        elites_idxs = rets.argsort()[-self.n_elite:]
        elites_weights = np.array(self.models_weights)[elites_idxs, :]
        self.mu = np.mean(elites_weights, axis=0)
        self.sigma = np.var(elites_weights, axis=0)
        self._update_models_weights()
        self._reset_variables()
        self.write_training_summaries(
            self.train_step,
            dict([['Statistics/mu', self.mu.mean()],
                  ['Statistics/sigma', self.sigma.mean()],
                  ['Statistics/sample_std',
                   self.sample_std.mean()]]))

    def _check_agents(self, s):
        '''
        用于为实例赋予种群数量属性,并且初始化变量
        params : 状态列表S,一个环境下有多少个智能体就包含多少个状态向量
        '''
        if not hasattr(self, 'populations'):
            assert s.shape[
                0] % self.envs_per_popu == 0, '环境数必须可以整除envs_per_popu系数'
            self.populations = int(s.shape[0] / self.envs_per_popu)
            self._build()

    def _build(self):
        '''
        构建实体模型,初始化变量
        '''
        self.n_elite = max(int(np.round(self.populations * self.frac)), 1)
        self.cem_models = [
            Model(self.s_dim, self.a_dim, self.hidden_units,
                  self.is_continuous) for i in range(self.populations)
        ]
        self.mu = np.random.randn(self.cem_models[0].weights_total_nums)
        self.sigma = np.ones(
            self.cem_models[0].weights_total_nums) * self.init_var
        self._update_models_weights()
        self._reset_variables()

    def _reset_variables(self):
        '''
        初始化return列表和done标志列表
        '''
        self.returns = np.zeros(self.populations * self.envs_per_popu,
                                dtype=np.float32)
        self.dones = np.full(self.populations * self.envs_per_popu, False)

    def _update_models_weights(self):
        '''
        重新给模型赋参数
        '''
        extra_var_multiplier = max(
            (1.0 - self.train_step / self.extra_decay_eps),
            self.extra_var_last_multiplier)
        self.sample_std = np.sqrt(self.sigma + np.square(self.extra_std) *
                                  extra_var_multiplier)
        self.models_weights = [
            self.mu + self.sample_std * np.random.randn(self.mu.shape[0])
            for i in range(self.populations)
        ]
        [m.set_wb(wb) for m, wb in zip(self.cem_models, self.models_weights)]
Пример #3
0
Файл: aoc.py Проект: yyht/RLs
class AOC(make_on_policy_class(mode='share')):
    '''
    Asynchronous Advantage Option-Critic with Deliberation Cost, A2OC
    When Waiting is not an Option : Learning Options with a Deliberation Cost, A2OC, http://arxiv.org/abs/1709.04571
    '''
    def __init__(
            self,
            s_dim,
            visual_sources,
            visual_resolution,
            a_dim,
            is_continuous,
            options_num=4,
            dc=0.01,
            terminal_mask=False,
            eps=0.1,
            epoch=4,
            pi_beta=1.0e-3,
            lr=5.0e-4,
            lambda_=0.95,
            epsilon=0.2,
            value_epsilon=0.2,
            kl_reverse=False,
            kl_target=0.02,
            kl_target_cutoff=2,
            kl_target_earlystop=4,
            kl_beta=[0.7, 1.3],
            kl_alpha=1.5,
            kl_coef=1.0,
            hidden_units={
                'share': [32, 32],
                'q': [32, 32],
                'intra_option': [32, 32],
                'termination': [32, 32]
            },
            **kwargs):
        super().__init__(s_dim=s_dim,
                         visual_sources=visual_sources,
                         visual_resolution=visual_resolution,
                         a_dim=a_dim,
                         is_continuous=is_continuous,
                         **kwargs)
        self.pi_beta = pi_beta
        self.epoch = epoch
        self.lambda_ = lambda_
        self.epsilon = epsilon
        self.value_epsilon = value_epsilon
        self.kl_reverse = kl_reverse
        self.kl_target = kl_target
        self.kl_alpha = kl_alpha
        self.kl_coef = tf.constant(kl_coef, dtype=tf.float32)

        self.kl_cutoff = kl_target * kl_target_cutoff
        self.kl_stop = kl_target * kl_target_earlystop
        self.kl_low = kl_target * kl_beta[0]
        self.kl_high = kl_target * kl_beta[-1]

        self.options_num = options_num
        self.dc = dc
        self.terminal_mask = terminal_mask
        self.eps = eps

        self.net = rls.aoc_share(self.feat_dim, self.a_dim, self.options_num,
                                 hidden_units, self.is_continuous)
        if self.is_continuous:
            self.log_std = tf.Variable(initial_value=-0.5 * np.ones(
                (self.options_num, self.a_dim), dtype=np.float32),
                                       trainable=True)  # [P, A]
            self.net_tv = self.net.trainable_variables + [self.log_std
                                                          ] + self.other_tv
        else:
            self.net_tv = self.net.trainable_variables + self.other_tv
        self.lr = self.init_lr(lr)
        self.optimizer = self.init_optimizer(self.lr)
        self.model_recorder(dict(model=self.net, optimizer=self.optimizer))

        self.initialize_data_buffer(data_name_list=[
            's', 'visual_s', 'a', 'r', 's_', 'visual_s_', 'done', 'value',
            'log_prob', 'beta_adv', 'last_options', 'options'
        ])

    def show_logo(self):
        self.recorder.logger.info('''
       xx           xxxxx          xxxxx          xxxxxx    
      xxx          xxx xxx         xx xxx        xxx  xx    
      xxx          xx   xx         xx xxx        xx    xx   
      x xx         xx   xxx           xx         xx         
     xx xx        xxx   xxx          xxx        xxx         
     xxxxxx        xx   xxx          xx         xxx         
    xx   xx        xx   xx          xx           xx    xx   
    xx   xx        xx  xxx         xx  x         xxx  xxx   
   xxx  xxxxx       xxxxx         xxxxxx          xxxxxx 
        ''')

    def reset(self):
        super().reset()
        self._done_mask = np.full(self.n_agents, True)

    def partial_reset(self, done):
        super().partial_reset(done)
        self._done_mask = done

    def _generate_random_options(self):
        return tf.constant(np.random.randint(0, self.options_num,
                                             self.n_agents),
                           dtype=tf.int32)

    def choose_action(self, s, visual_s, evaluation=False):
        if not hasattr(self, 'options'):
            self.options = self._generate_random_options()
        self.last_options = self.options
        if not hasattr(self, 'oc_mask'):
            self.oc_mask = tf.constant(np.zeros(self.n_agents), dtype=tf.int32)

        a, value, log_prob, beta_adv, new_options, max_options, self.cell_state = self._get_action(
            s, visual_s, self.cell_state, self.options)
        a = a.numpy()
        new_options = tf.where(self._done_mask, max_options, new_options)
        self._done_mask = np.full(self.n_agents, False)
        self._value = np.squeeze(value.numpy())
        self._log_prob = np.squeeze(log_prob.numpy()) + 1e-10
        self._beta_adv = np.squeeze(beta_adv.numpy()) + self.dc
        self.oc_mask = (
            new_options == self.options).numpy()  # equal means no change
        self.options = new_options
        return a

    @tf.function
    def _get_action(self, s, visual_s, cell_state, options):
        with tf.device(self.device):
            feat, cell_state = self.get_feature(s,
                                                visual_s,
                                                cell_state=cell_state,
                                                record_cs=True)
            q, pi, beta = self.net(feat)  # [B, P], [B, P, A], [B, P], [B, P]
            options_onehot = tf.one_hot(options,
                                        self.options_num,
                                        dtype=tf.float32)  # [B, P]
            options_onehot_expanded = tf.expand_dims(options_onehot,
                                                     axis=-1)  # [B, P, 1]
            pi = tf.reduce_sum(pi * options_onehot_expanded, axis=1)  # [B, A]
            if self.is_continuous:
                log_std = tf.gather(self.log_std, options)
                mu = pi
                sample_op, _ = gaussian_clip_rsample(mu, log_std)
                log_prob = gaussian_likelihood_sum(sample_op, mu, log_std)
            else:
                logits = pi
                norm_dist = tfp.distributions.Categorical(logits)
                sample_op = norm_dist.sample()
                log_prob = norm_dist.log_prob(sample_op)
            q_o = tf.reduce_sum(q * options_onehot, axis=-1)  # [B, ]
            beta_adv = q_o - ((1 - self.eps) * tf.reduce_max(q, axis=-1) +
                              self.eps * tf.reduce_mean(q, axis=-1))  # [B, ]
            max_options = tf.cast(tf.argmax(q, axis=-1),
                                  dtype=tf.int32)  # [B, P] => [B, ]
            beta_probs = tf.reduce_sum(beta * options_onehot,
                                       axis=1)  # [B, P] => [B,]
            beta_dist = tfp.distributions.Bernoulli(probs=beta_probs)
            new_options = tf.where(beta_dist.sample() < 1, options,
                                   max_options)  # <1 则不改变op, =1 则改变op
        return sample_op, q_o, log_prob, beta_adv, new_options, max_options, cell_state

    def store_data(self, s, visual_s, a, r, s_, visual_s_, done):
        assert isinstance(
            a, np.ndarray), "store_data need action type is np.ndarray"
        assert isinstance(
            r, np.ndarray), "store_data need reward type is np.ndarray"
        assert isinstance(
            done, np.ndarray), "store_data need done type is np.ndarray"
        self._running_average(s)
        r -= (1 - self.oc_mask) * self.dc
        self.data.add(s, visual_s, a, r, s_, visual_s_, done, self._value,
                      self._log_prob, self._beta_adv, self.last_options,
                      self.options)
        self.oc_mask = tf.zeros_like(self.oc_mask)

    @tf.function
    def _get_value(self, feat, options):
        options = tf.cast(options, tf.int32)
        with tf.device(self.device):
            options_onehot = tf.one_hot(options,
                                        self.options_num,
                                        dtype=tf.float32)  # [B, P]
            q, _, _ = self.net(feat)
            q_o = tf.reduce_sum(q * options_onehot, axis=-1)  # [B, ]
            return q_o

    def calculate_statistics(self):
        feat, self.cell_state = self.get_feature(self.data.last_s(),
                                                 self.data.last_visual_s(),
                                                 cell_state=self.cell_state,
                                                 record_cs=True)
        init_value = np.squeeze(
            self._get_value(feat, self.data.buffer['options'][-1]).numpy())
        self.data.cal_dc_r(self.gamma, init_value)
        self.data.cal_td_error(self.gamma, init_value)
        self.data.cal_gae_adv(self.lambda_, self.gamma)

    def learn(self, **kwargs):
        self.train_step = kwargs.get('train_step')

        def _train(data, crsty_loss, cell_state):
            early_step = 0
            for i in range(self.epoch):
                loss, pi_loss, q_loss, beta_loss, entropy, kl = self.train_share(
                    data, self.kl_coef, crsty_loss, cell_state)
                if kl > self.kl_stop:
                    early_step = i
                    break

            if kl > self.kl_high:
                self.kl_coef *= self.kl_alpha
            elif kl < self.kl_low:
                self.kl_coef /= self.kl_alpha

            summaries = dict([
                ['LOSS/loss', loss],
                ['LOSS/loss', pi_loss],
                ['LOSS/loss', q_loss],
                ['LOSS/loss', beta_loss],
                ['Statistics/kl', kl],
                ['Statistics/entropy', entropy],
                ['Statistics/kl_coef', self.kl_coef],
                ['Statistics/early_step', early_step],
            ])
            return summaries

        summary_dict = dict([['LEARNING_RATE/lr', self.lr(self.train_step)]])

        self._learn(
            function_dict={
                'calculate_statistics':
                self.calculate_statistics,
                'train_function':
                _train,
                'train_data_list': [
                    's', 'visual_s', 'a', 'discounted_reward', 'log_prob',
                    'gae_adv', 'value', 'beta_adv', 'last_options', 'options'
                ],
                'summary_dict':
                summary_dict
            })

    @tf.function(experimental_relax_shapes=True)
    def train_share(self, memories, kl_coef, crsty_loss, cell_state):
        s, visual_s, a, dc_r, old_log_prob, advantage, old_value, beta_advantage, last_options, options = memories
        last_options = tf.reshape(tf.cast(last_options, tf.int32),
                                  (-1, ))  # [B, 1] => [B,]
        options = tf.reshape(tf.cast(options, tf.int32), (-1, ))
        with tf.device(self.device):
            with tf.GradientTape() as tape:
                feat = self.get_feature(s, visual_s, cell_state=cell_state)
                q, pi, beta = self.net(
                    feat)  # [B, P], [B, P, A], [B, P], [B, P]

                options_onehot = tf.one_hot(options,
                                            self.options_num,
                                            dtype=tf.float32)  # [B, P]
                options_onehot_expanded = tf.expand_dims(options_onehot,
                                                         axis=-1)  # [B, P, 1]
                last_options_onehot = tf.one_hot(
                    last_options, self.options_num,
                    dtype=tf.float32)  # [B,] => [B, P]

                pi = tf.reduce_sum(pi * options_onehot_expanded,
                                   axis=1)  # [B, P, A] => [B, A]
                value = tf.reduce_sum(q * options_onehot,
                                      axis=1,
                                      keepdims=True)  # [B, 1]

                if self.is_continuous:
                    log_std = tf.gather(self.log_std, options)
                    mu = pi  # [B, A]
                    new_log_prob = gaussian_likelihood_sum(a, mu, log_std)
                    entropy = gaussian_entropy(log_std)
                else:
                    logits = pi  # [B, A]
                    logp_all = tf.nn.log_softmax(logits)
                    new_log_prob = tf.reduce_sum(a * logp_all,
                                                 axis=1,
                                                 keepdims=True)
                    entropy = -tf.reduce_mean(
                        tf.reduce_sum(tf.exp(logp_all) * logp_all,
                                      axis=1,
                                      keepdims=True))
                ratio = tf.exp(new_log_prob - old_log_prob)

                if self.kl_reverse:
                    kl = tf.reduce_mean(new_log_prob - old_log_prob)
                else:
                    kl = tf.reduce_mean(
                        old_log_prob - new_log_prob
                    )  # a sample estimate for KL-divergence, easy to compute
                surrogate = ratio * advantage

                value_clip = old_value + tf.clip_by_value(
                    value - old_value, -self.value_epsilon, self.value_epsilon)
                td_error = dc_r - value
                td_error_clip = dc_r - value_clip
                td_square = tf.maximum(tf.square(td_error),
                                       tf.square(td_error_clip))

                pi_loss = -tf.reduce_mean(
                    tf.minimum(
                        surrogate,
                        tf.clip_by_value(ratio, 1.0 - self.epsilon,
                                         1.0 + self.epsilon) * advantage))
                kl_loss = kl_coef * kl
                extra_loss = 1000.0 * tf.square(
                    tf.maximum(0., kl - self.kl_cutoff))
                pi_loss = pi_loss + kl_loss + extra_loss
                q_loss = 0.5 * tf.reduce_mean(td_square)

                beta_s = tf.reduce_sum(beta * last_options_onehot,
                                       axis=-1,
                                       keepdims=True)  # [B, 1]
                beta_loss = tf.reduce_mean(beta_s * beta_advantage)
                if self.terminal_mask:
                    beta_loss *= (1 - done)

                loss = pi_loss + 1.0 * q_loss + beta_loss - self.pi_beta * entropy + crsty_loss
            loss_grads = tape.gradient(loss, self.net_tv)
            self.optimizer.apply_gradients(zip(loss_grads, self.net_tv))
            self.global_step.assign_add(1)
            return loss, pi_loss, q_loss, beta_loss, entropy, kl
Пример #4
0
Файл: a2c.py Проект: yyht/RLs
class A2C(make_on_policy_class(mode='share')):
    def __init__(
            self,
            s_dim,
            visual_sources,
            visual_resolution,
            a_dim,
            is_continuous,
            epoch=5,
            beta=1.0e-3,
            actor_lr=5.0e-4,
            critic_lr=1.0e-3,
            hidden_units={
                'actor_continuous': [32, 32],
                'actor_discrete': [32, 32],
                'critic': [32, 32]
            },
            **kwargs):
        super().__init__(s_dim=s_dim,
                         visual_sources=visual_sources,
                         visual_resolution=visual_resolution,
                         a_dim=a_dim,
                         is_continuous=is_continuous,
                         **kwargs)
        self.beta = beta
        self.epoch = epoch

        # self.TensorSpecs = get_TensorSpecs([self.s_dim], self.visual_dim, [self.a_dim], [1])
        if self.is_continuous:
            self.actor_net = rls.actor_mu(self.feat_dim, self.a_dim,
                                          hidden_units['actor_continuous'])
            self.log_std = tf.Variable(initial_value=-0.5 *
                                       np.ones(self.a_dim, dtype=np.float32),
                                       trainable=True)
            self.actor_tv = self.actor_net.trainable_variables + [self.log_std]
        else:
            self.actor_net = rls.actor_discrete(self.feat_dim, self.a_dim,
                                                hidden_units['actor_discrete'])
            self.actor_tv = self.actor_net.trainable_variables
        self.critic_net = rls.critic_v(self.feat_dim, hidden_units['critic'])
        self.critic_tv = self.critic_net.trainable_variables + self.other_tv
        self.actor_lr, self.critic_lr = map(self.init_lr,
                                            [actor_lr, critic_lr])
        self.optimizer_actor, self.optimizer_critic = map(
            self.init_optimizer, [self.actor_lr, self.critic_lr])
        self.model_recorder(
            dict(actor=self.actor_net,
                 critic=self.critic_net,
                 optimizer_actor=self.optimizer_actor,
                 optimizer_critic=self.optimizer_critic))

        self.initialize_data_buffer()

    def show_logo(self):
        self.recorder.logger.info('''
       xx           xxxxx          xxxxxx    
      xxx           xx xxx        xxx  xx    
      xxx           xx xxx        xx    xx   
      x xx             xx         xx         
     xx xx            xxx        xxx         
     xxxxxx           xx         xxx         
    xx   xx          xx           xx    xx   
    xx   xx         xx  x         xxx  xxx   
   xxx  xxxxx      xxxxxx          xxxxxx    
        ''')

    def choose_action(self, s, visual_s, evaluation=False):
        a, self.cell_state = self._get_action(s, visual_s, self.cell_state)
        a = a.numpy()
        return a

    @tf.function
    def _get_action(self, s, visual_s, cell_state):
        with tf.device(self.device):
            feat, cell_state = self.get_feature(s,
                                                visual_s,
                                                cell_state=cell_state,
                                                record_cs=True)
            if self.is_continuous:
                mu = self.actor_net(feat)
                sample_op, _ = gaussian_clip_rsample(mu, self.log_std)
            else:
                logits = self.actor_net(feat)
                norm_dist = tfp.distributions.Categorical(logits)
                sample_op = norm_dist.sample()
        return sample_op, cell_state

    @tf.function
    def _get_value(self, feat):
        with tf.device(self.device):
            value = self.critic_net(feat)
            return value

    def calculate_statistics(self):
        feat, self.cell_state = self.get_feature(self.data.last_s(),
                                                 self.data.last_visual_s(),
                                                 cell_state=self.cell_state,
                                                 record_cs=True)
        init_value = np.squeeze(self._get_value(feat).numpy())
        self.data.cal_dc_r(self.gamma, init_value)

    def learn(self, **kwargs):
        self.train_step = kwargs.get('train_step')

        def _train(data, crsty_loss, cell_state):
            for _ in range(self.epoch):
                actor_loss, critic_loss, entropy = self.train(
                    data, crsty_loss, cell_state)

            summaries = dict([
                ['LOSS/actor_loss', actor_loss],
                ['LOSS/critic_loss', critic_loss],
                ['Statistics/entropy', entropy],
            ])
            return summaries

        self._learn(
            function_dict={
                'calculate_statistics':
                self.calculate_statistics,
                'train_function':
                _train,
                'train_data_list': ['s', 'visual_s', 'a', 'discounted_reward'],
                'summary_dict':
                dict([[
                    'LEARNING_RATE/actor_lr',
                    self.actor_lr(self.train_step)
                ], [
                    'LEARNING_RATE/critic_lr',
                    self.critic_lr(self.train_step)
                ]])
            })

    @tf.function(experimental_relax_shapes=True)
    def train(self, memories, crsty_loss, cell_state):
        s, visual_s, a, dc_r = memories
        with tf.device(self.device):
            with tf.GradientTape() as tape:
                feat = self.get_feature(s, visual_s, cell_state=cell_state)
                v = self.critic_net(feat)
                td_error = dc_r - v
                critic_loss = tf.reduce_mean(tf.square(td_error)) + crsty_loss
            critic_grads = tape.gradient(critic_loss, self.critic_tv)
            self.optimizer_critic.apply_gradients(
                zip(critic_grads, self.critic_tv))
            with tf.GradientTape() as tape:
                if self.is_continuous:
                    mu = self.actor_net(feat)
                    log_act_prob = gaussian_likelihood_sum(a, mu, self.log_std)
                    entropy = gaussian_entropy(self.log_std)
                else:
                    logits = self.actor_net(feat)
                    logp_all = tf.nn.log_softmax(logits)
                    log_act_prob = tf.reduce_sum(a * logp_all,
                                                 axis=1,
                                                 keepdims=True)
                    entropy = -tf.reduce_mean(
                        tf.reduce_sum(tf.exp(logp_all) * logp_all,
                                      axis=1,
                                      keepdims=True))
                v = self.critic_net(feat)
                advantage = tf.stop_gradient(dc_r - v)
                actor_loss = -(tf.reduce_mean(log_act_prob * advantage) +
                               self.beta * entropy)
            if self.is_continuous:
                actor_grads = tape.gradient(actor_loss, self.actor_tv)
                self.optimizer_actor.apply_gradients(
                    zip(actor_grads, self.actor_tv))
            else:
                actor_grads = tape.gradient(actor_loss, self.actor_tv)
                self.optimizer_actor.apply_gradients(
                    zip(actor_grads, self.actor_tv))
            self.global_step.assign_add(1)
            return actor_loss, critic_loss, entropy

    @tf.function(experimental_relax_shapes=True)
    def train_persistent(self, memories, crsty_loss, cell_state):
        s, visual_s, a, dc_r = memories
        with tf.device(self.device):
            with tf.GradientTape(persistent=True) as tape:
                feat = self.get_feature(s, visual_s, cell_state=cell_state)
                if self.is_continuous:
                    mu = self.actor_net(feat)
                    log_act_prob = gaussian_likelihood_sum(a, mu, self.log_std)
                    entropy = gaussian_entropy(self.log_std)
                else:
                    logits = self.actor_net(feat)
                    logp_all = tf.nn.log_softmax(logits)
                    log_act_prob = tf.reduce_sum(a * logp_all,
                                                 axis=1,
                                                 keepdims=True)
                    entropy = -tf.reduce_mean(
                        tf.reduce_sum(tf.exp(logp_all) * logp_all,
                                      axis=1,
                                      keepdims=True))
                v = self.critic_net(feat)
                advantage = tf.stop_gradient(dc_r - v)
                td_error = dc_r - v
                critic_loss = tf.reduce_mean(tf.square(td_error)) + crsty_loss
                actor_loss = -(tf.reduce_mean(log_act_prob * advantage) +
                               self.beta * entropy)
            critic_grads = tape.gradient(critic_loss, self.critic_tv)
            self.optimizer_critic.apply_gradients(
                zip(critic_grads, self.critic_tv))
            if self.is_continuous:
                actor_grads = tape.gradient(actor_loss, self.actor_tv)
                self.optimizer_actor.apply_gradients(
                    zip(actor_grads, self.actor_tv))
            else:
                actor_grads = tape.gradient(actor_loss, self.actor_tv)
                self.optimizer_actor.apply_gradients(
                    zip(actor_grads, self.actor_tv))
            self.global_step.assign_add(1)
            return actor_loss, critic_loss, entropy
Пример #5
0
class TRPO(make_on_policy_class(mode='share')):
    '''
    Trust Region Policy Optimization, https://arxiv.org/abs/1502.05477
    '''
    def __init__(
            self,
            s_dim,
            visual_sources,
            visual_resolution,
            a_dim,
            is_continuous,
            beta=1.0e-3,
            lr=5.0e-4,
            delta=0.01,
            lambda_=0.95,
            cg_iters=10,
            train_v_iters=10,
            damping_coeff=0.1,
            backtrack_iters=10,
            backtrack_coeff=0.8,
            epsilon=0.2,
            critic_lr=1e-3,
            hidden_units={
                'actor_continuous': [32, 32],
                'actor_discrete': [32, 32],
                'critic': [32, 32]
            },
            **kwargs):
        super().__init__(s_dim=s_dim,
                         visual_sources=visual_sources,
                         visual_resolution=visual_resolution,
                         a_dim=a_dim,
                         is_continuous=is_continuous,
                         **kwargs)
        self.beta = beta
        self.delta = delta
        self.lambda_ = lambda_
        self.epsilon = epsilon
        self.cg_iters = cg_iters
        self.damping_coeff = damping_coeff
        self.backtrack_iters = backtrack_iters
        self.backtrack_coeff = backtrack_coeff
        self.train_v_iters = train_v_iters

        # self.actor_TensorSpecs = get_TensorSpecs([self.s_dim], self.visual_dim, [self.a_dim], [1], [1])
        # self.critic_TensorSpecs = get_TensorSpecs([self.s_dim], self.visual_dim, [1])

        if self.is_continuous:
            self.actor_net = rls.actor_mu(self.feat_dim, self.a_dim,
                                          hidden_units['actor_continuous'])
            self.log_std = tf.Variable(initial_value=-0.5 *
                                       np.ones(self.a_dim, dtype=np.float32),
                                       trainable=True)
            self.actor_tv = self.actor_net.trainable_variables + [self.log_std]
            # self.Hx_TensorSpecs = [tf.TensorSpec(shape=flat_concat(self.actor_tv).shape, dtype=tf.float32)] \
            #     + get_TensorSpecs([self.s_dim], self.visual_dim, [self.a_dim], [self.a_dim])
        else:
            self.actor_net = rls.actor_discrete(self.feat_dim, self.a_dim,
                                                hidden_units['actor_discrete'])
            self.actor_tv = self.actor_net.trainable_variables
            # self.Hx_TensorSpecs = [tf.TensorSpec(shape=flat_concat(self.actor_tv).shape, dtype=tf.float32)] \
            #     + get_TensorSpecs([self.s_dim], self.visual_dim, [self.a_dim])
        self.critic_net = rls.critic_v(self.feat_dim, hidden_units['critic'])
        self.critic_tv = self.critic_net.trainable_variables + self.other_tv
        self.critic_lr = self.init_lr(critic_lr)
        self.optimizer_critic = self.init_optimizer(self.critic_lr)

        self.model_recorder(
            dict(actor=self.actor_net,
                 critic=self.critic_net,
                 optimizer_critic=self.optimizer_critic))

        if self.is_continuous:
            data_name_list = [
                's', 'visual_s', 'a', 'r', 's_', 'visual_s_', 'done', 'value',
                'log_prob', 'old_mu', 'old_log_std'
            ]
        else:
            data_name_list = [
                's', 'visual_s', 'a', 'r', 's_', 'visual_s_', 'done', 'value',
                'log_prob', 'old_logp_all'
            ]
        self.initialize_data_buffer(data_name_list=data_name_list)

    def show_logo(self):
        self.recorder.logger.info('''
   xxxxxxxxx      xxxxxxxx       xxxxxxxx         xxxxx     
   xx  x  xx        xx xxx         xx  xx        xxx xxx    
   xx  x  xx        x   xxx        x   xxx       xx   xx    
       x            x   xx         x   xxx       xx   xxx   
       x            xxxxxx         xxxxxx       xxx   xxx   
       x            xx xxx         x             xx   xxx   
       x            x   xx         x             xx   xx    
       x            x   xxx        x             xx  xxx    
     xxxxx        xxxxx xxx      xxxxx            xxxxx 
        ''')

    def choose_action(self, s, visual_s, evaluation=False):
        a, _v, _lp, _morlpa, self.cell_state = self._get_action(
            s, visual_s, self.cell_state)
        a = a.numpy()
        self._value = np.squeeze(_v.numpy())
        self._log_prob = np.squeeze(_lp.numpy()) + 1e-10
        if self.is_continuous:
            self._mu = _morlpa.numpy()
        else:
            self._logp_all = _morlpa.numpy()
        return a

    @tf.function
    def _get_action(self, s, visual_s, cell_state):
        with tf.device(self.device):
            feat, cell_state = self.get_feature(s,
                                                visual_s,
                                                cell_state=cell_state,
                                                record_cs=True)
            value = self.critic_net(feat)
            if self.is_continuous:
                mu = self.actor_net(feat)
                sample_op, _ = gaussian_clip_rsample(mu, self.log_std)
                log_prob = gaussian_likelihood_sum(sample_op, mu, self.log_std)
                return sample_op, value, log_prob, mu, cell_state
            else:
                logits = self.actor_net(feat)
                logp_all = tf.nn.log_softmax(logits)
                norm_dist = tfp.distributions.Categorical(logits)
                sample_op = norm_dist.sample()
                log_prob = norm_dist.log_prob(sample_op)
                return sample_op, value, log_prob, logp_all, cell_state

    def store_data(self, s, visual_s, a, r, s_, visual_s_, done):
        assert isinstance(
            a, np.ndarray), "store_data need action type is np.ndarray"
        assert isinstance(
            r, np.ndarray), "store_data need reward type is np.ndarray"
        assert isinstance(
            done, np.ndarray), "store_data need done type is np.ndarray"
        self._running_average(s)
        if self.is_continuous:
            self.data.add(s, visual_s, a, r, s_, visual_s_, done, self._value,
                          self._log_prob, self._mu, self.log_std.numpy())
        else:
            self.data.add(s, visual_s, a, r, s_, visual_s_, done, self._value,
                          self._log_prob, self._logp_all)

    @tf.function
    def _get_value(self, feat):
        with tf.device(self.device):
            value = self.critic_net(feat)
            return value

    def calculate_statistics(self):
        feat, self.cell_state = self.get_feature(self.data.last_s(),
                                                 self.data.last_visual_s(),
                                                 cell_state=self.cell_state,
                                                 record_cs=True)
        init_value = np.squeeze(self._get_value(feat).numpy())
        self.data.cal_dc_r(self.gamma, init_value)
        self.data.cal_td_error(self.gamma, init_value)
        self.data.cal_gae_adv(self.lambda_, self.gamma)

    def learn(self, **kwargs):
        self.train_step = kwargs.get('train_step')

        def _train(data, crsty_loss, cell_state):
            if self.is_continuous:
                s, visual_s, a, dc_r, old_log_prob, advantage, old_mu, old_log_std = data
                Hx_args = (s, visual_s, old_mu, old_log_std)
            else:
                s, visual_s, a, dc_r, old_log_prob, advantage, old_logp_all = data
                Hx_args = (s, visual_s, old_logp_all)
            actor_loss, entropy, gradients = self.train_actor(
                (s, visual_s, a, old_log_prob, advantage), cell_state)

            x = self.cg(self.Hx, gradients.numpy(), Hx_args)
            x = tf.convert_to_tensor(x)
            alpha = np.sqrt(2 * self.delta /
                            (np.dot(x, self.Hx(x, *Hx_args)) + 1e-8))
            for i in range(self.backtrack_iters):
                assign_params_from_flat(alpha * x * (self.backtrack_coeff**i),
                                        self.actor_tv)

            for _ in range(self.train_v_iters):
                critic_loss = self.train_critic((s, visual_s, dc_r),
                                                crsty_loss, cell_state)

            summaries = dict([['LOSS/actor_loss', actor_loss],
                              ['LOSS/critic_loss', critic_loss],
                              ['Statistics/entropy', entropy]])
            return summaries

        if self.is_continuous:
            train_data_list = [
                's', 'visual_s', 'a', 'discounted_reward', 'log_prob',
                'gae_adv', 'old_mu', 'old_log_std'
            ]
        else:
            train_data_list = [
                's', 'visual_s', 'a', 'discounted_reward', 'log_prob',
                'gae_adv', 'old_logp_all'
            ]

        self._learn(
            function_dict={
                'calculate_statistics':
                self.calculate_statistics,
                'train_function':
                _train,
                'train_data_list':
                train_data_list,
                'summary_dict':
                dict([[
                    'LEARNING_RATE/critic_lr',
                    self.critic_lr(self.train_step)
                ]])
            })

    @tf.function(experimental_relax_shapes=True)
    def train_actor(self, memories, cell_state):
        s, visual_s, a, old_log_prob, advantage = memories
        with tf.device(self.device):
            feat = self.get_feature(s, visual_s, cell_state=cell_state)
            with tf.GradientTape() as tape:
                if self.is_continuous:
                    mu = self.actor_net(feat)
                    new_log_prob = gaussian_likelihood_sum(a, mu, self.log_std)
                    entropy = gaussian_entropy(self.log_std)
                else:
                    logits = self.actor_net(feat)
                    logp_all = tf.nn.log_softmax(logits)
                    new_log_prob = tf.reduce_sum(a * logp_all,
                                                 axis=1,
                                                 keepdims=True)
                    entropy = -tf.reduce_mean(
                        tf.reduce_sum(tf.exp(logp_all) * logp_all,
                                      axis=1,
                                      keepdims=True))
                ratio = tf.exp(new_log_prob - old_log_prob)
                actor_loss = -tf.reduce_mean(ratio * advantage)
            actor_grads = tape.gradient(actor_loss, self.actor_tv)
            gradients = flat_concat(actor_grads)
            self.global_step.assign_add(1)
            return actor_loss, entropy, gradients

    @tf.function(experimental_relax_shapes=True)
    def Hx(self, x, *args):
        if self.is_continuous:
            s, visual_s, old_mu, old_log_std = args
        else:
            s, visual_s, old_logp_all = args
        with tf.device(self.device):
            with tf.GradientTape(persistent=True) as tape:
                feat = self.get_feature(s, visual_s)
                if self.is_continuous:
                    mu = self.actor_net(feat)
                    var0, var1 = tf.exp(2 * self.log_std), tf.exp(2 *
                                                                  old_log_std)
                    pre_sum = 0.5 * (
                        ((old_mu - mu)**2 + var0) /
                        (var1 + 1e-8) - 1) + old_log_std - self.log_std
                    all_kls = tf.reduce_sum(pre_sum, axis=1)
                    kl = tf.reduce_mean(all_kls)
                else:
                    logits = self.actor_net(feat)
                    logp_all = tf.nn.log_softmax(logits)
                    all_kls = tf.reduce_sum(tf.exp(old_logp_all) *
                                            (old_logp_all - logp_all),
                                            axis=1)
                    kl = tf.reduce_mean(all_kls)

                g = flat_concat(tape.gradient(kl, self.actor_tv))
                _g = tf.reduce_sum(g * x)
            hvp = flat_concat(tape.gradient(_g, self.actor_tv))
            if self.damping_coeff > 0:
                hvp += self.damping_coeff * x
            return hvp

    @tf.function(experimental_relax_shapes=True)
    def train_critic(self, memories, crsty_loss, cell_state):
        s, visual_s, dc_r = memories
        with tf.device(self.device):
            with tf.GradientTape() as tape:
                feat = self.get_feature(s, visual_s, cell_state=cell_state)
                value = self.critic_net(feat)
                td_error = dc_r - value
                value_loss = tf.reduce_mean(tf.square(td_error)) + crsty_loss
            critic_grads = tape.gradient(value_loss, self.critic_tv)
            self.optimizer_critic.apply_gradients(
                zip(critic_grads, self.critic_tv))
            return value_loss

    def cg(self, Ax, b, args):
        """
        Conjugate gradient algorithm
        (see https://en.wikipedia.org/wiki/Conjugate_gradient_method)
        """
        x = np.zeros_like(b)
        r = b.copy(
        )  # Note: should be 'b - Ax(x)', but for x=0, Ax(x)=0. Change if doing warm start.
        p = r.copy()
        r_dot_old = np.dot(r, r)
        for _ in range(self.cg_iters):
            z = Ax(tf.convert_to_tensor(p), *args)
            alpha = r_dot_old / (np.dot(p, z) + 1e-8)
            x += alpha * p
            r -= alpha * z
            r_dot_new = np.dot(r, r)
            p = r + (r_dot_new / r_dot_old) * p
            r_dot_old = r_dot_new
        return x
Пример #6
0
class PPO(make_on_policy_class(mode='share')):
    '''
    Proximal Policy Optimization, https://arxiv.org/abs/1707.06347
    Emergence of Locomotion Behaviours in Rich Environments, http://arxiv.org/abs/1707.02286, DPPO
    '''
    def __init__(self,
                 s_dim,
                 visual_sources,
                 visual_resolution,
                 a_dim,
                 is_continuous,

                 policy_epoch=4,
                 value_epoch=4,
                 beta=1.0e-3,
                 lr=5.0e-4,
                 lambda_=0.95,
                 epsilon=0.2,
                 value_epsilon=0.2,
                 share_net=True,
                 actor_lr=3e-4,
                 critic_lr=1e-3,
                 kl_reverse=False,
                 kl_target=0.02,
                 kl_target_cutoff=2,
                 kl_target_earlystop=4,
                 kl_beta=[0.7, 1.3],
                 kl_alpha=1.5,
                 kl_coef=1.0,
                 hidden_units={
                     'share': {
                         'continuous': {
                             'share': [32, 32],
                             'mu': [32, 32],
                             'v': [32, 32]
                         },
                         'discrete': {
                             'share': [32, 32],
                             'logits': [32, 32],
                             'v': [32, 32]
                         }
                     },
                     'actor_continuous': [32, 32],
                     'actor_discrete': [32, 32],
                     'critic': [32, 32]
                 },
                 **kwargs):
        super().__init__(
            s_dim=s_dim,
            visual_sources=visual_sources,
            visual_resolution=visual_resolution,
            a_dim=a_dim,
            is_continuous=is_continuous,
            **kwargs)
        self.beta = beta
        self.policy_epoch = policy_epoch
        self.value_epoch = value_epoch
        self.lambda_ = lambda_
        self.epsilon = epsilon
        self.value_epsilon = value_epsilon
        self.share_net = share_net
        self.kl_reverse = kl_reverse
        self.kl_target = kl_target
        self.kl_alpha = kl_alpha
        self.kl_coef = tf.constant(kl_coef, dtype=tf.float32)

        self.kl_cutoff = kl_target * kl_target_cutoff
        self.kl_stop = kl_target * kl_target_earlystop
        self.kl_low = kl_target * kl_beta[0]
        self.kl_high = kl_target * kl_beta[-1]

        if self.is_continuous:
            self.log_std = tf.Variable(initial_value=-0.5 * np.ones(self.a_dim, dtype=np.float32), trainable=True)
        if self.share_net:
            # self.TensorSpecs = get_TensorSpecs([self.s_dim], self.visual_dim, [self.a_dim], [1], [1], [1])
            if self.is_continuous:
                self.net = rls.a_c_v_continuous(self.feat_dim, self.a_dim, hidden_units['share']['continuous'])
                self.net_tv = self.net.trainable_variables + [self.log_std] + self.other_tv
            else:
                self.net = rls.a_c_v_discrete(self.feat_dim, self.a_dim, hidden_units['share']['discrete'])
                self.net_tv = self.net.trainable_variables + self.other_tv
            self.lr = self.init_lr(lr)
            self.optimizer = self.init_optimizer(self.lr)
            self.model_recorder(dict(
                model=self.net,
                optimizer=self.optimizer
                ))
        else:
            # self.actor_TensorSpecs = get_TensorSpecs([self.s_dim], self.visual_dim, [self.a_dim], [1], [1])
            # self.critic_TensorSpecs = get_TensorSpecs([self.s_dim], self.visual_dim, [1])
            if self.is_continuous:
                self.actor_net = rls.actor_mu(self.feat_dim, self.a_dim, hidden_units['actor_continuous'])
                self.actor_net_tv = self.actor_net.trainable_variables+ [self.log_std]
            else:
                self.actor_net = rls.actor_discrete(self.feat_dim, self.a_dim, hidden_units['actor_discrete'])
                self.actor_net_tv = self.actor_net.trainable_variables
            self.critic_net = rls.critic_v(self.feat_dim, hidden_units['critic'])
            self.critic_tv = self.critic_net.trainable_variables + self.other_tv
            self.actor_lr, self.critic_lr = map(self.init_lr, [actor_lr, critic_lr])
            self.optimizer_actor, self.optimizer_critic = map(self.init_optimizer, [self.actor_lr, self.critic_lr])
            self.model_recorder(dict(
                actor=self.actor_net,
                critic=self.critic_net,
                optimizer_actor=self.optimizer_actor,
                optimizer_critic=self.optimizer_critic
                ))
            
        self.initialize_data_buffer(
            data_name_list=['s', 'visual_s', 'a', 'r', 's_', 'visual_s_', 'done', 'value', 'log_prob'])

    def show_logo(self):
        self.recorder.logger.info('''
   xxxxxxxx       xxxxxxxx         xxxxx     
     xx  xx         xx  xx        xxx xxx    
     x   xxx        x   xxx       xx   xx    
     x   xxx        x   xxx       xx   xxx   
     xxxxxx         xxxxxx       xxx   xxx   
     x              x             xx   xxx   
     x              x             xx   xx    
     x              x             xx  xxx    
   xxxxx          xxxxx            xxxxx  
        ''')

    def choose_action(self, s, visual_s, evaluation=False):
        a, value, log_prob, self.cell_state = self._get_action(s, visual_s, self.cell_state)
        a = a.numpy()
        self._value = np.squeeze(value.numpy())
        self._log_prob = np.squeeze(log_prob.numpy()) + 1e-10
        return a

    @tf.function
    def _get_action(self, s, visual_s, cell_state):
        with tf.device(self.device):
            feat, cell_state = self.get_feature(s, visual_s, cell_state=cell_state, record_cs=True)
            if self.is_continuous:
                if self.share_net:
                    mu, value = self.net(feat)
                else:
                    mu = self.actor_net(feat)
                    value = self.critic_net(feat)
                sample_op, _ = gaussian_clip_rsample(mu, self.log_std)
                log_prob = gaussian_likelihood_sum(sample_op, mu, self.log_std)
            else:
                if self.share_net:
                    logits, value = self.net(feat)
                else:
                    logits = self.actor_net(feat)
                    value = self.critic_net(feat)
                norm_dist = tfp.distributions.Categorical(logits)
                sample_op = norm_dist.sample()
                log_prob = norm_dist.log_prob(sample_op)
        return sample_op, value, log_prob, cell_state

    def store_data(self, s, visual_s, a, r, s_, visual_s_, done):
        assert isinstance(a, np.ndarray), "store_data need action type is np.ndarray"
        assert isinstance(r, np.ndarray), "store_data need reward type is np.ndarray"
        assert isinstance(done, np.ndarray), "store_data need done type is np.ndarray"
        self.data.add(s, visual_s, a, r, s_, visual_s_, done, self._value, self._log_prob)

    @tf.function
    def _get_value(self, feat):
        with tf.device(self.device):
            if self.share_net:
                _, value = self.net(feat)
            else:
                value = self.critic_net(feat)
            return value

    def calculate_statistics(self):
        feat, self.cell_state = self.get_feature(self.data.last_s(), self.data.last_visual_s(), cell_state=self.cell_state, record_cs=True)
        init_value = np.squeeze(self._get_value(feat).numpy())
        self.data.cal_dc_r(self.gamma, init_value)
        self.data.cal_td_error(self.gamma, init_value)
        self.data.cal_gae_adv(self.lambda_, self.gamma)

    # @show_graph(name='ppo_net')
    def learn(self, **kwargs):
        self.episode = kwargs['episode']

        def _train(data, crsty_loss, cell_state):
            early_step = 0
            if self.share_net:
                for i in range(self.policy_epoch):
                    actor_loss, critic_loss, entropy, kl = self.train_share(
                        data,
                        self.kl_coef,
                        crsty_loss,
                        cell_state
                        )
                    if kl > self.kl_stop:
                        early_step = i
                        break
            else:
                for i in range(self.policy_epoch):
                    s, visual_s, a, dc_r, old_log_prob, advantage, old_value = data
                    actor_loss, entropy, kl = self.train_actor(
                        (s, visual_s, a, old_log_prob, advantage),
                        self.kl_coef,
                        cell_state
                    )
                    if kl > self.kl_stop:
                        early_step = i
                        break
                    
                for _ in range(self.value_epoch):  
                    critic_loss = self.train_critic(
                        (s, visual_s, dc_r, old_value),
                        crsty_loss,
                        cell_state
                    )

            # https://github.com/joschu/modular_rl/blob/6970cde3da265cf2a98537250fea5e0c0d9a7639/modular_rl/ppo.py#L93
            if kl > self.kl_high:
                self.kl_coef *= self.kl_alpha
            elif kl < self.kl_low:
                self.kl_coef /= self.kl_alpha

            summaries = dict([
                ['LOSS/actor_loss', actor_loss],
                ['LOSS/critic_loss', critic_loss],
                ['Statistics/kl', kl],
                ['Statistics/kl_coef', self.kl_coef],
                ['Statistics/early_step', early_step],
                ['Statistics/entropy', entropy]
            ])
            return summaries

        if self.share_net:
            summary_dict = dict([['LEARNING_RATE/lr', self.lr(self.episode)]])
        else:
            summary_dict =dict([
                ['LEARNING_RATE/actor_lr', self.actor_lr(self.episode)],
                ['LEARNING_RATE/critic_lr', self.critic_lr(self.episode)]
            ])

        self._learn(function_dict={
                        'calculate_statistics': self.calculate_statistics,
                        'train_function': _train,
                        'train_data_list': ['s', 'visual_s', 'a', 'discounted_reward', 'log_prob', 'gae_adv', 'value'],
                        'summary_dict': summary_dict
                    })

    @tf.function(experimental_relax_shapes=True)
    def train_share(self, memories, kl_coef, crsty_loss, cell_state):
        s, visual_s, a, dc_r, old_log_prob, advantage, old_value = memories
        with tf.device(self.device):
            with tf.GradientTape() as tape:
                feat = self.get_feature(s, visual_s, cell_state=cell_state)
                if self.is_continuous:
                    mu, value = self.net(feat)
                    new_log_prob = gaussian_likelihood_sum(a, mu, self.log_std)
                    entropy = gaussian_entropy(self.log_std)
                else:
                    logits, value = self.net(feat)
                    logp_all = tf.nn.log_softmax(logits)
                    new_log_prob = tf.reduce_sum(a * logp_all, axis=1, keepdims=True)
                    entropy = -tf.reduce_mean(tf.reduce_sum(tf.exp(logp_all) * logp_all, axis=1, keepdims=True))
                ratio = tf.exp(new_log_prob - old_log_prob)

                # https://github.com/joschu/modular_rl/blob/6970cde3da265cf2a98537250fea5e0c0d9a7639/modular_rl/ppo.py#L40
                if self.kl_reverse:
                    kl = tf.reduce_mean(new_log_prob - old_log_prob)
                else:
                    kl = tf.reduce_mean(old_log_prob - new_log_prob)    # a sample estimate for KL-divergence, easy to compute
                surrogate = ratio * advantage
                
                # https://github.com/llSourcell/OpenAI_Five_vs_Dota2_Explained/blob/c5def7e57aa70785c2394ea2eeb3e5f66ad59a53/train.py#L154
                value_clip = old_value + tf.clip_by_value(value - old_value, -self.value_epsilon, self.value_epsilon)
                td_error = dc_r - value
                td_error_clip = dc_r - value_clip
                td_square = tf.maximum(tf.square(td_error), tf.square(td_error_clip))

                pi_loss = -tf.reduce_mean(
                    tf.minimum(
                        surrogate,
                        tf.clip_by_value(ratio, 1.0 - self.epsilon, 1.0 + self.epsilon) * advantage
                    ))
                kl_loss = kl_coef * kl
                extra_loss = 1000.0 * tf.square(tf.maximum(0., kl - self.kl_cutoff))
                actor_loss = pi_loss + kl_loss + extra_loss
                value_loss = 0.5 * tf.reduce_mean(td_square)
                loss = actor_loss + 1.0 * value_loss - self.beta * entropy + crsty_loss
            loss_grads = tape.gradient(loss, self.net_tv)
            self.optimizer.apply_gradients(
                zip(loss_grads, self.net_tv)
            )
            self.global_step.assign_add(1)
            return actor_loss, value_loss, entropy, kl

    @tf.function(experimental_relax_shapes=True)
    def train_actor(self, memories, kl_coef, cell_state):
        s, visual_s, a, old_log_prob, advantage = memories
        with tf.device(self.device):
            feat = self.get_feature(s, visual_s, cell_state=cell_state)
            with tf.GradientTape() as tape:
                if self.is_continuous:
                    mu = self.actor_net(feat)
                    new_log_prob = gaussian_likelihood_sum(a, mu, self.log_std)
                    entropy = gaussian_entropy(self.log_std)
                else:
                    logits = self.actor_net(feat)
                    logp_all = tf.nn.log_softmax(logits)
                    new_log_prob = tf.reduce_sum(a * logp_all, axis=1, keepdims=True)
                    entropy = -tf.reduce_mean(tf.reduce_sum(tf.exp(logp_all) * logp_all, axis=1, keepdims=True))
                ratio = tf.exp(new_log_prob - old_log_prob)
                kl = tf.reduce_mean(old_log_prob - new_log_prob)
                surrogate = ratio * advantage
                min_adv = tf.where(advantage > 0, (1 + self.epsilon) * advantage, (1 - self.epsilon) * advantage)
                pi_loss = -(tf.reduce_mean(tf.minimum(surrogate, min_adv)) + self.beta * entropy)

                kl_loss = kl_coef * kl
                extra_loss = 1000.0 * tf.square(tf.maximum(0., kl - self.kl_cutoff))
                actor_loss = pi_loss + kl_loss + extra_loss

            actor_grads = tape.gradient(actor_loss, self.actor_net_tv)
            self.optimizer_actor.apply_gradients(
                zip(actor_grads, self.actor_net_tv)
            )
            self.global_step.assign_add(1)
            return actor_loss, entropy, kl

    @tf.function(experimental_relax_shapes=True)
    def train_critic(self, memories, crsty_loss, cell_state):
        s, visual_s, dc_r, old_value = memories
        with tf.device(self.device):
            with tf.GradientTape() as tape:
                feat = self.get_feature(s, visual_s, cell_state=cell_state)
                value = self.critic_net(feat)

                value_clip = old_value + tf.clip_by_value(value-old_value, -self.value_epsilon, self.value_epsilon)
                td_error = dc_r - value
                td_error_clip = dc_r - value_clip
                td_square = tf.maximum(tf.square(td_error), tf.square(td_error_clip))

                value_loss = 0.5 * tf.reduce_mean(td_square) + crsty_loss
            critic_grads = tape.gradient(value_loss, self.critic_tv)
            self.optimizer_critic.apply_gradients(
                zip(critic_grads, self.critic_tv)
            )
            return value_loss