Exemplo n.º 1
0
    def __init__(self,
                 s_dim,
                 visual_sources,
                 visual_resolution,
                 a_dim,
                 is_continuous,

                 epoch=5,
                 beta=1.0e-3,
                 actor_lr=5.0e-4,
                 critic_lr=1.0e-3,
                 hidden_units={
                     'actor_continuous': [32, 32],
                     'actor_discrete': [32, 32],
                     'critic': [32, 32]
                 },
                 **kwargs):
        super().__init__(
            s_dim=s_dim,
            visual_sources=visual_sources,
            visual_resolution=visual_resolution,
            a_dim=a_dim,
            is_continuous=is_continuous,
            **kwargs)
        self.beta = beta
        self.epoch = epoch

        # self.TensorSpecs = get_TensorSpecs([self.s_dim], self.visual_dim, [self.a_dim], [1])
        if self.is_continuous:
            self.actor_net = ActorCts(self.feat_dim, self.a_dim, hidden_units['actor_continuous'])
            self.log_std = tf.Variable(initial_value=-0.5 * np.ones(self.a_dim, dtype=np.float32), trainable=True)
            self.actor_tv = self.actor_net.trainable_variables + [self.log_std]
        else:
            self.actor_net = ActorDcs(self.feat_dim, self.a_dim, hidden_units['actor_discrete'])
            self.actor_tv = self.actor_net.trainable_variables
        self.critic_net = Critic(self.feat_dim, hidden_units['critic'])
        self.critic_tv = self.critic_net.trainable_variables + self.other_tv
        self.actor_lr, self.critic_lr = map(self.init_lr, [actor_lr, critic_lr])
        self.optimizer_actor, self.optimizer_critic = map(self.init_optimizer, [self.actor_lr, self.critic_lr])
        self.model_recorder(dict(
            actor=self.actor_net,
            critic=self.critic_net,
            optimizer_actor=self.optimizer_actor,
            optimizer_critic=self.optimizer_critic
        ))

        self.initialize_data_buffer()
Exemplo n.º 2
0
    def __init__(self,
                 s_dim,
                 visual_sources,
                 visual_resolution,
                 a_dim,
                 is_continuous,
                 actor_lr=5.0e-4,
                 critic_lr=1.0e-3,
                 discrete_tau=1.0,
                 hidden_units={
                     'actor_continuous': [32, 32],
                     'actor_discrete': [32, 32],
                     'q': [32, 32]
                 },
                 **kwargs):
        super().__init__(s_dim=s_dim,
                         visual_sources=visual_sources,
                         visual_resolution=visual_resolution,
                         a_dim=a_dim,
                         is_continuous=is_continuous,
                         **kwargs)
        self.discrete_tau = discrete_tau

        if self.is_continuous:
            # self.action_noise = NormalActionNoise(mu=np.zeros(self.a_dim), sigma=1 * np.ones(self.a_dim))
            self.action_noise = OrnsteinUhlenbeckActionNoise(
                mu=np.zeros(self.a_dim), sigma=0.2 * np.ones(self.a_dim))
            self.actor_net = ActorCts(self.feat_dim, self.a_dim,
                                      hidden_units['actor_continuous'])
        else:
            self.actor_net = ActorDcs(self.feat_dim, self.a_dim,
                                      hidden_units['actor_discrete'])
            self.gumbel_dist = tfp.distributions.Gumbel(0, 1)
        self.actor_tv = self.actor_net.trainable_variables

        self.q_net = Critic(self.feat_dim, self.a_dim, hidden_units['q'])
        self.critic_tv = self.q_net.trainable_variables + self.other_tv
        self.actor_lr, self.critic_lr = map(self.init_lr,
                                            [actor_lr, critic_lr])
        self.optimizer_actor, self.optimizer_critic = map(
            self.init_optimizer, [self.actor_lr, self.critic_lr])

        self.model_recorder(
            dict(actor=self.actor_net,
                 critic=self.q_net,
                 optimizer_actor=self.optimizer_actor,
                 optimizer_critic=self.optimizer_critic))
Exemplo n.º 3
0
Arquivo: pg.py Projeto: ncepuwwy97/RLs
    def __init__(self,
                 s_dim,
                 visual_sources,
                 visual_resolution,
                 a_dim,
                 is_continuous,

                 lr=5.0e-4,
                 epoch=5,
                 hidden_units={
                     'actor_continuous': [32, 32],
                     'actor_discrete': [32, 32]
                 },
                 **kwargs):
        super().__init__(
            s_dim=s_dim,
            visual_sources=visual_sources,
            visual_resolution=visual_resolution,
            a_dim=a_dim,
            is_continuous=is_continuous,
            **kwargs)
        self.epoch = epoch
        # self.TensorSpecs = get_TensorSpecs([self.s_dim], self.visual_dim, [self.a_dim], [1])
        if self.is_continuous:
            self.net = ActorCts(self.feat_dim, self.a_dim, hidden_units['actor_continuous'])
            self.log_std = tf.Variable(initial_value=-0.5 * np.ones(self.a_dim, dtype=np.float32), trainable=True)
            self.net_tv = self.net.trainable_variables + [self.log_std] + self.other_tv
        else:
            self.net = ActorDcs(self.feat_dim, self.a_dim, hidden_units['actor_discrete'])
            self.net_tv = self.net.trainable_variables + self.other_tv
        self.lr = self.init_lr(lr)
        self.optimizer = self.init_optimizer(self.lr)

        self.model_recorder(dict(
            model=self.net,
            optimizer=self.optimizer
        ))

        self.initialize_data_buffer()
Exemplo n.º 4
0
    def __init__(
            self,
            s_dim: Union[int, np.ndarray],
            visual_sources: Union[int, np.ndarray],
            visual_resolution: Union[List, np.ndarray],
            a_dim: Union[int, np.ndarray],
            is_continuous: Union[bool, np.ndarray],
            policy_epoch: int = 4,
            value_epoch: int = 4,
            beta: float = 1.0e-3,
            lr: float = 5.0e-4,
            lambda_: float = 0.95,
            epsilon: float = 0.2,
            value_epsilon: float = 0.2,
            share_net: bool = True,
            actor_lr: float = 3e-4,
            critic_lr: float = 1e-3,
            kl_reverse: bool = False,
            kl_target: float = 0.02,
            kl_target_cutoff: float = 2,
            kl_target_earlystop: float = 4,
            kl_beta: List[float] = [0.7, 1.3],
            kl_alpha: float = 1.5,
            kl_coef: float = 1.0,
            hidden_units: Dict = {
                'share': {
                    'continuous': {
                        'share': [32, 32],
                        'mu': [32, 32],
                        'v': [32, 32]
                    },
                    'discrete': {
                        'share': [32, 32],
                        'logits': [32, 32],
                        'v': [32, 32]
                    }
                },
                'actor_continuous': [32, 32],
                'actor_discrete': [32, 32],
                'critic': [32, 32]
            },
            **kwargs):
        super().__init__(s_dim=s_dim,
                         visual_sources=visual_sources,
                         visual_resolution=visual_resolution,
                         a_dim=a_dim,
                         is_continuous=is_continuous,
                         **kwargs)
        self.beta = beta
        self.policy_epoch = policy_epoch
        self.value_epoch = value_epoch
        self.lambda_ = lambda_
        self.epsilon = epsilon
        self.value_epsilon = value_epsilon
        self.share_net = share_net
        self.kl_reverse = kl_reverse
        self.kl_target = kl_target
        self.kl_alpha = kl_alpha
        self.kl_coef = tf.constant(kl_coef, dtype=tf.float32)

        self.kl_cutoff = kl_target * kl_target_cutoff
        self.kl_stop = kl_target * kl_target_earlystop
        self.kl_low = kl_target * kl_beta[0]
        self.kl_high = kl_target * kl_beta[-1]

        if self.is_continuous:
            self.log_std = tf.Variable(initial_value=-0.5 *
                                       np.ones(self.a_dim, dtype=np.float32),
                                       trainable=True)
        if self.share_net:
            # self.TensorSpecs = get_TensorSpecs([self.s_dim], self.visual_dim, [self.a_dim], [1], [1], [1])
            if self.is_continuous:
                self.net = ACCtsShare(self.feat_dim, self.a_dim,
                                      hidden_units['share']['continuous'])
                self.net_tv = self.net.trainable_variables + [self.log_std
                                                              ] + self.other_tv
            else:
                self.net = ACDcsShare(self.feat_dim, self.a_dim,
                                      hidden_units['share']['discrete'])
                self.net_tv = self.net.trainable_variables + self.other_tv
            self.lr = self.init_lr(lr)
            self.optimizer = self.init_optimizer(self.lr)
            self.model_recorder(dict(model=self.net, optimizer=self.optimizer))
        else:
            # self.actor_TensorSpecs = get_TensorSpecs([self.s_dim], self.visual_dim, [self.a_dim], [1], [1])
            # self.critic_TensorSpecs = get_TensorSpecs([self.s_dim], self.visual_dim, [1])
            if self.is_continuous:
                self.actor_net = ActorCts(self.feat_dim, self.a_dim,
                                          hidden_units['actor_continuous'])
                self.actor_net_tv = self.actor_net.trainable_variables + [
                    self.log_std
                ]
            else:
                self.actor_net = ActorDcs(self.feat_dim, self.a_dim,
                                          hidden_units['actor_discrete'])
                self.actor_net_tv = self.actor_net.trainable_variables
            self.critic_net = Critic(self.feat_dim, hidden_units['critic'])
            self.critic_tv = self.critic_net.trainable_variables + self.other_tv
            self.actor_lr, self.critic_lr = map(self.init_lr,
                                                [actor_lr, critic_lr])
            self.optimizer_actor, self.optimizer_critic = map(
                self.init_optimizer, [self.actor_lr, self.critic_lr])
            self.model_recorder(
                dict(actor=self.actor_net,
                     critic=self.critic_net,
                     optimizer_actor=self.optimizer_actor,
                     optimizer_critic=self.optimizer_critic))

        self.initialize_data_buffer(data_name_list=[
            's', 'visual_s', 'a', 'r', 's_', 'visual_s_', 'done', 'value',
            'log_prob'
        ])
Exemplo n.º 5
0
 def _actor_net(i): return ActorCts(self.s_dim[i], self.a_dim[i], hidden_units['actor'])
 self.actor_nets = {i: _actor_net(i) for i in range(self.agent_sep_ctls)}
Exemplo n.º 6
0
    def __init__(self,
                 s_dim,
                 visual_sources,
                 visual_resolution,
                 a_dim,
                 is_continuous,

                 beta=1.0e-3,
                 lr=5.0e-4,
                 delta=0.01,
                 lambda_=0.95,
                 cg_iters=10,
                 train_v_iters=10,
                 damping_coeff=0.1,
                 backtrack_iters=10,
                 backtrack_coeff=0.8,
                 epsilon=0.2,
                 critic_lr=1e-3,
                 hidden_units={
                     'actor_continuous': [32, 32],
                     'actor_discrete': [32, 32],
                     'critic': [32, 32]
                 },
                 **kwargs):
        super().__init__(
            s_dim=s_dim,
            visual_sources=visual_sources,
            visual_resolution=visual_resolution,
            a_dim=a_dim,
            is_continuous=is_continuous,
            **kwargs)
        self.beta = beta
        self.delta = delta
        self.lambda_ = lambda_
        self.epsilon = epsilon
        self.cg_iters = cg_iters
        self.damping_coeff = damping_coeff
        self.backtrack_iters = backtrack_iters
        self.backtrack_coeff = backtrack_coeff
        self.train_v_iters = train_v_iters

        # self.actor_TensorSpecs = get_TensorSpecs([self.s_dim], self.visual_dim, [self.a_dim], [1], [1])
        # self.critic_TensorSpecs = get_TensorSpecs([self.s_dim], self.visual_dim, [1])

        if self.is_continuous:
            self.actor_net = ActorCts(self.feat_dim, self.a_dim, hidden_units['actor_continuous'])
            self.log_std = tf.Variable(initial_value=-0.5 * np.ones(self.a_dim, dtype=np.float32), trainable=True)
            self.actor_tv = self.actor_net.trainable_variables + [self.log_std]
            # self.Hx_TensorSpecs = [tf.TensorSpec(shape=flat_concat(self.actor_tv).shape, dtype=tf.float32)] \
            #     + get_TensorSpecs([self.s_dim], self.visual_dim, [self.a_dim], [self.a_dim])
        else:
            self.actor_net = ActorDcs(self.feat_dim, self.a_dim, hidden_units['actor_discrete'])
            self.actor_tv = self.actor_net.trainable_variables
            # self.Hx_TensorSpecs = [tf.TensorSpec(shape=flat_concat(self.actor_tv).shape, dtype=tf.float32)] \
            #     + get_TensorSpecs([self.s_dim], self.visual_dim, [self.a_dim])
        self.critic_net = Critic(self.feat_dim, hidden_units['critic'])
        self.critic_tv = self.critic_net.trainable_variables + self.other_tv
        self.critic_lr = self.init_lr(critic_lr)
        self.optimizer_critic = self.init_optimizer(self.critic_lr)

        self.model_recorder(dict(
            actor=self.actor_net,
            critic=self.critic_net,
            optimizer_critic=self.optimizer_critic
        ))

        if self.is_continuous:
            data_name_list = ['s', 'visual_s', 'a', 'r', 's_', 'visual_s_', 'done', 'value', 'log_prob', 'old_mu', 'old_log_std']
        else:
            data_name_list = ['s', 'visual_s', 'a', 'r', 's_', 'visual_s_', 'done', 'value', 'log_prob', 'old_logp_all']
        self.initialize_data_buffer(
            data_name_list=data_name_list)
Exemplo n.º 7
0
 def _actor_net(i):
     return ActorCts(self.s_dim[i], self.a_dim[i],
                     network_settings['actor'])
Exemplo n.º 8
0
    def __init__(
            self,
            s_dim,
            visual_sources,
            visual_resolution,
            a_dim,
            is_continuous,
            alpha=0.2,
            annealing=True,
            last_alpha=0.01,
            ployak=0.995,
            discrete_tau=1.0,
            log_std_bound=[-20, 2],
            hidden_units={
                'actor_continuous': {
                    'share': [128, 128],
                    'mu': [64],
                    'log_std': [64]
                },
                'actor_discrete': [64, 32],
                'q': [128, 128],
                'encoder': 128
            },
            auto_adaption=True,
            actor_lr=5.0e-4,
            critic_lr=1.0e-3,
            alpha_lr=5.0e-4,
            curl_lr=5.0e-4,
            img_size=64,
            **kwargs):
        super().__init__(s_dim=s_dim,
                         visual_sources=visual_sources,
                         visual_resolution=visual_resolution,
                         a_dim=a_dim,
                         is_continuous=is_continuous,
                         **kwargs)
        assert self.visual_sources == 1
        self.ployak = ployak
        self.discrete_tau = discrete_tau
        self.log_std_min, self.log_std_max = log_std_bound[:]
        self.auto_adaption = auto_adaption
        self.annealing = annealing
        self.img_size = img_size
        self.img_dim = [img_size, img_size, self.visual_dim[-1]]
        self.vis_feat_size = hidden_units['encoder']

        if self.auto_adaption:
            self.log_alpha = tf.Variable(initial_value=0.0,
                                         name='log_alpha',
                                         dtype=tf.float32,
                                         trainable=True)
        else:
            self.log_alpha = tf.Variable(initial_value=tf.math.log(alpha),
                                         name='log_alpha',
                                         dtype=tf.float32,
                                         trainable=False)
            if self.annealing:
                self.alpha_annealing = LinearAnnealing(alpha, last_alpha,
                                                       1.0e6)

        if self.is_continuous:
            self.actor_net = ActorCts(self.s_dim + self.vis_feat_size,
                                      self.a_dim,
                                      hidden_units['actor_continuous'])
        else:
            self.actor_net = ActorDcs(self.s_dim + self.vis_feat_size,
                                      self.a_dim,
                                      hidden_units['actor_discrete'])
            self.gumbel_dist = tfp.distributions.Gumbel(0, 1)

        self.actor_tv = self.actor_net.trainable_variables
        # entropy = -log(1/|A|) = log |A|
        self.target_entropy = 0.98 * (-self.a_dim if self.is_continuous else
                                      np.log(self.a_dim))

        def _q_net():
            return Critic(self.s_dim + self.vis_feat_size, self.a_dim,
                          hidden_units['q'])

        self.critic_net = DoubleQ(_q_net)
        self.critic_target_net = DoubleQ(_q_net)

        self.encoder = VisualEncoder(self.img_dim, hidden_units['encoder'])
        self.encoder_target = VisualEncoder(self.img_dim,
                                            hidden_units['encoder'])

        self.curl_w = tf.Variable(
            initial_value=tf.random.normal(shape=(self.vis_feat_size,
                                                  self.vis_feat_size)),
            name='curl_w',
            dtype=tf.float32,
            trainable=True)

        self.critic_tv = self.critic_net.trainable_variables + self.encoder.trainable_variables

        update_target_net_weights(
            self.critic_target_net.weights +
            self.encoder_target.trainable_variables,
            self.critic_net.weights + self.encoder.trainable_variables)
        self.actor_lr, self.critic_lr, self.alpha_lr, self.curl_lr = map(
            self.init_lr, [actor_lr, critic_lr, alpha_lr, curl_lr])
        self.optimizer_actor, self.optimizer_critic, self.optimizer_alpha, self.optimizer_curl = map(
            self.init_optimizer,
            [self.actor_lr, self.critic_lr, self.alpha_lr, self.curl_lr])

        self.model_recorder(
            dict(
                actor=self.actor_net,
                critic_net=self.critic_net,
                curl_w=self.curl_w,
                optimizer_actor=self.optimizer_actor,
                optimizer_critic=self.optimizer_critic,
                optimizer_alpha=self.optimizer_alpha,
                optimizer_curl=self.optimizer_curl,
            ))
Exemplo n.º 9
0
    def __init__(
            self,
            s_dim,
            visual_sources,
            visual_resolution,
            a_dim,
            is_continuous,
            alpha=0.2,
            annealing=True,
            last_alpha=0.01,
            ployak=0.995,
            entropic_index=1.5,
            discrete_tau=1.0,
            log_std_bound=[-20, 2],
            hidden_units={
                'actor_continuous': {
                    'share': [128, 128],
                    'mu': [64],
                    'log_std': [64]
                },
                'actor_discrete': [64, 32],
                'q': [128, 128]
            },
            auto_adaption=True,
            actor_lr=5.0e-4,
            critic_lr=1.0e-3,
            alpha_lr=5.0e-4,
            **kwargs):
        super().__init__(s_dim=s_dim,
                         visual_sources=visual_sources,
                         visual_resolution=visual_resolution,
                         a_dim=a_dim,
                         is_continuous=is_continuous,
                         **kwargs)
        self.ployak = ployak
        self.discrete_tau = discrete_tau
        self.entropic_index = 2 - entropic_index
        self.log_std_min, self.log_std_max = log_std_bound[:]
        self.auto_adaption = auto_adaption
        self.annealing = annealing

        if self.auto_adaption:
            self.log_alpha = tf.Variable(initial_value=0.0,
                                         name='log_alpha',
                                         dtype=tf.float32,
                                         trainable=True)
        else:
            self.log_alpha = tf.Variable(initial_value=tf.math.log(alpha),
                                         name='log_alpha',
                                         dtype=tf.float32,
                                         trainable=False)
            if self.annealing:
                self.alpha_annealing = LinearAnnealing(alpha, last_alpha, 1e6)

        if self.is_continuous:
            self.actor_net = ActorCts(self.feat_dim, self.a_dim,
                                      hidden_units['actor_continuous'])
        else:
            self.actor_net = ActorDcs(self.feat_dim, self.a_dim,
                                      hidden_units['actor_discrete'])
            self.gumbel_dist = tfp.distributions.Gumbel(0, 1)
        self.actor_tv = self.actor_net.trainable_variables
        # entropy = -log(1/|A|) = log |A|
        self.target_entropy = 0.98 * (-self.a_dim if self.is_continuous else
                                      np.log(self.a_dim))

        def _q_net():
            return CriticQ1(self.feat_dim, self.a_dim, hidden_units['q'])

        self.critic_net = DoubleQ(_q_net)
        self.critic_target_net = DoubleQ(_q_net)
        self.critic_tv = self.critic_net.trainable_variables + self.other_tv

        update_target_net_weights(self.critic_target_net.weights,
                                  self.critic_net.weights)
        self.actor_lr, self.critic_lr, self.alpha_lr = map(
            self.init_lr, [actor_lr, critic_lr, alpha_lr])
        self.optimizer_actor, self.optimizer_critic, self.optimizer_alpha = map(
            self.init_optimizer,
            [self.actor_lr, self.critic_lr, self.alpha_lr])

        self.model_recorder(
            dict(
                actor=self.actor_net,
                critic_net=self.critic_net,
                log_alpha=self.log_alpha,
                optimizer_actor=self.optimizer_actor,
                optimizer_critic=self.optimizer_critic,
                optimizer_alpha=self.optimizer_alpha,
            ))
Exemplo n.º 10
0
 def _actor_net():
     return ActorCts(self.feat_dim, self.a_dim,
                     hidden_units['actor_continuous'])
Exemplo n.º 11
0
 def _actor_net(): return ActorCts(self.feat_dim, self.a_dim, hidden_units['actor_continuous'])
 # self.action_noise = NormalActionNoise(mu=np.zeros(self.a_dim), sigma=1 * np.ones(self.a_dim))
 self.action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(self.a_dim), sigma=0.2 * np.ones(self.a_dim))
Exemplo n.º 12
0
 def _low_actor_net():
     return ActorCts(self.s_dim + self.sub_goal_dim, self.a_dim,
                     hidden_units['low_actor'])
Exemplo n.º 13
0
 def _high_actor_net():
     return ActorCts(self.s_dim, self.sub_goal_dim,
                     hidden_units['high_actor'])