Пример #1
0
    def __init__(self,
                 s_dim,
                 a_dim,
                 is_continuous,
                 ployak=0.995,
                 actor_lr=5.0e-4,
                 critic_lr=1.0e-3,
                 n=1,
                 i=0,
                 hidden_units={
                     'actor': [32, 32],
                     'q': [32, 32]
                 },
                 **kwargs):
        assert is_continuous, 'matd3 only support continuous action space'
        raise Exception('MA系列存在问题,还未修复')
        super().__init__(s_dim=s_dim,
                         visual_sources=0,
                         visual_resolution=0,
                         a_dim=a_dim,
                         is_continuous=is_continuous,
                         **kwargs)
        self.n = n
        self.i = i
        self.ployak = ployak

        # self.action_noise = rls.NormalActionNoise(mu=np.zeros(self.a_dim), sigma=1 * np.ones(self.a_dim))
        self.action_noise = rls.OrnsteinUhlenbeckActionNoise(
            mu=np.zeros(self.a_dim), sigma=0.2 * np.ones(self.a_dim))

        def _actor_net():
            return rls.actor_dpg(self.s_dim, 0, self.a_dim,
                                 hidden_units['actor'])

        self.actor_net = _actor_net()
        self.actor_target_net = _actor_net()

        def _q_net():
            return rls.critic_q_one((self.s_dim) * self.n, 0,
                                    (self.a_dim) * self.n, hidden_units['q'])

        self.critic_net = DoubleQ(_q_net)
        self.critic_target_net = DoubleQ(_q_net)
        self.update_target_net_weights(
            self.actor_target_net.weights + self.critic_target_net.weights,
            self.actor_net.weights + self.critic_net.weights)
        self.actor_lr, self.critic_lr = map(self.init_lr,
                                            [actor_lr, critic_lr])
        self.optimizer_actor, self.optimizer_critic = map(
            self.init_optimizer, [self.actor_lr, self.critic_lr])

        self.model_recorder(
            dict(actor=self.actor_net,
                 critic_net=self.critic_net,
                 optimizer_critic=self.optimizer_critic,
                 optimizer_actor=self.optimizer_actor))

        self.recorder.logger.info(self.action_noise)
Пример #2
0
    def __init__(self,
                 s_dim,
                 visual_sources,
                 visual_resolution,
                 a_dim,
                 is_continuous,
                 alpha=0.2,
                 beta=0.1,
                 ployak=0.995,
                 eps_init=1,
                 eps_mid=0.2,
                 eps_final=0.01,
                 init2mid_annealing_step=1000,
                 use_epsilon=False,
                 q_lr=5.0e-4,
                 alpha_lr=5.0e-4,
                 auto_adaption=True,
                 hidden_units=[32, 32],
                 **kwargs):
        assert not is_continuous, 'maxsqn only support discrete action space'
        super().__init__(s_dim=s_dim,
                         visual_sources=visual_sources,
                         visual_resolution=visual_resolution,
                         a_dim=a_dim,
                         is_continuous=is_continuous,
                         **kwargs)
        self.expl_expt_mng = ExplorationExploitationClass(
            eps_init=eps_init,
            eps_mid=eps_mid,
            eps_final=eps_final,
            init2mid_annealing_step=init2mid_annealing_step,
            max_step=self.max_train_step)
        self.use_epsilon = use_epsilon
        self.ployak = ployak
        self.log_alpha = alpha if not auto_adaption else tf.Variable(
            initial_value=0.0,
            name='log_alpha',
            dtype=tf.float32,
            trainable=True)
        self.auto_adaption = auto_adaption
        self.target_entropy = beta * np.log(self.a_dim)

        def _q_net():
            return rls.critic_q_all(self.feat_dim, self.a_dim, hidden_units)

        self.critic_net = DoubleQ(_q_net)
        self.critic_target_net = DoubleQ(_q_net)
        self.critic_tv = self.critic_net.trainable_variables + self.other_tv
        self.update_target_net_weights(self.critic_target_net.weights,
                                       self.critic_net.weights)
        self.q_lr, self.alpha_lr = map(self.init_lr, [q_lr, alpha_lr])
        self.optimizer_critic, self.optimizer_alpha = map(
            self.init_optimizer, [self.q_lr, self.alpha_lr])

        self.model_recorder(
            dict(critic_net=self.critic_net,
                 optimizer_critic=self.optimizer_critic,
                 optimizer_alpha=self.optimizer_alpha))
Пример #3
0
Файл: tac.py Проект: yyht/RLs
    def __init__(self,
                 s_dim,
                 visual_sources,
                 visual_resolution,
                 a_dim,
                 is_continuous,

                 alpha=0.2,
                 annealing=True,
                 last_alpha=0.01,
                 ployak=0.995,
                 entropic_index=1.5,
                 discrete_tau=1.0,
                 log_std_bound=[-20, 2],
                 hidden_units={
                     'actor_continuous': {
                         'share': [128, 128],
                         'mu': [64],
                         'log_std': [64]
                     },
                     'actor_discrete': [64, 32],
                     'q': [128, 128]
                 },
                 auto_adaption=True,
                 actor_lr=5.0e-4,
                 critic_lr=1.0e-3,
                 alpha_lr=5.0e-4,
                 **kwargs):
        super().__init__(
            s_dim=s_dim,
            visual_sources=visual_sources,
            visual_resolution=visual_resolution,
            a_dim=a_dim,
            is_continuous=is_continuous,
            **kwargs)
        self.ployak = ployak
        self.discrete_tau = discrete_tau
        self.entropic_index = 2 - entropic_index
        self.log_std_min, self.log_std_max = log_std_bound[:]
        self.auto_adaption = auto_adaption
        self.annealing = annealing

        if self.auto_adaption:
            self.log_alpha = tf.Variable(initial_value=0.0, name='log_alpha', dtype=tf.float32, trainable=True)
        else:
            self.log_alpha = tf.Variable(initial_value=tf.math.log(alpha), name='log_alpha', dtype=tf.float32, trainable=False)
            if self.annealing:
                self.alpha_annealing = LinearAnnealing(alpha, last_alpha, 1e6)

        if self.is_continuous:
            self.actor_net = rls.actor_continuous(self.feat_dim, self.a_dim, hidden_units['actor_continuous'])
        else:
            self.actor_net = rls.actor_discrete(self.feat_dim, self.a_dim, hidden_units['actor_discrete'])
            self.gumbel_dist = tfp.distributions.Gumbel(0, 1)
        self.actor_tv = self.actor_net.trainable_variables
        # entropy = -log(1/|A|) = log |A|
        self.target_entropy = 0.98 * (self.a_dim if self.is_continuous else np.log(self.a_dim))

        def _q_net(): return rls.critic_q_one(self.feat_dim, self.a_dim, hidden_units['q'])
        self.critic_net = DoubleQ(_q_net)
        self.critic_target_net = DoubleQ(_q_net)
        self.critic_tv = self.critic_net.trainable_variables + self.other_tv

        self.update_target_net_weights(self.critic_target_net.weights, self.critic_net.weights)
        self.actor_lr, self.critic_lr, self.alpha_lr = map(self.init_lr, [actor_lr, critic_lr, alpha_lr])
        self.optimizer_actor, self.optimizer_critic, self.optimizer_alpha = map(self.init_optimizer, [self.actor_lr, self.critic_lr, self.alpha_lr])

        self.model_recorder(dict(
            actor=self.actor_net,
            critic_net=self.critic_net,
            log_alpha=self.log_alpha,
            optimizer_actor=self.optimizer_actor,
            optimizer_critic=self.optimizer_critic,
            optimizer_alpha=self.optimizer_alpha,
        ))
Пример #4
0
    def __init__(self,
                 s_dim,
                 visual_sources,
                 visual_resolution,
                 a_dim,
                 is_continuous,
                 ployak=0.995,
                 delay_num=2,
                 noise_type='gaussian',
                 gaussian_noise_sigma=0.2,
                 gaussian_noise_bound=0.2,
                 actor_lr=5.0e-4,
                 critic_lr=1.0e-3,
                 discrete_tau=1.0,
                 hidden_units={
                     'actor_continuous': [32, 32],
                     'actor_discrete': [32, 32],
                     'q': [32, 32]
                 },
                 **kwargs):
        super().__init__(s_dim=s_dim,
                         visual_sources=visual_sources,
                         visual_resolution=visual_resolution,
                         a_dim=a_dim,
                         is_continuous=is_continuous,
                         **kwargs)
        self.ployak = ployak
        self.delay_num = delay_num
        self.discrete_tau = discrete_tau
        self.gaussian_noise_sigma = gaussian_noise_sigma
        self.gaussian_noise_bound = gaussian_noise_bound

        if self.is_continuous:

            def _actor_net():
                return rls.actor_dpg(self.feat_dim, self.a_dim,
                                     hidden_units['actor_continuous'])

            if noise_type == 'gaussian':
                self.action_noise = rls.ClippedNormalActionNoise(
                    mu=np.zeros(self.a_dim),
                    sigma=self.gaussian_noise_sigma * np.ones(self.a_dim),
                    bound=self.gaussian_noise_bound)
            elif noise_type == 'ou':
                self.action_noise = rls.OrnsteinUhlenbeckActionNoise(
                    mu=np.zeros(self.a_dim), sigma=0.2 * np.ones(self.a_dim))
        else:

            def _actor_net():
                return rls.actor_discrete(self.feat_dim, self.a_dim,
                                          hidden_units['actor_discrete'])

            self.gumbel_dist = tfp.distributions.Gumbel(0, 1)

        self.actor_net = _actor_net()
        self.actor_target_net = _actor_net()
        self.actor_tv = self.actor_net.trainable_variables

        def _q_net():
            return rls.critic_q_one(self.feat_dim, self.a_dim,
                                    hidden_units['q'])

        self.critic_net = DoubleQ(_q_net)
        self.critic_target_net = DoubleQ(_q_net)
        self.critic_tv = self.critic_net.trainable_variables + self.other_tv

        self.update_target_net_weights(
            self.actor_target_net.weights + self.critic_target_net.weights,
            self.actor_net.weights + self.critic_net.weights)
        self.actor_lr, self.critic_lr = map(self.init_lr,
                                            [actor_lr, critic_lr])
        self.optimizer_actor, self.optimizer_critic = map(
            self.init_optimizer, [self.actor_lr, self.critic_lr])

        self.model_recorder(
            dict(actor=self.actor_net,
                 critic_net=self.critic_net,
                 optimizer_actor=self.optimizer_actor,
                 optimizer_critic=self.optimizer_critic))
Пример #5
0
    def __init__(
            self,
            s_dim,
            visual_sources,
            visual_resolution,
            a_dim,
            is_continuous,
            ployak=0.995,
            high_scale=1.0,
            reward_scale=1.0,
            sample_g_nums=100,
            sub_goal_steps=10,
            fn_goal_dim=0,
            intrinsic_reward_mode='os',
            high_batch_size=256,
            high_buffer_size=100000,
            low_batch_size=8,
            low_buffer_size=10000,
            high_actor_lr=1.0e-4,
            high_critic_lr=1.0e-3,
            low_actor_lr=1.0e-4,
            low_critic_lr=1.0e-3,
            hidden_units={
                'high_actor': [64, 64],
                'high_critic': [64, 64],
                'low_actor': [64, 64],
                'low_critic': [64, 64]
            },
            **kwargs):
        assert visual_sources == 0, 'HIRO doesn\'t support visual inputs.'
        super().__init__(s_dim=s_dim,
                         visual_sources=visual_sources,
                         visual_resolution=visual_resolution,
                         a_dim=a_dim,
                         is_continuous=is_continuous,
                         **kwargs)
        self.data_high = ExperienceReplay(high_batch_size, high_buffer_size)
        self.data_low = ExperienceReplay(low_batch_size, low_buffer_size)

        self.ployak = ployak
        self.high_scale = np.array(
            high_scale if isinstance(high_scale, list) else [high_scale] *
            self.s_dim,
            dtype=np.float32)
        self.reward_scale = reward_scale
        self.fn_goal_dim = fn_goal_dim
        self.sample_g_nums = sample_g_nums
        self.sub_goal_steps = sub_goal_steps
        self.sub_goal_dim = self.s_dim - self.fn_goal_dim

        self.high_noise = rls.ClippedNormalActionNoise(
            mu=np.zeros(self.sub_goal_dim),
            sigma=self.high_scale * np.ones(self.sub_goal_dim),
            bound=self.high_scale / 2)
        self.low_noise = rls.ClippedNormalActionNoise(mu=np.zeros(self.a_dim),
                                                      sigma=1.0 *
                                                      np.ones(self.a_dim),
                                                      bound=0.5)

        _high_actor_net = lambda: rls.actor_dpg(self.s_dim, self.sub_goal_dim,
                                                hidden_units['high_actor'])
        if self.is_continuous:
            _low_actor_net = lambda: rls.actor_dpg(
                self.s_dim + self.sub_goal_dim, self.a_dim, hidden_units[
                    'low_actor'])
        else:
            _low_actor_net = lambda: rls.actor_discrete(
                self.s_dim + self.sub_goal_dim, self.a_dim, hidden_units[
                    'low_actor'])
            self.gumbel_dist = tfd.Gumbel(0, 1)

        self.high_actor = _high_actor_net()
        self.high_actor_target = _high_actor_net()
        self.low_actor = _low_actor_net()
        self.low_actor_target = _low_actor_net()

        _high_critic_net = lambda: rls.critic_q_one(
            self.s_dim, self.sub_goal_dim, hidden_units['high_critic'])
        _low_critic_net = lambda: rls.critic_q_one(
            self.s_dim + self.sub_goal_dim, self.a_dim, hidden_units[
                'low_critic'])

        self.high_critic = DoubleQ(_high_critic_net)
        self.high_critic_target = DoubleQ(_high_critic_net)
        self.low_critic = DoubleQ(_low_critic_net)
        self.low_critic_target = DoubleQ(_low_critic_net)

        self.update_target_net_weights(
            self.low_actor_target.weights + self.low_critic_target.weights +
            self.high_actor_target.weights + self.high_critic_target.weights,
            self.low_actor.weights + self.low_critic.weights +
            self.high_actor.weights + self.high_critic.weights)

        self.low_actor_lr, self.low_critic_lr = map(
            self.init_lr, [low_actor_lr, low_critic_lr])
        self.high_actor_lr, self.high_critic_lr = map(
            self.init_lr, [high_actor_lr, high_critic_lr])
        self.low_actor_optimizer, self.low_critic_optimizer = map(
            self.init_optimizer, [self.low_actor_lr, self.low_critic_lr])
        self.high_actor_optimizer, self.high_critic_optimizer = map(
            self.init_optimizer, [self.high_actor_lr, self.high_critic_lr])

        self.model_recorder(
            dict(high_actor=self.high_actor,
                 high_critic=self.high_critic,
                 low_actor=self.low_actor,
                 low_critic=self.low_critic,
                 low_actor_optimizer=self.low_actor_optimizer,
                 low_critic_optimizer=self.low_critic_optimizer,
                 high_actor_optimizer=self.high_actor_optimizer,
                 high_critic_optimizer=self.high_critic_optimizer))

        self.counts = 0
        self._high_s = [[] for _ in range(self.n_agents)]
        self._noop_subgoal = np.random.uniform(-self.high_scale,
                                               self.high_scale,
                                               size=(self.n_agents,
                                                     self.sub_goal_dim))
        self.get_ir = self.generate_ir_func(mode=intrinsic_reward_mode)
Пример #6
0
    def __init__(
            self,
            s_dim,
            visual_sources,
            visual_resolution,
            a_dim,
            is_continuous,
            alpha=0.2,
            annealing=True,
            last_alpha=0.01,
            ployak=0.995,
            discrete_tau=1.0,
            log_std_bound=[-20, 2],
            hidden_units={
                'actor_continuous': {
                    'share': [128, 128],
                    'mu': [64],
                    'log_std': [64]
                },
                'actor_discrete': [64, 32],
                'q': [128, 128],
                'encoder': 128
            },
            auto_adaption=True,
            actor_lr=5.0e-4,
            critic_lr=1.0e-3,
            alpha_lr=5.0e-4,
            curl_lr=5.0e-4,
            img_size=64,
            **kwargs):
        super().__init__(s_dim=s_dim,
                         visual_sources=visual_sources,
                         visual_resolution=visual_resolution,
                         a_dim=a_dim,
                         is_continuous=is_continuous,
                         **kwargs)
        assert self.visual_sources == 1
        self.ployak = ployak
        self.discrete_tau = discrete_tau
        self.log_std_min, self.log_std_max = log_std_bound[:]
        self.auto_adaption = auto_adaption
        self.annealing = annealing
        self.img_size = img_size
        self.img_dim = [img_size, img_size, self.visual_dim[-1]]
        self.vis_feat_size = hidden_units['encoder']

        if self.auto_adaption:
            self.log_alpha = tf.Variable(initial_value=0.0,
                                         name='log_alpha',
                                         dtype=tf.float32,
                                         trainable=True)
        else:
            self.log_alpha = tf.Variable(initial_value=tf.math.log(alpha),
                                         name='log_alpha',
                                         dtype=tf.float32,
                                         trainable=False)
            if self.annealing:
                self.alpha_annealing = LinearAnnealing(alpha, last_alpha,
                                                       1.0e6)

        if self.is_continuous:
            self.actor_net = rls.actor_continuous(
                self.s_dim + self.vis_feat_size, self.a_dim,
                hidden_units['actor_continuous'])
        else:
            self.actor_net = rls.actor_discrete(
                self.s_dim + self.vis_feat_size, self.a_dim,
                hidden_units['actor_discrete'])
            self.gumbel_dist = tfp.distributions.Gumbel(0, 1)

        self.actor_tv = self.actor_net.trainable_variables
        # entropy = -log(1/|A|) = log |A|
        self.target_entropy = 0.98 * (-self.a_dim if self.is_continuous else
                                      np.log(self.a_dim))

        def _q_net():
            return rls.critic_q_one(self.s_dim + self.vis_feat_size,
                                    self.a_dim, hidden_units['q'])

        self.critic_net = DoubleQ(_q_net)
        self.critic_target_net = DoubleQ(_q_net)

        self.encoder = VisualEncoder(self.img_dim, hidden_units['encoder'])
        self.encoder_target = VisualEncoder(self.img_dim,
                                            hidden_units['encoder'])

        self.curl_w = tf.Variable(
            initial_value=tf.random.normal(shape=(self.vis_feat_size,
                                                  self.vis_feat_size)),
            name='curl_w',
            dtype=tf.float32,
            trainable=True)

        self.critic_tv = self.critic_net.trainable_variables + self.encoder.trainable_variables

        self.update_target_net_weights(
            self.critic_target_net.weights +
            self.encoder_target.trainable_variables,
            self.critic_net.weights + self.encoder.trainable_variables)
        self.actor_lr, self.critic_lr, self.alpha_lr, self.curl_lr = map(
            self.init_lr, [actor_lr, critic_lr, alpha_lr, curl_lr])
        self.optimizer_actor, self.optimizer_critic, self.optimizer_alpha, self.optimizer_curl = map(
            self.init_optimizer,
            [self.actor_lr, self.critic_lr, self.alpha_lr, self.curl_lr])

        self.model_recorder(
            dict(
                actor=self.actor_net,
                critic_net=self.critic_net,
                curl_w=self.curl_w,
                optimizer_actor=self.optimizer_actor,
                optimizer_critic=self.optimizer_critic,
                optimizer_alpha=self.optimizer_alpha,
                optimizer_curl=self.optimizer_curl,
            ))