示例#1
0
文件: ma_ddpg.py 项目: yyht/RLs
    def __init__(self,
                 s_dim,
                 a_dim,
                 is_continuous,
                 ployak=0.995,
                 actor_lr=5.0e-4,
                 critic_lr=1.0e-3,
                 n=1,
                 i=0,
                 hidden_units={
                     'actor': [32, 32],
                     'q': [32, 32]
                 },
                 **kwargs):
        assert is_continuous, 'maddpg only support continuous action space'
        raise Exception('MA系列存在问题,还未修复')
        super().__init__(s_dim=s_dim,
                         visual_sources=0,
                         visual_resolution=0,
                         a_dim=a_dim,
                         is_continuous=is_continuous,
                         **kwargs)
        self.n = n
        self.i = i
        self.ployak = ployak

        self.rnn_net = self._rnn_net(self.visual_net.hdim)

        # self.action_noise = rls.NormalActionNoise(mu=np.zeros(self.a_dim), sigma=1 * np.ones(self.a_dim))
        self.action_noise = rls.OrnsteinUhlenbeckActionNoise(
            mu=np.zeros(self.a_dim), sigma=0.2 * np.ones(self.a_dim))

        def _actor_net():
            return rls.actor_dpg(self.s_dim, 0, self.a_dim,
                                 hidden_units['actor'])

        self.actor_net = _actor_net()
        self.actor_target_net = _actor_net()

        def _q_net():
            return rls.critic_q_one((self.s_dim) * self.n, 0,
                                    (self.a_dim) * self.n, hidden_units['q'])

        self.q_net = _q_net()
        self.q_target_net = _q_net()
        self.update_target_net_weights(
            self.actor_target_net.weights + self.q_target_net.weights,
            self.actor_net.weights + self.q_net.weights)
        self.actor_lr, self.critic_lr = map(self.init_lr,
                                            [actor_lr, critic_lr])
        self.optimizer_actor, self.optimizer_critic = map(
            self.init_optimizer, [self.actor_lr, self.critic_lr])

        self.model_recorder(
            dict(actor=self.actor_net,
                 q=self.q_net,
                 optimizer_critic=self.optimizer_critic,
                 optimizer_actor=self.optimizer_actor))
        self.recorder.logger.info(self.action_noise)
示例#2
0
文件: ddpg.py 项目: yyht/RLs
    def __init__(self,
                 s_dim,
                 visual_sources,
                 visual_resolution,
                 a_dim,
                 is_continuous,

                 ployak=0.995,
                 actor_lr=5.0e-4,
                 critic_lr=1.0e-3,
                 discrete_tau=1.0,
                 hidden_units={
                     'actor_continuous': [32, 32],
                     'actor_discrete': [32, 32],
                     'q': [32, 32]
                 },
                 **kwargs):
        super().__init__(
            s_dim=s_dim,
            visual_sources=visual_sources,
            visual_resolution=visual_resolution,
            a_dim=a_dim,
            is_continuous=is_continuous,
            **kwargs)
        self.ployak = ployak
        self.discrete_tau = discrete_tau

        if self.is_continuous:
            def _actor_net(): return rls.actor_dpg(self.feat_dim, self.a_dim, hidden_units['actor_continuous'])
            # self.action_noise = rls.NormalActionNoise(mu=np.zeros(self.a_dim), sigma=1 * np.ones(self.a_dim))
            self.action_noise = rls.OrnsteinUhlenbeckActionNoise(mu=np.zeros(self.a_dim), sigma=0.2 * np.ones(self.a_dim))
        else:
            def _actor_net(): return rls.actor_discrete(self.feat_dim, self.a_dim, hidden_units['actor_discrete'])
            self.gumbel_dist = tfp.distributions.Gumbel(0, 1)

        self.actor_net = _actor_net()
        self.actor_target_net = _actor_net()
        self.actor_tv = self.actor_net.trainable_variables

        def _q_net(): return rls.critic_q_one(self.feat_dim, self.a_dim, hidden_units['q'])
        self.q_net = _q_net()
        self.q_target_net = _q_net()
        self.critic_tv = self.q_net.trainable_variables + self.other_tv
        self.update_target_net_weights(
            self.actor_target_net.weights + self.q_target_net.weights,
            self.actor_net.weights + self.q_net.weights
        )
        self.actor_lr, self.critic_lr = map(self.init_lr, [actor_lr, critic_lr])
        self.optimizer_actor, self.optimizer_critic = map(self.init_optimizer, [self.actor_lr, self.critic_lr])

        self.model_recorder(dict(
            actor=self.actor_net,
            critic=self.q_net,
            optimizer_actor=self.optimizer_actor,
            optimizer_critic=self.optimizer_critic
        ))
示例#3
0
    def __init__(self,
                 s_dim,
                 visual_sources,
                 visual_resolution,
                 a_dim,
                 is_continuous,
                 ployak=0.995,
                 delay_num=2,
                 noise_type='gaussian',
                 gaussian_noise_sigma=0.2,
                 gaussian_noise_bound=0.2,
                 actor_lr=5.0e-4,
                 critic_lr=1.0e-3,
                 discrete_tau=1.0,
                 hidden_units={
                     'actor_continuous': [32, 32],
                     'actor_discrete': [32, 32],
                     'q': [32, 32]
                 },
                 **kwargs):
        super().__init__(s_dim=s_dim,
                         visual_sources=visual_sources,
                         visual_resolution=visual_resolution,
                         a_dim=a_dim,
                         is_continuous=is_continuous,
                         **kwargs)
        self.ployak = ployak
        self.delay_num = delay_num
        self.discrete_tau = discrete_tau
        self.gaussian_noise_sigma = gaussian_noise_sigma
        self.gaussian_noise_bound = gaussian_noise_bound

        if self.is_continuous:
            _actor_net = lambda: rls.actor_dpg(
                self.feat_dim, self.a_dim, hidden_units['actor_continuous'])
            if noise_type == 'gaussian':
                self.action_noise = rls.ClippedNormalActionNoise(
                    mu=np.zeros(self.a_dim),
                    sigma=self.gaussian_noise_sigma * np.ones(self.a_dim),
                    bound=self.gaussian_noise_bound)
            elif noise_type == 'ou':
                self.action_noise = rls.OrnsteinUhlenbeckActionNoise(
                    mu=np.zeros(self.a_dim),
                    sigma=0.2 * np.exp(-self.episode / 10) *
                    np.ones(self.a_dim))
        else:
            _actor_net = lambda: rls.actor_discrete(
                self.feat_dim, self.a_dim, hidden_units['actor_discrete'])
            self.gumbel_dist = tfp.distributions.Gumbel(0, 1)

        self.actor_net = _actor_net()
        self.actor_target_net = _actor_net()
        self.actor_tv = self.actor_net.trainable_variables

        _q_net = lambda: rls.critic_q_one(self.feat_dim, self.a_dim,
                                          hidden_units['q'])
        self.critic_net = DoubleQ(_q_net)
        self.critic_target_net = DoubleQ(_q_net)
        self.critic_tv = self.critic_net.trainable_variables + self.other_tv

        self.update_target_net_weights(
            self.actor_target_net.weights + self.critic_target_net.weights,
            self.actor_net.weights + self.critic_net.weights)
        self.actor_lr, self.critic_lr = map(self.init_lr,
                                            [actor_lr, critic_lr])
        self.optimizer_actor, self.optimizer_critic = map(
            self.init_optimizer, [self.actor_lr, self.critic_lr])

        self.model_recorder(
            dict(actor=self.actor_net,
                 critic_net=self.critic_net,
                 optimizer_actor=self.optimizer_actor,
                 optimizer_critic=self.optimizer_critic))
示例#4
0
    def __init__(
            self,
            s_dim,
            visual_sources,
            visual_resolution,
            a_dim,
            is_continuous,
            ployak=0.995,
            actor_lr=5.0e-4,
            reward_critic_lr=1.0e-3,
            cost_critic_lr=1.0e-3,
            lambda_lr=5.0e-4,
            discrete_tau=1.0,
            cost_constraint=1.0,
            hidden_units={
                'actor_continuous': [32, 32],
                'actor_discrete': [32, 32],
                'reward': [32, 32],
                'cost': [32, 32]
            },
            **kwargs):
        super().__init__(s_dim=s_dim,
                         visual_sources=visual_sources,
                         visual_resolution=visual_resolution,
                         a_dim=a_dim,
                         is_continuous=is_continuous,
                         **kwargs)
        self.ployak = ployak
        self.discrete_tau = discrete_tau
        self._lambda = tf.Variable(0.0, dtype=tf.float32)
        self.cost_constraint = cost_constraint  # long tern cost <= d

        if self.is_continuous:
            _actor_net = lambda: rls.actor_dpg(
                self.feat_dim, self.a_dim, hidden_units['actor_continuous'])
            # self.action_noise = rls.NormalActionNoise(mu=np.zeros(self.a_dim), sigma=1 * np.ones(self.a_dim))
            self.action_noise = rls.OrnsteinUhlenbeckActionNoise(
                mu=np.zeros(self.a_dim),
                sigma=0.2 * np.exp(-self.episode / 10) * np.ones(self.a_dim))
        else:
            _actor_net = lambda: rls.actor_discrete(
                self.feat_dim, self.a_dim, hidden_units['actor_discrete'])
            self.gumbel_dist = tfp.distributions.Gumbel(0, 1)

        self.actor_net = _actor_net()
        self.actor_target_net = _actor_net()
        self.actor_tv = self.actor_net.trainable_variables

        _critic_net = lambda hiddens: rls.critic_q_one(self.feat_dim, self.
                                                       a_dim, hiddens)
        self.reward_critic_net = _critic_net(hidden_units['reward'])
        self.reward_critic_target_net = _critic_net(hidden_units['reward'])
        self.cost_critic_net = _critic_net(hidden_units['cost'])
        self.cost_critic_target_net = _critic_net(hidden_units['cost'])

        self.reward_critic_tv = self.reward_critic_net.trainable_variables + self.other_tv
        self.update_target_net_weights(
            self.actor_target_net.weights +
            self.reward_critic_target_net.weights +
            self.cost_critic_target_net.weights, self.actor_net.weights +
            self.reward_critic_net.weights + self.cost_critic_net.weights)
        self.lambda_lr = lambda_lr
        self.actor_lr, self.reward_critic_lr, self.cost_critic_lr = map(
            self.init_lr, [actor_lr, reward_critic_lr, cost_critic_lr])
        self.optimizer_actor, self.optimizer_reward_critic, self.optimizer_cost_critic = map(
            self.init_optimizer,
            [self.actor_lr, self.reward_critic_lr, self.cost_critic_lr])

        self.model_recorder(
            dict(actor=self.actor_net,
                 reward_critic=self.reward_critic_net,
                 cost_critic=self.cost_critic_net,
                 optimizer_actor=self.optimizer_actor,
                 optimizer_reward_critic=self.optimizer_reward_critic,
                 optimizer_cost_critic=self.optimizer_cost_critic))