示例#1
0
    def __init__(self,
                 observation_space,
                 action_space,
                 gamma=0.99,
                 nb_steps_warm_up=2000,
                 polyak=0.995,
                 training=True):
        super().__init__(observation_space, action_space)
        self.gamma = gamma
        self.polyak = polyak

        self.nb_actions = action_space.shape[0]
        self.observation_shape = observation_space.shape
        self.nb_steps_warm_up = nb_steps_warm_up
        self.training = training

        self.memory = MemoryNP(capacity=10000,
                               observation_shape=self.observation_shape,
                               action_shape=self.action_space.shape)

        self.actor_model, self.critic_model = self._build_network()
        self.target_actor_model, self.target_critic_model = self._build_network(
        )
        self.target_actor_model.set_weights(self.actor_model.get_weights())
        self.target_critic_model.set_weights(self.critic_model.get_weights())

        self.step_count = 0
示例#2
0
    def __init__(self,
                 observation_space: Box,
                 action_space: Discrete,
                 train_policy=None,
                 test_policy=None,
                 lr=3e-4,
                 gamma=0.99,
                 memory_size=10000,
                 target_model_update=0.99,
                 training=True,
                 enable_double_dqn=True,
                 dueling_type=None):

        super().__init__(observation_space, action_space)

        # 学习率
        self.lr = lr
        # 衰减系数
        self.gamma = gamma
        # 目标模型更新的频率,若`target_model_update < 1`使用软更新, `target_model_update >= 1`使用硬更新
        self.target_model_update = target_model_update
        # 训练过程使用的策略
        if train_policy is None:
            self.train_policy = DecayEpsGreedyQPolicy()
        else:
            self.train_policy = train_policy

        # 测试过程使用的策略
        if test_policy is None:
            self.test_policy = GreedyQPolicy()
        else:
            self.test_policy = test_policy

        # 用于标记agent的状态, True表示训练状态, False表示测试状态, 根据状态选择策略
        self.training = training
        self.policy = None
        self.switch_mode(self.training)

        # 是否使用double dqn
        self.enable_double_dqn = enable_double_dqn
        # 是否使用dueling dqn
        self.dueling_type = dueling_type

        # 动作个数
        self.nb_actions = action_space.n
        # 由于动作是离散的,可以使用一个值来表示
        self.action_shape = (1, )
        # 观测状态的形状
        self.observation_shape = observation_space.shape

        # ReplayBuffer
        self.memory = MemoryNP(capacity=memory_size,
                               action_shape=self.action_shape,
                               observation_shape=self.observation_shape)

        # 创建DNN模型,以及目标模型,初始化参数
        self.model, self.target_model = self.build_all_models()

        # 计数
        self.step_count = 0
示例#3
0
 def __init__(self,
              observation_space,
              action_space,
              gamma=0.99,
              nb_steps_warm_up=2000,
              sigma=0.3,
              polyak=0.995,
              pi_lr=0.001,
              q_lr=0.001,
              batch_size=100,
              action_noise=0.1,
              target_noise=0.2,
              noise_clip=0.5,
              policy_delay=2,
              training=True):
     super().__init__(observation_space, action_space)
     self.gamma = gamma
     self.sigma = sigma
     self.polyak = polyak
     self.pi_lr = pi_lr
     self.q_lr = q_lr
     self.batch_size = batch_size
     self.action_noise = action_noise
     self.target_noise = target_noise
     self.noise_clip = noise_clip
     self.policy_delay = policy_delay
     
     self.action_space = action_space
     self.nb_actions = action_space.shape[0]
     self.observation_shape = observation_space.shape
     self.nb_steps_warm_up = nb_steps_warm_up
     self.training = training
     
     self.memory = MemoryNP(
         capacity=10000,
         observation_shape=self.observation_shape,
         action_shape=self.action_space.shape
     )
     
     self.actor_model, self.critic_model1, self.critic_model2 = self._build_network()
     
     self.target_actor_model, self.target_critic_model1, self.target_critic_model2 = self._build_network()
     
     self.target_actor_model.set_weights(self.actor_model.get_weights())
     self.target_critic_model1.set_weights(self.critic_model1.get_weights())
     self.target_critic_model2.set_weights(self.critic_model2.get_weights())
     
     self.step_count = 0
示例#4
0
    def __init__(self,
                 observation_space,
                 action_space,
                 gamma=0.99,
                 nb_steps_warm_up=2000,
                 alpha=0.2,
                 polyak=0.995,
                 value_network_lr=3e-4,
                 soft_q_network_lr=3e-4,
                 policy_network_lr=3e-4,
                 log_std_min=-20,
                 log_std_max=2):
        super().__init__(observation_space, action_space)
        self.gamma = gamma
        self.alpha = alpha
        self.polyak = polyak
        self.nb_steps_warm_up = nb_steps_warm_up
        self.value_network_lr = value_network_lr
        self.soft_q_network_lr = soft_q_network_lr
        self.policy_network_lr = policy_network_lr
        self.log_std_min = log_std_min
        self.log_std_max = log_std_max
        self.min_entropy = -self.action_space.shape[0]

        self.memory = MemoryNP(capacity=10000,
                               observation_shape=observation_space.shape,
                               action_shape=action_space.shape)

        self.value_net = self._build_value_network()
        self.target_value_net = self._build_value_network()
        self.target_value_net.set_weights(self.value_net.get_weights())

        self.soft_q_net1 = self._build_soft_q_network()
        self.soft_q_net2 = self._build_soft_q_network()

        self.policy_net = self._build_policy_network()

        self.step_count = 0
        self.training = True
示例#5
0
class DQNAgent(Agent):
    def __init__(self,
                 observation_space: Box,
                 action_space: Discrete,
                 train_policy=None,
                 test_policy=None,
                 lr=3e-4,
                 gamma=0.99,
                 memory_size=10000,
                 target_model_update=0.99,
                 training=True,
                 enable_double_dqn=True,
                 dueling_type=None):

        super().__init__(observation_space, action_space)

        # 学习率
        self.lr = lr
        # 衰减系数
        self.gamma = gamma
        # 目标模型更新的频率,若`target_model_update < 1`使用软更新, `target_model_update >= 1`使用硬更新
        self.target_model_update = target_model_update
        # 训练过程使用的策略
        if train_policy is None:
            self.train_policy = DecayEpsGreedyQPolicy()
        else:
            self.train_policy = train_policy

        # 测试过程使用的策略
        if test_policy is None:
            self.test_policy = GreedyQPolicy()
        else:
            self.test_policy = test_policy

        # 用于标记agent的状态, True表示训练状态, False表示测试状态, 根据状态选择策略
        self.training = training
        self.policy = None
        self.switch_mode(self.training)

        # 是否使用double dqn
        self.enable_double_dqn = enable_double_dqn
        # 是否使用dueling dqn
        self.dueling_type = dueling_type

        # 动作个数
        self.nb_actions = action_space.n
        # 由于动作是离散的,可以使用一个值来表示
        self.action_shape = (1, )
        # 观测状态的形状
        self.observation_shape = observation_space.shape

        # ReplayBuffer
        self.memory = MemoryNP(capacity=memory_size,
                               action_shape=self.action_shape,
                               observation_shape=self.observation_shape)

        # 创建DNN模型,以及目标模型,初始化参数
        self.model, self.target_model = self.build_all_models()

        # 计数
        self.step_count = 0

    def build_all_models(self):
        model = self.build_q_net()
        target_model = self.build_q_net()

        model = self.use_dueling_network(model)
        target_model = self.use_dueling_network(target_model)

        target_model.set_weights(model.get_weights())

        model.compile(optimizer=tf.keras.optimizers.Adam(lr=self.lr),
                      metrics=['mse'],
                      loss=tf.keras.losses.mean_squared_error)

        return model, target_model

    def use_dueling_network(self, model):
        layer = model.layers[-2]
        y = tf.keras.layers.Dense(self.nb_actions + 1,
                                  activation='linear')(layer.output)
        if self.dueling_type == 'avg':
            output_layer = tf.keras.layers.Lambda(
                lambda a: tf.expand_dims(a[:, 0], -1) + a[:, 1:] - tf.
                reduce_mean(a[:, 1:], axis=1, keepdims=True),
                output_shape=(self.nb_actions, ))(y)
        elif self.dueling_type == 'max':
            output_layer = tf.keras.layers.Lambda(
                lambda a: tf.expand_dims(a[:, 0], -1) + a[:, 1:] - tf.
                reduce_max(a[:, 1:], axis=1, keepdims=True),
                output_shape=(self.nb_actions, ))(y)
        elif self.dueling_type == 'naive':
            output_layer = tf.keras.layers.Lambda(
                lambda a: tf.expand_dims(a[:, 0], -1) + a[:, 1:],
                output_shape=(self.nb_actions, ))(y)
        else:
            output_layer = model.layers[-1].output

        model = tf.keras.models.Model(inputs=model.input, outputs=output_layer)

        return model

    def build_q_net(self):
        '''
        创建深度Q网络,若需要改变网络的容量,可以重载该函数,不过需要保持输入输出的一致
        :return:  q_net
        '''
        model = tf.keras.Sequential([
            tf.keras.layers.Dense(128,
                                  activation='relu',
                                  input_shape=self.observation_shape),
            tf.keras.layers.Dense(32, activation='relu'),
            tf.keras.layers.Dense(self.nb_actions, activation='linear')
        ])

        return model

    def switch_mode(self, training=None):
        '''
        切换动作策略:
        :param training:  agent所处的模式,
            training=True: 训练模式
            training=False: 测试模式
        '''
        if training is None:
            self.training = ~self.training
        else:
            self.training = training

        if self.training:
            self.policy = self.train_policy
            print("Switch to train mode.")
        else:
            self.policy = self.test_policy
            print("Switch to test mode.")

    def forward(self, observation):
        '''
        根据观测状态选择动作
        :param observation: 观测状态
        :return:
        '''
        observation = np.expand_dims(observation, axis=0)
        q_values = self.model.predict(observation).squeeze(0)
        action = self.policy.select_action(q_values)
        return action

    def backward(self, observation, action, reward, terminal,
                 next_observation):
        '''
        每次与环境交互一次,可以生成一个MDP转移元组样本,保存该元组至ReplayBuffer,采样并训练
        :param observation:
        :param action:
        :param reward:
        :param terminal:
        :param next_observation:
        :return:
        '''
        # 保存样本
        self.memory.store_transition(observation, action, reward, terminal,
                                     next_observation)

        # 更新Q网络
        if self.enable_double_dqn:
            self.update_model_double_dqn()
        else:
            self.update_model()

        # 更新目标Q网络
        self.update_target_model()

    def update_model(self):
        # 从ReplayBuffer采样
        observations, actions, rewards, terminals, next_observations = self.memory.sample_batch(
        )

        # 计算目标Q值
        target_q_values = np.max(self.target_model.predict(next_observations),
                                 axis=1,
                                 keepdims=True)
        actions = tf.keras.utils.to_categorical(
            actions, num_classes=self.nb_actions).astype(np.bool)
        q_values = self.model.predict(observations)
        q_values[
            actions,
            np.newaxis] = rewards + self.gamma * target_q_values * (~terminals)

        # 更新Q网络
        self.model.fit(observations, q_values, verbose=0)

    def update_model_double_dqn(self):
        # 从ReplayBuffer采样
        observations, actions, rewards, terminals, next_observations = self.memory.sample_batch(
        )

        # 计算目标Q值
        q_values = self.model.predict(observations)
        q_values_next = self.model.predict(next_observations)

        target_action = tf.keras.utils.to_categorical(
            np.argmax(q_values_next,
                      axis=1), num_classes=self.nb_actions).astype(np.bool)

        target_q_values = self.target_model.predict(
            next_observations)[target_action].reshape(-1, 1)

        actions = tf.keras.utils.to_categorical(
            actions, num_classes=self.nb_actions).astype(np.bool)
        q_values[
            actions,
            np.newaxis] = rewards + self.gamma * target_q_values * (~terminals)

        # 更新Q网络
        self.model.fit(observations, q_values, verbose=0)

    def update_target_model(self):
        if self.target_model_update < 1.:
            # soft update: w'(t+1) = w'(t) * lamda + w(t) * (1 - lamda)
            new_target_model_weights = polyak_averaging(
                weights_list=self.model.get_weights(),
                target_weights_list=self.target_model.get_weights(),
                polyak=self.target_model_update)
            self.target_model.set_weights(new_target_model_weights)
        else:
            # hard update: w'(t+1) = w(t)
            self.step_count += 1
            if self.step_count % int(self.target_model_update) == 0:
                self.target_model.set_weights(self.model.get_weights())
示例#6
0
class DDPGAgent(Agent):
    def __init__(self,
                 observation_space,
                 action_space,
                 gamma=0.99,
                 nb_steps_warm_up=2000,
                 polyak=0.995,
                 training=True):
        super().__init__(observation_space, action_space)
        self.gamma = gamma
        self.polyak = polyak

        self.nb_actions = action_space.shape[0]
        self.observation_shape = observation_space.shape
        self.nb_steps_warm_up = nb_steps_warm_up
        self.training = training

        self.memory = MemoryNP(capacity=10000,
                               observation_shape=self.observation_shape,
                               action_shape=self.action_space.shape)

        self.actor_model, self.critic_model = self._build_network()
        self.target_actor_model, self.target_critic_model = self._build_network(
        )
        self.target_actor_model.set_weights(self.actor_model.get_weights())
        self.target_critic_model.set_weights(self.critic_model.get_weights())

        self.step_count = 0

    def _build_network(self):
        action_tensor = tf.keras.layers.Input(shape=(self.nb_actions, ),
                                              dtype=tf.float64)
        observation_tensor = tf.keras.layers.Input(
            shape=self.observation_shape, dtype=tf.float64)

        # 创建Actor模型
        y = tf.keras.layers.Dense(32, activation='relu')(observation_tensor)
        y = tf.keras.layers.Dense(32, activation='relu')(y)
        y = tf.keras.layers.Dense(32, activation='relu')(y)
        y = tf.keras.layers.Dense(self.nb_actions, activation='tanh')(y)

        actor_model = tf.keras.Model(inputs=observation_tensor, outputs=y)
        actor_model.compile(optimizer=tf.keras.optimizers.Adam(lr=3e-4),
                            loss='mse')

        # 创建Critic模型
        y = tf.keras.layers.Concatenate()([observation_tensor, action_tensor])
        y = tf.keras.layers.Dense(32, activation='relu')(y)
        y = tf.keras.layers.Dense(32, activation='relu')(y)
        y = tf.keras.layers.Dense(32, activation='relu')(y)
        y = tf.keras.layers.Dense(1, activation='linear')(y)

        critic_model = tf.keras.Model(
            inputs=[observation_tensor, action_tensor], outputs=y)
        critic_model.compile(optimizer=tf.keras.optimizers.Adam(lr=3e-4),
                             loss='mse')

        return actor_model, critic_model

    def forward(self, observation):
        self.step_count += 1

        if self.step_count < self.nb_steps_warm_up:
            return self.action_space.sample()
        else:
            observation = np.expand_dims(observation, axis=0)
            action = self.actor_model.predict(observation)
            action = action.reshape(self.nb_actions)
            if self.training:
                action = action + 0.3 * np.random.random()
            return action

    def backward(self, observation, action, reward, terminal,
                 next_observation):
        self.memory.store_transition(observation, action, reward, terminal,
                                     next_observation)

        if self.step_count < self.nb_steps_warm_up:
            return
        else:
            self._update()

    def _update(self):
        observations, actions, rewards, terminals, next_observations = self.memory.sample_batch(
        )

        self._update_critic(observations, actions, rewards, terminals,
                            next_observations)
        self._update_actor(observations)

        # 更新critic的target网络
        new_target_critic_weights_list = polyak_averaging(
            self.critic_model.get_weights(),
            self.target_critic_model.get_weights(), self.polyak)
        self.target_critic_model.set_weights(new_target_critic_weights_list)

        # 更新actor的target网络
        new_target_actor_weights_list = polyak_averaging(
            self.actor_model.get_weights(),
            self.target_actor_model.get_weights(), self.polyak)
        self.target_actor_model.set_weights(new_target_actor_weights_list)

    def polyak_averaging(self, weights_list, target_weights_list):
        new_target_weights_list = []
        for weights, target_weights in zip(weights_list, target_weights_list):
            new_target_weights = self.polyak * target_weights + (
                1 - self.polyak) * weights
            new_target_weights_list.append(new_target_weights)
        return new_target_weights_list

    @tf.function
    def _update_actor(self, observations):
        with tf.GradientTape() as tape:
            tape.watch(self.actor_model.trainable_weights)
            q_values = self.target_critic_model(
                [observations, self.actor_model(observations)])
            loss = -tf.reduce_mean(q_values)

        actor_grads = tape.gradient(loss, self.actor_model.trainable_weights)
        self.actor_model.optimizer.apply_gradients(
            zip(actor_grads, self.actor_model.trainable_weights))

    def _update_critic(self, observations, actions, rewards, terminals,
                       next_observations):
        q_values_next = self.target_critic_model(
            [next_observations,
             self.actor_model(next_observations)])
        target_q_values = rewards + self.gamma * q_values_next
        self.critic_model.fit([observations, actions],
                              target_q_values,
                              verbose=0)

    def switch_mode(self, training=None):
        """
        :param training:  agent所处的模式,
            training=True: 训练模式
            training=False: 测试模式
        """
        if training is None:
            self.training = ~self.training
        else:
            self.training = training

        if self.training:
            print("Switch to train mode.")
        else:
            print("Switch to test mode.")
示例#7
0
class SACAgent(Agent):
    def __init__(self,
                 observation_space,
                 action_space,
                 gamma=0.99,
                 nb_steps_warm_up=2000,
                 alpha=0.2,
                 polyak=0.995,
                 value_network_lr=3e-4,
                 soft_q_network_lr=3e-4,
                 policy_network_lr=3e-4,
                 log_std_min=-20,
                 log_std_max=2):
        super().__init__(observation_space, action_space)
        self.gamma = gamma
        self.alpha = alpha
        self.polyak = polyak
        self.nb_steps_warm_up = nb_steps_warm_up
        self.value_network_lr = value_network_lr
        self.soft_q_network_lr = soft_q_network_lr
        self.policy_network_lr = policy_network_lr
        self.log_std_min = log_std_min
        self.log_std_max = log_std_max
        self.min_entropy = -self.action_space.shape[0]

        self.memory = MemoryNP(capacity=10000,
                               observation_shape=observation_space.shape,
                               action_shape=action_space.shape)

        self.value_net = self._build_value_network()
        self.target_value_net = self._build_value_network()
        self.target_value_net.set_weights(self.value_net.get_weights())

        self.soft_q_net1 = self._build_soft_q_network()
        self.soft_q_net2 = self._build_soft_q_network()

        self.policy_net = self._build_policy_network()

        self.step_count = 0
        self.training = True

    def forward(self, observation):
        self.step_count += 1

        if self.step_count < self.nb_steps_warm_up:
            return self.action_space.sample()
        else:
            if observation.ndim == 1:
                observation = np.expand_dims(observation, axis=0)

            mean, log_std = self.policy_net.predict(observation)

            std = tf.math.exp(log_std)
            action = mean + tf.random.normal(tf.shape(mean)) * std

            return action

    def backward(self, observation, action, reward, terminal,
                 next_observation):
        self.memory.store_transition(observation, action, reward, terminal,
                                     next_observation)

        if self.step_count >= self.nb_steps_warm_up:
            self._update()

            new_target_weights = polyak_averaging(
                self.value_net.get_weights(),
                self.target_value_net.get_weights(), self.polyak)
            self.target_value_net.set_weights(new_target_weights)

    def _update(self):
        observations, actions, rewards, _, next_observations = self.memory.sample_batch(
        )

        target_q_value = rewards + self.gamma * self.target_value_net.predict(
            next_observations)

        soft_actions, log_probs = self.evaluate(observations)

        soft_q_value1 = self.soft_q_net1.predict([observations, soft_actions])
        soft_q_value2 = self.soft_q_net2.predict([observations, soft_actions])

        target_value = tf.minimum(soft_q_value1,
                                  soft_q_value2) - self.alpha * log_probs

        # Update soft Q network
        self.soft_q_net1.fit([observations, actions],
                             target_q_value,
                             verbose=0)
        self.soft_q_net2.fit([observations, actions],
                             target_q_value,
                             verbose=0)

        # Update value network
        self.value_net.fit(observations, target_value, verbose=0)

        # Update policy network
        with tf.GradientTape() as tape:
            tape.watch(self.policy_net.trainable_weights)

            soft_actions, log_probs = self.evaluate(observations)

            soft_q_value = self.soft_q_net1([observations, soft_actions])

            loss = -tf.reduce_mean(soft_q_value - self.alpha * log_probs)

        actor_grads = tape.gradient(loss, self.policy_net.trainable_weights)
        self.policy_net.optimizer.apply_gradients(
            zip(actor_grads, self.policy_net.trainable_weights))

    def evaluate(self, observations):
        mean, log_std = self.policy_net(observations)

        std = tf.math.exp(log_std)
        z = mean + tf.random.normal(tf.shape(mean)) * std
        action = tf.math.tanh(z)
        log_prob = gaussian_likelihood(z, mean, log_std)
        log_prob -= tf.math.reduce_sum(tf.math.log(1 - action**2 + 1e-6),
                                       axis=1)

        action = tf.cast(action, dtype=tf.float64)

        return action, log_prob

    def _build_value_network(self):
        observation_shape = self.observation_space.shape

        layers = tf.keras.layers

        model = tf.keras.models.Sequential([
            layers.Dense(32, activation='relu', input_shape=observation_shape),
            layers.Dense(32, activation='relu'),
            layers.Dense(32, activation='relu'),
            layers.Dense(1)
        ])
        model.compile(
            loss='mse',
            optimizer=tf.keras.optimizers.Adam(lr=self.value_network_lr))

        return model

    def _build_soft_q_network(self):
        observation_shape = self.observation_space.shape
        nb_actions = self.action_space.shape[0]

        layers = tf.keras.layers
        observation_tensor = layers.Input(shape=observation_shape)
        action_tensor = layers.Input(shape=(nb_actions, ))

        y = layers.Concatenate()([observation_tensor, action_tensor])
        y = layers.Dense(32, activation='relu')(y)
        y = layers.Dense(32, activation='relu')(y)
        y = layers.Dense(32, activation='relu')(y)
        y = layers.Dense(1)(y)

        model = tf.keras.models.Model(
            inputs=[observation_tensor, action_tensor], outputs=y)
        model.compile(
            loss='mse',
            optimizer=tf.keras.optimizers.Adam(lr=self.soft_q_network_lr),
        )

        return model

    def _build_policy_network(self):
        observation_shape = self.observation_space.shape
        nb_actions = self.action_space.shape[0]

        layers = tf.keras.layers
        observation_tensor = layers.Input(shape=observation_shape)
        y = layers.Dense(32, activation='relu')(observation_tensor)
        y = layers.Dense(32, activation='relu')(y)
        y = layers.Dense(32, activation='relu')(y)

        mean = layers.Dense(nb_actions, activation='tanh')(y)
        log_std = layers.Dense(nb_actions, activation='tanh')(y)

        log_std = self.log_std_min + 0.5 * (self.log_std_max -
                                            self.log_std_min) * (log_std + 1)

        model = tf.keras.models.Model(inputs=observation_tensor,
                                      outputs=[mean, log_std])
        model.compile(loss='mse',
                      optimizer=tf.keras.optimizers.Adam(
                          self.policy_network_lr))

        return model

    def switch_mode(self, training=None):
        """
        :param training:  agent所处的模式,
            training=True: 训练模式
            training=False: 测试模式
        """
        if training is None:
            self.training = ~self.training
        else:
            self.training = training

        if self.training:
            print("Switch to train mode.")
        else:
            print("Switch to test mode.")
示例#8
0
class TD3Agent(Agent):
    def __init__(self,
                 observation_space,
                 action_space,
                 gamma=0.99,
                 nb_steps_warm_up=2000,
                 sigma=0.3,
                 polyak=0.995,
                 pi_lr=0.001,
                 q_lr=0.001,
                 batch_size=100,
                 action_noise=0.1,
                 target_noise=0.2,
                 noise_clip=0.5,
                 policy_delay=2,
                 training=True):
        super().__init__(observation_space, action_space)
        self.gamma = gamma
        self.sigma = sigma
        self.polyak = polyak
        self.pi_lr = pi_lr
        self.q_lr = q_lr
        self.batch_size = batch_size
        self.action_noise = action_noise
        self.target_noise = target_noise
        self.noise_clip = noise_clip
        self.policy_delay = policy_delay
        
        self.action_space = action_space
        self.nb_actions = action_space.shape[0]
        self.observation_shape = observation_space.shape
        self.nb_steps_warm_up = nb_steps_warm_up
        self.training = training
        
        self.memory = MemoryNP(
            capacity=10000,
            observation_shape=self.observation_shape,
            action_shape=self.action_space.shape
        )
        
        self.actor_model, self.critic_model1, self.critic_model2 = self._build_network()
        
        self.target_actor_model, self.target_critic_model1, self.target_critic_model2 = self._build_network()
        
        self.target_actor_model.set_weights(self.actor_model.get_weights())
        self.target_critic_model1.set_weights(self.critic_model1.get_weights())
        self.target_critic_model2.set_weights(self.critic_model2.get_weights())
        
        self.step_count = 0
    
    def _build_network(self):
        action_tensor = tf.keras.layers.Input(shape=(self.nb_actions,), dtype=tf.float64)
        observation_tensor = tf.keras.layers.Input(shape=self.observation_shape, dtype=tf.float64)
        
        # 创建Actor模型
        y = tf.keras.layers.Dense(32, activation='relu')(observation_tensor)
        y = tf.keras.layers.Dense(32, activation='relu')(y)
        y = tf.keras.layers.Dense(32, activation='relu')(y)
        y = tf.keras.layers.Dense(self.nb_actions, activation='tanh')(y)
        
        actor_model = tf.keras.Model(inputs=observation_tensor, outputs=y)
        actor_model.compile(optimizer=tf.keras.optimizers.Adam(lr=self.pi_lr), loss='mse')
        
        # 创建Critic1模型
        critic_model1 = self._build_critic_network(observation_tensor, action_tensor)
        # 创建Critic2模型
        critic_model2 = self._build_critic_network(observation_tensor, action_tensor)
        
        return actor_model, critic_model1, critic_model2
    
    def _build_critic_network(self, observation_tensor, action_tensor):
        y = tf.keras.layers.Concatenate()([observation_tensor, action_tensor])
        y = tf.keras.layers.Dense(32, activation='relu')(y)
        y = tf.keras.layers.Dense(32, activation='relu')(y)
        y = tf.keras.layers.Dense(32, activation='relu')(y)
        y = tf.keras.layers.Dense(1, activation='linear')(y)
        
        critic_model = tf.keras.Model(inputs=[observation_tensor, action_tensor], outputs=y)
        critic_model.compile(optimizer=tf.keras.optimizers.Adam(lr=self.q_lr), loss='mse')
        return critic_model
    
    def forward(self, observation):
        self.step_count += 1
        
        if self.step_count < self.nb_steps_warm_up:
            return self.action_space.sample()
        else:
            observation = np.expand_dims(observation, axis=0)
            action = self.actor_model.predict(observation)
            action = action.reshape(self.nb_actions)
            if self.training:
                action = action + np.clip(
                    np.random.normal(0.0, self.action_noise, self.nb_actions),
                    -self.noise_clip,
                    self.noise_clip
                )
            return action
    
    def backward(self, observation, action, reward, terminal, next_observation):
        self.memory.store_transition(observation, action, reward, terminal, next_observation)
        
        if self.step_count < self.nb_steps_warm_up:
            return
        else:
            self._update()
    
    def _update(self):
        observations, actions, rewards, terminals, next_observations = self.memory.sample_batch()
        
        self._update_critic(observations, actions, rewards, terminals, next_observations)
        self._update_actor(observations)
        
        if self.step_count % self.policy_delay == 0:
            # 更新critic的target网络
            new_target_critic_weights_list = polyak_averaging(
                self.critic_model1.get_weights(), self.target_critic_model1.get_weights(), self.polyak)
            self.target_critic_model1.set_weights(new_target_critic_weights_list)
            new_target_critic_weights_list = polyak_averaging(
                self.critic_model2.get_weights(), self.target_critic_model2.get_weights(), self.polyak)
            self.target_critic_model2.set_weights(new_target_critic_weights_list)
            
            # 更新actor的target网络
            new_target_actor_weights_list = polyak_averaging(
                self.actor_model.get_weights(), self.target_actor_model.get_weights(), self.polyak)
            self.target_actor_model.set_weights(new_target_actor_weights_list)
    
    def _update_critic(self, observations, actions, rewards, terminals, next_observations):
        batch_size = observations.shape[0]
        
        q_values_next1 = self.target_critic_model1([next_observations, self.actor_model(next_observations)])
        target1_noise = tf.clip_by_value(
            tf.random.normal(mean=0.0, stddev=self.target_noise, shape=(batch_size, 1), dtype=tf.float64),
            -self.noise_clip, self.noise_clip
        )
        target_q_values1 = rewards + self.gamma * q_values_next1 + target1_noise
        q_values_next2 = self.target_critic_model2([next_observations, self.actor_model(next_observations)])
        target2_noise = tf.clip_by_value(
            tf.random.normal(mean=0.0, stddev=self.target_noise, shape=(batch_size, 1), dtype=tf.float64),
            -self.noise_clip, self.noise_clip
        )
        target_q_values2 = rewards + self.gamma * q_values_next2 + target2_noise
        
        target_q_values = tf.minimum(target_q_values1, target_q_values2)
        
        self.critic_model1.fit([observations, actions], target_q_values, verbose=0)
        self.critic_model2.fit([observations, actions], target_q_values, verbose=0)
    
    @tf.function
    def _update_actor(self, observations):
        with tf.GradientTape() as tape:
            tape.watch(self.actor_model.trainable_weights)
            q_values = self.target_critic_model1([observations, self.actor_model(observations)])
            loss = -tf.reduce_mean(q_values)
        
        actor_grads = tape.gradient(loss, self.actor_model.trainable_weights)
        self.actor_model.optimizer.apply_gradients(zip(actor_grads, self.actor_model.trainable_weights))
    
    def switch_mode(self, training=None):
        """
        :param training:  agent所处的模式,
            training=True: 训练模式
            training=False: 测试模式
        """
        if training is None:
            self.training = ~self.training
        else:
            self.training = training
        
        if self.training:
            print("Switch to train mode.")
        else:
            print("Switch to test mode.")