def __init__(self, observation_space, action_space, gamma=0.99, nb_steps_warm_up=2000, polyak=0.995, training=True): super().__init__(observation_space, action_space) self.gamma = gamma self.polyak = polyak self.nb_actions = action_space.shape[0] self.observation_shape = observation_space.shape self.nb_steps_warm_up = nb_steps_warm_up self.training = training self.memory = MemoryNP(capacity=10000, observation_shape=self.observation_shape, action_shape=self.action_space.shape) self.actor_model, self.critic_model = self._build_network() self.target_actor_model, self.target_critic_model = self._build_network( ) self.target_actor_model.set_weights(self.actor_model.get_weights()) self.target_critic_model.set_weights(self.critic_model.get_weights()) self.step_count = 0
def __init__(self, observation_space: Box, action_space: Discrete, train_policy=None, test_policy=None, lr=3e-4, gamma=0.99, memory_size=10000, target_model_update=0.99, training=True, enable_double_dqn=True, dueling_type=None): super().__init__(observation_space, action_space) # 学习率 self.lr = lr # 衰减系数 self.gamma = gamma # 目标模型更新的频率,若`target_model_update < 1`使用软更新, `target_model_update >= 1`使用硬更新 self.target_model_update = target_model_update # 训练过程使用的策略 if train_policy is None: self.train_policy = DecayEpsGreedyQPolicy() else: self.train_policy = train_policy # 测试过程使用的策略 if test_policy is None: self.test_policy = GreedyQPolicy() else: self.test_policy = test_policy # 用于标记agent的状态, True表示训练状态, False表示测试状态, 根据状态选择策略 self.training = training self.policy = None self.switch_mode(self.training) # 是否使用double dqn self.enable_double_dqn = enable_double_dqn # 是否使用dueling dqn self.dueling_type = dueling_type # 动作个数 self.nb_actions = action_space.n # 由于动作是离散的,可以使用一个值来表示 self.action_shape = (1, ) # 观测状态的形状 self.observation_shape = observation_space.shape # ReplayBuffer self.memory = MemoryNP(capacity=memory_size, action_shape=self.action_shape, observation_shape=self.observation_shape) # 创建DNN模型,以及目标模型,初始化参数 self.model, self.target_model = self.build_all_models() # 计数 self.step_count = 0
def __init__(self, observation_space, action_space, gamma=0.99, nb_steps_warm_up=2000, sigma=0.3, polyak=0.995, pi_lr=0.001, q_lr=0.001, batch_size=100, action_noise=0.1, target_noise=0.2, noise_clip=0.5, policy_delay=2, training=True): super().__init__(observation_space, action_space) self.gamma = gamma self.sigma = sigma self.polyak = polyak self.pi_lr = pi_lr self.q_lr = q_lr self.batch_size = batch_size self.action_noise = action_noise self.target_noise = target_noise self.noise_clip = noise_clip self.policy_delay = policy_delay self.action_space = action_space self.nb_actions = action_space.shape[0] self.observation_shape = observation_space.shape self.nb_steps_warm_up = nb_steps_warm_up self.training = training self.memory = MemoryNP( capacity=10000, observation_shape=self.observation_shape, action_shape=self.action_space.shape ) self.actor_model, self.critic_model1, self.critic_model2 = self._build_network() self.target_actor_model, self.target_critic_model1, self.target_critic_model2 = self._build_network() self.target_actor_model.set_weights(self.actor_model.get_weights()) self.target_critic_model1.set_weights(self.critic_model1.get_weights()) self.target_critic_model2.set_weights(self.critic_model2.get_weights()) self.step_count = 0
def __init__(self, observation_space, action_space, gamma=0.99, nb_steps_warm_up=2000, alpha=0.2, polyak=0.995, value_network_lr=3e-4, soft_q_network_lr=3e-4, policy_network_lr=3e-4, log_std_min=-20, log_std_max=2): super().__init__(observation_space, action_space) self.gamma = gamma self.alpha = alpha self.polyak = polyak self.nb_steps_warm_up = nb_steps_warm_up self.value_network_lr = value_network_lr self.soft_q_network_lr = soft_q_network_lr self.policy_network_lr = policy_network_lr self.log_std_min = log_std_min self.log_std_max = log_std_max self.min_entropy = -self.action_space.shape[0] self.memory = MemoryNP(capacity=10000, observation_shape=observation_space.shape, action_shape=action_space.shape) self.value_net = self._build_value_network() self.target_value_net = self._build_value_network() self.target_value_net.set_weights(self.value_net.get_weights()) self.soft_q_net1 = self._build_soft_q_network() self.soft_q_net2 = self._build_soft_q_network() self.policy_net = self._build_policy_network() self.step_count = 0 self.training = True
class DQNAgent(Agent): def __init__(self, observation_space: Box, action_space: Discrete, train_policy=None, test_policy=None, lr=3e-4, gamma=0.99, memory_size=10000, target_model_update=0.99, training=True, enable_double_dqn=True, dueling_type=None): super().__init__(observation_space, action_space) # 学习率 self.lr = lr # 衰减系数 self.gamma = gamma # 目标模型更新的频率,若`target_model_update < 1`使用软更新, `target_model_update >= 1`使用硬更新 self.target_model_update = target_model_update # 训练过程使用的策略 if train_policy is None: self.train_policy = DecayEpsGreedyQPolicy() else: self.train_policy = train_policy # 测试过程使用的策略 if test_policy is None: self.test_policy = GreedyQPolicy() else: self.test_policy = test_policy # 用于标记agent的状态, True表示训练状态, False表示测试状态, 根据状态选择策略 self.training = training self.policy = None self.switch_mode(self.training) # 是否使用double dqn self.enable_double_dqn = enable_double_dqn # 是否使用dueling dqn self.dueling_type = dueling_type # 动作个数 self.nb_actions = action_space.n # 由于动作是离散的,可以使用一个值来表示 self.action_shape = (1, ) # 观测状态的形状 self.observation_shape = observation_space.shape # ReplayBuffer self.memory = MemoryNP(capacity=memory_size, action_shape=self.action_shape, observation_shape=self.observation_shape) # 创建DNN模型,以及目标模型,初始化参数 self.model, self.target_model = self.build_all_models() # 计数 self.step_count = 0 def build_all_models(self): model = self.build_q_net() target_model = self.build_q_net() model = self.use_dueling_network(model) target_model = self.use_dueling_network(target_model) target_model.set_weights(model.get_weights()) model.compile(optimizer=tf.keras.optimizers.Adam(lr=self.lr), metrics=['mse'], loss=tf.keras.losses.mean_squared_error) return model, target_model def use_dueling_network(self, model): layer = model.layers[-2] y = tf.keras.layers.Dense(self.nb_actions + 1, activation='linear')(layer.output) if self.dueling_type == 'avg': output_layer = tf.keras.layers.Lambda( lambda a: tf.expand_dims(a[:, 0], -1) + a[:, 1:] - tf. reduce_mean(a[:, 1:], axis=1, keepdims=True), output_shape=(self.nb_actions, ))(y) elif self.dueling_type == 'max': output_layer = tf.keras.layers.Lambda( lambda a: tf.expand_dims(a[:, 0], -1) + a[:, 1:] - tf. reduce_max(a[:, 1:], axis=1, keepdims=True), output_shape=(self.nb_actions, ))(y) elif self.dueling_type == 'naive': output_layer = tf.keras.layers.Lambda( lambda a: tf.expand_dims(a[:, 0], -1) + a[:, 1:], output_shape=(self.nb_actions, ))(y) else: output_layer = model.layers[-1].output model = tf.keras.models.Model(inputs=model.input, outputs=output_layer) return model def build_q_net(self): ''' 创建深度Q网络,若需要改变网络的容量,可以重载该函数,不过需要保持输入输出的一致 :return: q_net ''' model = tf.keras.Sequential([ tf.keras.layers.Dense(128, activation='relu', input_shape=self.observation_shape), tf.keras.layers.Dense(32, activation='relu'), tf.keras.layers.Dense(self.nb_actions, activation='linear') ]) return model def switch_mode(self, training=None): ''' 切换动作策略: :param training: agent所处的模式, training=True: 训练模式 training=False: 测试模式 ''' if training is None: self.training = ~self.training else: self.training = training if self.training: self.policy = self.train_policy print("Switch to train mode.") else: self.policy = self.test_policy print("Switch to test mode.") def forward(self, observation): ''' 根据观测状态选择动作 :param observation: 观测状态 :return: ''' observation = np.expand_dims(observation, axis=0) q_values = self.model.predict(observation).squeeze(0) action = self.policy.select_action(q_values) return action def backward(self, observation, action, reward, terminal, next_observation): ''' 每次与环境交互一次,可以生成一个MDP转移元组样本,保存该元组至ReplayBuffer,采样并训练 :param observation: :param action: :param reward: :param terminal: :param next_observation: :return: ''' # 保存样本 self.memory.store_transition(observation, action, reward, terminal, next_observation) # 更新Q网络 if self.enable_double_dqn: self.update_model_double_dqn() else: self.update_model() # 更新目标Q网络 self.update_target_model() def update_model(self): # 从ReplayBuffer采样 observations, actions, rewards, terminals, next_observations = self.memory.sample_batch( ) # 计算目标Q值 target_q_values = np.max(self.target_model.predict(next_observations), axis=1, keepdims=True) actions = tf.keras.utils.to_categorical( actions, num_classes=self.nb_actions).astype(np.bool) q_values = self.model.predict(observations) q_values[ actions, np.newaxis] = rewards + self.gamma * target_q_values * (~terminals) # 更新Q网络 self.model.fit(observations, q_values, verbose=0) def update_model_double_dqn(self): # 从ReplayBuffer采样 observations, actions, rewards, terminals, next_observations = self.memory.sample_batch( ) # 计算目标Q值 q_values = self.model.predict(observations) q_values_next = self.model.predict(next_observations) target_action = tf.keras.utils.to_categorical( np.argmax(q_values_next, axis=1), num_classes=self.nb_actions).astype(np.bool) target_q_values = self.target_model.predict( next_observations)[target_action].reshape(-1, 1) actions = tf.keras.utils.to_categorical( actions, num_classes=self.nb_actions).astype(np.bool) q_values[ actions, np.newaxis] = rewards + self.gamma * target_q_values * (~terminals) # 更新Q网络 self.model.fit(observations, q_values, verbose=0) def update_target_model(self): if self.target_model_update < 1.: # soft update: w'(t+1) = w'(t) * lamda + w(t) * (1 - lamda) new_target_model_weights = polyak_averaging( weights_list=self.model.get_weights(), target_weights_list=self.target_model.get_weights(), polyak=self.target_model_update) self.target_model.set_weights(new_target_model_weights) else: # hard update: w'(t+1) = w(t) self.step_count += 1 if self.step_count % int(self.target_model_update) == 0: self.target_model.set_weights(self.model.get_weights())
class DDPGAgent(Agent): def __init__(self, observation_space, action_space, gamma=0.99, nb_steps_warm_up=2000, polyak=0.995, training=True): super().__init__(observation_space, action_space) self.gamma = gamma self.polyak = polyak self.nb_actions = action_space.shape[0] self.observation_shape = observation_space.shape self.nb_steps_warm_up = nb_steps_warm_up self.training = training self.memory = MemoryNP(capacity=10000, observation_shape=self.observation_shape, action_shape=self.action_space.shape) self.actor_model, self.critic_model = self._build_network() self.target_actor_model, self.target_critic_model = self._build_network( ) self.target_actor_model.set_weights(self.actor_model.get_weights()) self.target_critic_model.set_weights(self.critic_model.get_weights()) self.step_count = 0 def _build_network(self): action_tensor = tf.keras.layers.Input(shape=(self.nb_actions, ), dtype=tf.float64) observation_tensor = tf.keras.layers.Input( shape=self.observation_shape, dtype=tf.float64) # 创建Actor模型 y = tf.keras.layers.Dense(32, activation='relu')(observation_tensor) y = tf.keras.layers.Dense(32, activation='relu')(y) y = tf.keras.layers.Dense(32, activation='relu')(y) y = tf.keras.layers.Dense(self.nb_actions, activation='tanh')(y) actor_model = tf.keras.Model(inputs=observation_tensor, outputs=y) actor_model.compile(optimizer=tf.keras.optimizers.Adam(lr=3e-4), loss='mse') # 创建Critic模型 y = tf.keras.layers.Concatenate()([observation_tensor, action_tensor]) y = tf.keras.layers.Dense(32, activation='relu')(y) y = tf.keras.layers.Dense(32, activation='relu')(y) y = tf.keras.layers.Dense(32, activation='relu')(y) y = tf.keras.layers.Dense(1, activation='linear')(y) critic_model = tf.keras.Model( inputs=[observation_tensor, action_tensor], outputs=y) critic_model.compile(optimizer=tf.keras.optimizers.Adam(lr=3e-4), loss='mse') return actor_model, critic_model def forward(self, observation): self.step_count += 1 if self.step_count < self.nb_steps_warm_up: return self.action_space.sample() else: observation = np.expand_dims(observation, axis=0) action = self.actor_model.predict(observation) action = action.reshape(self.nb_actions) if self.training: action = action + 0.3 * np.random.random() return action def backward(self, observation, action, reward, terminal, next_observation): self.memory.store_transition(observation, action, reward, terminal, next_observation) if self.step_count < self.nb_steps_warm_up: return else: self._update() def _update(self): observations, actions, rewards, terminals, next_observations = self.memory.sample_batch( ) self._update_critic(observations, actions, rewards, terminals, next_observations) self._update_actor(observations) # 更新critic的target网络 new_target_critic_weights_list = polyak_averaging( self.critic_model.get_weights(), self.target_critic_model.get_weights(), self.polyak) self.target_critic_model.set_weights(new_target_critic_weights_list) # 更新actor的target网络 new_target_actor_weights_list = polyak_averaging( self.actor_model.get_weights(), self.target_actor_model.get_weights(), self.polyak) self.target_actor_model.set_weights(new_target_actor_weights_list) def polyak_averaging(self, weights_list, target_weights_list): new_target_weights_list = [] for weights, target_weights in zip(weights_list, target_weights_list): new_target_weights = self.polyak * target_weights + ( 1 - self.polyak) * weights new_target_weights_list.append(new_target_weights) return new_target_weights_list @tf.function def _update_actor(self, observations): with tf.GradientTape() as tape: tape.watch(self.actor_model.trainable_weights) q_values = self.target_critic_model( [observations, self.actor_model(observations)]) loss = -tf.reduce_mean(q_values) actor_grads = tape.gradient(loss, self.actor_model.trainable_weights) self.actor_model.optimizer.apply_gradients( zip(actor_grads, self.actor_model.trainable_weights)) def _update_critic(self, observations, actions, rewards, terminals, next_observations): q_values_next = self.target_critic_model( [next_observations, self.actor_model(next_observations)]) target_q_values = rewards + self.gamma * q_values_next self.critic_model.fit([observations, actions], target_q_values, verbose=0) def switch_mode(self, training=None): """ :param training: agent所处的模式, training=True: 训练模式 training=False: 测试模式 """ if training is None: self.training = ~self.training else: self.training = training if self.training: print("Switch to train mode.") else: print("Switch to test mode.")
class SACAgent(Agent): def __init__(self, observation_space, action_space, gamma=0.99, nb_steps_warm_up=2000, alpha=0.2, polyak=0.995, value_network_lr=3e-4, soft_q_network_lr=3e-4, policy_network_lr=3e-4, log_std_min=-20, log_std_max=2): super().__init__(observation_space, action_space) self.gamma = gamma self.alpha = alpha self.polyak = polyak self.nb_steps_warm_up = nb_steps_warm_up self.value_network_lr = value_network_lr self.soft_q_network_lr = soft_q_network_lr self.policy_network_lr = policy_network_lr self.log_std_min = log_std_min self.log_std_max = log_std_max self.min_entropy = -self.action_space.shape[0] self.memory = MemoryNP(capacity=10000, observation_shape=observation_space.shape, action_shape=action_space.shape) self.value_net = self._build_value_network() self.target_value_net = self._build_value_network() self.target_value_net.set_weights(self.value_net.get_weights()) self.soft_q_net1 = self._build_soft_q_network() self.soft_q_net2 = self._build_soft_q_network() self.policy_net = self._build_policy_network() self.step_count = 0 self.training = True def forward(self, observation): self.step_count += 1 if self.step_count < self.nb_steps_warm_up: return self.action_space.sample() else: if observation.ndim == 1: observation = np.expand_dims(observation, axis=0) mean, log_std = self.policy_net.predict(observation) std = tf.math.exp(log_std) action = mean + tf.random.normal(tf.shape(mean)) * std return action def backward(self, observation, action, reward, terminal, next_observation): self.memory.store_transition(observation, action, reward, terminal, next_observation) if self.step_count >= self.nb_steps_warm_up: self._update() new_target_weights = polyak_averaging( self.value_net.get_weights(), self.target_value_net.get_weights(), self.polyak) self.target_value_net.set_weights(new_target_weights) def _update(self): observations, actions, rewards, _, next_observations = self.memory.sample_batch( ) target_q_value = rewards + self.gamma * self.target_value_net.predict( next_observations) soft_actions, log_probs = self.evaluate(observations) soft_q_value1 = self.soft_q_net1.predict([observations, soft_actions]) soft_q_value2 = self.soft_q_net2.predict([observations, soft_actions]) target_value = tf.minimum(soft_q_value1, soft_q_value2) - self.alpha * log_probs # Update soft Q network self.soft_q_net1.fit([observations, actions], target_q_value, verbose=0) self.soft_q_net2.fit([observations, actions], target_q_value, verbose=0) # Update value network self.value_net.fit(observations, target_value, verbose=0) # Update policy network with tf.GradientTape() as tape: tape.watch(self.policy_net.trainable_weights) soft_actions, log_probs = self.evaluate(observations) soft_q_value = self.soft_q_net1([observations, soft_actions]) loss = -tf.reduce_mean(soft_q_value - self.alpha * log_probs) actor_grads = tape.gradient(loss, self.policy_net.trainable_weights) self.policy_net.optimizer.apply_gradients( zip(actor_grads, self.policy_net.trainable_weights)) def evaluate(self, observations): mean, log_std = self.policy_net(observations) std = tf.math.exp(log_std) z = mean + tf.random.normal(tf.shape(mean)) * std action = tf.math.tanh(z) log_prob = gaussian_likelihood(z, mean, log_std) log_prob -= tf.math.reduce_sum(tf.math.log(1 - action**2 + 1e-6), axis=1) action = tf.cast(action, dtype=tf.float64) return action, log_prob def _build_value_network(self): observation_shape = self.observation_space.shape layers = tf.keras.layers model = tf.keras.models.Sequential([ layers.Dense(32, activation='relu', input_shape=observation_shape), layers.Dense(32, activation='relu'), layers.Dense(32, activation='relu'), layers.Dense(1) ]) model.compile( loss='mse', optimizer=tf.keras.optimizers.Adam(lr=self.value_network_lr)) return model def _build_soft_q_network(self): observation_shape = self.observation_space.shape nb_actions = self.action_space.shape[0] layers = tf.keras.layers observation_tensor = layers.Input(shape=observation_shape) action_tensor = layers.Input(shape=(nb_actions, )) y = layers.Concatenate()([observation_tensor, action_tensor]) y = layers.Dense(32, activation='relu')(y) y = layers.Dense(32, activation='relu')(y) y = layers.Dense(32, activation='relu')(y) y = layers.Dense(1)(y) model = tf.keras.models.Model( inputs=[observation_tensor, action_tensor], outputs=y) model.compile( loss='mse', optimizer=tf.keras.optimizers.Adam(lr=self.soft_q_network_lr), ) return model def _build_policy_network(self): observation_shape = self.observation_space.shape nb_actions = self.action_space.shape[0] layers = tf.keras.layers observation_tensor = layers.Input(shape=observation_shape) y = layers.Dense(32, activation='relu')(observation_tensor) y = layers.Dense(32, activation='relu')(y) y = layers.Dense(32, activation='relu')(y) mean = layers.Dense(nb_actions, activation='tanh')(y) log_std = layers.Dense(nb_actions, activation='tanh')(y) log_std = self.log_std_min + 0.5 * (self.log_std_max - self.log_std_min) * (log_std + 1) model = tf.keras.models.Model(inputs=observation_tensor, outputs=[mean, log_std]) model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam( self.policy_network_lr)) return model def switch_mode(self, training=None): """ :param training: agent所处的模式, training=True: 训练模式 training=False: 测试模式 """ if training is None: self.training = ~self.training else: self.training = training if self.training: print("Switch to train mode.") else: print("Switch to test mode.")
class TD3Agent(Agent): def __init__(self, observation_space, action_space, gamma=0.99, nb_steps_warm_up=2000, sigma=0.3, polyak=0.995, pi_lr=0.001, q_lr=0.001, batch_size=100, action_noise=0.1, target_noise=0.2, noise_clip=0.5, policy_delay=2, training=True): super().__init__(observation_space, action_space) self.gamma = gamma self.sigma = sigma self.polyak = polyak self.pi_lr = pi_lr self.q_lr = q_lr self.batch_size = batch_size self.action_noise = action_noise self.target_noise = target_noise self.noise_clip = noise_clip self.policy_delay = policy_delay self.action_space = action_space self.nb_actions = action_space.shape[0] self.observation_shape = observation_space.shape self.nb_steps_warm_up = nb_steps_warm_up self.training = training self.memory = MemoryNP( capacity=10000, observation_shape=self.observation_shape, action_shape=self.action_space.shape ) self.actor_model, self.critic_model1, self.critic_model2 = self._build_network() self.target_actor_model, self.target_critic_model1, self.target_critic_model2 = self._build_network() self.target_actor_model.set_weights(self.actor_model.get_weights()) self.target_critic_model1.set_weights(self.critic_model1.get_weights()) self.target_critic_model2.set_weights(self.critic_model2.get_weights()) self.step_count = 0 def _build_network(self): action_tensor = tf.keras.layers.Input(shape=(self.nb_actions,), dtype=tf.float64) observation_tensor = tf.keras.layers.Input(shape=self.observation_shape, dtype=tf.float64) # 创建Actor模型 y = tf.keras.layers.Dense(32, activation='relu')(observation_tensor) y = tf.keras.layers.Dense(32, activation='relu')(y) y = tf.keras.layers.Dense(32, activation='relu')(y) y = tf.keras.layers.Dense(self.nb_actions, activation='tanh')(y) actor_model = tf.keras.Model(inputs=observation_tensor, outputs=y) actor_model.compile(optimizer=tf.keras.optimizers.Adam(lr=self.pi_lr), loss='mse') # 创建Critic1模型 critic_model1 = self._build_critic_network(observation_tensor, action_tensor) # 创建Critic2模型 critic_model2 = self._build_critic_network(observation_tensor, action_tensor) return actor_model, critic_model1, critic_model2 def _build_critic_network(self, observation_tensor, action_tensor): y = tf.keras.layers.Concatenate()([observation_tensor, action_tensor]) y = tf.keras.layers.Dense(32, activation='relu')(y) y = tf.keras.layers.Dense(32, activation='relu')(y) y = tf.keras.layers.Dense(32, activation='relu')(y) y = tf.keras.layers.Dense(1, activation='linear')(y) critic_model = tf.keras.Model(inputs=[observation_tensor, action_tensor], outputs=y) critic_model.compile(optimizer=tf.keras.optimizers.Adam(lr=self.q_lr), loss='mse') return critic_model def forward(self, observation): self.step_count += 1 if self.step_count < self.nb_steps_warm_up: return self.action_space.sample() else: observation = np.expand_dims(observation, axis=0) action = self.actor_model.predict(observation) action = action.reshape(self.nb_actions) if self.training: action = action + np.clip( np.random.normal(0.0, self.action_noise, self.nb_actions), -self.noise_clip, self.noise_clip ) return action def backward(self, observation, action, reward, terminal, next_observation): self.memory.store_transition(observation, action, reward, terminal, next_observation) if self.step_count < self.nb_steps_warm_up: return else: self._update() def _update(self): observations, actions, rewards, terminals, next_observations = self.memory.sample_batch() self._update_critic(observations, actions, rewards, terminals, next_observations) self._update_actor(observations) if self.step_count % self.policy_delay == 0: # 更新critic的target网络 new_target_critic_weights_list = polyak_averaging( self.critic_model1.get_weights(), self.target_critic_model1.get_weights(), self.polyak) self.target_critic_model1.set_weights(new_target_critic_weights_list) new_target_critic_weights_list = polyak_averaging( self.critic_model2.get_weights(), self.target_critic_model2.get_weights(), self.polyak) self.target_critic_model2.set_weights(new_target_critic_weights_list) # 更新actor的target网络 new_target_actor_weights_list = polyak_averaging( self.actor_model.get_weights(), self.target_actor_model.get_weights(), self.polyak) self.target_actor_model.set_weights(new_target_actor_weights_list) def _update_critic(self, observations, actions, rewards, terminals, next_observations): batch_size = observations.shape[0] q_values_next1 = self.target_critic_model1([next_observations, self.actor_model(next_observations)]) target1_noise = tf.clip_by_value( tf.random.normal(mean=0.0, stddev=self.target_noise, shape=(batch_size, 1), dtype=tf.float64), -self.noise_clip, self.noise_clip ) target_q_values1 = rewards + self.gamma * q_values_next1 + target1_noise q_values_next2 = self.target_critic_model2([next_observations, self.actor_model(next_observations)]) target2_noise = tf.clip_by_value( tf.random.normal(mean=0.0, stddev=self.target_noise, shape=(batch_size, 1), dtype=tf.float64), -self.noise_clip, self.noise_clip ) target_q_values2 = rewards + self.gamma * q_values_next2 + target2_noise target_q_values = tf.minimum(target_q_values1, target_q_values2) self.critic_model1.fit([observations, actions], target_q_values, verbose=0) self.critic_model2.fit([observations, actions], target_q_values, verbose=0) @tf.function def _update_actor(self, observations): with tf.GradientTape() as tape: tape.watch(self.actor_model.trainable_weights) q_values = self.target_critic_model1([observations, self.actor_model(observations)]) loss = -tf.reduce_mean(q_values) actor_grads = tape.gradient(loss, self.actor_model.trainable_weights) self.actor_model.optimizer.apply_gradients(zip(actor_grads, self.actor_model.trainable_weights)) def switch_mode(self, training=None): """ :param training: agent所处的模式, training=True: 训练模式 training=False: 测试模式 """ if training is None: self.training = ~self.training else: self.training = training if self.training: print("Switch to train mode.") else: print("Switch to test mode.")