state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) min_val = paddle.to_tensor(1e-7).astype('float32') actor = Actor(state_dim, action_dim, max_action) target_actor = Actor(state_dim, action_dim, max_action) target_actor.eval() target_actor.load_dict(actor.state_dict()) actor_optimizer = paddle.optimizer.RMSProp(parameters=actor.parameters(), learning_rate=learning_rate) critic_1 = Critic(state_dim, action_dim) target_critic_1 = Critic(state_dim, action_dim) target_critic_1.eval() target_critic_1.load_dict(critic_1.state_dict()) critic_2 = Critic(state_dim, action_dim) target_critic_2 = Critic(state_dim, action_dim) target_critic_2.eval() target_critic_2.load_dict(critic_2.state_dict()) critic_1_optimizer = paddle.optimizer.RMSProp(parameters=critic_1.parameters(), learning_rate=learning_rate) critic_2_optimizer = paddle.optimizer.RMSProp(parameters=critic_2.parameters(), learning_rate=learning_rate) rpm = ReplayMemory(memory_size) def train(): global epoch total_reward = 0
action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) min_val = paddle.to_tensor(1e-7).astype('float32') actor = Actor(state_dim, action_dim, max_action) actor_optimizer = paddle.optimizer.RMSProp(parameters=actor.parameters(), learning_rate=learning_rate) Q_net = Q(state_dim, action_dim) Q_optimizer = paddle.optimizer.RMSProp(parameters=Q_net.parameters(), learning_rate=learning_rate) critic = Critic(state_dim) target_critic = Critic(state_dim) target_critic.eval() target_critic.load_dict(critic.state_dict()) critic_optimizer = paddle.optimizer.RMSProp(parameters=critic.parameters(), learning_rate=learning_rate) rpm = ReplayMemory(memory_size) def train(): global epoch total_reward = 0 # 重置游戏状态 state = env.reset() while True: action = actor.select_action(state) next_state, reward, done, info = env.step(action) env.render()