def evaluate( self, obs_queue: deque, agent: Agent, num_episode: int = 3, render: bool = False, ) -> Tuple[float, List[GymImg], ]: # 使用给定的代理运行游戏几次(3个玩家5个回合),并返回平均奖励和捕获的帧(实际上不返回帧)。 """evaluate uses the given agent to run the game for a few episodes and returns the average reward and the captured frames.""" self.__env = self.__env_eval ep_rewards = [] frames = [] for _ in range(self.get_eval_lives() * num_episode): observations, ep_reward, _frames = self.reset( render=render) # 初始化测试环境 for obs in observations: obs_queue.append(obs) if render: frames.extend(_frames) done = False while not done: # 开始测试 state = self.make_state(obs_queue).to(self.__device).float() action = agent.run(state) # 得到AI的下一个步骤 obs, reward, done = self.step(action) # 得到下一状态、收益、是否结束 ep_reward += reward obs_queue.append(obs) if render: frames.append(self.get_frame()) ep_rewards.append(ep_reward) # 统计收益 self.__env = self.__env_train return np.sum(ep_rewards) / num_episode, frames # 返回平均收益
def evaluate( self, obs_queue: deque, agent: Agent, num_episode: int = 3, render: bool = False, ) -> Tuple[float, List[GymImg], ]: """evaluate uses the given agent to run the game for a few episodes and returns the average reward and the captured frames.""" self.__env = self.__env_eval ep_rewards = [] frames = [] for _ in range(self.get_eval_lives() * num_episode): observations, ep_reward, _frames = self.reset(render=render) for obs in observations: obs_queue.append(obs) if render: frames.extend(_frames) done = False while not done: state = self.make_state(obs_queue).to(self.__device).float() action = agent.run(state) obs, reward, done = self.step(action) ep_reward += reward obs_queue.append(obs) if render: frames.append(self.get_frame()) ep_rewards.append(ep_reward) self.__env = self.__env_train return np.sum(ep_rewards) / num_episode, frames
done = True progressive = tqdm(range(MAX_STEPS), total=MAX_STEPS, ncols=50, leave=False, unit="b") for step in progressive: if done: observations, _, _ = env.reset() for obs in observations: obs_queue.append(obs) training = len(memory) > WARM_STEPS # 前面是热身 state = env.make_state(obs_queue).to(device).float() action = agent.run(state, training) obs, reward, done = env.step(action) obs_queue.append(obs) memory.push(env.make_folded_state(obs_queue), action, reward, done) # 加入记忆池中 if step % POLICY_UPDATE == 0 and training: agent.learn(memory, BATCH_SIZE) # 使用之前的记忆进行学习 if step % TARGET_UPDATE == 0: # 将policy的网络同步到Target中去 agent.sync() if step % EVALUATE_FREQ == 0: avg_reward, frames = env.evaluate(obs_queue, agent, render=RENDER) with open("rewards.txt", "a") as fp: fp.write(f"{step//EVALUATE_FREQ:3d} {step:8d} {avg_reward:.1f}\n")
done = True preaction = 0 progressive = tqdm(range(MAX_STEPS), total=MAX_STEPS, ncols=50, leave=False, unit="b") for step in progressive: if done: observations, _, _ = env.reset() for obs in observations: obs_queue.append(obs) training = len(memory) > WARM_STEPS state = env.make_state(obs_queue).to(device).float() action = agent.run(state, training, preaction) preaction = action obs, reward, done = env.step(action) obs_queue.append(obs) memory.push(env.make_folded_state(obs_queue), action, reward, done) if step % POLICY_UPDATE == 0 and training: agent.learn(memory, BATCH_SIZE) if step % TARGET_UPDATE == 0: agent.sync() if step % EVALUATE_FREQ == 0: avg_reward, frames = env.evaluate(obs_queue, agent, render=RENDER) with open("rewards.txt", "a") as fp: fp.write(f"{step//EVALUATE_FREQ:3d} {step:8d} {avg_reward:.1f}\n")
progressive = tqdm(range(MAX_STEPS), total=MAX_STEPS, ncols=50, leave=False, unit="b") # 可视化进度条 for step in progressive: if done: # 开始新一轮环境 observations, _, _ = env.reset() for obs in observations: obs_queue.append(obs) training = len( memory) > WARM_STEPS # len(memory) = step,50000,先存储一些数据用于后续训练 state = env.make_state(obs_queue).to(device).float() # 丢掉第一个状态(~) action = agent.run(state, training) # 选取动作a(以eps的概率随机选取动作a, 否则a由Q网络选取) obs, reward, done = env.step(action) # 执行动作a, 获取奖励reward和下一个状态s’ obs_queue.append(obs) # 加入新的状态 memory.push(env.make_folded_state(obs_queue), action, reward, done) # 储存(s,a,r)到经验池中 if step % POLICY_UPDATE == 0 and training: agent.learn(memory, BATCH_SIZE) # 训练 if step % TARGET_UPDATE == 0: agent.sync() # 将策略网络中的权重同步到目标网络 if step % EVALUATE_FREQ == 0: avg_reward, frames = env.evaluate(obs_queue, agent, render=RENDER) # 让几个AI玩一下游戏,记录一下奖励 with open("rewards.txt", "a") as fp: # 文件记录奖励,用于后续画图 fp.write(f"{step//EVALUATE_FREQ:3d} {step:8d} {avg_reward:.1f}\n") if RENDER: # No~ prefix = f"eval_{step//EVALUATE_FREQ:03d}" os.mkdir(prefix)
progressive = tqdm(range(MAX_STEPS), total=MAX_STEPS, ncols=50, leave=False, unit="b") # 进度条 for step in progressive: if done: # done表示结束一次游戏,需要重置 observations, _, _ = env.reset() for obs in observations: obs_queue.append(obs) training = len(memory) > WARM_STEPS state = env.make_state(obs_queue).to( device).float() # 将长度5的观察队列做成state(只用到了后4个obs action = agent.run(state, training) # 根据policy network获得当前action obs, reward, done = env.step(action) # 运行一步 obs_queue.append(obs) # 将头pop,队列中剩后4个加1个新的 memory.store(env.make_folded_state(obs_queue), action, reward, done) # folded_state:[:4]是state,[1:]是next_state if step % POLICY_UPDATE == 0 and training: # 如果training,每过POLICY_UPDATE,就更新一次policy network agent.learn(memory, step) if step % TARGET_UPDATE == 0: # 每过TARGET_UPDATE,就更新一次target network agent.sync() if step % EVALUATE_FREQ == 0: # 每过EVALUATE_FREQ,就评价一次 avg_reward, frames = env.evaluate(obs_queue, agent, render=RENDER) with open("rewards.txt", "a") as fp:
progressive = tqdm(range(MAX_STEPS), total=MAX_STEPS, ncols=50, leave=False, unit="b") for step in progressive: if done: observations, _, _ = env.reset() for obs in observations: obs_queue.append(obs) training = len(memory) > WARM_STEPS state = env.make_state(obs_queue).to( device).float() #current state S action, value_this = agent.run( state, training ) #from state and DQN get Q-function, policy and action A if stable and COUNT_SHADE: action_queue.append(action) obs, reward, done = env.step(action) #execute action getting R, S' obs_queue.append(obs) if version.find('PER') != -1: state_next = env.make_state(obs_queue).to(device).float() value_next = agent.get_target_value(state_next) td_error = GAMMA * value_next + reward - value_this memory.push(env.make_folded_state(obs_queue), action, reward, done, td_error) #how to encoding TD-error? else: memory.push(env.make_folded_state(obs_queue), action, reward, done)
EPS_START, EPS_END, EPS_DECAY, "./model_weights_b" ####************* ) memory = choosememory(c) Reward = [] for step in tqdm(range(MAX_STEPS), ncols=50, leave=False, unit="b"): if done: observations, _, _ = env.reset() for obs in observations: obs_queue.append(obs) training = len(memory) > WARM_STEPS state = env.make_state(obs_queue).to(device).float() action, value_this = agent.run(state, training) #当前选择的行动和Q值 obs, reward, done = env.step(action) obs_queue.append(obs) #memory.push(env.make_folded_state(obs_queue), action, reward, done) #pynvml.nvmlInit() #handle = pynvml.nvmlDeviceGetHandleByIndex(0) #meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle) #print(meminfo.used) if c == 1: #在PER时加入memory时用TDerror来更新优先 state_next = env.make_state(obs_queue).to(device).float() value_next = agent.get_target_value(state_next) td_error = abs(GAMMA * value_next + reward - value_this) #用loss的绝对值来作为TDerror memory.push(env.make_folded_state(obs_queue), action, reward, done, td_error) else:
progressive = tqdm( range(MAX_STEPS), total=MAX_STEPS, # 预期的迭代次数 ncols=50, # 可以自定义进度条的总长度 leave=False, unit="b") for step in progressive: #step=int if done: #新一轮游戏? observations, _, _ = env.reset() for obs in observations: obs_queue.append(obs) #将观察得到内容置入obs_queue # print(len(obs_queue)) training = len(memory) > WARM_STEPS #检测memory长度是否超过WARM_STEPS state = env.make_state(obs_queue).to(device).float() #根据观察转换为状态 action = agent.run(state, training) # 以epsilon为参数随机选择一个动作 obs, reward, done = env.step(action) # 执行动作获得r,和下一个状态 # print(len(obs)) 执行此语句得到obs的大小为1 obs_queue.append(obs) #将新观测结果插入obs_queue memory.push(env.make_folded_state(obs_queue), action, reward, done) # 保存经验 if step % POLICY_UPDATE == 0 and training: # 每 POLICY_UPDATE = 4 次训练,并且要满足memory大小大于WARM_STEPS=50_000 agent.learn(memory, BATCH_SIZE) # 从memory中随机采样BATCH_SIZE = 32来帮助更新 if step % TARGET_UPDATE == 0: # 每TARGET_UPDATE= 10_000轮将target网络更新为policy网络的权重 agent.sync() if step % EVALUATE_FREQ == 0: #保存当前状况 avg_reward, frames = env.evaluate(obs_queue, agent, render=RENDER) with open("rewards.txt", "a") as fp: fp.write(f"{step//EVALUATE_FREQ:3d} {step:8d} {avg_reward:.1f}\n"