total=MAX_STEPS, ncols=50, leave=False, unit="b") for step in progressive: if done: observations, _, _ = env.reset() for obs in observations: obs_queue.append(obs) training = len(memory) > WARM_STEPS # 前面是热身 state = env.make_state(obs_queue).to(device).float() action = agent.run(state, training) obs, reward, done = env.step(action) obs_queue.append(obs) memory.push(env.make_folded_state(obs_queue), action, reward, done) # 加入记忆池中 if step % POLICY_UPDATE == 0 and training: agent.learn(memory, BATCH_SIZE) # 使用之前的记忆进行学习 if step % TARGET_UPDATE == 0: # 将policy的网络同步到Target中去 agent.sync() if step % EVALUATE_FREQ == 0: avg_reward, frames = env.evaluate(obs_queue, agent, render=RENDER) with open("rewards.txt", "a") as fp: fp.write(f"{step//EVALUATE_FREQ:3d} {step:8d} {avg_reward:.1f}\n") if RENDER: prefix = f"eval_{step//EVALUATE_FREQ:03d}" os.mkdir(prefix)
training = len(memory) > WARM_STEPS state = env.make_state(obs_queue).to( device).float() #current state S action, value_this = agent.run( state, training ) #from state and DQN get Q-function, policy and action A if stable and COUNT_SHADE: action_queue.append(action) obs, reward, done = env.step(action) #execute action getting R, S' obs_queue.append(obs) if version.find('PER') != -1: state_next = env.make_state(obs_queue).to(device).float() value_next = agent.get_target_value(state_next) td_error = GAMMA * value_next + reward - value_this memory.push(env.make_folded_state(obs_queue), action, reward, done, td_error) #how to encoding TD-error? else: memory.push(env.make_folded_state(obs_queue), action, reward, done) if step % POLICY_UPDATE == 0 and training: agent.learn(memory, BATCH_SIZE) if step > STABLE_STEPS and stable: agent.stable_learn(env.make_folded_state(obs_queue), action, reward, done) if step % TARGET_UPDATE == 0: agent.sync() if step % EVALUATE_FREQ == 0:
leave=False, unit="b") # 进度条 for step in progressive: if done: # done表示结束一次游戏,需要重置 observations, _, _ = env.reset() for obs in observations: obs_queue.append(obs) training = len(memory) > WARM_STEPS state = env.make_state(obs_queue).to( device).float() # 将长度5的观察队列做成state(只用到了后4个obs action = agent.run(state, training) # 根据policy network获得当前action obs, reward, done = env.step(action) # 运行一步 obs_queue.append(obs) # 将头pop,队列中剩后4个加1个新的 memory.store(env.make_folded_state(obs_queue), action, reward, done) # folded_state:[:4]是state,[1:]是next_state if step % POLICY_UPDATE == 0 and training: # 如果training,每过POLICY_UPDATE,就更新一次policy network agent.learn(memory, step) if step % TARGET_UPDATE == 0: # 每过TARGET_UPDATE,就更新一次target network agent.sync() if step % EVALUATE_FREQ == 0: # 每过EVALUATE_FREQ,就评价一次 avg_reward, frames = env.evaluate(obs_queue, agent, render=RENDER) with open("rewards.txt", "a") as fp: fp.write(f"{step // EVALUATE_FREQ:3d} {step:8d} {avg_reward:.1f}\n" ) # 可以从rewards.txt中画出学习曲线 if RENDER: # 如果RENDER,就绘图 prefix = f"eval_{step // EVALUATE_FREQ:03d}"
progressive = tqdm(range(MAX_STEPS), total=MAX_STEPS, ncols=50, leave=False, unit="b") for step in progressive: if done: observations, _, _ = env.reset() for obs in observations: obs_queue.append(obs) training = len(memory) > WARM_STEPS state = env.make_state(obs_queue).to(device).float() action = agent.run(state, training) obs, reward, done = env.step(action) obs_queue.append(obs) memory.push(env.make_folded_state(obs_queue), action, reward, done, agent.getexp(env.make_folded_state(obs_queue), action, reward, done)) # agent.update_memory(memory) if step % POLICY_UPDATE == 0 and training: agent.learn(memory, BATCH_SIZE) if step % TARGET_UPDATE == 0: agent.sync() if step % EVALUATE_FREQ == 0: avg_reward, frames = env.evaluate(obs_queue, agent, render=RENDER) with open("rewards.txt", "a") as fp: fp.write(f"{step // EVALUATE_FREQ:3d} {step:8d} {avg_reward:.1f}\n") if RENDER: prefix = f"eval_{step // EVALUATE_FREQ:03d}"