for step in progressive:
    if done:
        observations, _, _ = env.reset()
        for obs in observations:
            obs_queue.append(obs)

    training = len(memory) > WARM_STEPS  # 前面是热身
    state = env.make_state(obs_queue).to(device).float()
    action = agent.run(state, training)
    obs, reward, done = env.step(action)
    obs_queue.append(obs)
    memory.push(env.make_folded_state(obs_queue), action, reward,
                done)  # 加入记忆池中

    if step % POLICY_UPDATE == 0 and training:
        agent.learn(memory, BATCH_SIZE)  # 使用之前的记忆进行学习

    if step % TARGET_UPDATE == 0:  # 将policy的网络同步到Target中去
        agent.sync()

    if step % EVALUATE_FREQ == 0:
        avg_reward, frames = env.evaluate(obs_queue, agent, render=RENDER)
        with open("rewards.txt", "a") as fp:
            fp.write(f"{step//EVALUATE_FREQ:3d} {step:8d} {avg_reward:.1f}\n")
        if RENDER:
            prefix = f"eval_{step//EVALUATE_FREQ:03d}"
            os.mkdir(prefix)
            for ind, frame in enumerate(frames):
                with open(os.path.join(prefix, f"{ind:06d}.png"), "wb") as fp:
                    frame.save(fp, format="png")
        agent.save(
示例#2
0
                   unit="b")
for step in progressive:
    if done:
        observations, _, _ = env.reset()
        for obs in observations:
            obs_queue.append(obs)

    training = len(memory) > WARM_STEPS
    state = env.make_state(obs_queue).to(device).float()
    action = agent.run(state, training)
    obs, reward, done = env.step(action)
    obs_queue.append(obs)
    memory.push(env.make_folded_state(obs_queue), action, reward, done)

    if step % POLICY_UPDATE == 0 and training:
        agent.learn(memory, BATCH_SIZE)

    if step % TARGET_UPDATE == 0:
        agent.sync()

    if step % EVALUATE_FREQ == 0:
        avg_reward, frames = env.evaluate(obs_queue, agent, render=RENDER)
        with open(REWARD_PATH, "a") as fp:
            fp.write(f"{step//EVALUATE_FREQ:3d} {step:8d} {avg_reward:.1f}\n")
        if RENDER:
            prefix = f"eval_{step//EVALUATE_FREQ:03d}"
            os.mkdir(prefix)
            for ind, frame in enumerate(frames):
                with open(os.path.join(prefix, f"{ind:06d}.png"), "wb") as fp:
                    frame.save(fp, format="png")
        agent.save(
            #pynvml.nvmlInit()
            #handle = pynvml.nvmlDeviceGetHandleByIndex(0)
            #meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
            #print(meminfo.used)
            if c == 1:  #在PER时加入memory时用TDerror来更新优先
                state_next = env.make_state(obs_queue).to(device).float()
                value_next = agent.get_target_value(state_next)
                td_error = abs(GAMMA * value_next + reward -
                               value_this)  #用loss的绝对值来作为TDerror
                memory.push(env.make_folded_state(obs_queue), action, reward,
                            done, td_error)
            else:
                memory.push(env.make_folded_state(obs_queue), action, reward,
                            done)
            if step % POLICY_UPDATE == 0 and training:
                agent.learn(memory, BATCH_SIZE, c)  #PER时学习到的LOSS要来更新优先级

            if step % TARGET_UPDATE == 0:
                agent.sync()

            if step % EVALUATE_FREQ == 0:
                avg_reward, frames = env.evaluate(obs_queue,
                                                  agent,
                                                  render=RENDER)
                with open("rewards.txt", "a") as fp:
                    fp.write(
                        f"{step//EVALUATE_FREQ:3d} {step:8d} {avg_reward:.1f}\n"
                    )
                if RENDER:
                    prefix = f"eval_{step//EVALUATE_FREQ:03d}"
                    os.mkdir(prefix)
示例#4
0
        observations, _, _ = env.reset()
        for obs in observations:
            obs_queue.append(obs)

    training = len(memory) > WARM_STEPS
    state = env.make_state(obs_queue).to(
        device).float()  # 将长度5的观察队列做成state(只用到了后4个obs
    action = agent.run(state, training)  # 根据policy network获得当前action
    obs, reward, done = env.step(action)  # 运行一步
    obs_queue.append(obs)  # 将头pop,队列中剩后4个加1个新的

    memory.store(env.make_folded_state(obs_queue), action, reward,
                 done)  # folded_state:[:4]是state,[1:]是next_state

    if step % POLICY_UPDATE == 0 and training:  # 如果training,每过POLICY_UPDATE,就更新一次policy network
        agent.learn(memory, step)

    if step % TARGET_UPDATE == 0:  # 每过TARGET_UPDATE,就更新一次target network
        agent.sync()

    if step % EVALUATE_FREQ == 0:  # 每过EVALUATE_FREQ,就评价一次
        avg_reward, frames = env.evaluate(obs_queue, agent, render=RENDER)
        with open("rewards.txt", "a") as fp:
            fp.write(f"{step // EVALUATE_FREQ:3d} {step:8d} {avg_reward:.1f}\n"
                     )  # 可以从rewards.txt中画出学习曲线
        if RENDER:  # 如果RENDER,就绘图
            prefix = f"eval_{step // EVALUATE_FREQ:03d}"
            os.mkdir(prefix)
            for ind, frame in enumerate(frames):
                with open(os.path.join(prefix, f"{ind:06d}.png"), "wb") as fp:
                    frame.save(fp, format="png")
示例#5
0
for step in progressive:  #step=int
    if done:  #新一轮游戏?
        observations, _, _ = env.reset()
        for obs in observations:
            obs_queue.append(obs)  #将观察得到内容置入obs_queue
#         print(len(obs_queue))
    training = len(memory) > WARM_STEPS  #检测memory长度是否超过WARM_STEPS
    state = env.make_state(obs_queue).to(device).float()  #根据观察转换为状态
    action = agent.run(state, training)  # 以epsilon为参数随机选择一个动作
    obs, reward, done = env.step(action)  # 执行动作获得r,和下一个状态
    #     print(len(obs)) 执行此语句得到obs的大小为1
    obs_queue.append(obs)  #将新观测结果插入obs_queue
    memory.push(env.make_folded_state(obs_queue), action, reward, done)  # 保存经验

    if step % POLICY_UPDATE == 0 and training:  # 每 POLICY_UPDATE = 4 次训练,并且要满足memory大小大于WARM_STEPS=50_000
        agent.learn(memory, BATCH_SIZE)  # 从memory中随机采样BATCH_SIZE = 32来帮助更新

    if step % TARGET_UPDATE == 0:  # 每TARGET_UPDATE= 10_000轮将target网络更新为policy网络的权重
        agent.sync()

    if step % EVALUATE_FREQ == 0:  #保存当前状况
        avg_reward, frames = env.evaluate(obs_queue, agent, render=RENDER)
        with open("rewards.txt", "a") as fp:
            fp.write(f"{step//EVALUATE_FREQ:3d} {step:8d} {avg_reward:.1f}\n"
                     )  # 3d表示3位数的表达形式比如“000”
        if RENDER:
            prefix = f"eval_{step//EVALUATE_FREQ:03d}"
            os.mkdir(prefix)
            for ind, frame in enumerate(frames):
                with open(os.path.join(prefix, f"{ind:06d}.png"), "wb") as fp:
                    frame.save(fp, format="png")