Exemplo n.º 1
0
    def evaluate(
        self,
        obs_queue: deque,
        agent: Agent,
        num_episode: int = 3,
        render: bool = False,
    ) -> Tuple[float, List[GymImg],
               ]:  # 使用给定的代理运行游戏几次(3个玩家5个回合),并返回平均奖励和捕获的帧(实际上不返回帧)。
        """evaluate uses the given agent to run the game for a few episodes and
        returns the average reward and the captured frames."""
        self.__env = self.__env_eval
        ep_rewards = []
        frames = []
        for _ in range(self.get_eval_lives() * num_episode):
            observations, ep_reward, _frames = self.reset(
                render=render)  # 初始化测试环境
            for obs in observations:
                obs_queue.append(obs)
            if render: frames.extend(_frames)
            done = False

            while not done:  # 开始测试
                state = self.make_state(obs_queue).to(self.__device).float()
                action = agent.run(state)  # 得到AI的下一个步骤
                obs, reward, done = self.step(action)  # 得到下一状态、收益、是否结束

                ep_reward += reward
                obs_queue.append(obs)
                if render: frames.append(self.get_frame())

            ep_rewards.append(ep_reward)  # 统计收益

        self.__env = self.__env_train
        return np.sum(ep_rewards) / num_episode, frames  # 返回平均收益
Exemplo n.º 2
0
    def evaluate(
        self,
        obs_queue: deque,
        agent: Agent,
        num_episode: int = 3,
        render: bool = False,
    ) -> Tuple[float, List[GymImg], ]:
        """evaluate uses the given agent to run the game for a few episodes and
        returns the average reward and the captured frames."""
        self.__env = self.__env_eval
        ep_rewards = []
        frames = []
        for _ in range(self.get_eval_lives() * num_episode):
            observations, ep_reward, _frames = self.reset(render=render)
            for obs in observations:
                obs_queue.append(obs)
            if render:
                frames.extend(_frames)
            done = False

            while not done:
                state = self.make_state(obs_queue).to(self.__device).float()
                action = agent.run(state)
                obs, reward, done = self.step(action)

                ep_reward += reward
                obs_queue.append(obs)
                if render:
                    frames.append(self.get_frame())

            ep_rewards.append(ep_reward)

        self.__env = self.__env_train
        return np.sum(ep_rewards) / num_episode, frames
Exemplo n.º 3
0
done = True

progressive = tqdm(range(MAX_STEPS),
                   total=MAX_STEPS,
                   ncols=50,
                   leave=False,
                   unit="b")
for step in progressive:
    if done:
        observations, _, _ = env.reset()
        for obs in observations:
            obs_queue.append(obs)

    training = len(memory) > WARM_STEPS  # 前面是热身
    state = env.make_state(obs_queue).to(device).float()
    action = agent.run(state, training)
    obs, reward, done = env.step(action)
    obs_queue.append(obs)
    memory.push(env.make_folded_state(obs_queue), action, reward,
                done)  # 加入记忆池中

    if step % POLICY_UPDATE == 0 and training:
        agent.learn(memory, BATCH_SIZE)  # 使用之前的记忆进行学习

    if step % TARGET_UPDATE == 0:  # 将policy的网络同步到Target中去
        agent.sync()

    if step % EVALUATE_FREQ == 0:
        avg_reward, frames = env.evaluate(obs_queue, agent, render=RENDER)
        with open("rewards.txt", "a") as fp:
            fp.write(f"{step//EVALUATE_FREQ:3d} {step:8d} {avg_reward:.1f}\n")
Exemplo n.º 4
0
done = True
preaction = 0
progressive = tqdm(range(MAX_STEPS),
                   total=MAX_STEPS,
                   ncols=50,
                   leave=False,
                   unit="b")
for step in progressive:
    if done:
        observations, _, _ = env.reset()
        for obs in observations:
            obs_queue.append(obs)

    training = len(memory) > WARM_STEPS
    state = env.make_state(obs_queue).to(device).float()
    action = agent.run(state, training, preaction)
    preaction = action
    obs, reward, done = env.step(action)
    obs_queue.append(obs)
    memory.push(env.make_folded_state(obs_queue), action, reward, done)

    if step % POLICY_UPDATE == 0 and training:
        agent.learn(memory, BATCH_SIZE)

    if step % TARGET_UPDATE == 0:
        agent.sync()

    if step % EVALUATE_FREQ == 0:
        avg_reward, frames = env.evaluate(obs_queue, agent, render=RENDER)
        with open("rewards.txt", "a") as fp:
            fp.write(f"{step//EVALUATE_FREQ:3d} {step:8d} {avg_reward:.1f}\n")
Exemplo n.º 5
0
progressive = tqdm(range(MAX_STEPS),
                   total=MAX_STEPS,
                   ncols=50,
                   leave=False,
                   unit="b")  # 可视化进度条
for step in progressive:
    if done:  # 开始新一轮环境
        observations, _, _ = env.reset()
        for obs in observations:
            obs_queue.append(obs)

    training = len(
        memory) > WARM_STEPS  # len(memory) = step,50000,先存储一些数据用于后续训练
    state = env.make_state(obs_queue).to(device).float()  # 丢掉第一个状态(~)
    action = agent.run(state, training)  # 选取动作a(以eps的概率随机选取动作a, 否则a由Q网络选取)
    obs, reward, done = env.step(action)  # 执行动作a, 获取奖励reward和下一个状态s’
    obs_queue.append(obs)  # 加入新的状态
    memory.push(env.make_folded_state(obs_queue), action, reward,
                done)  # 储存(s,a,r)到经验池中
    if step % POLICY_UPDATE == 0 and training:
        agent.learn(memory, BATCH_SIZE)  # 训练
    if step % TARGET_UPDATE == 0: agent.sync()  # 将策略网络中的权重同步到目标网络
    if step % EVALUATE_FREQ == 0:
        avg_reward, frames = env.evaluate(obs_queue, agent,
                                          render=RENDER)  # 让几个AI玩一下游戏,记录一下奖励
        with open("rewards.txt", "a") as fp:  # 文件记录奖励,用于后续画图
            fp.write(f"{step//EVALUATE_FREQ:3d} {step:8d} {avg_reward:.1f}\n")
        if RENDER:  # No~
            prefix = f"eval_{step//EVALUATE_FREQ:03d}"
            os.mkdir(prefix)
Exemplo n.º 6
0
progressive = tqdm(range(MAX_STEPS),
                   total=MAX_STEPS,
                   ncols=50,
                   leave=False,
                   unit="b")  # 进度条
for step in progressive:
    if done:  # done表示结束一次游戏,需要重置
        observations, _, _ = env.reset()
        for obs in observations:
            obs_queue.append(obs)

    training = len(memory) > WARM_STEPS
    state = env.make_state(obs_queue).to(
        device).float()  # 将长度5的观察队列做成state(只用到了后4个obs
    action = agent.run(state, training)  # 根据policy network获得当前action
    obs, reward, done = env.step(action)  # 运行一步
    obs_queue.append(obs)  # 将头pop,队列中剩后4个加1个新的

    memory.store(env.make_folded_state(obs_queue), action, reward,
                 done)  # folded_state:[:4]是state,[1:]是next_state

    if step % POLICY_UPDATE == 0 and training:  # 如果training,每过POLICY_UPDATE,就更新一次policy network
        agent.learn(memory, step)

    if step % TARGET_UPDATE == 0:  # 每过TARGET_UPDATE,就更新一次target network
        agent.sync()

    if step % EVALUATE_FREQ == 0:  # 每过EVALUATE_FREQ,就评价一次
        avg_reward, frames = env.evaluate(obs_queue, agent, render=RENDER)
        with open("rewards.txt", "a") as fp:
Exemplo n.º 7
0
        progressive = tqdm(range(MAX_STEPS),
                           total=MAX_STEPS,
                           ncols=50,
                           leave=False,
                           unit="b")
        for step in progressive:
            if done:
                observations, _, _ = env.reset()
                for obs in observations:
                    obs_queue.append(obs)

            training = len(memory) > WARM_STEPS
            state = env.make_state(obs_queue).to(
                device).float()  #current state S
            action, value_this = agent.run(
                state, training
            )  #from state and DQN get Q-function, policy and action A
            if stable and COUNT_SHADE:
                action_queue.append(action)
            obs, reward, done = env.step(action)  #execute action getting R, S'
            obs_queue.append(obs)
            if version.find('PER') != -1:
                state_next = env.make_state(obs_queue).to(device).float()
                value_next = agent.get_target_value(state_next)
                td_error = GAMMA * value_next + reward - value_this
                memory.push(env.make_folded_state(obs_queue), action, reward,
                            done, td_error)  #how to encoding TD-error?
            else:
                memory.push(env.make_folded_state(obs_queue), action, reward,
                            done)
            EPS_START,
            EPS_END,
            EPS_DECAY,
            "./model_weights_b"  ####*************
        )
        memory = choosememory(c)
        Reward = []
        for step in tqdm(range(MAX_STEPS), ncols=50, leave=False, unit="b"):
            if done:
                observations, _, _ = env.reset()
                for obs in observations:
                    obs_queue.append(obs)

            training = len(memory) > WARM_STEPS
            state = env.make_state(obs_queue).to(device).float()
            action, value_this = agent.run(state, training)  #当前选择的行动和Q值
            obs, reward, done = env.step(action)
            obs_queue.append(obs)
            #memory.push(env.make_folded_state(obs_queue), action, reward, done)
            #pynvml.nvmlInit()
            #handle = pynvml.nvmlDeviceGetHandleByIndex(0)
            #meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
            #print(meminfo.used)
            if c == 1:  #在PER时加入memory时用TDerror来更新优先
                state_next = env.make_state(obs_queue).to(device).float()
                value_next = agent.get_target_value(state_next)
                td_error = abs(GAMMA * value_next + reward -
                               value_this)  #用loss的绝对值来作为TDerror
                memory.push(env.make_folded_state(obs_queue), action, reward,
                            done, td_error)
            else:
Exemplo n.º 9
0
progressive = tqdm(
    range(MAX_STEPS),
    total=MAX_STEPS,  #   预期的迭代次数
    ncols=50,  #  可以自定义进度条的总长度
    leave=False,
    unit="b")
for step in progressive:  #step=int
    if done:  #新一轮游戏?
        observations, _, _ = env.reset()
        for obs in observations:
            obs_queue.append(obs)  #将观察得到内容置入obs_queue
#         print(len(obs_queue))
    training = len(memory) > WARM_STEPS  #检测memory长度是否超过WARM_STEPS
    state = env.make_state(obs_queue).to(device).float()  #根据观察转换为状态
    action = agent.run(state, training)  # 以epsilon为参数随机选择一个动作
    obs, reward, done = env.step(action)  # 执行动作获得r,和下一个状态
    #     print(len(obs)) 执行此语句得到obs的大小为1
    obs_queue.append(obs)  #将新观测结果插入obs_queue
    memory.push(env.make_folded_state(obs_queue), action, reward, done)  # 保存经验

    if step % POLICY_UPDATE == 0 and training:  # 每 POLICY_UPDATE = 4 次训练,并且要满足memory大小大于WARM_STEPS=50_000
        agent.learn(memory, BATCH_SIZE)  # 从memory中随机采样BATCH_SIZE = 32来帮助更新

    if step % TARGET_UPDATE == 0:  # 每TARGET_UPDATE= 10_000轮将target网络更新为policy网络的权重
        agent.sync()

    if step % EVALUATE_FREQ == 0:  #保存当前状况
        avg_reward, frames = env.evaluate(obs_queue, agent, render=RENDER)
        with open("rewards.txt", "a") as fp:
            fp.write(f"{step//EVALUATE_FREQ:3d} {step:8d} {avg_reward:.1f}\n"