def __init__(self, number, num_actions, trainer, model_name):

        self.name = "worker_" + str(number)
        self.number = number
        self.model_name = model_name

        # Create the local copy of the network and the tensorflow op to copy global paramters to local network
        self.local_ac = ACNet(num_actions, self.name, trainer)
        # 从全局 AC 网络中获取数据
        self.update_target_graph = self.update_target(global_scope_name,
                                                      self.name)
        # 使用的环境
        self.env = EnvLab(width=out_size_width,
                          height=out_size_height,
                          fps=out_fps,
                          level=level)
示例#2
0
    def __init__(self, number, num_actions, trainer, model_name):

        self.name = "worker_" + str(number)
        self.number = number
        self.model_name = model_name

        # Create the local copy of the network and the tensorflow op to copy global paramters to local network
        self.local_ac = ACNet(num_actions, self.name, trainer)
        self.update_target_graph = self.update_target(global_scope_name,
                                                      self.name)

        if (lab):
            self.env = EnvLab(80, 80, 60, "seekavoid_arena_01")
        else:
            self.env = EnvVizDoom(vizdoom_scenario)
示例#3
0
            reward = env.Act(action, 1)
            reward_total += reward

            if (not env.IsRunning()):
                break

            state_raw = env.Observation()


if __name__ == '__main__':

    parser = argparse.ArgumentParser()
    parser.add_argument("--gpu", help="the GPU to use")
    args = parser.parse_args()

    if (args.gpu):
        os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu

    if (lab):
        env = EnvLab(80, 80, 60, "seekavoid_arena_01")
    else:
        env = EnvVizDoom(vizdoom_scenario)

    agent = Agent(env.NumActions())

    if (train):
        agent.Train()

    Test(agent)
class Worker(object):
    def __init__(self, number, num_actions, trainer, model_name):

        self.name = "worker_" + str(number)
        self.number = number
        self.model_name = model_name

        # Create the local copy of the network and the tensorflow op to copy global paramters to local network
        self.local_ac = ACNet(num_actions, self.name, trainer)
        # 从全局 AC 网络中获取数据
        self.update_target_graph = self.update_target(global_scope_name,
                                                      self.name)
        # 使用的环境
        self.env = EnvLab(width=out_size_width,
                          height=out_size_height,
                          fps=out_fps,
                          level=level)

    # Copies one set of variables to another.
    # Used to set worker network parameters to those of global network.
    def update_target(self, from_scope, to_scope):
        # 指定了 global 和 local 的作用域
        from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                      from_scope)
        to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, to_scope)

        op_holder = []
        # 由于 global 和 local 定义的变量一致。所以直接组合成同一个 Zip,然后从 from 中给 to 赋值
        for from_var, to_var in zip(from_vars, to_vars):
            op_holder.append(to_var.assign(from_var))
        return op_holder

    # Calculate discounted returns.
    def Discount(self, x, gamma):
        # 他是计算来的这是一个通用的函数,具体的含义只有在调用位置才有指定
        for idx in reversed(range(len(x) - 1)):
            x[idx] += x[idx + 1] * gamma
        return x

    def Start(self, session, saver, coord):
        worker_process = lambda: self.Process(session, saver, coord)
        thread = threading.Thread(target=worker_process)
        thread.start()

        global start_time
        start_time = time.time()
        return thread

    def Train(self, episode_buffer, sess, bootstrap_value):
        """
        训练数据集
        :param episode_buffer: local 缓存的数据
        :param sess: 会话
        :param bootstrap_value: 自举值,总是计算得到的当前最新的状态的
        :return:
        """
        # 定义的 local_ac 保存的计算的数据
        episode_buffer = np.array(episode_buffer)
        states = episode_buffer[:, 0]
        actions = episode_buffer[:, 1]
        rewards = episode_buffer[:, 2]
        values = episode_buffer[:, 3]

        # Here we take the rewards and values from the episode_buffer, and use them to
        # generate the advantage and discounted returns.
        # The advantage function uses "Generalized Advantage Estimation"
        # **删除** 目前的线索包括 迹函数、后观点的 TD error
        # **删除** 生成一个带有刚刚计算出来的值函数的新的 reward 列表,需要用来计算 eligibility trace。
        rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value])
        # 这个对应于伪代码中,更新 R 的公式,目前 x 中的值是 r_i。对于 x 中最后一个元素,就是刚刚计算出来的 R。最终计算出来的 R 保存在 x[0] 中。
        # 我们不需要最后扩展出来的 R。
        # 这个可以推出来。Discount 的公式可以提炼出:X(i) = SIGMA(i=0,i<指定的上界)(r(t+i) * gama^i)。这里的t是起始的位置,这个函数默认是 0。可以对应论文章的 t_start
        discounted_rewards = self.Discount(rewards_plus, gamma)[:-1]
        # advatnages 的定义 A(at,st) = Q(at,st) - V(st)
        value_plus = np.asarray(values.tolist() + [bootstrap_value])
        # 这一个部分计算的是 TD error :delta_t = R(t+1) + gama * V(S(t+1)) - V(S(t))
        advantages = rewards + gamma * value_plus[1:] - value_plus[:-1]
        # TD 的后向观点的计算。 更新 Q
        advantages = self.Discount(advantages, gamma)

        # Update the global network using gradients from loss
        # Generate network statistics to periodically save
        self.local_ac.Train(sess, discounted_rewards, states, actions,
                            advantages)

    def Process(self, sess, saver, coord):
        global step, train_scores, start_time, lock

        print("Starting worker " + str(self.number))
        while (not coord.should_stop()):
            sess.run(self.update_target_graph)
            episode_buffer = []
            episode_reward = 0
            # 重新设置环境
            self.env.Reset()
            s = self.env.Observation()
            # 当 channel==1 的时候,把 环境返回的 RGBD 格式转换为 RGB 格式。最后要缩放
            s = Preprocess(s)
            self.local_ac.ResetLstm()

            while (self.env.IsRunning()):
                # Take an action using probabilities from policy network output.
                a, v = self.local_ac.GetAction(sess, s)
                r = self.env.Act(a, frame_repeat)
                finished = not self.env.IsRunning()
                if (not finished):
                    s1 = self.env.Observation()
                    s1 = Preprocess(s1)
                else:
                    s1 = None

                episode_buffer.append([s, a, r, v])

                episode_reward += r
                s = s1

                lock.acquire()

                step += 1

                if (step % save_each == 0):
                    model_name_curr = self.model_name + "_{:04}".format(
                        int(step / save_each))
                    print("\nSaving the network weigths to:",
                          model_name_curr,
                          file=sys.stderr)
                    saver.save(sess, model_name_curr)

                    PrintStat(time.time() - start_time, step, step_num,
                              train_scores)

                    train_scores = []

                if (step == step_num):
                    coord.request_stop()

                lock.release()

                # If the episode hasn't ended, but the experience buffer is full, then we
                # make an update step using that experience rollout.
                if (len(episode_buffer) == t_max
                        or (finished and len(episode_buffer) > 0)):
                    # Since we don't know what the true final return is,
                    # we "bootstrap" from our current value estimation.
                    if (not finished):
                        v1 = self.local_ac.GetValue(sess, s)
                        self.Train(episode_buffer, sess, v1)
                        episode_buffer = []
                        # 将全局 AC 中的数据更新
                        sess.run(self.update_target_graph)
                    else:
                        self.Train(episode_buffer, sess, 0.0)

            lock.acquire()
            train_scores.append(episode_reward)
            lock.release()
            if (not env.IsRunning()):
                break

            state_raw = env.Observation()
    # 释放资源
    if test_write_video:
        out_video.release()
    cv2.destroyAllWindows()


if __name__ == '__main__':
    # seekavoid_arena_01 的 actions:
    """
    [{'max': 512, 'min': -512, 'name': 'LOOK_LEFT_RIGHT_PIXELS_PER_FRAME'},
 {'max': 512, 'min': -512, 'name': 'LOOK_DOWN_UP_PIXELS_PER_FRAME'},
 {'max': 1, 'min': -1, 'name': 'STRAFE_LEFT_RIGHT'},
 {'max': 1, 'min': -1, 'name': 'MOVE_BACK_FORWARD'},
 {'max': 1, 'min': 0, 'name': 'FIRE'},
 {'max': 1, 'min': 0, 'name': 'JUMP'},
 {'max': 1, 'min': 0, 'name': 'CROUCH'}]
    """
    if (lab):
        env = EnvLab(width=out_size_width,
                     height=out_size_height,
                     fps=out_fps,
                     level=level)

    agent = Agent()

    Test(agent)
示例#6
0
        state = Preprocess(state_raw)
        action = agent.Act(state)

        for _ in range(frame_repeat):
            if (test_display):
                cv2.imshow("frame-test", state_raw)
                cv2.waitKey(20)

            if (test_write_video):
                out_video.write(state_raw)

            reward = env.Act(action, 1)
            reward_total += reward

            if (not env.IsRunning()):
                break

            state_raw = env.Observation()


if __name__ == '__main__':

    if (lab):
        env = EnvLab(80, 80, 60, "seekavoid_arena_01")
    else:
        env = EnvVizDoom(vizdoom_scenario)

    agent = Agent()

    Test(agent)