예제 #1
0
    def __init__(self, number, num_actions, trainer, model_name):

        self.name = "worker_" + str(number)
        self.number = number
        self.model_name = model_name

        # Create the local copy of the network and the tensorflow op to copy global paramters to local network
        self.local_ac = ACNet(num_actions, self.name, trainer)
        self.update_target_graph = self.update_target(global_scope_name,
                                                      self.name)

        self.env = EnvVizDoom(vizdoom_scenario)
예제 #2
0
            reward = env.Act(action, 1)
            reward_total += reward

            if (not env.IsRunning()):
                break

            state_raw = env.Observation()


if __name__ == '__main__':

    parser = argparse.ArgumentParser()
    parser.add_argument("--gpu", help="the GPU to use")
    args = parser.parse_args()

    if (args.gpu):
        os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu

    if (lab):
        env = EnvLab(80, 80, 60, "seekavoid_arena_01")
    else:
        env = EnvVizDoom(vizdoom_scenario)

    agent = Agent(env.NumActions())

    if (train):
        agent.Train()

    Test(agent)
예제 #3
0
            if (test_write_video):
                out_video.write(state_raw)

            reward = env.Act(action, 1)
            reward_total += reward

            if (not env.IsRunning()):
                break

            state_raw = env.Observation()


if __name__ == '__main__':

    parser = argparse.ArgumentParser()
    parser.add_argument("--gpu", help="the GPU to use")
    args = parser.parse_args()

    if (args.gpu):
        os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu

    env = EnvVizDoom(vizdoom_scenario)

    agent = Agent(env.NumActions())

    if (train):
        agent.Train()

    Test(agent)
예제 #4
0
class Worker(object):
    def __init__(self, number, num_actions, trainer, model_name):

        self.name = "worker_" + str(number)
        self.number = number
        self.model_name = model_name

        # Create the local copy of the network and the tensorflow op to copy global paramters to local network
        self.local_ac = ACNet(num_actions, self.name, trainer)
        self.update_target_graph = self.update_target(global_scope_name,
                                                      self.name)

        self.env = EnvVizDoom(vizdoom_scenario)

    # Copies one set of variables to another.
    # Used to set worker network parameters to those of global network.
    def update_target(self, from_scope, to_scope):
        from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                      from_scope)
        to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, to_scope)

        op_holder = []
        for from_var, to_var in zip(from_vars, to_vars):
            op_holder.append(to_var.assign(from_var))
        return op_holder

    # Calculate discounted returns.
    def Discount(self, x, gamma):
        for idx in reversed(range(len(x) - 1)):
            x[idx] += x[idx + 1] * gamma
        return x

    def Start(self, session, saver, coord):
        worker_process = lambda: self.Process(session, saver, coord)
        thread = threading.Thread(target=worker_process)
        thread.start()

        global start_time
        start_time = time.time()
        return thread

    def Train(self, episode_buffer, sess, bootstrap_value):
        episode_buffer = np.array(episode_buffer)
        states = episode_buffer[:, 0]
        actions = episode_buffer[:, 1]
        rewards = episode_buffer[:, 2]
        values = episode_buffer[:, 3]

        # Here we take the rewards and values from the episode_buffer, and use them to
        # generate the advantage and discounted returns.
        # The advantage function uses "Generalized Advantage Estimation"
        rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value])
        discounted_rewards = self.Discount(rewards_plus, gamma)[:-1]

        value_plus = np.asarray(values.tolist() + [bootstrap_value])
        advantages = rewards + gamma * value_plus[1:] - value_plus[:-1]
        advantages = self.Discount(advantages, gamma)

        # Update the global network using gradients from loss
        # Generate network statistics to periodically save
        self.local_ac.Train(sess, discounted_rewards, states, actions,
                            advantages)

    def Process(self, sess, saver, coord):
        global step, train_scores, start_time, lock

        print("Starting worker " + str(self.number))
        while (not coord.should_stop()):
            sess.run(self.update_target_graph)
            episode_buffer = []
            episode_reward = 0

            self.env.Reset()
            s = self.env.Observation()
            s = Preprocess(s)
            self.local_ac.ResetLstm()

            while (self.env.IsRunning()):
                # Take an action using probabilities from policy network output.
                a, v = self.local_ac.GetAction(sess, s)
                r = self.env.Act(a, frame_repeat)
                finished = not self.env.IsRunning()
                if (not finished):
                    s1 = self.env.Observation()
                    s1 = Preprocess(s1)
                else:
                    s1 = None

                episode_buffer.append([s, a, r, v])

                episode_reward += r
                s = s1

                lock.acquire()

                step += 1

                if (step % save_each == 0):
                    model_name_curr = self.model_name + "_{:04}".format(
                        int(step / save_each))
                    print("\nSaving the network weigths to:",
                          model_name_curr,
                          file=sys.stderr)
                    saver.save(sess, model_name_curr)

                    PrintStat(time.time() - start_time, step, step_num,
                              train_scores)

                    train_scores = []

                if (step == step_num):
                    coord.request_stop()

                lock.release()

                # If the episode hasn't ended, but the experience buffer is full, then we
                # make an update step using that experience rollout.
                if (len(episode_buffer) == t_max
                        or (finished and len(episode_buffer) > 0)):
                    # Since we don't know what the true final return is,
                    # we "bootstrap" from our current value estimation.
                    if (not finished):
                        v1 = self.local_ac.GetValue(sess, s)
                        self.Train(episode_buffer, sess, v1)
                        episode_buffer = []
                        sess.run(self.update_target_graph)
                    else:
                        self.Train(episode_buffer, sess, 0.0)

            lock.acquire()
            train_scores.append(episode_reward)
            lock.release()