Пример #1
0
    def __init__(self, env):
        logger.info("SchedNet trainer is created")

        self._env = env
        self._eval = Evaluation()
        self._agent_profile = self._env.get_agent_profile()
        self._n_predator = self._agent_profile['predator']['n_agent']
        self._n_prey = self._agent_profile['prey']['n_agent']

        # State and obs additionally include history information
        self._state_dim = self._env.get_info(
        )[0]['state'].shape[0] + self._n_predator
        self._obs_dim = obs_dim = self._agent_profile['predator']['obs_dim'][
            0] + 1

        # Predator agent
        self._predator_agent = PredatorAgent(
            n_agent=self._agent_profile['predator']['n_agent'],
            action_dim=self._agent_profile['predator']['act_dim'],
            state_dim=self._state_dim,
            obs_dim=self._obs_dim)
        # Prey agent (randomly moving)
        self._prey_agent = []
        for _ in range(self._n_prey):
            self._prey_agent.append(RandomAgent(5))

        self.epsilon = 0.5  # Init value for epsilon

        if FLAGS.gui:  # Enable GUI
            self.canvas = canvas.Canvas(self._n_predator, 1, FLAGS.map_size)
            self.canvas.setup()
Пример #2
0
    def __init__(self, env):
        logger.info("Centralized DQN Trainer is created")

        self._env = env
        self._eval = Evaluation()
        self._n_predator = FLAGS.n_predator
        self._n_prey = FLAGS.n_prey
        self.action_dim = self._env.call_action_dim()
        self.state_dim = self._env.call_state_dim()

        self._agent = Agent(self.action_dim, self.state_dim)

        self.epsilon = 1.0
Пример #3
0
    def __init__(self, n_agent, action_dim, state_dim, obs_dim, name=""):
        logger.info("Predator Agent is created")

        self._n_agent = n_agent
        self._state_dim = state_dim
        self._action_dim_per_unit = action_dim
        self._obs_dim_per_unit = obs_dim
        self._obs_dim = self._obs_dim_per_unit * self._n_agent

        self._name = name
        self.update_cnt = 0

        # Make Networks
        tf.reset_default_graph()
        my_graph = tf.Graph()

        with my_graph.as_default():
            self.sess = tf.Session(graph=my_graph, config=tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True)))

            self.action_selector = ActionSelectorNetwork(self.sess, self._n_agent, self._obs_dim_per_unit, self._action_dim_per_unit, self._name)
            self.weight_generator = WeightGeneratorNetwork(self.sess, self._n_agent, self._obs_dim)
            self.critic = CriticNetwork(self.sess, self._n_agent, self._state_dim, self._name)

            self.sess.run(tf.global_variables_initializer())
            self.saver = tf.train.Saver()

            if FLAGS.load_nn:
                if FLAGS.nn_file == "":
                    logger.error("No file for loading Neural Network parameter")
                    exit()
                self.saver.restore(self.sess, FLAGS.nn_file)

        self.replay_buffer = ReplayBuffer()
        self._eval = Evaluation()
Пример #4
0
    def __init__(self, env):
        logger.info("Centralized DQN Trainer is created")

        self._env = env
        self._eval = Evaluation()
        self._n_predator = FLAGS.n_predator
        self._n_prey = FLAGS.n_prey
        self._agent_profile = self._env.get_agent_profile()
        self._agent_precedence = self._env.agent_precedence

        self._agent = Agent(self._agent_profile["predator"]["act_dim"],
                            self._agent_profile["predator"]["obs_dim"][0])
        self._prey_agent = AcAgent(5)

        self.epsilon = 1.0
        if FLAGS.load_nn:
            self.epsilon = epsilon_min

        if FLAGS.gui:
            self.canvas = canvas.Canvas(self._n_predator, self._n_prey,
                                        FLAGS.map_size)
            self.canvas.setup()
Пример #5
0
    def __init__(self, action_dim, obs_dim, name=""):
        logger.info("Centralized DQN Agent")

        self._obs_dim = obs_dim
        self._n_player = FLAGS.n_predator
        self._action_dim = action_dim * self._n_player
        self._action_dim_single = action_dim
        self._state_dim = obs_dim

        self._name = name
        self.update_cnt = 0
        self.target_update_period = 3000

        self.df = FLAGS.df
        self.lr = FLAGS.lr

        # Make Q-network
        tf.reset_default_graph()
        my_graph = tf.Graph()

        with my_graph.as_default():
            self.sess = tf.Session(
                graph=my_graph,
                config=tf.ConfigProto(gpu_options=tf.GPUOptions(
                    allow_growth=True)))
            self.q_network = DQNetwork(self.sess, self._state_dim,
                                       self._action_dim_single, self._n_player)
            self.sess.run(tf.global_variables_initializer())
            self.saver = tf.train.Saver()
            if FLAGS.load_nn:
                print "LOAD!"
                self.saver.restore(self.sess, FLAGS.nn_file)

        self.replay_buffer = ReplayBuffer()

        self._eval = Evaluation()
        self.q_prev = None

        self.ims = []
        plt.clf()
        self.fig = plt.figure()
        self.axes = plt.gca()
        plt.xticks(list(range(0, 25, 5)))
        plt.yticks(list(range(0, 25, 5)))
        self.axes.tick_params(axis='both', labelsize=15)
Пример #6
0
class Trainer(object):
    def __init__(self, env):
        logger.info("Centralized DQN Trainer is created")

        self._env = env
        self._eval = Evaluation()
        self._n_predator = FLAGS.n_predator
        self._n_prey = FLAGS.n_prey
        self._agent_profile = self._env.get_agent_profile()
        self._agent_precedence = self._env.agent_precedence

        self._agent = Agent(self._agent_profile["predator"]["act_dim"],
                            self._agent_profile["predator"]["obs_dim"][0])
        self._prey_agent = AcAgent(5)

        self.epsilon = 1.0
        if FLAGS.load_nn:
            self.epsilon = epsilon_min

        if FLAGS.gui:
            self.canvas = canvas.Canvas(self._n_predator, self._n_prey,
                                        FLAGS.map_size)
            self.canvas.setup()

    def learn(self):

        step = 0
        episode = 0
        print_flag = False
        count = 1

        while step < training_step:
            episode += 1
            ep_step = 0
            obs = self._env.reset()
            state = self._env.get_full_encoding()[:, :, 2]
            total_reward = 0
            total_reward_pos = 0
            total_reward_neg = 0
            self.random_action_generator()
            while True:
                step += 1
                ep_step += 1
                action = self.get_action(obs, step, state)
                obs_n, reward, done, info = self._env.step(action)
                state_n = self._env.get_full_encoding()[:, :, 2]
                done_single = sum(done) > 0

                self.train_agents(state, action, reward, state_n, done_single)
                obs = obs_n
                state = state_n
                total_reward += np.sum(reward)
                if np.sum(reward) >= 0:
                    total_reward_pos += np.sum(reward)
                else:
                    total_reward_neg += np.sum(reward)

                if is_episode_done(done, step) or ep_step >= FLAGS.max_step:
                    # print step, ep_step, total_reward
                    if print_flag and episode % FLAGS.eval_step == 1:
                        print "[train_ep %d]" % (
                            episode
                        ), "\treward", total_reward_pos, total_reward_neg
                    break

            if episode % FLAGS.eval_step == 0:
                self.test(episode)

        self._eval.summarize()

    def random_action_generator(self):
        rand_unit = np.random.uniform(size=(FLAGS.n_predator, 5))
        self.rand = rand_unit / np.sum(rand_unit, axis=1, keepdims=True)

    def get_action(self, obs, step, state, train=True):
        act_n = []
        if train == True:
            self.epsilon = max(self.epsilon - epsilon_dec, epsilon_min)

        # Action of predator

        action_list = self._agent.act(state)
        for i in range(self._n_predator):
            if train and (step < FLAGS.m_size * FLAGS.pre_train_step
                          or np.random.rand() < self.epsilon):
                action = np.random.choice(5)
                act_n.append(action)
            else:
                act_n.append(action_list[i])

        # Action of prey
        for i in range(FLAGS.n_prey):
            act_n.append(self._prey_agent.act(state, i))
        # act_n[1] = 2

        return np.array(act_n, dtype=np.int32)

    def train_agents(self, state, action, reward, state_n, done):
        self._agent.train(state, action, reward, state_n, done)

    def test(self, curr_ep=None):

        step = 0
        episode = 0

        test_flag = FLAGS.kt
        sum_reward = 0
        sum_reward_pos = 0
        sum_reward_neg = 0
        while step < testing_step:
            episode += 1
            obs = self._env.reset()
            state = self._env.get_full_encoding()[:, :, 2]
            if test_flag:
                print "\nInit\n", state
            total_reward = 0
            total_reward_pos = 0
            total_reward_neg = 0

            ep_step = 0

            while True:

                step += 1
                ep_step += 1

                action = self.get_action(obs, step, state, False)
                obs_n, reward, done, info = self._env.step(action)
                state_n = self._env.get_full_encoding()[:, :, 2]
                state_next = state_to_index(state_n)
                if FLAGS.gui:
                    self.canvas.draw(
                        state_next, done, "Score:" + str(total_reward) +
                        ", Step:" + str(ep_step))

                if test_flag:
                    aa = raw_input('>')
                    if aa == 'c':
                        test_flag = False
                    print action
                    print state_n
                    print reward

                obs = obs_n
                state = state_n
                r = np.sum(reward)
                # if r == 0.1:
                #     r = r * (-1.) * FLAGS.penalty
                total_reward += r  # * (FLAGS.df ** (ep_step-1))
                if r > 0:
                    total_reward_pos += r
                else:
                    total_reward_neg -= r

                if is_episode_done(done, step,
                                   "test") or ep_step >= FLAGS.max_step:

                    if FLAGS.gui:
                        self.canvas.draw(
                            state_next, done, "Hello", "Score:" +
                            str(total_reward) + ", Step:" + str(ep_step))

                    break
            sum_reward += total_reward
            sum_reward_pos += total_reward_pos
            sum_reward_neg += total_reward_neg
        if FLAGS.scenario == "pursuit":
            print "Test result: Average steps to capture: ", curr_ep, float(
                step) / episode
            self._eval.update_value("training result: ",
                                    float(step) / episode, curr_ep)
        elif FLAGS.scenario == "endless" or FLAGS.scenario == "endless2" or FLAGS.scenario == "endless3":
            print "Average reward:", FLAGS.penalty, curr_ep, sum_reward / episode, sum_reward_pos / episode, sum_reward_neg / episode
            self._eval.update_value("training result: ", sum_reward / episode,
                                    curr_ep)
            self._agent.logging(sum_reward / episode, curr_ep * 100)
Пример #7
0
class Trainer(object):
    def __init__(self, env):
        logger.info("SchedNet trainer is created")

        self._env = env
        self._eval = Evaluation()
        self._agent_profile = self._env.get_agent_profile()
        self._n_predator = self._agent_profile['predator']['n_agent']
        self._n_prey = self._agent_profile['prey']['n_agent']

        # State and obs additionally include history information
        self._state_dim = self._env.get_info(
        )[0]['state'].shape[0] + self._n_predator
        self._obs_dim = obs_dim = self._agent_profile['predator']['obs_dim'][
            0] + 1

        # Predator agent
        self._predator_agent = PredatorAgent(
            n_agent=self._agent_profile['predator']['n_agent'],
            action_dim=self._agent_profile['predator']['act_dim'],
            state_dim=self._state_dim,
            obs_dim=self._obs_dim)
        # Prey agent (randomly moving)
        self._prey_agent = []
        for _ in range(self._n_prey):
            self._prey_agent.append(RandomAgent(5))

        self.epsilon = 0.5  # Init value for epsilon

        if FLAGS.gui:  # Enable GUI
            self.canvas = canvas.Canvas(self._n_predator, 1, FLAGS.map_size)
            self.canvas.setup()

    def learn(self):

        global_step = 0
        episode_num = 0
        print_flag = True

        while global_step < training_step:
            episode_num += 1
            step_in_ep = 0

            obs_n = self._env.reset()
            info_n = self._env.get_info()
            h_schedule_n = np.zeros(self._n_predator)  # schedule history
            obs_n, state, _ = self.get_obs_state_with_schedule(obs_n,
                                                               info_n,
                                                               h_schedule_n,
                                                               init=True)

            total_reward = 0
            done = False

            while not done:
                global_step += 1
                step_in_ep += 1

                schedule_n, priority = self.get_schedule(
                    obs_n, global_step, FLAGS.sched)
                action_n = self.get_action(obs_n, schedule_n, global_step)
                obs_n_without_schedule, reward_n, done_n, info_n = self._env.step(
                    action_n)
                obs_n_next, state_next, h_schedule_n = self.get_obs_state_with_schedule(
                    obs_n_without_schedule, info_n, h_schedule_n, schedule_n)

                if FLAGS.gui:
                    self.canvas.draw(state_next * FLAGS.map_size,
                                     [0] * self._n_predator, "Train")

                done_single = sum(done_n) > 0
                self.train_agents(state, obs_n, action_n, reward_n, state_next,
                                  obs_n_next, schedule_n, priority,
                                  done_single)

                obs_n = obs_n_next
                state = state_next
                total_reward += np.sum(reward_n)

                if is_episode_done(done_n, global_step):
                    if FLAGS.gui:
                        self.canvas.draw(state_next * FLAGS.map_size,
                                         [0] * self._n_predator, "Train", True)
                    if print_flag:
                        print("[train_ep %d]" % (episode_num), "\tstep:",
                              global_step, "\tstep_per_ep:", step_in_ep,
                              "\treward", total_reward)
                    done = True

                if FLAGS.eval_on_train and global_step % FLAGS.eval_step == 0:
                    self.test(global_step)
                    break

        self._predator_agent.save_nn(global_step)
        self._eval.summarize()

    def get_action(self, obs_n, schedule_n, global_step, train=True):

        act_n = [0] * len(obs_n)
        self.epsilon = max(self.epsilon - epsilon_dec, epsilon_min)

        # Action of predator
        if train and (global_step < FLAGS.m_size * FLAGS.pre_train_step or
                      np.random.rand() < self.epsilon):  # with prob. epsilon
            # Exploration
            predator_action = self._predator_agent.explore()
        else:
            # Exploitation
            predator_obs = [
                obs_n[i] for i in self._agent_profile['predator']['idx']
            ]
            predator_action = self._predator_agent.act(predator_obs,
                                                       schedule_n)

        for i, idx in enumerate(self._agent_profile['predator']['idx']):
            act_n[idx] = predator_action[i]

        # Action of prey
        for i, idx in enumerate(self._agent_profile['prey']['idx']):
            act_n[idx] = self._prey_agent[i].act(None)

        return np.array(act_n, dtype=np.int32)

    def get_schedule(self, obs_n, global_step, type, train=True):

        predator_obs = [
            obs_n[i] for i in self._agent_profile['predator']['idx']
        ]

        if train and (global_step < FLAGS.m_size * FLAGS.pre_train_step
                      or np.random.rand() < self.epsilon):
            # Exploration: Schedule k random agent
            priority = np.random.rand(self._n_predator)
            i = np.argsort(-priority)[:FLAGS.s_num]
            ret = np.full(self._n_predator, 0.0)
            ret[i] = 1.0
            return ret, priority
        else:
            # Exploitation
            return self._predator_agent.schedule(predator_obs)

    def train_agents(self, state, obs_n, action_n, reward_n, state_next,
                     obs_n_next, schedule_n, priority, done):

        predator_obs = [
            obs_n[i] for i in self._agent_profile['predator']['idx']
        ]
        predator_action = [
            action_n[i] for i in self._agent_profile['predator']['idx']
        ]
        predator_reward = [
            reward_n[i] for i in self._agent_profile['predator']['idx']
        ]
        predator_obs_next = [
            obs_n_next[i] for i in self._agent_profile['predator']['idx']
        ]
        self._predator_agent.train(state, predator_obs, predator_action,
                                   predator_reward, state_next,
                                   predator_obs_next, schedule_n, priority,
                                   done)

    def get_h_obs_state(self, obs_n, state, h_schedule):
        obs_n_h = np.concatenate((obs_n[0:self._n_predator],
                                  h_schedule.reshape((self._n_predator, 1))),
                                 axis=1)
        obs_final = list()
        for i in range(self._n_predator):
            obs_final.append(obs_n_h[i])
        for i in range(self._n_prey):
            obs_final.append(obs_n[self._n_predator + i])
        obs_n = np.array(obs_final)
        state = np.concatenate((state, h_schedule), axis=-1)

        return obs_n, state

    def get_obs_state_with_schedule(self,
                                    obs_n_ws,
                                    info_n,
                                    h_schedule_n,
                                    schedule_n=None,
                                    init=False):
        if not init:
            h_schedule_n = self.update_h_schedule(h_schedule_n, schedule_n)

        obs_n_h = np.concatenate((obs_n_ws[0:self._n_predator],
                                  h_schedule_n.reshape((self._n_predator, 1))),
                                 axis=1)
        obs_final = list()
        for i in range(self._n_predator):
            obs_final.append(obs_n_h[i])
        for i in range(self._n_prey):
            obs_final.append(obs_n_ws[self._n_predator + i])
        obs_n = np.array(obs_final)
        state = np.concatenate((info_n[0]['state'], h_schedule_n), axis=-1)

        return obs_n, state, h_schedule_n

    def update_h_schedule(self, h_schedule, schedule_n):

        ret = h_schedule * 0.5 + schedule_n * 0.5
        return ret

    def print_obs(self, obs):
        for i in range(FLAGS.n_predator):
            print(obs[i])
        print("")

    def check_obs(self, obs):

        check_list = []
        for i in range(FLAGS.n_predator):
            check_list.append(obs[i][2])

        return np.array(check_list)

    def test(self, curr_ep=None):

        global_step = 0
        episode_num = 0

        total_reward = 0
        obs_cnt = np.zeros(self._n_predator)

        while global_step < testing_step:

            episode_num += 1
            step_in_ep = 0
            obs_n = self._env.reset()
            info_n = self._env.get_info()
            h_schedule_n = np.zeros(self._n_predator)
            obs_n, state, _ = self.get_obs_state_with_schedule(obs_n,
                                                               info_n,
                                                               h_schedule_n,
                                                               init=True)

            while True:

                global_step += 1
                step_in_ep += 1

                schedule_n, priority = self.get_schedule(
                    obs_n, global_step, FLAGS.sched)
                action_n = self.get_action(obs_n, schedule_n, global_step,
                                           False)
                obs_n_without_schedule, reward_n, done_n, info_n = self._env.step(
                    action_n)
                obs_n_next, state_next, h_schedule_n = self.get_obs_state_with_schedule(
                    obs_n_without_schedule, info_n, h_schedule_n, schedule_n)

                obs_cnt += self.check_obs(obs_n_next)

                if FLAGS.gui:
                    self.canvas.draw(state_next * FLAGS.map_size,
                                     [0] * self._n_predator, "Test")

                obs_n = obs_n_next
                state = state_next
                total_reward += np.sum(reward_n)

                if is_episode_done(done_n, global_step,
                                   "test") or step_in_ep > FLAGS.max_step:
                    if FLAGS.gui:
                        self.canvas.draw(state_next * FLAGS.map_size,
                                         [0] * self._n_predator, "Test", True)
                    break

        print("Test result: Average steps to capture: ", curr_ep,
              float(global_step) / episode_num, "\t",
              float(total_reward) / episode_num, obs_cnt / episode_num)
        self._eval.update_value("test_result",
                                float(global_step) / episode_num, curr_ep)
Пример #8
0
class Trainer(object):
    def __init__(self, env):
        logger.info("Centralized DQN Trainer is created")

        self._env = env
        self._eval = Evaluation()
        self._n_predator = FLAGS.n_predator
        self._n_prey = FLAGS.n_prey
        self.action_dim = self._env.call_action_dim()
        self.state_dim = self._env.call_state_dim()

        self._agent = Agent(self.action_dim, self.state_dim)

        self.epsilon = 1.0

    def learn(self):

        step = 0
        episode = 0
        print_flag = False
        array = np.zeros([FLAGS.training_step / FLAGS.eval_step, 4])
        while step < training_step:
            episode += 1
            ep_step = 0
            obs = self._env.reset()
            state = obs
            total_reward = 0

            while True:
                step += 1
                ep_step += 1
                action = self.get_action(obs, step, state)
                obs_n, reward, done, info = self._env.step(action)
                state_n = obs_n

                done_single = sum(done) > 0
                if ep_step >= FLAGS.max_step:
                    done_single = True
                self.train_agents(state, action, reward, state_n, done_single)

                obs = obs_n
                state = state_n
                total_reward += np.sum(reward) * (FLAGS.df**(ep_step - 1))
                # if step % 100 ==0:
                #     print step, self._agent.q()
                if is_episode_done(done, step) or ep_step >= FLAGS.max_step:
                    if print_flag:
                        print "[train_ep %d]" % (
                            episode
                        ), "\tstep:", step, "\tep_step:", ep_step, "\treward", total_reward
                    break

            if episode % FLAGS.eval_step == 0:

                self.test(episode)

        self._eval.summarize()

    def get_action(self, obs, step, state, train=True):
        act_n = []
        self.epsilon = max(self.epsilon - epsilon_dec, epsilon_min)

        # Action of predator
        action_list = self._agent.act(state)

        for i in range(self._n_predator):
            if train and (
                    step < FLAGS.m_size * FLAGS.pre_train_step
                    or np.random.rand() < self.epsilon):  # with prob. epsilon
                action = np.random.choice(self.action_dim)
                act_n.append(action)
            else:
                act_n.append(action_list[i])

        return np.array(act_n, dtype=np.int32)

    def train_agents(self, state, action, reward, state_n, done):
        self._agent.train(state, action, reward, state_n, done)

    def test(self, curr_ep=None):

        step = 0
        episode = 0

        test_flag = FLAGS.kt
        sum_reward = 0
        while step < testing_step:
            episode += 1
            obs = self._env.reset()
            state = obs
            if test_flag:
                print "\nInit\n", state
            total_reward = 0

            ep_step = 0

            while True:

                step += 1
                ep_step += 1

                action = self.get_action(obs, step, state, False)

                obs_n, reward, done, info = self._env.step(action)
                state_n = obs_n

                if test_flag:
                    aa = raw_input('>')
                    if aa == 'c':
                        test_flag = False
                    print action
                    print state_n
                    print reward

                obs = obs_n
                state = state_n
                total_reward += np.sum(reward) * (FLAGS.df**(ep_step - 1))

                if is_episode_done(done, step,
                                   "test") or ep_step >= FLAGS.max_step:
                    break
            sum_reward += total_reward

        print "Algorithm ", FLAGS.algorithm, ",Average reward: ", curr_ep, sum_reward / episode
        self._eval.update_value("test_result", sum_reward / episode, curr_ep)