示例#1
0
        def init_variables(self, info):
            # Here you have the information of the game (virtual init() in random_walk.cpp)
            # List: game_time, goal, number_of_robots, penalty_area, codewords,
            #       robot_height, robot_radius, max_linear_velocity, field, team_info,
            #       {rating, name}, axle_length, resolution, ball_radius
            # self.game_time = info['game_time']
            self.field = info['field']
            self.robot_size = 2 * info['robot_radius']
            self.goal = info['goal']
            self.max_linear_velocity = info['max_linear_velocity']
            self.number_of_robots = info['number_of_robots']
            self.end_of_frame = False
            self.cur_my = []
            self.cur_ball = []

            self.state_dim = 2  # relative ball
            self.history_size = 2  # frame history size
            self.action_dim = 2  # 2

            self.arglist = Argument()
            self.state_shape = (self.state_dim * self.history_size,
                                )  # state dimension
            self.act_space = [Discrete(self.action_dim * 2 + 1)]
            self.trainers = MADDPGAgentTrainer('agent_moving',
                                               self.mlp_model,
                                               self.state_shape,
                                               self.act_space,
                                               0,
                                               self.arglist,
                                               local_q_func=False)

            # for tensorboard
            self.summary_placeholders, self.update_ops, self.summary_op = \
                                                            self.setup_summary()
            self.summary_writer = \
                tf.summary.FileWriter('summary/moving_test', U.get_session().graph)

            U.initialize()

            # Load previous results, if necessary
            if self.arglist.load_dir == "":
                self.arglist.load_dir = self.arglist.save_dir
            if self.arglist.restore:
                print('Loading previous state... %s' % self.arglist.load_dir)
                U.load_state(self.arglist.load_dir)

            self.saver = tf.train.Saver(max_to_keep=1100)

            self.state = np.zeros([self.state_dim * self.history_size
                                   ])  # histories
            self.train_step = 216000
            self.wheels = np.zeros(self.number_of_robots * 2)
            self.action = np.zeros(self.action_dim * 2 + 1)  # not np.zeros(2)

            self.stats_steps = 6000  # for tensorboard
            self.rwd_sum = 0

            self.done = False
            self.control_idx = 0
            return
    def train(self):
        print("==========================================================")
        print("Initializing constraint model training...")
        print("==========================================================")
        U.initialize()

        for epoch in range(self.epochs):
            # Just sample episodes for the whole epoch
            self._sample_steps(self.steps_per_epoch)

            # Do the update from memory
            losses = np.mean(np.concatenate([self._update_batch(batch) for batch in \
                                             self.replay_buffer.get_sequential(self.batch_size)]).reshape(-1, self.num_constraints),
                             axis=0)

            self.replay_buffer.clear()
            self._train_global_step += 1

            print(
                f"Finished epoch {epoch} with losses: {losses}. Running validation ..."
            )
            self.evaluate()
            print("----------------------------------------------------------")

        print("==========================================================")
示例#3
0
def train(arglist):
    with U.single_threaded_session():
        # Create environment
        env = make_env(arglist.scenario, arglist, arglist.benchmark)
        # Create agent trainers
        obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]
        num_adversaries = min(env.n, arglist.num_adversaries)
        trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist)
        print('Using good policy {} and adv policy {}'.format(
            arglist.good_policy, arglist.adv_policy))

        # Initialize
        U.initialize()

        # Load previous results, if necessary
        if arglist.load_dir == "":
            arglist.load_dir = arglist.save_dir
        if arglist.display or arglist.restore or arglist.benchmark:
            print('Loading previous state...')
            U.load_state(arglist.load_dir)

        episode_rewards = [0.0]  # sum of rewards for all agents
        agent_rewards = [[0.0]
                         for _ in range(env.n)]  # individual agent reward
        final_ep_rewards = []  # sum of rewards for training curve
        final_ep_ag_rewards = []  # agent rewards for training curve
        agent_info = [[[]]]  # placeholder for benchmarking info
        saver = tf.train.Saver()
        obs_n = env.reset()
        episode_step = 0
        train_step = 0
        t_start = time.time()

        #if not (arglist.display or arglist.restore or arglist.benchmark):
        #    U.save_state(arglist.save_dir, saver=saver)
        #    print("Saved first checkpoint")

        current_game_experiences = []
        t0 = time.time()

        print('Starting iterations...')
        while True:

            new_experiences = load_new_experiences()
            for exp in new_experiences:
                obs_n, action_n, rew_n, new_obs_n, done_n, terminal = exp
                for i, agent in enumerate(trainers):
                    agent.experience(obs_n[i], action_n[i], rew_n[i],
                                     new_obs_n[i], done_n[i], terminal)

            # update all trainers, if not in display or benchmark mode
            loss = None
            for agent in trainers:
                agent.preupdate()
            for agent in trainers:
                loss = agent.update(trainers, train_step)

            U.save_state(arglist.save_dir, saver=saver)
示例#4
0
        def init_variables(self, info):
            # Here you have the information of the game (virtual init() in random_walk.cpp)
            # List: game_time, goal, number_of_robots, penalty_area, codewords,
            #       robot_height, robot_radius, max_linear_velocity, field, team_info,
            #       {rating, name}, axle_length, resolution, ball_radius
            # self.game_time = info['game_time']
            self.field = info['field']
            self.robot_size = 2*info['robot_radius']
            self.goal = info['goal']
            self.max_linear_velocity = info['max_linear_velocity']
            self.number_of_robots = info['number_of_robots']
            self.end_of_frame = False
            self.cur_my_posture = []
            self.cur_op_posture = []
            self.cur_ball = []
            self.pre_ball = [0, 0]

            self.state_dim = 2 # 3*my robots, relative to the ball position
            self.history_size = 2 # frame history size
            self.action_dim = 2 # 2                    
            
            self.arglist = Argument()
            self.obs_shape_n = [(self.state_dim * self.history_size,) for _ in range(1)] # state dimenstion
            self.action_space = [spaces.Discrete(self.action_dim * 2 + 1) for _ in range(1)]
            self.trainers = self.get_trainers(1, self.obs_shape_n, self.action_space, self.arglist)

            # for tensorboard
            self.summary_placeholders, self.update_ops, self.summary_op = self.setup_summary()
            self.summary_writer = tf.summary.FileWriter('summary/aiwc_maddpg', U.get_session().graph)

            U.initialize()
            
            # Load previous results, if necessary
            if self.arglist.load_dir == "":
                self.arglist.load_dir = self.arglist.save_dir
            if self.arglist.display or self.arglist.restore or self.arglist.benchmark:
                print('Loading previous state...')
                U.load_state(self.arglist.load_dir)

            self.final_ep_rewards = []  # sum of rewards for training curve
            self.final_ep_ag_rewards = []  # agent rewards for training curve
            self.agent_info = [[[]]]  # placeholder for benchmarking info
            self.saver = tf.train.Saver()
            self.obs_n = [np.zeros([self.state_dim * self.history_size]) for _ in range(1)] # histories
            self.train_step = 0
            self.wheels = np.zeros(self.number_of_robots*2)
            self.action_n = [np.zeros(self.action_dim * 2 + 1) for _ in range(1)]
                   
            self.save_every_steps = 12000 # save the model every 10 minutes
            self.stats_steps = 6000 # for tensorboard
            self.reward_sum = 0
            self.score_sum = 0 
            self.active_flag = [[False for _ in range(5)], [False for _ in range(5)]]   
            self.inner_step = 0

            self.done = False
            self.control_idx = 0
            return
示例#5
0
文件: train.py 项目: baradist/maddpg
def play(arglist):
    with U.single_threaded_session():
        # Create environment
        env = make_env(arglist.scenario)
        # Create agent trainers
        obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]
        num_adversaries = get_num_adversaries(env)
        trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist)
        print('Using good policy {} and adv policy {}'.format(arglist.good_policy, arglist.adv_policy))

        # Initialize
        U.initialize()

        load_state(arglist.load_dir)

        episode_rewards = [0.0]  # sum of rewards for all agents
        agent_rewards = [[0.0] for _ in range(env.n)]  # individual agent reward
        agent_info = [[[]]]  # placeholder for benchmarking info
        obs_n = env.reset()
        episode_step = 0
        train_step = 0
        t_start = time.time()

        print('Starting iterations...')
        while True:
            # get action
            action_n = [agent.action(obs) for agent, obs in zip(trainers, obs_n)]
            # environment step
            new_obs_n, rew_n, done_n, info_n = env.step(action_n)
            episode_step += 1
            done = all(done_n)
            terminal = (episode_step >= arglist.max_episode_len)
            # # collect experience
            # for i, agent in enumerate(trainers):
            #     agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal)
            obs_n = new_obs_n

            for i, rew in enumerate(rew_n):
                episode_rewards[-1] += rew
                agent_rewards[i][-1] += rew

            if done or terminal:
                print("train step: {}, episode reward: {}, time: {}".format(
                    train_step, np.mean(episode_rewards[-1:]), round(time.time() - t_start, 3)))
                obs_n = env.reset()
                episode_step = 0
                episode_rewards.append(0)
                for a in agent_rewards:
                    a.append(0)
                agent_info.append([[]])

            # increment global step counter
            train_step += 1

            # for displaying learned policies
            time.sleep(0.1)
            env.render()
示例#6
0
        def init_variables(self, info):
            # Here you have the information of the game (virtual init() in random_walk.cpp)
            # List: game_time, goal, number_of_robots, penalty_area, codewords,
            #       robot_height, robot_radius, max_linear_velocity, field, team_info,
            #       {rating, name}, axle_length, resolution, ball_radius
            # self.game_time = info['game_time']
            self.field = info['field']
            self.robot_size = 2 * info['robot_radius']
            self.goal = info['goal']
            self.max_linear_velocity = info['max_linear_velocity']
            self.number_of_robots = info['number_of_robots']
            self.end_of_frame = False
            self.cur_my_posture = []
            self.cur_op_posture = []
            self.cur_ball = []
            self.pre_ball = [0, 0]

            self.state_dim = 2  # relative ball
            self.history_size = 2  # frame history size
            self.action_dim = 2  # 2

            self.arglist = Argument()
            self.obs_shape_n = [(self.state_dim * self.history_size, )
                                for _ in range(1)]  # state dimenstion
            self.action_space = [
                Discrete(self.action_dim * 2 + 1) for _ in range(1)
            ]
            self.trainers = self.get_trainers(1, self.obs_shape_n,
                                              self.action_space, self.arglist)

            U.initialize()

            # Load previous results, if necessary
            if self.arglist.load_dir == "":
                self.arglist.load_dir = self.arglist.save_dir
            if self.arglist.display or self.arglist.restore or self.arglist.benchmark:
                print('Loading previous state...')
                U.load_state(self.arglist.load_dir)

            self.obs_n = [
                np.zeros([self.state_dim * self.history_size])
                for _ in range(self.number_of_robots)
            ]  # histories
            self.wheels = np.zeros(self.number_of_robots * 2)
            self.action_n = [
                np.zeros(self.action_dim * 2 + 1)
                for _ in range(self.number_of_robots)
            ]  # not np.zeros(2)

            self.distances = [[i for i in range(5)],
                              [i for i in range(5)]]  # distances to the ball
            self.idxs = [[i for i in range(5)], [i for i in range(5)]]
            self.shoot_plan = [0 for _ in range(self.number_of_robots)]
            self.deadlock_cnt = 0
            self.avoid_deadlock_cnt = 0
            self.global_step = 0
            return
示例#7
0
        def init_variables(self, info):
            # Here you have the information of the game (virtual init() in random_walk.cpp)
            # List: game_time, goal, number_of_robots, penalty_area, codewords,
            #       robot_height, robot_radius, max_linear_velocity, field, team_info,
            #       {rating, name}, axle_length, resolution, ball_radius
            # self.game_time = info['game_time']
            self.field = info['field']
            self.robot_size = 2 * info['robot_radius']
            self.goal = info['goal']
            self.max_linear_velocity = info['max_linear_velocity']
            self.number_of_robots = info['number_of_robots']
            self.end_of_frame = False
            self.cur_my_posture = []
            self.cur_op_posture = []
            self.cur_ball = []
            self.pre_ball = [0, 0]

            self.state_dim = 5  # ball, goal, theta
            self.history_size = 2  # frame history size
            self.action_dim = 2  # 2

            self.arglist = Argument()
            self.obs_shape_n = [(self.state_dim * self.history_size, )
                                for _ in range(1)]  # state dimenstion
            self.action_space = [
                Discrete(self.action_dim * 2 + 1) for _ in range(1)
            ]
            self.trainers = self.get_trainers(1, self.obs_shape_n,
                                              self.action_space, self.arglist)

            U.initialize()

            # Load previous results, if necessary
            if self.arglist.load_dir == "":
                self.arglist.load_dir = self.arglist.save_dir
            if self.arglist.display or self.arglist.restore or self.arglist.benchmark:
                print('Loading previous state...')
                U.load_state(self.arglist.load_dir)

            self.episode_rewards = [0.0]  # sum of rewards for all agents
            self.agent_rewards = [[0.0] for _ in range(self.number_of_robots)
                                  ]  # individual agent reward
            self.final_ep_rewards = []  # sum of rewards for training curve
            self.final_ep_ag_rewards = []  # agent rewards for training curve
            self.agent_info = [[[]]]  # placeholder for benchmarking info
            self.obs_n = [
                np.zeros([self.state_dim * self.history_size])
                for _ in range(self.number_of_robots)
            ]  # histories
            self.wheels = np.zeros(self.number_of_robots * 2)
            self.action_n = [
                np.zeros(self.action_dim * 2 + 1)
                for _ in range(self.number_of_robots)
            ]  # not np.zeros(2)
            return
示例#8
0
文件: train.py 项目: chshong/rmaddpg
    def init(self, arglist, env):
        num_thread = 1
        tf_config = tf.ConfigProto(inter_op_parallelism_threads=num_thread,
                                   intra_op_parallelism_threads=num_thread)
        self.sess = tf.InteractiveSession(config=tf_config)

        # To make sure that training and testing are based on diff seeds
        if arglist.restore:
            create_seed(np.random.randint(2))
        else:
            create_seed(arglist.seed)

        # Create agent trainers
        self.obs_shape_n = [
            env.observation_space[i].shape for i in range(env.n)
        ]
        self.num_adversaries = min(env.n, arglist.num_adversaries)
        self.trainers = get_trainers(env, self.num_adversaries,
                                     self.obs_shape_n, arglist)
        print('Using good policy {} and adv policy {}'.format(
            arglist.good_policy, arglist.adv_policy))

        # Initialize
        U.initialize()

        # Load previous results, if necessary
        if arglist.load_dir == "":
            arglist.load_dir = arglist.save_dir
        if arglist.restore or arglist.benchmark:
            print('Loading previous state...')
            U.load_state(arglist.load_dir)

        self.episode_rewards = [0.0]  # sum of rewards for all agents
        self.agent_rewards = [[0.0]
                              for _ in range(env.n)]  # individual agent reward
        self.final_ep_rewards = []  # sum of rewards for training curve
        self.final_ep_ag_rewards = []  # agent rewards for training curve
        self.agent_info = [[[]]]  # placeholder for benchmarking info
        self.saver = tf.train.Saver()
        self.obs_n = env.reset()
        self.train_step = 0
        self.t_start = time.time()
        self.new_episode = True  # start of a new episode (used for replay buffer)
        self.start_saving_comm = False

        if arglist.graph:
            print("Setting up graph writer!")
            self.writer = tf.summary.FileWriter("learning_curves/graph",
                                                sess.graph)

        if arglist.analysis:
            print("Starting analysis on {}...".format(arglist.analysis))
            if arglist.analysis != 'video':
                analyze.run_analysis(arglist, env, self.trainers)
            return  # should be a single run
示例#9
0
    def train(self):
        print("==========================================================")
        print("Initializing constraint model training...")
        print("==========================================================")
        U.initialize()

        for epoch in range(self.epochs):
            # Just sample episodes for the whole epoch
            self._sample_steps(self.steps_per_epoch)

            # Do the update from memory
            '''if len(self.replay_buffer) < self.max_replay_buffer:  # replay buffer is not large enough
                continue
            if not epoch % 100 == 0:  # only update every 100 steps
                continue'''
            self.replay_sample_index = self.replay_buffer.make_index(
                self.batch_size)
            # collect replay sample from all agents
            index = self.replay_sample_index

            action, obs, c, c_next = self.replay_buffer.sample_index(index)
            obs = np.squeeze(obs, axis=1)
            action = np.squeeze(action, axis=1)
            c = np.squeeze(c, axis=1)
            c_next = np.squeeze(c_next, axis=1)

            # train the c_next network
            c_next_loss = [
                self.c_next_train[_](*([obs] + [action] + [c] + [c_next]))
                for _ in range(self.num_constraints)
            ]

            self.replay_buffer.clear()
            self._train_global_step += 1

            print(
                f"Finished epoch {epoch} with losses: {c_next_loss}. Running validation ..."
            )
            print("----------------------------------------------------------")

        print("==========================================================")
示例#10
0
def train(arglist):
    # To make sure that training and testing are based on diff seeds
    if arglist.restore:
        create_seed(np.random.randint(2))
    else:
        create_seed(arglist.seed)

    with U.single_threaded_session() as sess:
        # Create environment
        env = make_env(arglist.scenario, arglist, arglist.benchmark)
        # Create agent trainers
        obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]

        num_adversaries = min(env.n, arglist.num_adversaries)
        trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist)
        print('Using good policy {} and adv policy {}'.format(arglist.good_policy, arglist.adv_policy))

        # Initialize
        U.initialize()

        # Load previous results, if necessary
        if arglist.load_dir == "":
            arglist.load_dir = arglist.save_dir
        if arglist.restore or arglist.benchmark:
            print('Loading previous state...')
            U.load_state(arglist.load_dir)

        if arglist.analysis:
            print("Starting analysis on {}...".format(arglist.analysis))
            if arglist.analysis != 'video':
                analyze.run_analysis(arglist, env, trainers)
            return # should be a single run

        episode_rewards = [0.0]  # sum of rewards for all agents
        agent_rewards = [[0.0] for _ in range(env.n)]  # individual agent reward
        final_ep_rewards = []  # sum of rewards for training curve
        final_ep_ag_rewards = []  # agent rewards for training curve
        agent_info = [[[]]]  # placeholder for benchmarking info
        saver = tf.train.Saver()
        obs_n = env.reset()
        episode_step = 0
        train_step = 0
        t_start = time.time()
        new_episode = True # start of a new episode (used for replay buffer)
        start_saving_comm = False

        if arglist.graph:
            print("Setting up graph writer!")
            writer = tf.summary.FileWriter("learning_curves/graph",sess.graph)

        print('Starting iterations...')
        while True:
            if arglist.actor_lstm:
                # get critic input states
                p_in_c_n, p_in_h_n = get_lstm_states('p', trainers) # num_trainers x 1 x 1 x 64
            if arglist.critic_lstm:
                q_in_c_n, q_in_h_n = get_lstm_states('q', trainers) # num_trainers x 1 x 1 x 64

            # get action
            action_n = [agent.action(obs) for agent, obs in zip(trainers,obs_n)]
            if arglist.critic_lstm:
                # get critic output states
                p_states = [p_in_c_n, p_in_h_n] if arglist.actor_lstm else []
                update_critic_lstm(trainers, obs_n, action_n, p_states)
                q_out_c_n, q_out_h_n = get_lstm_states('q', trainers) # num_trainers x 1 x 1 x 64
            if arglist.actor_lstm:
                p_out_c_n, p_out_h_n = get_lstm_states('p', trainers) # num_trainers x 1 x 1 x 64

            # environment step
            new_obs_n, rew_n, done_n, info_n = env.step(action_n)
            episode_step += 1
            done = all(done_n)
            terminal = (episode_step >= arglist.max_episode_len)
            # collect experience
            for i, agent in enumerate(trainers):
                num_episodes = len(episode_rewards)
                # do this every iteration
                if arglist.critic_lstm and arglist.actor_lstm:
                    agent.experience(obs_n[i], action_n[i], rew_n[i],
                                    new_obs_n[i], done_n[i], # terminal,
                                    p_in_c_n[i][0], p_in_h_n[i][0],
                                    p_out_c_n[i][0], p_out_h_n[i][0],
                                    q_in_c_n[i][0], q_in_h_n[i][0],
                                    q_out_c_n[i][0], q_out_h_n[i][0], new_episode)
                elif arglist.critic_lstm:
                    agent.experience(obs_n[i], action_n[i], rew_n[i],
                                    new_obs_n[i], done_n[i], # terminal,
                                    q_in_c_n[i][0], q_in_h_n[i][0],
                                    q_out_c_n[i][0], q_out_h_n[i][0],new_episode)
                elif arglist.actor_lstm:
                    agent.experience(obs_n[i], action_n[i], rew_n[i],
                                    new_obs_n[i], done_n[i], # terminal,
                                    p_in_c_n[i][0], p_in_h_n[i][0],
                                    p_out_c_n[i][0], p_out_h_n[i][0],
                                    new_episode)
                else:
                    agent.experience(obs_n[i], action_n[i], rew_n[i],
                                    new_obs_n[i], done_n[i], # terminal,
                                    new_episode)

                obs_n = new_obs_n

            # Adding rewards
            if arglist.tracking:
                for i, a in enumerate(trainers):
                    if arglist.num_episodes - len(episode_rewards) <= 1000:
                        a.tracker.record_information("goal", np.array(env.world.landmarks[0].state.p_pos))
                        a.tracker.record_information("position",np.array(env.world.agents[i].state.p_pos))
                    a.tracker.record_information("ag_reward", rew_n[i])
                    a.tracker.record_information("team_dist_reward", info_n["team_dist"][i])
                    a.tracker.record_information("team_diff_reward", info_n["team_diff"][i])

            # Closing graph writer
            if arglist.graph:
                writer.close()
            for i, rew in enumerate(rew_n):
                episode_rewards[-1] += rew
                agent_rewards[i][-1] += rew

            if done or terminal:
                new_episode = True
                num_episodes = len(episode_rewards)
                obs_n = env.reset()
                # reset trainers
                if arglist.actor_lstm or arglist.critic_lstm:
                    for agent in trainers:
                        agent.reset_lstm()
                if arglist.tracking:
                    for agent in trainers:
                        agent.tracker.reset()
                episode_step = 0
                episode_rewards.append(0)
                for a in agent_rewards:
                    a.append(0)
                agent_info.append([[]])
            else:
                new_episode=False

            # increment global step counter
            train_step += 1

            # for benchmarking learned policies
            if arglist.benchmark:
                for i, info in enumerate(info_n):
                    agent_info[-1][i].append(info_n['n'])
                if train_step > arglist.benchmark_iters and (done or terminal):
                    file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl'
                    print('Finished benchmarking, now saving...')
                    with open(file_name, 'wb') as fp:
                        pickle.dump(agent_info[:-1], fp)
                    break
                continue

            # update all trainers, if not in display or benchmark mode
            loss = None

            # get same episode sampling
            if arglist.sync_sampling:
                inds = [random.randint(0, len(trainers[0].replay_buffer._storage)-1) for i in range(arglist.batch_size)]
            else:
                inds = None

            for agent in trainers:
                # if arglist.lstm:
                #     agent.preupdate(inds=inds)
                # else:
                agent.preupdate(inds)
            for agent in trainers:
                loss = agent.update(trainers, train_step)
                if loss is None: continue

            # for displaying learned policies
            if arglist.display:
                env.render()
                # continue

            # save model, display training output
            if terminal and (len(episode_rewards) % arglist.save_rate == 0):
                U.save_state(arglist.save_dir, saver=saver)
                # print statement depends on whether or not there are adversaries
                if num_adversaries == 0:
                    print("steps: {}, episodes: {}, mean episode reward: {}, time: {}".format(
                        train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), round(time.time()-t_start, 3)))
                else:
                    print("steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}".format(
                        train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]),
                        [np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards], round(time.time()-t_start, 3)))
                t_start = time.time()
                # Keep track of final episode reward
                final_ep_rewards.append(np.mean(episode_rewards[-arglist.save_rate:]))
                for rew in agent_rewards:
                    final_ep_ag_rewards.append(np.mean(rew[-arglist.save_rate:]))

            # saves final episode reward for plotting training curve later
            if len(episode_rewards) > arglist.num_episodes:
                # U.save_state(arglist.save_dir, saver=saver)
                if arglist.tracking:
                    for agent in trainers:
                        agent.tracker.save()

                rew_file_name = "rewards/" + arglist.commit_num + "_rewards.pkl"
                with open(rew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_rewards, fp)
                agrew_file_name = "rewards/" + arglist.commit_num + "_agrewards.pkl"
                # agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl'
                with open(agrew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_ag_rewards, fp)
                print('...Finished total of {} episodes.'.format(len(episode_rewards)))
                break
示例#11
0
def train(arglist):
    with U.single_threaded_session():
        # [Initialization]
        # Create environment
        env = make_env(arglist.scenario, arglist, arglist.benchmark)
        # Create agent trainers
        obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]
        num_adversaries = min(env.n, arglist.num_adversaries)
        trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist)
        print('Using good policy {} and adv policy {}'.format(
            arglist.good_policy, arglist.adv_policy))

        # Initialize (Tensorflow initialization procedure)
        U.initialize()

        # Load previous results, if necessary
        if arglist.load_dir == "":
            arglist.load_dir = arglist.save_dir
        if arglist.display or arglist.restore or arglist.benchmark:
            print('Loading previous state...')
            U.load_state(arglist.load_dir)

        # Parameters initialization
        episode_rewards = [0.0]  # sum of rewards for all agents
        agent_rewards = [[0.0]
                         for _ in range(env.n)]  # individual agent reward
        final_ep_rewards = []  # sum of rewards for training curve
        final_ep_ag_rewards = []  # agent rewards for training curve
        agent_info = [[[]]]  # placeholder for benchmarking info
        saver = tf.train.Saver()
        obs_n = env.reset()
        episode_step = 0
        train_step = 0
        t_start = time.time()

        print('Starting iterations...')
        while True:
            # get action
            action_n = [
                agent.action(obs) for agent, obs in zip(trainers, obs_n)
            ]  # Get Action from Policy training.
            # environment step according to actions
            new_obs_n, rew_n, done_n, info_n = env.step(
                action_n
            )  # Receive the observation, the reward, the done and the information from the simulation environment.
            episode_step += 1
            done = all(done_n)  # Check if all tasks have been done.
            terminal = (episode_step >= arglist.max_episode_len
                        )  # Check the timeout.
            # record experience to agents
            for i, agent in enumerate(
                    trainers
            ):  # The "done" may be the actions which has been executed at the past.
                agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i],
                                 done_n[i],
                                 terminal)  # Record for the experience replay.
            obs_n = new_obs_n  # Reset the current observation

            for i, rew in enumerate(
                    rew_n
            ):  # Update the total rewards and each agent's rewards
                episode_rewards[-1] += rew
                agent_rewards[i][-1] += rew

            if done or terminal:  # Task finished or timeout, restart the simulation environment again.
                obs_n = env.reset()
                episode_step = 0
                episode_rewards.append(0)
                for a in agent_rewards:
                    a.append(0)
                agent_info.append([[]])

            # increment global step counter
            train_step += 1

            # for benchmarking learned policies
            if arglist.benchmark:  # Save the agents' information.
                for i, info in enumerate(info_n):
                    agent_info[-1][i].append(info_n['n'])
                if train_step > arglist.benchmark_iters and (done or terminal):
                    file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl'
                    print('Finished benchmarking, now saving...')
                    with open(file_name, 'wb') as fp:
                        pickle.dump(agent_info[:-1], fp)
                    break
                continue

            # for displaying learned policies
            if arglist.display:
                time.sleep(0.1)  # Delay.
                env.render()  # Displaying the environment if necessary.
                continue

            # update all trainers, if not in display or benchmark mode [Important]
            loss = None
            for agent in trainers:
                agent.preupdate(
                )  # Clear the index randomly choosed by method 'make_index' --> 'agent.replay_sample_index = None'
            for agent in trainers:
                loss = agent.update(trainers, train_step)

            # save model, display training output
            if terminal and (len(episode_rewards) % arglist.save_rate == 0):
                U.save_state(arglist.save_dir, saver=saver)
                # print statement depends on whether or not there are adversaries
                if num_adversaries == 0:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, time: {}"
                        .format(train_step, len(episode_rewards),
                                np.mean(episode_rewards[-arglist.save_rate:]),
                                round(time.time() - t_start, 3)))
                else:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}"
                        .format(train_step, len(episode_rewards),
                                np.mean(episode_rewards[-arglist.save_rate:]),
                                [
                                    np.mean(rew[-arglist.save_rate:])
                                    for rew in agent_rewards
                                ], round(time.time() - t_start, 3)))
                t_start = time.time()
                # Keep track of final episode reward
                final_ep_rewards.append(
                    np.mean(episode_rewards[-arglist.save_rate:]))
                for rew in agent_rewards:
                    final_ep_ag_rewards.append(
                        np.mean(rew[-arglist.save_rate:]))

            # saves final episode reward for plotting training curve later
            if len(episode_rewards) > arglist.num_episodes:
                rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl'
                with open(rew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_rewards, fp)
                agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl'
                with open(agrew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_ag_rewards, fp)
                print('...Finished total of {} episodes.'.format(
                    len(episode_rewards)))
                break
示例#12
0
def train(arglist):
    with U.make_session(8):
        # Create environment
        env = make_env(arglist.scenario, arglist, arglist.benchmark)
        # Create agent trainers
        obs_shape_n = [[29] for i in range(env.n)]
        obs_map_shape_n =[[56*86] for i in range(env.n)]
        num_adversaries = min(env.n, arglist.num_adversaries)
        trainers = get_trainers(env, num_adversaries, obs_shape_n, obs_map_shape_n,arglist)
        print('Using good policy {} and adv policy {}'.format(arglist.good_policy, arglist.adv_policy))

        # Initialize
        U.initialize()

        # Load previous results, if necessary
        if arglist.load_dir == "":
            arglist.load_dir = arglist.save_dir
        if arglist.display or arglist.restore or arglist.benchmark:
            print('Loading previous state...')
            U.load_state(arglist.load_dir)

        episode_rewards = [0.0]  # sum of rewards for all agents
        agent_rewards = [[0.0] for _ in range(env.n)]  # individual agent reward
        final_ep_rewards = []  # sum of rewards for training curve
        final_ep_ag_rewards = []  # agent rewards for training curve
        agent_info = [[[]]]  # placeholder for benchmarking info
        saver = tf.train.Saver()
        obs_n = env._reset()
        episode_step = 13000
        train_step = 0
        t_start = time.time()
示例#13
0
def train(arglist):
    with U.single_threaded_session():
        # Create environment
        env = StarCraft2Env(map_name=arglist.scenario,
                            reward_only_positive=False,
                            obs_last_action=True,
                            obs_timestep_number=True,
                            reward_scale_rate=200)
        # Create agent trainers
        env_info = env.get_env_info()
        num_agents = env_info["n_agents"]
        num_adversaries = num_agents
        obs_shape_n = [(env_info["obs_shape"], )
                       for i in range(num_adversaries)]
        action_space_n = [
            env_info["n_actions"] for i in range(num_adversaries)
        ]
        buffer_size = arglist.buffer_size

        trainers = get_trainers(num_adversaries, obs_shape_n, action_space_n,
                                arglist, buffer_size)
        print('Using good policy {} and adv policy {}'.format(
            arglist.good_policy, arglist.adv_policy))

        # Initialize
        U.initialize()

        logdir = "./tensorboard/"

        Logger.DEFAULT \
            = Logger.CURRENT \
            = Logger(dir=None,
                     output_formats=[TensorBoardOutputFormat(logdir)])

        # Load previous results, if necessary
        if arglist.load_dir == "":
            arglist.load_dir = arglist.save_dir
        if arglist.display or arglist.restore or arglist.benchmark:
            print('Loading previous state...')
            U.load_state(arglist.load_dir)

        episode_rewards = [0.0]  # sum of rewards for all agents
        agent_rewards = [[0.0]
                         for _ in range(num_agents)]  # individual agent reward
        saver = tf.train.Saver(max_to_keep=100000000)
        n_actions_no_attack = 6

        env.reset()

        obs_n = []
        reward_hl_own_old = []
        reward_hl_en_old = []
        for agent_id in range(num_agents):  # 第一个循环是为了得到初始状态/观察/生命值信息
            obs = env.get_obs_agent(agent_id)
            obs_n.append(obs)
            reward_hl_own_old.append(env.get_agent_health(agent_id))
            reward_hl_en_old.append(env.get_enemy_health(agent_id))

        episode_step = 0
        step = 0

        print('Starting iterations...')
        while True:
            # get action
            action_set_actual = []
            action_set_execute = []
            action_n = []
            dead_unit = []
            for agent_id in range(num_agents):
                action_output = trainers[agent_id].action(obs_n[agent_id])
                action_n.append(action_output)
                action_prob = action_output
                action_to_choose = np.argmax(action_prob)
                action_set_actual.append(action_to_choose)
                avail_actions = env.get_avail_agent_actions(agent_id)
                avail_actions_ind = np.nonzero(avail_actions)[0]
                if action_to_choose in avail_actions_ind:
                    action_set_execute.append(action_to_choose)
                elif (avail_actions[0] == 1):
                    action_set_execute.append(
                        0)  # 如果该动作不能执行,并且智能体已经死亡,那么就用NO_OP代替当前动作
                else:
                    action_set_execute.append(1)  # 如果该动作不能执行,那么就用STOP动作代替

                if (len(avail_actions_ind) == 1
                        and avail_actions_ind[0] == 0):  # 判断该智能体是否已经死亡
                    dead_unit.append(agent_id)

            rew_base, done, _ = env.step(action_set_execute)
            episode_rewards[-1] += rew_base
            new_obs_n = []
            reward_hl_own_new = []
            reward_hl_en_new = []
            rew_n = []

            for agent_id in range(num_agents):
                obs_next = env.get_obs_agent(agent_id=agent_id)
                new_obs_n.append(obs_next)
                reward_hl_own_new.append(env.get_agent_health(agent_id))
                reward_hl_en_new.append(env.get_enemy_health(agent_id))

            for agent_id in range(num_agents):
                if (agent_id in dead_unit):
                    reward = 0
                elif (action_set_execute[agent_id] !=
                      action_set_actual[agent_id]
                      ):  #当输出动作无法执行时,执行替代动作,但是把输出动作进行保存并且给与一个负的奖励
                    reward = -2

                elif (action_set_execute[agent_id] > 5):
                    target_id = action_set_execute[
                        agent_id] - n_actions_no_attack
                    health_reduce_en = reward_hl_en_old[
                        target_id] - reward_hl_en_new[target_id]
                    if (health_reduce_en > 0):
                        if (rew_base > 0):
                            reward = 2 + rew_base
                        else:
                            reward = 2
                    else:
                        reward = 1
                else:
                    reward = (reward_hl_own_new[agent_id] -
                              reward_hl_own_old[agent_id]) * 5
                rew_n.append(reward)

            episode_step += 1

            # collect experience
            for i, agent in enumerate(trainers):
                agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i],
                                 done)

            obs_n = new_obs_n
            reward_hl_own_old = reward_hl_own_new
            reward_hl_en_old = reward_hl_en_new

            for i, rew in enumerate(rew_n):
                agent_rewards[i][-1] += rew

            if done:
                print("steps until now : %s, episode: %s, episode reward: %s" %
                      (step, len(episode_rewards), episode_rewards[-1]))
                logger.record_tabular("episodes", len(episode_rewards))
                logger.record_tabular("episode reward", episode_rewards[-1])
                for i in range(num_agents):
                    logger.record_tabular("agent" + str(i) + " episode reward",
                                          agent_rewards[i][-1])
                logger.dump_tabular()

                env.reset()
                obs_n = []
                reward_hl_own_old = []
                reward_hl_en_old = []
                for agent_id in range(num_agents):  # 第一个循环是为了得到初始状态/观察/生命值信息
                    obs = env.get_obs_agent(agent_id)
                    obs_n.append(obs)
                    reward_hl_own_old.append(env.get_agent_health(agent_id))
                    reward_hl_en_old.append(env.get_enemy_health(agent_id))
                episode_step = 0
                episode_rewards.append(0)
                for a in agent_rewards:
                    a.append(0)

            # increment global step counter
            step += 1
            if (step == arglist.buffer_size):
                print("Training starts.")

            # update all trainers, if not in display or benchmark mode
            loss = None
            for agent in trainers:
                agent.preupdate()
            for agent in trainers:
                loss = agent.update(trainers, step)

            # save model, display training output
            if done and (len(episode_rewards) % arglist.save_rate == 0):
                save_dir = arglist.save_dir + "/model_" + str(
                    step) + "steps/" + arglist.exp_name
                U.save_state(save_dir, saver=saver)
                # print statement depends on whether or not there are adversaries
                if num_adversaries == 0:
                    print("steps: {}, episodes: {}, mean episode reward: {}".
                          format(step, len(episode_rewards),
                                 np.mean(
                                     episode_rewards[-arglist.save_rate:])))
                else:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}"
                        .format(step, len(episode_rewards),
                                np.mean(episode_rewards[-arglist.save_rate:]),
                                [
                                    np.mean(rew[-arglist.save_rate:])
                                    for rew in agent_rewards
                                ]))

            # saves final episode reward for plotting training curve later
            if len(episode_rewards) > arglist.num_episodes:
                print('...Finished total of {} episodes.'.format(
                    len(episode_rewards) - 1))
                break
示例#14
0
def train(arglist, extra_args=None):
    tf_graph = tf.Graph()
    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1,
                               intra_op_parallelism_threads=1)
    tf_config.gpu_options.allow_growth = True
    with tf.Session(graph=tf_graph, config=tf_config):
        # Create environment
        env = make_env(arglist.scenario, arglist)
        # Create agent trainers
        obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]
        if arglist.num_adversaries is None:
            arglist.num_adversaries = len([
                agent for agent in env.agents
                if (hasattr(agent, "adversary") and agent.adversary)
            ])
        arglist.num_adversaries = min(env.n, arglist.num_adversaries)
        num_adversaries = arglist.num_adversaries
        trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist)
        print('Using good policy {} and adv policy {}'.format(
            arglist.good_policy, arglist.adv_policy))

        # Initialize
        U.initialize()

        if os.environ.get("OUTPUT_GRAPH"):
            tf.summary.FileWriter(os.path.join(logger.get_dir(), "tb"),
                                  U.get_session().graph)

        # Load previous results, if necessary
        if arglist.display or arglist.restore or arglist.benchmark:
            print('Loading previous state...')
            U.load_state(arglist.load_dir)

        episode_rewards = [0.0]  # sum of rewards for all agents
        agent_rewards = [[0.0]
                         for _ in range(env.n)]  # individual agent reward
        final_ep_rewards = []  # sum of rewards for training curve
        final_ep_ag_rewards = []  # agent rewards for training curve
        agent_info = [[[]]]  # placeholder for benchmarking info
        saver = tf.train.Saver(max_to_keep=None)
        obs_n = env.reset()
        episode_step = 0
        train_step = 0
        t_start = time.time()

        print('Starting iterations...')
        while True:
            # get action
            action_n = [
                agent.action(obs) for agent, obs in zip(trainers, obs_n)
            ]
            # print("[action] " + ", ".join(["agent {i}: {action}".format(i=i, action=list(action_n[i])) for i in range(len(action_n))]))
            # environment step
            new_obs_n, rew_n, done_n, info_n = env.step(action_n)
            episode_step += 1
            done = all(done_n)
            terminal = (episode_step >= arglist.max_episode_len)
            # collect experience
            for i, agent in enumerate(trainers):
                agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i],
                                 done_n[i], terminal)
            obs_n = new_obs_n

            for i, rew in enumerate(rew_n):
                episode_rewards[-1] += rew
                agent_rewards[i][-1] += rew

            if done or terminal:
                if arglist.save_render_images:
                    input_file_name = os.path.join(
                        arglist.render_dir,
                        "image-episode_{}-step_%d.png".format(
                            len(episode_rewards)))
                    output_file_name = os.path.join(
                        arglist.render_dir,
                        "video-episode_{}.mp4".format(len(episode_rewards)))
                    command = "ffmpeg -y -r 10 -i {} {}".format(
                        input_file_name, output_file_name)
                    os.system(command)
                    print("Saved render video at {}".format(output_file_name))

                    for episode_step_ in range(episode_step):
                        file_name = os.path.join(
                            arglist.render_dir,
                            "image-episode_{}-step_{}.png".format(
                                len(episode_rewards), episode_step_))
                        if os.path.exists(file_name):
                            os.remove(file_name)

                obs_n = env.reset()
                episode_step = 0
                episode_rewards.append(0)
                for a in agent_rewards:
                    a.append(0)
                agent_info.append([[]])

            # increment global step counter
            train_step += 1

            # for benchmarking learned policies
            if arglist.benchmark:
                for i, info in enumerate(info_n):
                    agent_info[-1][i].append(info_n['n'])
                if train_step > arglist.benchmark_iters and (done or terminal):
                    file_name = os.path.join(arglist.benchmark_dir,
                                             'benchmark.pkl')
                    print('Finished benchmarking, now saving...')
                    with open(file_name, 'wb') as fp:
                        pickle.dump(agent_info[:-1], fp)
                    break
                continue

            # for displaying learned policies
            if arglist.display:
                time.sleep(0.1)
                if arglist.save_render_images:
                    images = env.render(mode="rgb_array")
                    image = images[0]
                    file_name = os.path.join(
                        arglist.render_dir,
                        "image-episode_{}-step_{}.png".format(
                            len(episode_rewards), episode_step))
                    plt.imsave(file_name, image)
                    print("Saved render image at {}".format(file_name))
                else:
                    env.render(mode="human")
                continue

            # update all trainers, if not in display or benchmark mode
            loss = None
            for agent in trainers:
                agent.preupdate()
            for agent in trainers:
                loss = agent.update(trainers, train_step)

            # save model
            if terminal and (len(episode_rewards) % arglist.save_rate == 0):
                U.save_state(os.path.join(
                    arglist.save_dir,
                    "checkpoint-episode_{}".format(len(episode_rewards))),
                             saver=saver)

            # print training scalars
            if terminal and ((len(episode_rewards) % arglist.print_rate == 0)
                             or
                             (len(episode_rewards) % arglist.save_rate == 0)):
                # print statement depends on whether or not there are adversaries
                logger.log("Time: {}".format(
                    datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
                logger.logkv("steps", train_step)
                logger.logkv("episodes", len(episode_rewards))
                logger.logkv("mean_episode_reward",
                             np.mean(episode_rewards[-arglist.save_rate:]))
                if num_adversaries == 0:
                    # print("[{}] steps: {}, episodes: {}, mean episode reward: {}, time: {}".format(time.strftime("%Y-%m-%dT%H:%M:%S", time.localtime()),
                    #     train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), round(time.time()-t_start, 3)))
                    pass
                else:
                    for agent_index in range(len(agent_rewards)):
                        logger.logkv(
                            "agent_{}_episode_reward".format(agent_index),
                            np.mean(agent_rewards[agent_index]
                                    [-arglist.save_rate:]))
                    # print("[{}] steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}".format(time.strftime("%Y-%m-%dT%H:%M:%S", time.localtime()),
                    #     train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]),
                    #     [np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards], round(time.time()-t_start, 3)))
                logger.logkv("time", round(time.time() - t_start, 3))
                logger.dumpkvs()
                t_start = time.time()
                # Keep track of final episode reward
                final_ep_rewards.append(
                    np.mean(episode_rewards[-arglist.save_rate:]))
                for rew in agent_rewards:
                    final_ep_ag_rewards.append(
                        np.mean(rew[-arglist.save_rate:]))

            # saves final episode reward for plotting training curve later
            if len(episode_rewards) > arglist.num_episodes:
                rew_file_name = os.path.join(arglist.plots_dir, 'rewards.pkl')
                with open(rew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_rewards, fp)
                agrew_file_name = os.path.join(arglist.plots_dir,
                                               'average_rewards.pkl')
                with open(agrew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_ag_rewards, fp)
                print('...Finished total of {} episodes.'.format(
                    len(episode_rewards)))
                break
示例#15
0
def train(arglist):
    with U.single_threaded_session() as sess:
        # Create environment
        env = make_env(arglist.scenario, arglist, arglist.benchmark)
        # Create agent trainers
        obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]
        num_adversaries = min(env.n, arglist.num_adversaries)
        trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist)
        print('Using good policy {} and adv policy {}'.format(
            arglist.good_policy, arglist.adv_policy))

        saver = tf.train.Saver()
        # Initialize
        U.initialize()
        summary_writer = tf.summary.FileWriter(arglist.summary_dir, sess.graph)
        summary_placeholders, update_ops, summary_op = setup_summary()

        # Load previous results, if necessary
        if arglist.load_dir == "":
            arglist.load_dir = arglist.save_dir
        if arglist.display or arglist.restore or arglist.benchmark:
            print('Loading previous state...')
            U.load_state(arglist.load_dir)
            #saver.restore(sess, "/home/sugon/Peixian/maddpg_peixian/maddpg/experiments/tmp/policy/simple_comm_-4166440")
            #print ("susessfully restor")

        episode_rewards = [0.0]  # sum of rewards for all agents
        agent_rewards = [[0.0]
                         for _ in range(env.n)]  # individual agent reward
        final_ep_rewards = []  # sum of rewards for training curve
        final_ep_ag_rewards = []  # agent rewards for training curve
        agent_info = [[[]]]  # placeholder for benchmarking info
        saver = tf.train.Saver(max_to_keep=3)
        obs_n = env.reset()
        episode_step = 0
        train_step = 0
        t_start = time.time()

        adversary_rewards = 0.0
        goodagent_rewards = 0.0

        print('Starting iterations...')
        while True:
            #input('...')
            # get action
            action_n = [
                agent.action(obs) for agent, obs in zip(trainers, obs_n)
            ]
            # environment step
            new_obs_n, rew_n, done_n, info_n = env.step(action_n)
            episode_step += 1
            done = all(done_n)
            terminal = (episode_step >= arglist.max_episode_len)
            # collect experience
            for i, agent in enumerate(trainers):
                agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i],
                                 done_n[i], terminal)
            obs_n = new_obs_n

            for i, rew in enumerate(rew_n):
                #print (i,":",rew_n[i])
                episode_rewards[-1] += rew
                agent_rewards[i][-1] += rew
                if i < num_adversaries:
                    adversary_rewards += rew
                else:
                    goodagent_rewards += rew

            if done or terminal:
                if done:
                    print("*" * 20)
                    print("done:", episode_step)

                stats = [adversary_rewards, episode_step, goodagent_rewards]
                for i in range(len(stats)):
                    sess.run(
                        update_ops[i],
                        feed_dict={summary_placeholders[i]: float(stats[i])})
                summary_str = sess.run(summary_op)
                summary_writer.add_summary(summary_str,
                                           len(episode_rewards) + 1)

                obs_n = env.reset()
                episode_step = 0
                adversary_rewards = 0.0
                goodagent_rewards = 0.0
                episode_rewards.append(0)
                for a in agent_rewards:
                    a.append(0)
                agent_info.append([[]])

            # increment global step counter
            train_step += 1

            # for benchmarking learned policies
            if arglist.benchmark:
                for i, info in enumerate(info_n):
                    agent_info[-1][i].append(info_n['n'])
                if train_step > arglist.benchmark_iters and (done or terminal):
                    file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl'
                    print('Finished benchmarking, now saving...')
                    with open(file_name, 'wb') as fp:
                        pickle.dump(agent_info[:-1], fp)
                    break
                continue

            # for displaying learned policies
            if arglist.display:
                time.sleep(0.1)
                env.render()
                continue

            # update all trainers, if not in display or benchmark mode
            loss = None
            for agent in trainers:
                agent.preupdate()
            for agent in trainers:
                loss = agent.update(trainers, train_step)

            # save model, display training output
            if (done or terminal) and (len(episode_rewards) % arglist.save_rate
                                       == 0):
                U.save_state(arglist.save_dir, train_step, saver=saver)
                # print statement depends on whether or not there are adversaries
                if num_adversaries == 0:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, time: {}"
                        .format(train_step, len(episode_rewards),
                                np.mean(episode_rewards[-arglist.save_rate:]),
                                round(time.time() - t_start, 3)))
                else:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}"
                        .format(train_step, len(episode_rewards),
                                np.mean(episode_rewards[-arglist.save_rate:]),
                                [
                                    np.mean(rew[-arglist.save_rate:])
                                    for rew in agent_rewards
                                ], round(time.time() - t_start, 3)))
                t_start = time.time()
                # Keep track of final episode reward
                final_ep_rewards.append(
                    np.mean(episode_rewards[-arglist.save_rate:]))
                for rew in agent_rewards:
                    final_ep_ag_rewards.append(
                        np.mean(rew[-arglist.save_rate:]))

            # saves final episode reward for plotting training curve later
            if len(episode_rewards) > arglist.num_episodes:
                rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl'
                with open(rew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_rewards, fp)
                agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl'
                with open(agrew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_ag_rewards, fp)
                print('...Finished total of {} episodes.'.format(
                    len(episode_rewards)))
                break
示例#16
0
def train(arglist):
    os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
    with U.single_threaded_session():
        # Create environment
        env = make_env(arglist.scenario, arglist, arglist.benchmark)
        obs_n = env.reset(
        )  # so that env.observation_space is initialized so trainers can be initialized
        # Create agent trainers
        num_adversaries = arglist.num_adversaries
        obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]
        print("env.observation_space:", env.observation_space)
        print("num adversaries: ", num_adversaries, ", env.n (num agents): ",
              env.n)

        #need to ensure that the trainer is in correct order. pacman in front
        trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist)
        print('Using good policy {} and adv policy {}'.format(
            arglist.good_policy, arglist.adv_policy))

        # Initialize
        U.initialize()

        # Load previous results, if necessary
        if arglist.load_dir == "":
            arglist.load_dir = arglist.save_dir + ("{}".format(
                arglist.load_episode))
        if arglist.restore or arglist.benchmark:
            print('Loading previous state...')
            U.load_state(arglist.load_dir)
        if arglist.display and arglist.load:
            print('Loading previous state...')
            U.load_state(arglist.load_dir)

        episode_rewards = [0.0]  # sum of rewards for all agents
        agent_rewards = [[0.0]
                         for _ in range(env.n)]  # individual agent reward
        final_ep_rewards = []  # sum of rewards for training curve
        final_ep_ag_rewards = [[] for i in range(env.n)
                               ]  # agent rewards for training curve
        agent_info = [[[]]]  # placeholder for benchmarking info
        saver = tf.train.Saver(max_to_keep=None)
        episode_step = 0
        train_step = 0
        total_win = [0]
        final_win = []
        total_lose = [0]
        final_lose = []
        t_start = time.time()
        loss_list = {}
        for i in range(env.n):
            loss_list[i] = [[] for i in range(6)]

        print('Starting iterations...')
        while True:
            # get action
            action_n = [
                agent.action(obs) for agent, obs in zip(trainers, obs_n)
            ]
            # environment step
            new_obs_n, rew_n, done, info_n, win, lose = env.step(action_n)
            episode_step += 1
            terminal = (episode_step >= arglist.max_episode_len)
            # print("obs_n", obs_n)
            # print("new_obs_n", new_obs_n)
            #print("action_n", action_n)
            # print("rew_n",episode_step, rew_n)
            # print("done", done)
            # print("terminal", terminal)
            # collect experience
            for i, agent in enumerate(trainers):
                agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i],
                                 done, terminal)
            obs_n = new_obs_n
            for i, rew in enumerate(rew_n):
                episode_rewards[-1] += rew
                agent_rewards[i][-1] += rew
            if done or terminal:
                if arglist.display:
                    env.render()
                obs_n = env.reset()
                episode_step = 0
                if win:
                    total_win[-1] += 1
                if lose:
                    total_lose[-1] += 1
                total_win.append(0)
                total_lose.append(0)
                episode_rewards.append(0)
                for a in agent_rewards:
                    a.append(0)
                agent_info.append([[]])

            # increment global step counter
            train_step += 1
            # if train_step % 1000 == 0:
            #     print(train_step)
            # for benchmarking learned policies
            if arglist.benchmark:
                for i, info in enumerate(info_n):
                    agent_info[-1][i].append(info_n['n'])
                if train_step > arglist.benchmark_iters and (done or terminal):
                    file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl'
                    print('Finished benchmarking, now saving...')
                    with open(file_name, 'wb') as fp:
                        pickle.dump(agent_info[:-1], fp)
                    break
                continue

            # for displaying learned policies
            if arglist.display:
                time.sleep(0.1)
                env.render()
                continue

            # update all trainers, if not in display or benchmark mode
            loss = None

            for agent in trainers:
                agent.preupdate()
            for ind, agent in enumerate(trainers):
                loss = agent.update(trainers, train_step)
                if train_step % 10000 == 0 and loss != None:
                    for i in range(len(loss)):
                        loss_list[ind][i].append(loss[i])

            # save model, display training output
            if (terminal or done) and (len(episode_rewards) % arglist.save_rate
                                       == 0):
                saving = arglist.save_dir + (
                    "{}".format(0 + len(episode_rewards))
                )  #TODO why append this
                U.save_state(saving, saver=saver)
                # print statement depends on whether or not there are adversaries
                if num_adversaries == 0:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, time: {}"
                        .format(train_step, len(episode_rewards),
                                np.mean(episode_rewards[-arglist.save_rate:]),
                                round(time.time() - t_start, 3)))
                else:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, number of wins {}, number of lose {}, "
                        "time: {}".format(
                            train_step, len(episode_rewards),
                            np.mean(episode_rewards[-arglist.save_rate:]), [
                                np.mean(rew[-arglist.save_rate:])
                                for rew in agent_rewards
                            ], np.sum(total_win[-arglist.save_rate:]),
                            np.sum(total_lose[-arglist.save_rate:]),
                            round(time.time() - t_start, 3)))
                t_start = time.time()
                # Keep track of final episode reward
                final_ep_rewards.append(
                    np.mean(episode_rewards[-arglist.save_rate:]))
                final_win.append(np.sum(total_win[-arglist.save_rate:]))
                final_lose.append(np.sum(total_lose[-arglist.save_rate:]))

                ep_reward_df = pd.DataFrame(final_ep_rewards)
                ep_ag_reward_df = pd.DataFrame(final_ep_ag_rewards)
                win_df = pd.DataFrame(final_win)
                lose_df = pd.DataFrame(final_lose)
                for i in range(env.n):
                    trainer_loss_df = pd.DataFrame(loss_list[i]).transpose()
                    trainer_loss_df.to_csv(arglist.plots_dir +
                                           arglist.exp_name +
                                           '_trainer_loss_df_{}.csv'.format(i))

                ep_reward_df.to_csv(arglist.plots_dir + arglist.exp_name +
                                    '_rewards.csv')
                ep_ag_reward_df.to_csv(arglist.plots_dir + arglist.exp_name +
                                       '_agrewards.csv')
                win_df.to_csv(arglist.plots_dir + arglist.exp_name +
                              '_win_df.csv')
                lose_df.to_csv(arglist.plots_dir + arglist.exp_name +
                               '_lose_df.csv')

                for i, rew in enumerate(agent_rewards):
                    final_ep_ag_rewards[i].append(
                        np.mean(rew[-arglist.save_rate:]))
            # saves final episode reward for plotting training curve later

            if len(episode_rewards) > arglist.num_episodes:
                # rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl'
                # with open(rew_file_name, 'wb') as fp:
                #     pickle.dump(final_ep_rewards, fp)
                # agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl'
                # with open(agrew_file_name, 'wb') as fp:
                #     pickle.dump(final_ep_ag_rewards, fp)
                print('...Finished total of {} episodes.'.format(
                    len(episode_rewards)))
                break
示例#17
0
def train(arglist):
    with U.single_threaded_session():
        # Create environment
        env = make_env(arglist)
        # Create agent trainers
        obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]

        board_write_path = './board/' + datetime.now().strftime("%Y%m%d_%H%M%S")
        os.makedirs(board_write_path)
        board_writer = tf.summary.FileWriter(board_write_path)

        trainers = get_trainers(env, obs_shape_n, arglist, board_writer)
        print('Using good policy {} and adv policy {}'.format(arglist.good_policy, arglist.adv_policy))

        # Initialize
        U.initialize()

        # Load previous results, if necessary
        if arglist.load_dir == "":
            arglist.load_dir = arglist.save_dir
        if arglist.display or arglist.restore or arglist.benchmark:
            print('Loading previous state...')
            U.load_state(arglist.load_dir)

        episode_rewards = [0.0]  # sum of rewards for all agents
        agent_rewards = [[0.0] for _ in range(env.n)]  # individual agent reward
        final_ep_rewards = []  # sum of rewards for training curve
        final_ep_ag_rewards = []  # agent rewards for training curve
        agent_info = [[[]]]  # placeholder for benchmarking info
        evaluate_rewards = []
        saver = tf.train.Saver()
        obs_n = env.reset()
        episode_step = 0
        train_step = 0
        t_start = time.time()

        print('Starting iterations...')
        while True:
            # get action
            action_n = [agent.action(obs) for agent, obs in zip(trainers, obs_n)]
            # environment step
            action_n_saved = deepcopy(action_n)

            if arglist.display:
                for idx, (agent, obs) in enumerate(zip(trainers, obs_n)):
                    action_result = agent.p_debug['p_values'](obs[None])[0]
                    print("agent_%d" % idx, action_result)

            new_obs_n, rew_n, done_n, info_n = env.step(action_n)
            action_n = action_n_saved
            episode_step += 1
            done = all(done_n)
            terminal = (episode_step >= arglist.max_episode_len)
            # collect experience
            for i, agent in enumerate(trainers):
                agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal)
            obs_n = new_obs_n

            for i, rew in enumerate(rew_n):
                episode_rewards[-1] += rew
                agent_rewards[i][-1] += rew

            if done or terminal:
                obs_n = env.reset()
                episode_step = 0
                episode_rewards.append(0)
                for a in agent_rewards:
                    a.append(0)
                agent_info.append([[]])

            # increment global step counter
            train_step += 1

            # for benchmarking learned policies
            if arglist.benchmark:
                for i, info in enumerate(info_n):
                    agent_info[-1][i].append(info_n['n'])
                if train_step > arglist.benchmark_iters and (done or terminal):
                    file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl'
                    print('Finished benchmarking, now saving...')
                    with open(file_name, 'wb') as fp:
                        pickle.dump(agent_info[:-1], fp)
                    break
                continue

            if arglist.display:
                continue

            # update all trainers, if not in display or benchmark mode
            if train_step % 100 == 0 and len(trainers[0].replay_buffer) >= trainers[0].max_replay_buffer_len:
                loss = None
                replay_sample_index = trainers[0].get_memory_index()

                obs_n_sampled = []
                obs_next_n_sampled = []
                act_n_sampled = []
                for agent in trainers:
                    agent.set_memory_index(replay_sample_index)
                    obs_sampled, act_sampled, _, obs_next_sampled, _ = agent.get_replay_data()
                    obs_n_sampled.append(obs_sampled)
                    obs_next_n_sampled.append(obs_next_sampled)
                    act_n_sampled.append(act_sampled)
                target_act_next_n = []
                for agent in trainers:
                    target_act_next_n.append(agent.get_target_act(obs_next_n_sampled))

                for agent in trainers:
                    loss = agent.update(train_step, obs_n_sampled, act_n_sampled, obs_next_n_sampled,
                                        target_act_next_n)

            import math
            if math.isnan(episode_rewards[-1]):
                print("NaN occurred! ")
                break

            # save model, display training output
            if terminal and (len(episode_rewards) % arglist.save_rate == 0):
                U.save_state(arglist.save_dir, saver=saver)
                # print statement depends on whether or not there are adversaries
                print("steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}".format(
                    train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]),
                    [np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards], round(time.time() - t_start, 3)))
                t_start = time.time()
                # Keep track of final episode reward
                final_ep_rewards.append(np.mean(episode_rewards[-arglist.save_rate:]))
                for rew in agent_rewards:
                    final_ep_ag_rewards.append(np.mean(rew[-arglist.save_rate:]))

                evaluate_rewards.append(evaluate(arglist, trainers, is_toy=True))

            # saves final episode reward for plotting training curve later
            if len(episode_rewards) > arglist.num_episodes:
                rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl'
                with open(rew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_rewards, fp)
                agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl'
                with open(agrew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_ag_rewards, fp)
                print('...Finished total of {} episodes.'.format(len(episode_rewards)))
                with open(arglist.plots_dir + arglist.exp_name + "_evaluate_rewards.pkl", 'wb') as fp:
                    pickle.dump(evaluate_rewards, fp)
                break
示例#18
0
def train(arglist):
    with U.single_threaded_session():
        # Create environment
        env = make_env(arglist.scenario, arglist, arglist.benchmark)
        # Create agent trainers
        obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]
        num_adversaries = min(env.n, arglist.num_adversaries)
        trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist)
        print('Using good policy {} and adv policy {}'.format(arglist.good_policy, arglist.adv_policy))

        # Initialize
        U.initialize()

        # Load previous results, if necessary
        if arglist.load_dir == "":
            arglist.load_dir = arglist.save_dir
        if arglist.display or arglist.restore or arglist.benchmark:
            print('Loading previous state...')
            U.load_state(arglist.load_dir)

        episode_rewards = [0.0]  # sum of rewards for all agents
        agent_rewards = [[0.0] for _ in range(env.n)]  # individual agent reward
        final_ep_rewards = []  # sum of rewards for training curve
        final_ep_ag_rewards = []  # agent rewards for training curve
        agent_info = [[[]]]  # placeholder for benchmarking info
        saver = tf.train.Saver()
        obs_n = env.reset()
        episode_step = 0
        train_step = 0
        t_start = time.time()

        current_game_experiences = []
        t0 = time.time()

        print('Starting iterations...')
        while True:
            # get action
            action_n = [agent.action(obs) for agent, obs in zip(trainers,obs_n)]
            # environment step
            new_obs_n, rew_n, done_n, info_n = env.step(action_n)
            episode_step += 1
            done = all(done_n)
            terminal = (episode_step >= arglist.max_episode_len)
            # collect experience
            #for i, agent in enumerate(trainers):
                #agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal)
            current_game_experiences.append((obs_n, action_n, rew_n, new_obs_n, done_n, terminal))
            obs_n = new_obs_n

            for i, rew in enumerate(rew_n):
                episode_rewards[-1] += rew
                agent_rewards[i][-1] += rew

            if done or terminal:
                #U.save_state(arglist.save_dir, saver=saver)
                #print("SAVED")
                obs_n = env.reset()
                episode_step = 0
                episode_rewards.append(0)
                for a in agent_rewards:
                    a.append(0)
                agent_info.append([[]])

                if len(episode_rewards) % 200 == 0 and not arglist.display:
                    fname = datetime.datetime.now().strftime('%Y-%m-%d %H.%M.%S.%f') + ".pkl"
                    with open("../../worker_experiences/" + fname, 'wb') as fp:
                        print("\n[%d] Finished 200 games in %.2f seconds" % (len(episode_rewards), time.time() - t0))
                        pickle.dump(current_game_experiences, fp)
                        print("Saved experience file " + fname)
                        print('Loading latest networks...')
                        t0 = time.time()
                        try:
                            U.load_state(arglist.load_dir)
                            print("Latest networks loaded in %.2f seconds" % (time.time() - t0))
                            t0 = time.time()
                        except tf.python.framework.errors_impl.DataLossError:
                            print("Couldn't read latest network, it's probably being written...")

                    current_game_experiences = []

            # increment global step counter
            train_step += 1

            # for benchmarking learned policies
            if arglist.benchmark:
                for i, info in enumerate(info_n):
                    agent_info[-1][i].append(info_n['n'])
                if train_step > arglist.benchmark_iters and (done or terminal):
                    file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl'
                    print('Finished benchmarking, now saving...')
                    with open(file_name, 'wb') as fp:
                        pickle.dump(agent_info[:-1], fp)
                    break
                continue

            # for displaying learned policies
            if arglist.display:
                time.sleep(0.05)
                env.render()
                if arglist.video:
                    video_maker.save_frame(episode_step)
                if terminal and len(episode_rewards) % 5 == 0:
                    if arglist.video:
                        video_maker.combine_frames_to_video("../../videos/test_video.mp4")
                        clear_folder("../../frames/")
                    t0 = time.time()
                    try:
                        U.load_state(arglist.load_dir)
                        print("Latest networks loaded in %.2f seconds" % (time.time() - t0))
                        t0 = time.time()
                    except tf.python.framework.errors_impl.DataLossError:
                        print("Couldn't read latest network, it's probably being written...")
                continue

            # update all trainers, if not in display or benchmark mode
            #loss = None
            #for agent in trainers:
            #    agent.preupdate()
            #for agent in trainers:
            #    loss = agent.update(trainers, train_step)

            # save model, display training output
            if terminal and (len(episode_rewards) % arglist.save_rate == 0):
                #U.save_state(arglist.save_dir, saver=saver)
                # print statement depends on whether or not there are adversaries
                if num_adversaries == 0:
                    print("steps: {}, episodes: {}, mean episode abs-reward: {}, time: {}".format(
                        train_step, len(episode_rewards), np.mean(np.abs(episode_rewards[-arglist.save_rate:])), round(time.time()-t_start, 3)))
                else:
                    print("steps: {}, episodes: {}, mean episode abs-reward: {}, agent episode abs-reward: {}, time: {}".format(
                        train_step, len(episode_rewards), np.mean(np.abs(episode_rewards[-arglist.save_rate:])),
                        [np.mean(np.abs(rew[-arglist.save_rate:])) for rew in agent_rewards], round(time.time()-t_start, 3)))
                t_start = time.time()
                # Keep track of final episode reward
                final_ep_rewards.append(np.mean(episode_rewards[-arglist.save_rate:]))
                for rew in agent_rewards:
                    final_ep_ag_rewards.append(np.mean(rew[-arglist.save_rate:]))

            # saves final episode reward for plotting training curve later
            if len(episode_rewards) > arglist.num_episodes:
                rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl'
                with open(rew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_rewards, fp)
                agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl'
                with open(agrew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_ag_rewards, fp)
                print('...Finished total of {} episodes.'.format(len(episode_rewards)))
                break
示例#19
0
def train(arglist):
    """
    Run MADDPG algorithm using passed in commandline arguments

    Args:
        arglist (argparse.Namespace): Parsed commandline arguments object
    """
    tf.reset_default_graph()

    if arglist.seed is not None:
        np.random.seed(arglist.seed)
        tf.set_random_seed(arglist.seed)

    with tf_util.make_session(config=None,
                              num_cpu=1,
                              make_default=False,
                              graph=None):
        # with tf_util.single_threaded_session():
        ###########################################
        #         Create environment              #
        ###########################################
        env = make_env(arglist.scenario,
                       arglist=arglist,
                       done=arglist.done_callback,
                       logging=arglist.logging,
                       benchmark=arglist.benchmark)

        ###########################################
        #        Create agent trainers            #
        ###########################################
        obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]
        num_adversaries = min(env.n, arglist.num_adversaries)
        trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist)

        print("Number of Adversaries: {}".format(num_adversaries))
        print('Experiment: {}. Using good policy {} and adv policy {}'.format(
            arglist.exp_name, arglist.good_policy, arglist.adv_policy))

        ###########################################
        #              Initialize                 #
        ###########################################
        tf_util.initialize()

        ###########################################
        #   Load previous results, if necessary   #
        ###########################################
        if arglist.load_dir == "":
            arglist.load_dir = arglist.save_dir

        # if arglist.display or arglist.restore or arglist.benchmark or arglist.load_dir is not None:
        if arglist.restore or arglist.benchmark or arglist.load_dir is not None:
            print('Loading previous state...')

            # Set model file
            if arglist.model_file == "":
                arglist.model_file = arglist.exp_name

            print("Model File: " + arglist.load_dir + arglist.model_file)
            tf_util.load_state(arglist.load_dir + arglist.model_file)

        ###########################################
        #       Create the save directory         #
        ###########################################
        if not os.path.exists(arglist.save_dir):
            os.makedirs(arglist.save_dir, exist_ok=True)

        if not os.path.exists(arglist.plots_dir):
            os.makedirs(arglist.plots_dir, exist_ok=True)

        ###########################################
        #             Set parameters              #
        ###########################################
        # Sum of rewards for all agents
        episode_rewards = [0.0]

        # This was changed so that a reward can be tracked for fixed policy agents as well as learning agents
        # Individual agent reward
        # agent_rewards = [[0.0] for _ in range(env.n)]
        agent_rewards = [[0.0] for _ in range(len(env.world.agents))]

        # Retrieve previous episode count
        try:
            prev_ep_ct = int(arglist.model_file.split("_")[-1])
        except ValueError:
            print("Starting from untrained network...")
            prev_ep_ct = 0
        ep_ct = prev_ep_ct + arglist.num_episodes

        # Sum of rewards for training curve
        final_ep_rewards = []

        # Agent rewards for training curve
        final_ep_ag_rewards = []

        # Placeholder for benchmarking info
        agent_info = [[[]]]

        saver = tf.train.Saver()
        obs_n = env.reset()
        episode_step = 0
        train_step = 0
        t_start = time.time()
        progress = False

        # Save more often if you have fewer episodes
        arglist.save_rate = min(arglist.save_rate, arglist.num_episodes)

        # Initialize loss file for each agent
        if arglist.log_loss:
            for i in range(len(env.world.agents)):
                log_loss(arglist, ep_ct, "agent_{}".format(i), initialize=True)

        ###########################################
        #                 Start                   #
        ###########################################
        print('Starting iterations...')
        while True:
            # TODO: Switch to is isinstance()
            # if type(env.world.scripted_agents[0].action) == type(None):
            #     print("Error")

            # Get action
            action_n = [
                agent.action(obs) for agent, obs in zip(trainers, obs_n)
            ]

            # Environment step
            new_obs_n, rew_n, done_n, info_n = env.step(action_n)

            # Logging step
            if arglist.logging:
                env.log(
                    len(episode_rewards) + prev_ep_ct, episode_step, new_obs_n,
                    rew_n, done_n, info_n)

            # Update information
            episode_step += 1

            # Check if all agents are done
            # done = all(done_n)

            # Check if any agents are done
            done = any(done_n)

            terminal = (episode_step >= arglist.max_episode_len)

            # Collect experience
            for i, agent in enumerate(trainers):
                agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i],
                                 done_n[i], terminal)
            obs_n = new_obs_n

            for i, rew in enumerate(rew_n):
                episode_rewards[-1] += rew
                agent_rewards[i][-1] += rew

            # For displaying learned policies
            if arglist.display:
                time.sleep(0.1)
                env.render()
                if done or terminal:
                    print('Episode Reward: {}'.format(
                        [rew[-1] for rew in agent_rewards]))
                    time.sleep(0.5)
                    obs_n = env.reset()
                    episode_step = 0
                    episode_rewards.append(0)
                    for a in agent_rewards:
                        a.append(0)
                    agent_info.append([[]])
                continue

            if done or terminal:
                obs_n = env.reset()
                episode_step = 0
                episode_rewards.append(0)
                for a in agent_rewards:
                    a.append(0)
                agent_info.append([[]])

            # Increment global step counter
            train_step += 1

            # For benchmarking learned policies
            if arglist.benchmark:
                for i, info in enumerate(info_n):
                    agent_info[-1][i].append(info_n['n'])

                if train_step > arglist.benchmark_iters and (done or terminal):
                    file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl'
                    print('Finished benchmarking, now saving...')
                    with open(file_name, 'wb') as fp:
                        pickle.dump(agent_info[:-1], fp)
                    break
                continue

            # In testing mode, don't perform model updates
            if arglist.testing:
                if len(episode_rewards) > arglist.num_episodes:
                    print("episodes: {}, "
                          "mean episode reward: {}, time: {}".format(
                              len(episode_rewards),
                              np.mean(episode_rewards[-arglist.save_rate:]),
                              round(time.time() - t_start, 3)))
                    env.logger.save("State",
                                    arglist.save_dir,
                                    filename=arglist.exp_name + '_state' +
                                    '_' + str(prev_ep_ct) + arglist.log_append)
                    break
                continue

            # Update all trainers, if not in display or benchmark mode
            loss = None
            for agent in trainers:
                agent.preupdate()
            for i, agent in enumerate(trainers):
                loss = agent.update(trainers, train_step)
                if arglist.log_loss and loss is not None:
                    log_loss(arglist,
                             ep_ct,
                             "agent_{}".format(i),
                             loss=loss[1])

            if len(episode_rewards) % 100 == 0 and progress:
                print("Episode {} Reached. Time: {}".format(
                    len(episode_rewards),
                    time.time() - t_start))
                progress = False
            elif len(episode_rewards) % 100 != 0 and not progress:
                progress = True

            # Save model, display training output
            if (terminal or done) and (len(episode_rewards) % arglist.save_rate
                                       == 0):
                # TODO: Implement some checks so that we don't overwrite old networks unintentionally?

                # Save model state
                tf_util.save_state(arglist.save_dir + arglist.exp_name + '_' +
                                   str(len(episode_rewards) + prev_ep_ct),
                                   saver=saver)

                # Print statement depends on whether or not there are adversaries
                if num_adversaries == 0:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, time: {}"
                        .format(train_step,
                                len(episode_rewards) + prev_ep_ct,
                                np.mean(episode_rewards[-arglist.save_rate:]),
                                round(time.time() - t_start, 3)))
                else:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}"
                        .format(train_step,
                                len(episode_rewards) + prev_ep_ct,
                                np.mean(episode_rewards[-arglist.save_rate:]),
                                [
                                    np.mean(reward[-arglist.save_rate:])
                                    for reward in agent_rewards
                                ], round(time.time() - t_start, 3)))

                # Reset start time to current time
                t_start = time.time()

                # Keep track of final episode reward
                final_ep_rewards.append(
                    np.mean(episode_rewards[-arglist.save_rate:]))
                for reward in agent_rewards:
                    final_ep_ag_rewards.append(
                        np.mean(reward[-arglist.save_rate:]))

            # Saves final episode reward for plotting training curve later
            if len(episode_rewards) > arglist.num_episodes:
                rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl'
                with open(rew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_rewards, fp)

                agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl'
                with open(agrew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_ag_rewards, fp)

                # Log agent data for run
                env.logger.save("State",
                                arglist.save_dir,
                                filename=arglist.exp_name + '_state' + '_' +
                                str(len(episode_rewards) + prev_ep_ct))

                print('...Finished total of {} episodes.'.format(
                    len(episode_rewards)))
                break
示例#20
0
    strargs = ['--benchmark', '--deterministic'] + unknown_args
    arglist = parse_args(strargs)

    #tf.reset_default_graph()
    #tf.InteractiveSession().as_default()
    with tf.Session().as_default():
        # Create environment
        env = make_env('simple_spread', arglist, arglist.benchmark)
        # Create agent trainers
        obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]
        num_adversaries = min(env.n, arglist.num_adversaries)
        trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist)
        #print('Using good policy {} and adv policy {}'.format(arglist.good_policy, arglist.adv_policy))

        # Initialize
        U.initialize()

        os.path.splitext(args.input_file)

        print('Loading previous state...')
        U.load_state(args.policy_file)

        actions = trainers[0].act(states1[0])
        assert np.allclose(actions1[0], actions)
        actions = trainers[1].act(states2[0])
        assert np.allclose(actions2[0], actions)
        actions = trainers[2].act(states3[0])
        assert np.allclose(actions3[0], actions)

        h1_values = trainers[0].p_debug['h1_values']
        h2_values = trainers[0].p_debug['h2_values']
示例#21
0
def train(arglist):
    with U.make_session(8):
        # Create environment
        env = make_env(arglist.scenario, arglist, arglist.benchmark)
        # Create agent trainers
        obs_shape_n = [[29] for i in range(env.n)]
        obs_map_shape_n =[[56*86] for i in range(env.n)]
        num_adversaries = min(env.n, arglist.num_adversaries)
        trainers = get_trainers(env, num_adversaries, obs_shape_n, obs_map_shape_n,arglist)
        print('Using good policy {} and adv policy {}'.format(arglist.good_policy, arglist.adv_policy))

        # Initialize
        U.initialize()

        # Load previous results, if necessary
        if arglist.load_dir == "":
            arglist.load_dir = arglist.save_dir
        if arglist.display or arglist.restore or arglist.benchmark:
            print('Loading previous state...')
            U.load_state(arglist.load_dir)

        episode_rewards = [0.0]  # sum of rewards for all agents
        agent_rewards = [[0.0] for _ in range(env.n)]  # individual agent reward
        final_ep_rewards = []  # sum of rewards for training curve
        final_ep_ag_rewards = []  # agent rewards for training curve
        agent_info = [[[]]]  # placeholder for benchmarking info
        saver = tf.train.Saver()
        obs_n = env._reset()
        episode_step = 13000
        train_step = 0
        t_start = time.time()

        print('Starting iterations...')
        while True:
            # get action
            #action_n = [agent.action(obs) for agent, obs in zip(trainers,obs_n)]
            action_n=[]
            for agent, obs in zip(trainers,obs_n):
                #print(obs)
                t=agent.action(obs)
                d=np.argmax(t)
                if d%5==4:
                    rt=random.randint(0,20)
                    if rt<4:
                        swap=t[d]
                        t[d]=t[d-rt-1]
                        t[d-rt-1]=swap
                else:
                    rt=random.randint(0,80)
                    if rt<4:
                        swap=t[d]
                        t[d]=t[d//5*5+rt]
                        t[d//5*5+rt]=swap

                action_n.append(t)

            #print(action_n)
            # environment step
            new_obs_n, rew_n, done_n, info_n = env._step(action_n)
            
            #print(rew_n)
            

            episode_step += 1
            env.training_episode=episode_step

            done = all(done_n)
            terminal = (episode_step >= arglist.max_episode_len)
            # collect experience
            for i, agent in enumerate(trainers):
                agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal)
            obs_n = new_obs_n

            for i, rew in enumerate(rew_n):
                episode_rewards[-1] += rew
                agent_rewards[i][-1] += rew

            if done or terminal:
                obs_n = env._reset()
                episode_step = 0
                episode_rewards.append(0)
                for a in agent_rewards:
                    a.append(0)
                agent_info.append([[]])

            # increment global step counter
            train_step += 1

            # for benchmarking learned policies
            if arglist.benchmark:
                for i, info in enumerate(info_n):
                    agent_info[-1][i].append(info_n['n'])
                if train_step > arglist.benchmark_iters and (done or terminal):
                    file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl'
                    print('Finished benchmarking, now saving...')
                    with open(file_name, 'wb') as fp:
                        pickle.dump(agent_info[:-1], fp)
                    break
                continue

            # for displaying learned policies
            if arglist.online_display or arglist.display:
                 time.sleep(0.01)
                 #if rew_n[2]>0: pdb.set_trace()
                 env._render(close=False)
                 print(rew_n)
                 # if (rew_n[2]>0) or (rew_n[0]>0) or (rew_n[1]>0):
                 #     pdb.set_trace() 
                 #pdb.set_trace()
                 
                 if arglist.display: continue

            # update all trainers, if not in display or benchmark mode
            loss = None
            for agent in trainers:
                agent.preupdate()
            for agent in trainers:
                loss = agent.update(trainers, train_step)

            # save model, display training output
            if terminal and (len(episode_rewards) % arglist.save_rate == 0):
                U.save_state(arglist.save_dir, saver=saver)
                # print statement depends on whether or not there are adversaries
                if num_adversaries == 0:
                    print("steps: {}, episodes: {}, mean episode reward: {}, time: {}".format(
                        train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), round(time.time()-t_start, 3)))
                else:
                    print("steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}".format(
                        train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]),
                        [np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards], round(time.time()-t_start, 3)))
                t_start = time.time()

                

                # Keep track of final episode reward
                final_ep_rewards.append(np.mean(episode_rewards[-arglist.save_rate:]))
                for rew in agent_rewards:
                    final_ep_ag_rewards.append(np.mean(rew[-arglist.save_rate:]))

            # saves final episode reward for plotting training curve later
            if len(episode_rewards) > arglist.num_episodes:
                rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl'
                with open(rew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_rewards, fp)
                agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl'
                with open(agrew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_ag_rewards, fp)
                print('...Finished total of {} episodes.'.format(len(episode_rewards)))
                break
示例#22
0
def train(arglist):
    with U.single_threaded_session():
        # Create environment
        env = make_env(arglist.scenario, arglist, arglist.benchmark)

        # Create agent trainers
        obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]
        num_adversaries = min(env.n, arglist.num_adversaries)
        trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist)
        print('Using good policy {} and adv policy {}'.format(
            arglist.good_policy, arglist.adv_policy))
        print("number of adversaries is: ", num_adversaries)
        # Initialize
        U.initialize()

        # Load previous results, if necessary
        if arglist.load_dir == "":
            arglist.load_dir = arglist.save_dir
        if arglist.display or arglist.restore or arglist.benchmark:
            print('Loading previous state...')
            print("path is: ", arglist.load_dir)
            print("restoring checkpoints")
            # added for selective training.
            # Make it general for other environments as well later.
            if arglist.scenario == "simple_tag":
                print("inside simple tag")
                if not arglist.train_adversaries:
                    print("loading only positive")
                    print("number of adversaries are: ", num_adversaries)
                    saver = tf.train.Saver(var_list=tf.get_collection(
                        tf.GraphKeys.GLOBAL_VARIABLES,
                        scope="agent_" + str(num_adversaries)))
                    print(
                        "var list is: ",
                        tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                          scope="agent_" +
                                          str(num_adversaries)))
                if not arglist.train_positive_agent:
                    print("only loading adversaries")
                    var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                                 scope="agent_0")
                    print("var list is: ", var_list)
                    for l in range(1, arglist.num_adversaries):
                        var_list += tf.get_collection(
                            tf.GraphKeys.GLOBAL_VARIABLES,
                            scope="agent_" + str(l))

                    saver = tf.train.Saver(var_list=var_list)

                U.load_state(arglist.load_dir, saver=saver)

            else:
                U.load_state(arglist.load_dir)

        episode_rewards = [0.0]  # sum of rewards for all agents
        agent_rewards = [[0.0]
                         for _ in range(env.n)]  # individual agent reward

        if arglist.restore:
            final_ep_rewards = list(
                np.load(arglist.plots_dir + arglist.exp_name +
                        '_episode_rewards.npy'))
            final_ep_ag_rewards = list(
                np.load(arglist.plots_dir + arglist.exp_name +
                        '_agent_rewards.npy'))
            final_ep_ag_rewards = [list(a) for a in final_ep_ag_rewards]
        else:
            final_ep_rewards = []  # sum of rewards for training curve
            # final_ep_ag_rewards = []  # agent rewards for training curve
            final_ep_ag_rewards = [[0.0] for _ in range(env.n)
                                   ]  # agent rewards for training curve

        agent_info = [[[]]]  # placeholder for benchmarking info
        saver = tf.train.Saver()
        obs_n = env.reset()
        episode_step = 0
        train_step = 0
        t_start = time.time()

        print("number of agents in the environment are: ", env.n)
        episode_avg_rewards = [0.0]
        agent_avg_rewards = [[0.0] for _ in range(env.n)]
        print('Starting iterations...')
        while True:
            # get action
            action_n = [
                agent.action(obs) for agent, obs in zip(trainers, obs_n)
            ]
            # environment step
            new_obs_n, rew_n, done_n, info_n = env.step(action_n)
            episode_step += 1
            done = all(done_n)
            terminal = (episode_step >= arglist.max_episode_len)
            # collect experience
            for i, agent in enumerate(trainers):
                agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i],
                                 done_n[i], terminal)
            obs_n = new_obs_n

            for i, rew in enumerate(rew_n):
                episode_rewards[-1] += rew
                agent_rewards[i][-1] += rew

            if done or terminal:
                obs_n = env.reset()
                episode_step = 0

                # this should perhaps be done later.
                episode_rewards.append(0)
                for a in agent_rewards:
                    a.append(0)
                agent_info.append([[]])

            # increment global step counter
            train_step += 1

            # for benchmarking learned policies
            if arglist.benchmark:
                for i, info in enumerate(info_n):
                    agent_info[-1][i].append(info_n['n'])
                if train_step > arglist.benchmark_iters and (done or terminal):
                    file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl'
                    print('Finished benchmarking, now saving...')
                    with open(file_name, 'wb') as fp:
                        pickle.dump(agent_info[:-1], fp)
                    break
                continue

            # for displaying learned policies
            if arglist.display:
                time.sleep(0.1)
                env.render()
                continue

            # update all trainers, if not in display or benchmark mode
            loss = None
            # for agent in trainers:
            #     agent.preupdate()
            # for agent in trainers:
            #     loss = agent.update(trainers, train_step)

            for m in range(0, len(trainers)):
                agent = trainers[m]

                if not arglist.train_adversaries and m > num_adversaries:
                    # print("updating positive")
                    agent.preupdate()

                if not arglist.train_positive_agent and m <= num_adversaries:
                    # print("updating adversary")
                    agent.preupdate()

                if arglist.train_positive_agent and arglist.train_adversaries:
                    # print("updating both")
                    agent.preupdate()

            for m in range(0, len(trainers)):
                agent = trainers[m]

                if not arglist.train_adversaries and m > num_adversaries:
                    loss = agent.update(trainers, train_step)

                if not arglist.train_positive_agent and m <= num_adversaries:
                    loss = agent.update(trainers, train_step)

                if arglist.train_positive_agent and arglist.train_adversaries:
                    loss = agent.update(trainers, train_step)

            # save model, display training output
            if terminal and (len(episode_rewards) % arglist.save_rate == 0):
                U.save_state(arglist.save_dir, saver=saver)
                # print statement depends on whether or not there are adversaries
                if num_adversaries == 0:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, time: {}"
                        .format(train_step, len(episode_rewards),
                                np.mean(episode_rewards[-arglist.save_rate:]),
                                round(time.time() - t_start, 3)))
                else:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}"
                        .format(train_step, len(episode_rewards),
                                np.mean(episode_rewards[-arglist.save_rate:]),
                                [
                                    np.mean(rew[-arglist.save_rate:])
                                    for rew in agent_rewards
                                ], round(time.time() - t_start, 3)))
                t_start = time.time()
                # Keep track of final episode reward
                final_ep_rewards.append(
                    np.mean(episode_rewards[-arglist.save_rate:-1]))
                # for rew in agent_rewards:
                #     final_ep_ag_rewards.append(np.mean(rew[-arglist.save_rate:]))

                # for rew in agent_rewards:
                for j in range(len(agent_rewards)):
                    rew = agent_rewards[j]
                    final_ep_ag_rewards[j].append(
                        np.mean(rew[-arglist.save_rate:-1]))

            # saves final episode reward for plotting training curve later
            if len(episode_rewards) > arglist.num_episodes:
                rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl'
                with open(rew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_rewards, fp)
                agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl'
                with open(agrew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_ag_rewards, fp)
                print('...Finished total of {} episodes.'.format(
                    len(episode_rewards)))

                agent_rewards = np.array(final_ep_ag_rewards)
                episode_rewards = np.array(final_ep_rewards)

                np.save(
                    arglist.plots_dir + arglist.exp_name +
                    '_agent_rewards.npy', agent_rewards)
                np.save(
                    arglist.plots_dir + arglist.exp_name +
                    '_episode_rewards.npy', episode_rewards)

                fig, ax = plt.subplots()
                for k in range(len(agent_rewards)):
                    ax.plot(agent_rewards[k], label="agent_" + str(k))

                ax.plot(episode_rewards, label="total")

                ax.legend()
                plt.savefig(arglist.plots_dir + arglist.exp_name + '_plot.png')
                plt.show()

                break
示例#23
0
def train(arglist, PID=None, lock=None):
    start_time = time.time()
    # global replay_buffer
    with U.single_threaded_session() as sess:
        # Create environment
        env = make_env(arglist.scenario, arglist, arglist.benchmark)
        # Create agents networks
        obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]

        ####changed by yuan li
        num_adversaries = copy.deepcopy(env.num_adversaries)
        arglist.num_adversaries = copy.deepcopy(num_adversaries)

        if comm_rank==0:
            req = None
            wait_flag = False
            data = 0
            a = 0
            number = 0

            actors = get_agents(env, num_adversaries, obs_shape_n, arglist)

            U.initialize()
            while True:
                if not wait_flag:
                    req = comm.irecv(350000, source=(comm_rank - 1 + comm_size) % comm_size, tag=11)
                    wait_flag = True
                else:
                    if a >= 3:
                        break
                    data_recv = req.test()
                    if data_recv[0]:
                        a += 1
                        wait_flag = False
                        i = 0
                        j = 0
                        for var in tf.trainable_variables():
                            if 11 < (i % 24) < 24:
                                var.load(data_recv[1][j], sess)
                                j += 1
                            i += 1
                        print("rank0 updata param:000000000000000000000, step:", a)
                    else:
                        if number<=2:
                            data = data + number * 100
                            comm.send(data, dest=(comm_rank + 1) % comm_size, tag=11)
                            number+=1
                            print("rank:{}, step:{}, send data:{}".format(comm_rank, a, data))

        if comm_rank==1:
            wait_flag = False
            req = None
            sample = 0
            step = 0
            data = 0
            while True:
                if not wait_flag:
                    req = comm.irecv(source=(comm_rank - 1 + comm_size) % comm_size, tag=11)
                    wait_flag = True
                else:
                    data_recv = req.test()
                    if data_recv[0]:
                        sample += 1
                        wait_flag=False
                        print("rank:{}, step:{}, recv data:{}".format(comm_rank, step, data_recv[1]))
                        if sample==3:
                            break
                    else:
                        wait_flag = True
                        #if step >= 3:
                        #    break
                        if step<=2:
                            data = data + step*10000
                            comm.send(data, dest=(comm_rank + 1) % comm_size, tag=11)
                            step+=1
                            print("rank:{}, step:{}, send data:{}".format(comm_rank, step, data))

        if comm_rank == 2:
            step = 0
            learners = get_agents(env, num_adversaries, obs_shape_n, arglist)
            U.initialize()

            while True:
                if step >= 3:
                    break
                else:
                    data_recv = comm.recv(source=(comm_rank - 1) % comm_size, tag=11)
                    print("rank:{}, step:{}, recv data:{}".format(comm_rank, step, data_recv))

                    param = []
                    i = 0
                    for var in tf.trainable_variables():
                        if 11 < (i % 24) < 24:
                            param.append(sess.run(var))
                        i += 1
                    comm.send(param, dest=(comm_rank + 1) % comm_size, tag=11)
                    step += 1
                    print("rank2 send param:22222222222222222222, step:", step)
示例#24
0
def train(arglist):
    with U.single_threaded_session():
        # Create environment
        env = make_env(arglist.scenario, arglist, arglist.benchmark)
        # Create agent trainers
        obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]
        num_adversaries = min(env.n, arglist.num_adversaries)
        state_shape_n = [(64, ) for i in range(env.n)]
        trainers = get_trainers(env, num_adversaries, obs_shape_n,
                                state_shape_n, arglist)
        print('Using good policy {} and adv policy {}'.format(
            arglist.good_policy, arglist.adv_policy))

        # Initialize
        U.initialize()
        episode_rewards = [0.0]  # sum of rewards for all agents
        agent_rewards = [[0.0]
                         for _ in range(env.n)]  # individual agent reward
        episode_begin_num = 0

        # Load previous results, if necessary
        if arglist.load_dir == "":
            arglist.load_dir = arglist.save_dir
        if arglist.display or arglist.restore or arglist.benchmark:
            print('Loading previous state...')
            U.load_state(arglist.load_dir)
            fname = './learning_curves/' + arglist.exp_name + '_rewards.pkl'
            final_ep_rewards = pickle.load(open(fname, 'rb'))
            fname = './learning_curves/' + arglist.exp_name + '_agrewards.pkl'
            final_ep_ag_rewards = pickle.load(open(fname, 'rb'))
            episode_begin_num = arglist.save_rate * len(final_ep_rewards)

        final_ep_rewards = []  # sum of rewards for training curve
        final_ep_ag_rewards = []  # agent rewards for training curve
        agent_info = [[[]]]  # placeholder for benchmarking info
        saver = tf.train.Saver()

        obs_n = env.reset()
        state_n = [agent.p_init_state(1) for agent in trainers]
        pred_n = [agent.init_pred(1) for agent in trainers]

        episode_step = 0
        train_step = 0
        t_start = time.time()

        print('Starting iterations...')
        while True:
            ## get action
            temp = [
                agent.take_action(obs, state,
                                  pred) for agent, obs, state, pred in zip(
                                      trainers, obs_n, state_n, pred_n)
            ]
            action_n = [x[0] for x in temp]
            new_state_n = [x[1] for x in temp]
            gru_out_n = [x[2] for x in temp]
            new_pred_n = [
                agent.predict(act[None], gru_out)
                for agent, act, gru_out in zip(trainers, action_n, gru_out_n)
            ]

            # environment step
            new_obs_n, rew_n, done_n, info_n = env.step(action_n)
            episode_step += 1
            done = all(done_n)
            terminal = (episode_step >= arglist.max_episode_len)

            # collect experience
            ## need to be modified
            for i, agent in enumerate(trainers):
                agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i],
                                 done_n[i], terminal)
            obs_n = new_obs_n
            state_n = new_state_n
            # pred_n = [x.eval() for x in new_pred_n]
            pred_n = new_pred_n

            for i, rew in enumerate(rew_n):
                episode_rewards[-1] += rew
                agent_rewards[i][-1] += rew

            if done or terminal:
                obs_n = env.reset()
                state_n = [agent.p_init_state(1) for agent in trainers]
                pred_n = [agent.init_pred(1) for agent in trainers]
                episode_step = 0
                episode_rewards.append(0)
                for a in agent_rewards:
                    a.append(0)
                agent_info.append([[]])

            # increment global step counter
            train_step += 1

            # for benchmarking learned policies
            if arglist.benchmark:
                for i, info in enumerate(info_n):
                    agent_info[-1][i].append(info_n['n'])
                if train_step > arglist.benchmark_iters and (done or terminal):
                    file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl'
                    print('Finished benchmarking, now saving...')
                    with open(file_name, 'wb') as fp:
                        pickle.dump(agent_info[:-1], fp)
                    break
                continue

            # for displaying learned policies
            if arglist.display:
                time.sleep(0.05)
                env.render()
                continue

            # update all trainers, if not in display or benchmark mode
            loss = None
            for agent in trainers:
                agent.preupdate()
            for agent in trainers:
                loss = agent.update(trainers, train_step, arglist.step_size,
                                    arglist.burn_in_step)

            # save model, display training output
            episode_num = len(episode_rewards) + episode_begin_num
            if terminal and (episode_num % arglist.save_rate == 0):
                # print statement depends on whether or not there are adversaries
                if num_adversaries == 0:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, time: {}"
                        .format(train_step, episode_num,
                                np.mean(episode_rewards[-arglist.save_rate:]),
                                round(time.time() - t_start, 3)))
                else:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}"
                        .format(train_step, episode_num,
                                np.mean(episode_rewards[-arglist.save_rate:]),
                                [
                                    np.mean(rew[-arglist.save_rate:])
                                    for rew in agent_rewards
                                ], round(time.time() - t_start, 3)))
                t_start = time.time()
                # Keep track of final episode reward
                final_ep_rewards.append(
                    np.mean(episode_rewards[-arglist.save_rate:]))
                for rew in agent_rewards:
                    final_ep_ag_rewards.append(
                        np.mean(rew[-arglist.save_rate:]))

                rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl'
                with open(rew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_rewards, fp)
                agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl'
                with open(agrew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_ag_rewards, fp)
                print('...Finished total of {} episodes.'.format(episode_num))

                U.save_state(arglist.save_dir, saver=saver)

            if episode_num > arglist.num_episodes:
                rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl'
                with open(rew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_rewards, fp)
                agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl'
                with open(agrew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_ag_rewards, fp)
                print('...Finished total of {} episodes.'.format(
                    len(episode_rewards)))
                break
def train(arglist, PID=None, lock=None):
    start_time = time.time()
    # global replay_buffer
    with U.single_threaded_session() as sess:
        # Create environment
        env = make_env(arglist.scenario, arglist, arglist.benchmark)
        # Create agents networks
        obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]

        ####changed by yuan li
        num_adversaries = copy.deepcopy(env.num_adversaries)
        arglist.num_adversaries = copy.deepcopy(num_adversaries)

        if comm_rank != 0 and comm_rank != 1:
            req = None
            wait_flag = False

            actors = get_agents(env, num_adversaries, obs_shape_n, arglist)

            U.initialize()

            #var_list = [var for var in tf.trainable_variables()]
            #加载模型
            var_list_n = []
            for actor in actors:
                var_list_n.extend(actor.get_variable_list())
            saver = tf.train.Saver(var_list=var_list_n, max_to_keep=20)
            if arglist.load_dir != "":
                U.load_state(arglist.load_dir, saver)

            episode_rewards, agent_rewards, final_ep_rewards, final_ep_ag_rewards, agent_info = initialize_variables(
                env)
            obs_n = env.reset()
            step = 0
            episode_step = 0
            sample_number = 0
            t_start = time.time()
            updata_time = 0
            print('Starting iterations...')

            invalid_train, red_win, red_leave, green_win, green_leave = 0, 0, 0, 0, 0

            while True:
                if not wait_flag:
                    #req = comm.irecv(350000, source=(comm_rank - 1 + comm_size) % comm_size, tag=11)
                    req = comm.irecv(350000, source=0, tag=11)
                    wait_flag = True
                else:
                    data_recv = req.test()
                    if data_recv[0]:
                        wait_flag = False
                        if data_recv[1] == 'finish':
                            #finish = True
                            comm.send('finish', dest=1, tag=11)
                            break
                        else:
                            update_start = time.time()
                            i = 0
                            j = 0
                            for var in tf.trainable_variables():
                                if 11 < (i % 24) < 24:
                                    var.load(data_recv[1][j], sess)
                                    j += 1
                                i += 1

                            #for var in var_list:
                            #    var.load(data_recv[1][i], sess)
                            #    i += 1
                            #print("111111111111111111111111,load param")
                            #for i, actor in enumerate(actors):
                            #    actor.load_weights(data_recv[1][i], sess)
                            update_end = time.time()
                            #print("step:{}, rank0_update_end_time:{}".format(step, update_end))
                            updata_time += (update_end - update_start)
                            step += 1
                    else:
                        wait_flag = True
                        # get action
                        action_n = [
                            agent.action(obs)
                            for agent, obs in zip(actors, obs_n)
                        ]
                        # environment step
                        new_obs_n, rew_n, done_n, info_n = env.step(action_n)
                        episode_step += 1
                        # changed by liyuan
                        done = any(done_n)
                        terminal = (episode_step >= arglist.max_episode_len)
                        ###liyuan: compute the arverage win rate
                        if green_leave_screen(env) or adversary_all_die(
                                env) or adversary_leave_screen(env):
                            terminal = True

                        if adversary_all_die(env):
                            green_win += 1
                        if green_leave_screen(env):
                            invalid_train += 1
                            green_leave += 1
                        if adversary_leave_screen(env):
                            red_leave += 1

                        if episode_step >= arglist.max_episode_len:
                            for i, agent in enumerate(env.agents):
                                if agent.adversary:
                                    rew_n[i] -= 50

                        if adversary_all_die(env):
                            for i, agent in enumerate(env.agents):
                                if agent.adversary:
                                    rew_n[i] -= 100

                        if done:
                            red_win = red_win + 1
                            for i, agent in enumerate(env.agents):
                                if agent.adversary:
                                    rew_n[i] += 200
                                    rew_n[i] += (
                                        arglist.max_episode_len -
                                        episode_step) / arglist.max_episode_len

                        #send data
                        data = [obs_n, action_n, rew_n, new_obs_n, done_n]
                        comm.send(data, dest=1, tag=11)

                        sample_number += 1

                        #replay_buffer.add(obs_n, action_n, rew_n, new_obs_n, done_n)
                        obs_n = new_obs_n
                        for i, rew in enumerate(rew_n):
                            episode_rewards[-1] += rew
                            agent_rewards[i][-1] += rew

                        if done or terminal:
                            obs_n = env.reset()
                            episode_step = 0
                            episode_rewards.append(0)
                            for a in agent_rewards:
                                a.append(0)
                            agent_info.append([[]])

                        # save model, display training output
                        if (terminal or done) and (len(episode_rewards) %
                                                   arglist.save_rate == 0):
                            if red_win >= 0.8 * arglist.save_rate:
                                temp_dir = arglist.save_dir + "_" + str(
                                    len(episode_rewards)) + "_" + str(
                                        red_win) + "_{}".format(PID)
                                U.save_state(temp_dir, saver=saver)
                            # print statement depends on whether or not there are adversaries
                            if num_adversaries == 0:
                                print(
                                    "Rank {}, steps: {}, episodes: {}, mean episode reward: {}, time: {}"
                                    .format(
                                        comm_rank, sample_number,
                                        len(episode_rewards),
                                        np.mean(episode_rewards[-arglist.
                                                                save_rate:]),
                                        round(time.time() - t_start, 3)))
                            else:
                                print(
                                    "Rank {}, steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}"
                                    .format(
                                        comm_rank, sample_number,
                                        len(episode_rewards),
                                        np.mean(episode_rewards[-arglist.
                                                                save_rate:]),
                                        [
                                            np.mean(rew[-arglist.save_rate:])
                                            for rew in agent_rewards
                                        ], round(time.time() - t_start, 3)))
                                print(
                                    "Rank  {}, red win: {}, green win: {}, red all leave: {}, green all leave: {}"
                                    .format(comm_rank, red_win, green_win,
                                            red_leave, green_leave))

                                middle_time = time.time()
                                print(
                                    "sample_number:{}, train_step:{}, update_time:{}, total_time:{}"
                                    .format(sample_number, step, updata_time,
                                            middle_time - start_time))
                                mydata = []
                                mydata.append(str(len(episode_rewards)))
                                mydata.append(
                                    str(
                                        np.mean(episode_rewards[-arglist.
                                                                save_rate:])))
                                mydata.append(
                                    str(
                                        np.mean(agent_rewards[0]
                                                [-arglist.save_rate:])))
                                mydata.append(
                                    str(
                                        np.mean(agent_rewards[1]
                                                [-arglist.save_rate:])))
                                mydata.append(
                                    str(
                                        np.mean(agent_rewards[2]
                                                [-arglist.save_rate:])))
                                mydata.append(str(red_win))
                                mydata.append(
                                    str(round(time.time() - t_start, 3)))
                                out = open('1mydata_{}.csv'.format(comm_rank),
                                           'a',
                                           newline='')
                                csv_write = csv.writer(out, dialect='excel')
                                csv_write.writerow(mydata)

                            if len(episode_rewards) > 3000:
                                U.save_state(arglist.save_dir, saver=saver)

                            invalid_train, red_win, red_leave, green_win, green_leave = 0, 0, 0, 0, 0
                            t_start = time.time()
                            # Keep track of final episode reward
                            final_ep_rewards.append(
                                np.mean(episode_rewards[-arglist.save_rate:]))
                            for rew in agent_rewards:
                                final_ep_ag_rewards.append(
                                    np.mean(rew[-arglist.save_rate:]))

            end_time = time.time()
            print("rank{}_time:{}".format(comm_rank, end_time - start_time))
            print("rank{}_update_time:{}".format(comm_rank, updata_time))
            print("rank{}_step:{}".format(comm_rank, step))

        if comm_rank == 1:
            replay_buffer = ReplayBuffer(1e6)

            wait_flag_1 = False
            wait_flag_2 = False
            wait_flag_3 = False
            req1 = None
            req2 = None
            req3 = None
            sample = 0
            step = 0
            req_list = []
            while True:
                if not wait_flag_1 or not wait_flag_2 or not wait_flag_3:
                    if not wait_flag_1:
                        req1 = comm.irecv(source=2, tag=11)
                        wait_flag_1 = True
                    if not wait_flag_2:
                        req2 = comm.irecv(source=3, tag=11)
                        wait_flag_2 = True
                    if not wait_flag_3:
                        req3 = comm.irecv(source=4, tag=11)
                        wait_flag_3 = True
                else:
                    data_recv_1 = req1.test()
                    data_recv_2 = req2.test()
                    data_recv_3 = req3.test()
                    if data_recv_1[0] or data_recv_2[0] or data_recv_3[0]:
                        if data_recv_1[0]:
                            wait_flag_1 = False
                            if data_recv_1[1] == 'finish':
                                break
                            else:
                                obs_n, action_n, rew_n, new_obs_n, done_n = data_recv_1[
                                    1]
                                replay_buffer.add(obs_n, action_n, rew_n,
                                                  new_obs_n, done_n)
                                sample += 1

                        if data_recv_2[0]:
                            wait_flag_2 = False
                            if data_recv_2[1] == 'finish':
                                break
                            else:
                                obs_n, action_n, rew_n, new_obs_n, done_n = data_recv_2[
                                    1]
                                replay_buffer.add(obs_n, action_n, rew_n,
                                                  new_obs_n, done_n)
                                sample += 1

                        if data_recv_3[0]:
                            wait_flag_3 = False
                            if data_recv_3[1] == 'finish':
                                break
                            else:
                                obs_n, action_n, rew_n, new_obs_n, done_n = data_recv_3[
                                    1]
                                replay_buffer.add(obs_n, action_n, rew_n,
                                                  new_obs_n, done_n)
                                sample += 1
                        '''
                        #计算接收100个样本然后发送样本用的时间
                        if (sample % 100 == 0) and len(replay_buffer) >= arglist.batch_size * arglist.max_episode_len:
                            start = time.time()
                            replay_sample_index = replay_buffer.make_index(arglist.batch_size)
                            send_data = replay_buffer.sample_index(replay_sample_index)
                            #send_data = (obs_n_a, act_n_a, rew_n_a, obs_next_n_a, done_n_a)
                            comm.send(send_data, dest=(comm_rank + 1) % comm_size, tag=11)
                            sample = 0
                            step += 1
                            end = time.time()
                            print("rank1 send sample time:", end-start)
                        '''

                    else:
                        wait_flag_1 = True
                        wait_flag_2 = True
                        wait_flag_3 = True
                        if (sample // 100 > 0) and len(
                                replay_buffer
                        ) >= arglist.batch_size * arglist.max_episode_len:
                            replay_sample_index = replay_buffer.make_index(
                                arglist.batch_size)
                            send_data = replay_buffer.sample_index(
                                replay_sample_index)
                            #send_data = (obs_n_a, act_n_a, rew_n_a, obs_next_n_a, done_n_a)
                            comm.send(send_data, dest=0, tag=11)
                            sample = 0
                            step += 1

            end_time = time.time()
            print("rank1_time:", end_time - start_time)
            print("rank1_step", step)

        if comm_rank == 0:
            extract_time = 0
            step = 0

            learners = get_agents(env, num_adversaries, obs_shape_n, arglist)

            var_list_n = []
            for learner in learners:
                var_list_n.extend(learner.get_variable_list())

            U.initialize()

            #var_list = [var for var in tf.trainable_variables()]

            # 加载模型
            saver = tf.train.Saver(var_list=var_list_n, max_to_keep=20)
            if arglist.load_dir != "":
                U.load_state(arglist.load_dir, saver)

            while True:
                if step >= STEP:
                    for i in range(comm_size - 2):
                        comm.send('finish', dest=(i + 2), tag=11)
                    break
                else:
                    start = time.time()
                    data_recv = comm.recv(source=1, tag=11)

                    for i, agent in enumerate(learners):
                        agent.update(learners, data_recv)

                    #dict_list = []
                    param = []
                    extract_start = time.time()
                    i = 0
                    for var in tf.trainable_variables():
                        if 11 < (i % 24) < 24:
                            param.append(sess.run(var))
                        i += 1
                    #print("2222222222222222 load weights")
                    #for var in var_list:
                    #   param.append(sess.run(var))

                    extract_end = time.time()
                    extract_time += (extract_end - extract_start)

                    for i in range(comm_size - 2):
                        comm.send(param, dest=(i + 2), tag=11)
                    #print("222222222222222222222222,send param")

                    step += 1
                    end = time.time()
                    #print("rank2 train time:{}, extract_time:{}".format(end - start, extract_end - extract_start))
            end_time = time.time()
            print("rank0_time:", end_time - start_time)
            print("rank0_extract_time:", extract_time)
            print("rank0_step:", step)
示例#26
0
文件: train.py 项目: EcustBoy/IMAC
def train(arglist):
    # random.seed(arglist.random_seed)
    # np.random.seed(arglist.random_seed)
    # tf.set_random_seed(arglist.random_seed)

    with U.single_threaded_session():
        # Create environment
        env = make_env(arglist.scenario, arglist, arglist.benchmark)
        # Create agent trainers
        obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]
        num_adversaries = min(env.n, arglist.num_adversaries)
        trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist)
        print('Using good policy {} and adv policy {}'.format(
            arglist.good_policy, arglist.adv_policy))

        # Initialize
        U.initialize()

        savers = [
            tf.train.Saver(U.scope_vars(trainer.name)) for trainer in trainers
        ]

        # Load previous results, if necessary
        if arglist.load_dir == "":
            arglist.load_dir = arglist.save_dir
        if arglist.display or arglist.restore or arglist.benchmark:
            print('Loading previous state...')
            # U.load_state(arglist.load_dir)
            [
                U.load_state(os.path.join(arglist.load_dir,
                                          'team_{}'.format(i)),
                             saver=saver) for i, saver in enumerate(savers)
            ]

        episode_rewards = [0.0]  # sum of rewards for all agents
        agent_rewards = [[0.0]
                         for _ in range(env.n)]  # individual agent reward
        final_ep_rewards = []  # sum of rewards for training curve
        final_ep_ag_rewards = []  # agent rewards for training curve
        agent_info = [[[]]]  # placeholder for benchmarking info
        saver = tf.train.Saver()
        obs_n = env.reset()
        episode_step = 0
        train_step = 0
        if arglist.trainer == 'tarmac' or arglist.trainer == 'reuse_tarmac' or arglist.trainer == 'ibmac_inter':
            message_n = np.zeros([len(obs_n), 4])
        is_training = True

        t_start = time.time()

        writer = tf.summary.FileWriter("graph", U.get_session().graph)
        writer.close()

        writer = SummaryWriter(arglist.save_dir)

        print('Starting iterations...')
        while True:
            # get action
            if arglist.trainer == 'ibmac' or arglist.trainer == 'reuse_ibmac':
                is_inference = False
                if arglist.display or arglist.restore or arglist.benchmark:
                    is_inference = False
                if len(trainers) == 2:
                    action_n1 = trainers[0].action(obs_n[:num_adversaries],
                                                   is_inference=is_inference)
                    action_n2 = trainers[1].action(obs_n[num_adversaries:],
                                                   is_inference=is_inference)
                    action_n = [action[0] for action in action_n1
                                ] + [action[0] for action in action_n2]
                else:
                    action_n = trainers[0].action(obs_n,
                                                  is_inference=is_inference)
                    action_n = [action[0] for action in action_n]
            elif arglist.trainer == 'ibmac_inter':
                if len(trainers) == 2:
                    action_n1, message_action_n1 = trainers[0].action(
                        obs_n[:num_adversaries], message_n[:num_adversaries])
                    action_n2, message_action_n2 = trainers[1].action(
                        obs_n[num_adversaries:], message_n[num_adversaries:])
                    action_n = [action[0] for action in action_n1
                                ] + [action[0] for action in action_n2]
                else:
                    action_n, message_action_n = trainers[0].action(
                        obs_n, message_n)
                    action_n = [action[0] for action in action_n]
                    message_n = [
                        message_action[0]
                        for message_action in message_action_n
                    ]
            else:
                action_n = [
                    agent.action(obs) for agent, obs in zip(trainers, obs_n)
                ]
            # environment step
            new_obs_n, rew_n, done_n, info_n = env.step(action_n)
            episode_step += 1
            done = all(done_n)
            terminal = (episode_step >= arglist.max_episode_len)
            # collect experience
            if arglist.trainer == 'ibmac':
                if len(trainers) == 2:
                    trainers[0].experience(obs_n[:num_adversaries],
                                           action_n[:num_adversaries],
                                           rew_n[:num_adversaries],
                                           new_obs_n[:num_adversaries],
                                           done_n[:num_adversaries], terminal)
                    trainers[1].experience(obs_n[num_adversaries:],
                                           action_n[num_adversaries:],
                                           rew_n[num_adversaries:],
                                           new_obs_n[num_adversaries:],
                                           done_n[num_adversaries:], terminal)
                else:
                    trainers[0].experience(obs_n, action_n, rew_n, new_obs_n,
                                           done_n, terminal)
            elif arglist.trainer == 'ibmac_inter':
                if len(trainers) == 2:
                    trainers[0].experience(obs_n[:num_adversaries],
                                           message_n[:num_adversaries],
                                           action_n[:num_adversaries],
                                           rew_n[:num_adversaries],
                                           new_obs_n[:num_adversaries],
                                           done_n[:num_adversaries], terminal)
                    trainers[1].experience(obs_n[num_adversaries:],
                                           message_n[:num_adversaries],
                                           action_n[num_adversaries:],
                                           rew_n[num_adversaries:],
                                           new_obs_n[num_adversaries:],
                                           done_n[num_adversaries:], terminal)
                else:
                    trainers[0].experience(obs_n, message_n, action_n, rew_n,
                                           new_obs_n, done_n, terminal)
            else:
                for i, agent in enumerate(trainers):
                    agent.experience(obs_n[i], action_n[i], rew_n[i],
                                     new_obs_n[i], done_n[i], terminal)
            obs_n = new_obs_n

            for i, rew in enumerate(rew_n):
                episode_rewards[-1] += rew
                agent_rewards[i][-1] += rew

            if done or terminal:
                obs_n = env.reset()
                episode_step = 0
                episode_rewards.append(0)
                for a in agent_rewards:
                    a.append(0)
                agent_info.append([[]])

            # increment global step counter
            train_step += 1

            # for benchmarking learned policies
            if arglist.benchmark:
                for i, info in enumerate(info_n):
                    agent_info[-1][i].append(info_n['n'])
                if train_step > arglist.benchmark_iters and (done or terminal):
                    file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl'
                    print('Finished benchmarking, now saving...')
                    with open(file_name, 'wb') as fp:
                        pickle.dump(agent_info[:-1], fp)
                    break
                continue

            # for displaying learned policies
            if arglist.display:
                env.render()
                continue

            # update all trainers, if not in display or benchmark mode
            loss = None
            for agent in trainers:
                agent.preupdate()
            for i, agent in enumerate(trainers):
                loss = agent.update(trainers, train_step)
                if loss:
                    if isinstance(agent, IBMACAgentTrainer) or isinstance(
                            agent, ReuseIBMACAgentTrainer):
                        q_loss, p_loss, _, _, _, _, kl_loss = loss
                        writer.add_scalar('agent_{}/loss_kl'.format(i),
                                          kl_loss, train_step)
                    else:
                        q_loss, p_loss, _, _, _, _ = loss
                    writer.add_scalar('agent_{}/loss_policy'.format(i), p_loss,
                                      train_step)
                    writer.add_scalar('agent_{}/loss_critic'.format(i), q_loss,
                                      train_step)

            # save model, display training output
            if terminal and (len(episode_rewards) % arglist.save_rate == 0):
                U.save_state(arglist.save_dir, saver=saver)
                [
                    U.save_state(os.path.join(arglist.save_dir,
                                              'team_{}'.format(i)),
                                 saver=saver) for i, saver in enumerate(savers)
                ]
                # print statement depends on whether or not there are adversaries

                for i in range(len(agent_rewards)):
                    writer.add_scalar(
                        'agent_{}/mean_episode_reward'.format(i),
                        np.mean(agent_rewards[i][-arglist.save_rate:]),
                        len(episode_rewards))

                if num_adversaries == 0:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, time: {}"
                        .format(train_step, len(episode_rewards),
                                np.mean(episode_rewards[-arglist.save_rate:]),
                                round(time.time() - t_start, 3)))
                else:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}"
                        .format(train_step, len(episode_rewards),
                                np.mean(episode_rewards[-arglist.save_rate:]),
                                [
                                    np.mean(rew[-arglist.save_rate:])
                                    for rew in agent_rewards
                                ], round(time.time() - t_start, 3)))
                t_start = time.time()
                # Keep track of final episode reward
                final_ep_rewards.append(
                    np.mean(episode_rewards[-arglist.save_rate:]))
                for rew in agent_rewards:
                    final_ep_ag_rewards.append(
                        np.mean(rew[-arglist.save_rate:]))

            # saves final episode reward for plotting training curve later
            if len(episode_rewards) > arglist.num_episodes:
                rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl'
                with open(rew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_rewards, fp)
                agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl'
                with open(agrew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_ag_rewards, fp)
                print('...Finished total of {} episodes.'.format(
                    len(episode_rewards)))
                break
示例#27
0
def train(arglist):
    with U.single_threaded_session():
        if not os.path.isdir(arglist.save_dir):
            os.makedirs(arglist.save_dir)
        if not os.path.isdir(arglist.benchmark_dir):
            os.makedirs(arglist.benchmark_dir)
        if not os.path.isdir(arglist.plots_dir):
            os.makedirs(arglist.plots_dir)

        #tensorboard
        summary_writer = tf.summary.FileWriter(
            "./" + arglist.exp_name + "_graph/",
            U.get_session().graph)
        reward_plot = None
        reward_summary = tf.Summary()
        reward_summary.value.add(tag='reward', simple_value=reward_plot)

        # Create environment
        env = make_env(arglist.scenario, arglist, arglist.benchmark)
        # Create agent trainers
        obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]
        num_adversaries = min(env.n, arglist.num_adversaries)
        trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist)
        print('Using good policy {} and adv policy {}'.format(
            arglist.good_policy, arglist.adv_policy))

        # Initialize
        U.initialize()

        # Load previous results, if necessary
        if arglist.load_dir == "":
            arglist.load_dir = arglist.save_dir
        if arglist.display or arglist.restore or arglist.benchmark:
            print('Loading previous state...')
            U.load_state(arglist.load_dir)

        episode_rewards = [0.0]  # sum of rewards for all agents
        agent_rewards = [[0.0]
                         for _ in range(env.n)]  # individual agent reward
        final_ep_rewards = []  # sum of rewards for training curve
        final_ep_ag_rewards = []  # agent rewards for training curve
        agent_info = [[[]]]  # placeholder for benchmarking info
        saver = tf.train.Saver()
        obs_n = env.reset()
        episode_step = 0
        train_step = 0
        """
        #### USE RVO 
        """
        use_rvo_range = -1  # if want to use rvo, set 0.28

        t_start = time.time()

        print('Starting iterations...')
        while True:
            # get action
            action_n = [
                agent.action(obs) for agent, obs in zip(trainers, obs_n)
            ]

            if use_rvo_range < 0:
                new_obs_n, rew_n, done_n, info_n = env.step(action_n,
                                                            use_rvo=None)
            else:
                # use_rvo list
                total_rvo_list = []
                for obs in obs_n:
                    agent_pos = obs[-2 * (env.world.num_agents - 1)::]
                    obst_pos = obs[-2 * (env.world.num_agents +
                                         env.world.num_obstacles)::]
                    agent_rvo_list = []
                    for i in range(0, len(agent_pos), 2):
                        if np.sqrt(np.sum(np.square(
                                agent_pos[i:i + 2]))) < use_rvo_range:
                            agent_rvo_list.append(True)
                        else:
                            agent_rvo_list.append(False)
                    for i in range(0, len(obst_pos), 2):
                        if np.sqrt(np.sum(np.square(
                                obst_pos[i:i + 2]))) < use_rvo_range:
                            agent_rvo_list.append(True)
                        else:
                            agent_rvo_list.append(False)

                    if any(agent_rvo_list):
                        total_rvo_list.append(True)
                    else:
                        total_rvo_list.append(False)
                # environment step
                new_obs_n, rew_n, done_n, info_n = env.step(
                    action_n, use_rvo=total_rvo_list)

            episode_step += 1
            done = all(done_n)
            terminal = (episode_step >= arglist.max_episode_len)
            # collect experience
            for i, agent in enumerate(trainers):
                agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i],
                                 done_n[i], terminal)
            obs_n = new_obs_n

            for i, rew in enumerate(rew_n):
                episode_rewards[-1] += rew
                agent_rewards[i][-1] += rew

            if done or terminal:
                obs_n = env.reset()
                episode_step = 0
                episode_rewards.append(0)
                for a in agent_rewards:
                    a.append(0)
                agent_info.append([[]])
            # increment global step counter
            train_step += 1

            # for benchmarking learned policies
            if arglist.benchmark:
                for i, info in enumerate(info_n):
                    agent_info[-1][i].append(info_n['n'])
                if train_step > arglist.benchmark_iters and (done or terminal):
                    file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl'
                    print('Finished benchmarking, now saving...')
                    with open(file_name, 'wb') as fp:
                        pickle.dump(agent_info[:-1], fp)
                    break
                continue

            # for displaying learned policies
            if arglist.display:
                time.sleep(0.1)
                env.render()
                continue

            # update all trainers, if not in display or benchmark mode
            loss = None
            for agent in trainers:
                agent.preupdate()
            for agent in trainers:
                loss = agent.update(trainers, train_step)

            # add reward to tensorboard
            reward_summary.value[0].simple_value = np.mean(
                episode_rewards[-arglist.save_rate:])
            summary_writer.add_summary(reward_summary, len(episode_rewards))

            # save model, display training output
            if terminal and (len(episode_rewards) % arglist.save_rate == 0):
                U.save_state(arglist.save_dir, saver=saver)
                # print statement depends on whether or not there are adversaries
                if num_adversaries == 0:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, time: {}"
                        .format(train_step, len(episode_rewards),
                                np.mean(episode_rewards[-arglist.save_rate:]),
                                round(time.time() - t_start, 3)))
                else:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}"
                        .format(train_step, len(episode_rewards),
                                np.mean(episode_rewards[-arglist.save_rate:]),
                                [
                                    np.mean(rew[-arglist.save_rate:])
                                    for rew in agent_rewards
                                ], round(time.time() - t_start, 3)))

                t_start = time.time()
            if terminal:
                # Keep track of final episode reward
                final_ep_rewards.append(
                    np.mean(episode_rewards[-arglist.save_rate:]))
                for rew in agent_rewards:
                    final_ep_ag_rewards.append(
                        np.mean(rew[-arglist.save_rate:]))

            # saves final episode reward for plotting training curve later
            if len(episode_rewards) % 1000 == 0:
                rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl' + str(
                    len(episode_rewards))
                with open(rew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_rewards, fp)
                agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl'
                with open(agrew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_ag_rewards, fp)
                print('saved')
            if len(episode_rewards) > arglist.num_episodes:
                print('...Finished total of {} episodes.'.format(
                    len(episode_rewards)))
                break
示例#28
0
def train(arglist):
    with U.single_threaded_session():
        # Create environment
        env = make_env(arglist.scenario, arglist, arglist.benchmark)
        obs_n = env.reset()
        # Create agent trainers
        obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]
        num_adversaries = min(env.n, arglist.num_adversaries)
        trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist)
        print('Using good policy {} and adv policy {}'.format(
            arglist.good_policy, arglist.adv_policy))
        # Pretrain the safety_layer
        safety_layer = None
        if arglist.use_safety_layer:
            safety_layer = SafetyLayer(env,
                                       len(env.world.landmarks) - 1,
                                       mlp_model_safety_layer,
                                       env.observation_space[0].shape,
                                       env.action_space, trainers[0].action)
            # set safety_layer for trainer[0]
            trainers[0].set_safety_layer(safety_layer)
        if arglist.use_mpc_layer:
            safety_layer = MpcLayer(env)
            # set safety_layer for trainer[0]
            trainers[0].set_safety_layer(safety_layer)

        # Initialize
        U.initialize()

        # Load previous results, if necessary
        if arglist.load_dir == "":
            arglist.load_dir = arglist.save_dir
        if arglist.display or arglist.restore or arglist.benchmark:
            print('Loading previous state...')
            U.load_state(arglist.load_dir)

        episode_rewards = [0.0]  # sum of rewards for all agents
        agent_rewards = [[0.0]
                         for _ in range(env.n)]  # individual agent reward
        final_ep_rewards = []  # sum of rewards for training curve
        final_ep_ag_rewards = []  # agent rewards for training curve
        agent_info = [[[]]]  # placeholder for benchmarking info
        saver = tf.train.Saver()
        episode_step = 0
        train_step = 0
        cumulative_constraint_violations = 0
        t_start = time.time()
        data_save = []
        num_done = 0

        # pickle env
        # env0 = copy.deepcopy(env)
        '''file_path = open('env.pkl', 'rb')
        import pickle
        for i in range(len(env.world.landmarks)):
            env.world.landmarks[i] = pickle.load(file_path)
        for i in range(len(env.world.agents)):
            env.world.agents[i] = pickle.load(file_path)
        obs_n = []
        agents = env.world.agents
        for agent in agents:
            obs_n.append(env._get_obs(agent))'''

        print('Starting iterations...')
        while True:
            # get constraint_values
            c_n = env.get_constraint_values()
            is_any_collision = env.is_any_collision()
            if is_any_collision[0]:
                cumulative_constraint_violations = cumulative_constraint_violations + 1
            '''if c_n[0][0] > 0:
                print("there is a c_n > 0")'''
            # get action
            action_n = [
                agent.action_real(obs, c, env)
                for agent, obs, c in zip(trainers, obs_n, c_n)
            ]
            action_real = [action_n[0][0]]
            if_call = [action_n[0][2]]
            action_n = [action_n[0][1]]
            data_save.append(
                np.concatenate([obs_n[0], action_n[0], action_n[0]]))
            # environment step
            new_obs_n, rew_n, done_n, info_n = env.step(action_n,
                                                        if_call=if_call)
            '''is_any_collision_new = env.is_any_collision()
            if is_any_collision_new[0]:
                env.is_any_collision()
                dist = np.sqrt(np.sum(np.square(env.agents[0].state.p_pos - env.world.landmarks[0].state.p_pos))) -\
                       (env.agents[0].size + env.world.landmarks[0].size)
                # print("aaa", env.agents[0].state.p_pos, dist)'''

            # new c_n
            # new_c_n = env.get_constraint_values()
            episode_step += 1
            done = all(done_n)
            terminal = (episode_step >= arglist.max_episode_len) or \
                       (env.agents[0].state.p_pos[0] - env.world.landmarks[-1].state.p_pos[0]) > 1.5
            # collect experience
            for i, agent in enumerate(trainers):
                agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i],
                                 done_n[i], terminal)
            obs_n = new_obs_n

            for i, rew in enumerate(rew_n):
                episode_rewards[-1] += rew
                agent_rewards[i][-1] += rew

            if done or terminal:
                if done:
                    num_done = num_done + 1

                data_save.append(
                    np.concatenate([obs_n[0], action_n[0], action_n[0]]))
                data_save = np.array(data_save)
                '''np.savetxt("data_save.txt", data_save)'''  # 缺省按照'%.18e'格式保存数据,以空格分隔

                # plot x, y, v, theta
                a = data_save
                V = a[:, 1]
                x = a[:, 2]
                y = a[:, 3]
                theta = a[:, 4]
                omega = a[:, 5]
                # action_n = a[:, 26] - a[:, 27]
                # action_real = a[:, 31] - a[:, 32]
                fig, ax0 = plt.subplots()
                for i, landmark in enumerate(env.world.landmarks[:-1]):
                    p_pos = landmark.state.p_pos
                    r = landmark.size
                    circle = mpathes.Circle(p_pos,
                                            r,
                                            facecolor='w',
                                            edgecolor='forestgreen',
                                            linestyle='-.')
                    ax0.add_patch(circle)
                for i, landmark in enumerate(env.world.landmarks):
                    p_pos = landmark.state.p_pos
                    r = (landmark.size -
                         0.09) if landmark is not env.world.landmarks[
                             -1] else landmark.size
                    circle = mpathes.Circle(p_pos, r, facecolor='forestgreen')
                    ax0.add_patch(circle)
                for i in range(len(x)):
                    p_pos = np.array([x[i], y[i]])
                    r = env.world.agents[0].size
                    circle = mpathes.Circle(p_pos, r, facecolor='darkgreen')
                    ax0.add_patch(circle)
                ax0.set_xlim((-1, 40))
                ax0.set_ylim((-10, 10))
                ax0.axis('equal')
                ax0.set_title("x-y")
                x1 = [-1, 40]
                y1 = [10, 10]
                y2 = [-10, -10]
                ax0.plot(x1, y1, color='forestgreen', linestyle='-.')
                ax0.plot(x1, y2, color='forestgreen', linestyle='-.')
                plt.show()
                '''fig, ax = plt.subplots(ncols=2, nrows=2)
                for i, landmark in enumerate(env.world.landmarks):
                    p_pos = landmark.state.p_pos
                    r = landmark.size
                    circle = mpathes.Circle(p_pos, r)
                    ax[0, 0].add_patch(circle)
                for i in range(len(x)):
                    p_pos = np.array([x[i], y[i]])
                    r = env.world.agents[0].size
                    circle = mpathes.Circle(p_pos, r)
                    ax[0, 0].add_patch(circle)
                ax[0, 0].set_xlim((-1, 20))
                ax[0, 0].set_ylim((-10.3, 10.3))
                ax[0, 0].set_title("x-y")
                ax[0, 0].axis('equal')
                ax[0, 1].plot(theta)
                ax[0, 1].set_title("theta")
                ax[1, 0].plot(omega)
                ax[1, 0].set_title("omega")
                # ax[1, 1].plot(action_n * 0.12)
                # ax[1, 1].set_title("action_n")
                plt.show()'''

                # reset and continue
                data_save = []
                obs_n = env.reset()
                # env0 = copy.deepcopy(env)
                episode_step = 0
                episode_rewards.append(0)
                for a in agent_rewards:
                    a.append(0)
                agent_info.append([[]])

            # increment global step counter
            train_step += 1

            # for benchmarking learned policies
            if arglist.benchmark:
                for i, info in enumerate(info_n):
                    agent_info[-1][i].append(info_n['n'])
                if train_step > arglist.benchmark_iters and (done or terminal):
                    file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl'
                    print('Finished benchmarking, now saving...')
                    with open(file_name, 'wb') as fp:
                        pickle.dump(agent_info[:-1], fp)
                    break
                continue

            # for displaying learned policies
            if arglist.display:
                time.sleep(0.1)
                env.render()
                continue

            # update all trainers, if not in display or benchmark mode
            loss = None
            for agent in trainers:
                agent.preupdate()
            '''for agent in trainers:
                loss = agent.update(trainers, train_step)'''

            # save model, display training output
            if (done or terminal) and ((len(episode_rewards) - 1) %
                                       arglist.save_rate == 0):
                U.save_state(arglist.save_dir, saver=saver)
                # print statement depends on whether or not there are adversaries
                if num_adversaries == 0:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, num_cumulative_constraints: {}, num_done: {}, time: {}"
                        .format(train_step,
                                len(episode_rewards) - 1,
                                np.mean(episode_rewards[-arglist.save_rate:]),
                                cumulative_constraint_violations, num_done,
                                round(time.time() - t_start, 3)))
                else:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, num_cumulative_constraints: {}, num_done: {}, time: {}"
                        .format(train_step,
                                len(episode_rewards) - 1,
                                np.mean(episode_rewards[-arglist.save_rate:]),
                                cumulative_constraint_violations, num_done,
                                round(time.time() - t_start, 3)))
                    # print(trainers[0].safety_layer.num_call)
                t_start = time.time()
                num_done = 0
                cumulative_constraint_violations = 0
                # Keep track of final episode reward
                final_ep_rewards.append(
                    np.mean(episode_rewards[-arglist.save_rate:]))
                for rew in agent_rewards:
                    final_ep_ag_rewards.append(
                        np.mean(rew[-arglist.save_rate:]))

            # saves final episode reward for plotting training curve later
            if len(episode_rewards) > arglist.num_episodes:
                rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl'
                with open(rew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_rewards, fp)
                agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl'
                with open(agrew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_ag_rewards, fp)
                print('...Finished total of {} episodes.'.format(
                    len(episode_rewards)))
                break
示例#29
0
def train(arglist):
    with U.single_threaded_session():
        # Create environment
        env = make_env(arglist.scenario, arglist, arglist.benchmark)
        # Create agent trainers
        obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]
        num_adversaries = min(env.n, arglist.num_adversaries)
        trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist)
        print('Using good policy {} and adv policy {}'.format(
            arglist.good_policy, arglist.adv_policy))

        # Initialize
        U.initialize()

        # Load previous results, if necessary
        if arglist.load_dir == "":
            arglist.load_dir = arglist.save_dir
        if arglist.display or arglist.restore or arglist.benchmark or arglist.plot:
            print('Loading previous state...')
            U.load_state(arglist.load_dir)

        episode_rewards = [0.0]  # sum of rewards for all agents
        agent_rewards = [[0.0]
                         for _ in range(env.n)]  # individual agent reward
        final_ep_rewards = []  # sum of rewards for training curve
        final_ep_ag_rewards = []  # agent rewards for training curve
        agent_info = [[[]]]  # placeholder for benchmarking info
        saver = tf.train.Saver()
        obs_n = env.reset()
        episode_step = 0
        train_step = 0
        t_start = time.time()
        plot_data = []

        print('Starting iterations...')
        while True:
            # get action
            action_n = [
                agent.action(obs) for agent, obs in zip(trainers, obs_n)
            ]
            # environment step
            new_obs_n, rew_n, done_n, info_n = env.step(action_n)
            episode_step += 1
            done = all(done_n)
            terminal = (episode_step >= arglist.max_episode_len)
            # collect experience
            for i, agent in enumerate(trainers):
                agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i],
                                 done_n[i], terminal)
            obs_n = new_obs_n

            plot_d = env.get_plot_data()

            for i, rew in enumerate(rew_n):
                episode_rewards[-1] += rew
                agent_rewards[i][-1] += rew

            plot_data.append(plot_d)

            if done or terminal:
                if arglist.plot:
                    if arglist.scenario == "simple_spread" or arglist.scenario == "simple_spread_obstacles":
                        plot_spread(plot_data)
                    if arglist.scenario == "simple_formation" or arglist.scenario == "simple_formation_obstacles":
                        plot_formation(plot_data)
                obs_n = env.reset()
                episode_step = 0
                episode_rewards.append(0)
                for a in agent_rewards:
                    a.append(0)
                agent_info.append([[]])
                plot_data = []

            # increment global step counter
            train_step += 1

            # for benchmarking learned policies
            if arglist.benchmark:
                for i, info in enumerate(info_n):
                    agent_info[-1][i].append(info_n['n'])
                if train_step > arglist.benchmark_iters and (done or terminal):
                    file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl'
                    print('Finished benchmarking, now saving...')
                    with open(file_name, 'wb') as fp:
                        pickle.dump(agent_info[:-1], fp)
                    break
                continue

            # for displaying learned policies
            if arglist.display:
                time.sleep(0.1)
                env.render()
                continue

            # update all trainers, if not in display or benchmark mode
            loss = None
            for agent in trainers:
                agent.preupdate()
            for agent in trainers:
                loss = agent.update(trainers, train_step)

            # save model, display training output
            if terminal and (len(episode_rewards) % arglist.save_rate == 0):
                U.save_state(arglist.save_dir, saver=saver)
                # print statement depends on whether or not there are adversaries
                if num_adversaries == 0:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, time: {}"
                        .format(train_step, len(episode_rewards),
                                np.mean(episode_rewards[-arglist.save_rate:]),
                                round(time.time() - t_start, 3)))
                else:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}"
                        .format(train_step, len(episode_rewards),
                                np.mean(episode_rewards[-arglist.save_rate:]),
                                [
                                    np.mean(rew[-arglist.save_rate:])
                                    for rew in agent_rewards
                                ], round(time.time() - t_start, 3)))
                t_start = time.time()
                # Keep track of final episode reward
                final_ep_rewards.append(
                    np.mean(episode_rewards[-arglist.save_rate:]))
                for rew in agent_rewards:
                    final_ep_ag_rewards.append(
                        np.mean(rew[-arglist.save_rate:]))

            # saves final episode reward for plotting training curve later
            if len(episode_rewards) > arglist.num_episodes:
                rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl'
                with open(rew_file_name, 'wb') as fp:
                    pickle.dump(episode_rewards, fp)
                agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl'
                with open(agrew_file_name, 'wb') as fp:
                    pickle.dump(agent_rewards, fp)
                print('...Finished total of {} episodes.'.format(
                    len(episode_rewards)))
                break
示例#30
0
def train(arglist):
    with U.single_threaded_session():
        # create world
        world = World()

        # Create environment
        env = MultiAgentTorcsEnv(world,
                                 0,
                                 world.reset_world,
                                 world.reward,
                                 world.observation,
                                 done_callback=world.done)

        # Create agent trainers
        obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]
        num_adversaries = env.adv  #min(env.n, arglist.num_adversaries)
        trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist)

        print('Using good policy {} and adv policy {}'.format(
            arglist.good_policy, arglist.adv_policy))

        # Initialize
        U.initialize()

        # Load previous results, if necessary
        if arglist.load_dir == "":
            arglist.load_dir = arglist.save_dir
        if arglist.display or arglist.restore or arglist.benchmark:
            print('Loading previous state...')
            U.load_state(arglist.load_dir)

        episode_rewards = [0.0]  # sum of rewards for all agents
        agent_rewards = [[0.0]
                         for _ in range(env.n)]  # individual agent reward
        final_ep_rewards = []  # sum of rewards for training curve
        final_ep_ag_rewards = []  # agent rewards for training curve
        agent_info = [[[]]]  # placeholder for benchmarking info
        saver = tf.train.Saver()

        #todo : call reset function here
        os.system("pkill torcs")
        os.system("cd ~/vtorcs3 && ./torcs &"
                  )  #use the location of torcs installation on your system
        time.sleep(0.5)
        os.system('sh autostart.sh')
        time.sleep(1)

        obs_n = []
        world.initialize_agents()
        for agent in env.agents:
            obs_n.append(world.observation(agent))
        #obs_n = env.reset()

        episode_step = 0
        train_step = 0
        t_start = time.time()
        episode_count = 0
        epsilon = 1
        EXPLORE = 100000.
        train_indicator = 1
        print('Starting iterations...')
        while True:
            print("Episode number: " + str(episode_count) + " ")
            # get action
            action_n = [
                agent.action(obs) for agent, obs in zip(trainers, obs_n)
            ]
            # environment step
            new_obs_n, rew_n, done_n, info_n = env.step(
                action_n, epsilon, train_indicator)
            episode_step += 1

            done = all(done_n)
            terminal = (episode_step >= arglist.max_episode_len)
            # collect experience
            for i, agent in enumerate(trainers):
                agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i],
                                 done_n[i], terminal)
            obs_n = new_obs_n

            for i, rew in enumerate(rew_n):
                episode_rewards[-1] += rew
                agent_rewards[i][-1] += rew

            if done or terminal:
                obs_n = env.reset()
                epsilon -= 1.0 / EXPLORE
                episode_step = 0
                episode_rewards.append(0)
                episode_count += 1
                for a in agent_rewards:
                    a.append(0)
                agent_info.append([[]])

            # increment global step counter
            train_step += 1
            world.step = train_step
            # for benchmarking learned policies
            if arglist.benchmark:
                for i, info in enumerate(info_n):
                    agent_info[-1][i].append(info_n['n'])
                if train_step > arglist.benchmark_iters and (done or terminal):
                    file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl'
                    print('Finished benchmarking, now saving...')
                    with open(file_name, 'wb') as fp:
                        pickle.dump(agent_info[:-1], fp)
                    break
                continue

            #NA for TORCS env
            # for displaying learned policies
            '''if arglist.display:
                time.sleep(0.1)
                env.render()
                continue'''

            # update all trainers, if not in display or benchmark mode
            loss = None
            for agent in trainers:
                agent.preupdate()
            for agent in trainers:
                loss = agent.update(trainers, train_step)
            l2 = "Loss is " + str(loss) + "\n"
            with open("log2.txt", "a") as f:
                f.write(l2)
            print(l2)

            # save model, display training output
            if terminal and (len(episode_rewards) % arglist.save_rate == 0):
                U.save_state(arglist.save_dir, saver=saver)
                # print statement depends on whether or not there are adversaries
                if num_adversaries == 0:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, time: {}"
                        .format(train_step, len(episode_rewards),
                                np.mean(episode_rewards[-arglist.save_rate:]),
                                round(time.time() - t_start, 3)))
                else:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}"
                        .format(train_step, len(episode_rewards),
                                np.mean(episode_rewards[-arglist.save_rate:]),
                                [
                                    np.mean(rew[-arglist.save_rate:])
                                    for rew in agent_rewards
                                ], round(time.time() - t_start, 3)))
                t_start = time.time()
                # Keep track of final episode reward
                final_ep_rewards.append(
                    np.mean(episode_rewards[-arglist.save_rate:]))
                for rew in agent_rewards:
                    final_ep_ag_rewards.append(
                        np.mean(rew[-arglist.save_rate:]))

            # saves final episode reward for plotting training curve later
            if len(episode_rewards) > arglist.num_episodes:
                rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl'
                with open(rew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_rewards, fp)
                agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl'
                with open(agrew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_ag_rewards, fp)
                print('...Finished total of {} episodes.'.format(
                    len(episode_rewards)))
                break