예제 #1
0
def train(args, return_early=False):
    writer = SummaryWriter(args.log_dir)    
    envs = utils.make_parallel_envs(args) 
    master = setup_master(args) 
    # used during evaluation only
    eval_master, eval_env = setup_master(args, return_env=True) 
    obs = envs.reset() # shape - num_processes x num_agents x obs_dim
    master.initialize_obs(obs)
    n = len(master.all_agents)
    episode_rewards = torch.zeros([args.num_processes, n], device=args.device)
    final_rewards = torch.zeros([args.num_processes, n], device=args.device)

    # start simulations
    start = datetime.datetime.now()
    for j in range(args.num_updates):
        for step in range(args.num_steps):
            with torch.no_grad():
                actions_list = master.act(step)
            agent_actions = np.transpose(np.array(actions_list),(1,0,2))
            obs, reward, done, info = envs.step(agent_actions)
            reward = torch.from_numpy(np.stack(reward)).float().to(args.device)
            episode_rewards += reward
            masks = torch.FloatTensor(1-1.0*done).to(args.device)
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            master.update_rollout(obs, reward, masks)
          
        master.wrap_horizon()
        return_vals = master.update()
        value_loss = return_vals[:, 0]
        action_loss = return_vals[:, 1]
        dist_entropy = return_vals[:, 2]
        master.after_update()

        if j%args.save_interval == 0 and not args.test:
            savedict = {'models': [agent.actor_critic.state_dict() for agent in master.all_agents]}
            ob_rms = (None, None) if envs.ob_rms is None else (envs.ob_rms[0].mean, envs.ob_rms[0].var)
            savedict['ob_rms'] = ob_rms
            savedir = args.save_dir+'/ep'+str(j)+'.pt'
            torch.save(savedict, savedir)

        total_num_steps = (j + 1) * args.num_processes * args.num_steps

        if j%args.log_interval == 0:
            end = datetime.datetime.now()
            seconds = (end-start).total_seconds()
            mean_reward = final_rewards.mean(dim=0).cpu().numpy()
            print("Updates {} | Num timesteps {} | Time {} | FPS {}\nMean reward {}\nEntropy {:.4f} Value loss {:.4f} Policy loss {:.4f}\n".
                  format(j, total_num_steps, str(end-start), int(total_num_steps / seconds), 
                  mean_reward, dist_entropy[0], value_loss[0], action_loss[0]))
            if not args.test:
                for idx in range(n):
                    writer.add_scalar('agent'+str(idx)+'/training_reward', mean_reward[idx], j)

                writer.add_scalar('all/value_loss', value_loss[0], j)
                writer.add_scalar('all/action_loss', action_loss[0], j)
                writer.add_scalar('all/dist_entropy', dist_entropy[0], j)

        if args.eval_interval is not None and j%args.eval_interval==0:
            ob_rms = (None, None) if envs.ob_rms is None else (envs.ob_rms[0].mean, envs.ob_rms[0].var)
            print('===========================================================================================')
            _, eval_perstep_rewards, final_min_dists, num_success, eval_episode_len = evaluate(args, None, master.all_policies,
                                                                                               ob_rms=ob_rms, env=eval_env,
                                                                                               master=eval_master)
            print('Evaluation {:d} | Mean per-step reward {:.2f}'.format(j//args.eval_interval, eval_perstep_rewards.mean()))
            print('Num success {:d}/{:d} | Episode Length {:.2f}'.format(num_success, args.num_eval_episodes, eval_episode_len))
            if final_min_dists:
                print('Final_dists_mean {}'.format(np.stack(final_min_dists).mean(0)))
                print('Final_dists_var {}'.format(np.stack(final_min_dists).var(0)))
            print('===========================================================================================\n')

            if not args.test:
                writer.add_scalar('all/eval_success', 100.0*num_success/args.num_eval_episodes, j)
                writer.add_scalar('all/episode_length', eval_episode_len, j)
                for idx in range(n):
                    writer.add_scalar('agent'+str(idx)+'/eval_per_step_reward', eval_perstep_rewards.mean(0)[idx], j)
                    if final_min_dists:
                        writer.add_scalar('agent'+str(idx)+'/eval_min_dist', np.stack(final_min_dists).mean(0)[idx], j)

            curriculum_success_thres = 0.9
            if return_early and num_success*1./args.num_eval_episodes > curriculum_success_thres:
                savedict = {'models': [agent.actor_critic.state_dict() for agent in master.all_agents]}
                ob_rms = (None, None) if envs.ob_rms is None else (envs.ob_rms[0].mean, envs.ob_rms[0].var)
                savedict['ob_rms'] = ob_rms
                savedir = args.save_dir+'/ep'+str(j)+'.pt'
                torch.save(savedict, savedir)
                print('===========================================================================================\n')
                print('{} agents: training complete. Breaking.\n'.format(args.num_agents))
                print('===========================================================================================\n')
                break

    writer.close()
    if return_early:
        return savedir
예제 #2
0
def evaluate(args,
             seed,
             policies_list,
             ob_rms=None,
             render=False,
             env=None,
             master=None,
             render_attn=True):
    """
    RL evaluation: supports eval through training code as well as independently
    policies_list should be a list of policies of all the agents;
    len(policies_list) = num agents
    """
    if env is None or master is None:  # if any one of them is None, generate both of them
        master, env = setup_master(args, return_env=True)

    if seed is None:  # ensure env eval seed is different from training seed
        seed = np.random.randint(0, 100000)
    print("Evaluation Seed: ", seed)
    env.seed(seed)

    if ob_rms is not None:
        obs_mean, obs_std = ob_rms
    else:
        obs_mean = None
        obs_std = None
    master.load_models(policies_list)
    master.set_eval_mode()

    num_eval_episodes = args.num_eval_episodes
    all_episode_rewards = np.full((num_eval_episodes, env.n), 0.0)
    per_step_rewards = np.full((num_eval_episodes, env.n), 0.0)

    # TODO: provide support for recurrent policies and mask
    recurrent_hidden_states = None
    mask = None

    # world.dists at the end of episode for simple_spread
    final_min_dists = []
    num_success = 0
    episode_length = 0

    for t in range(num_eval_episodes):
        obs = env.reset()
        obs = normalize_obs(obs, obs_mean, obs_std)
        done = [False] * env.n
        episode_rewards = np.full(env.n, 0.0)
        episode_steps = 0
        if render:
            attn = None if not render_attn else master.team_attn
            if attn is not None and len(attn.shape) == 3:
                attn = attn.max(0)
            env.render(attn=attn)

        while not np.all(done):
            actions = []
            with torch.no_grad():
                actions = master.eval_act(obs, recurrent_hidden_states, mask)
            episode_steps += 1
            obs, reward, done, info = env.step(actions)
            obs = normalize_obs(obs, obs_mean, obs_std)
            episode_rewards += np.array(reward)
            if render:
                attn = None if not render_attn else master.team_attn
                if attn is not None and len(attn.shape) == 3:
                    attn = attn.max(0)
                env.render(attn=attn)
                if args.record_video:
                    time.sleep(0.08)

        per_step_rewards[t] = episode_rewards / episode_steps
        num_success += info['n'][0]['is_success']
        episode_length = (episode_length * t +
                          info['n'][0]['world_steps']) / (t + 1)

        # for simple spread env only
        if args.env_name == 'simple_spread':
            final_min_dists.append(env.world.min_dists)
        elif args.env_name == 'simple_formation' or args.env_name == 'simple_line':
            final_min_dists.append(env.world.dists)

        if render:
            print(
                "Ep {} | Success: {} \n Av per-step reward: {:.2f} | Ep Length {}"
                .format(t, info['n'][0]['is_success'], per_step_rewards[t][0],
                        info['n'][0]['world_steps']))
        all_episode_rewards[
            t, :] = episode_rewards  # all_episode_rewards shape: num_eval_episodes x num agents

        if args.record_video:
            # print(attn)
            input('Press enter to continue: ')

    return all_episode_rewards, per_step_rewards, final_min_dists, num_success, episode_length
def evaluate(args,
             seed,
             policies_list,
             ob_rms=None,
             render=True,
             env=None,
             master=None,
             render_attn=True):
    """
    RL evaluation: supports eval through training code as well as independently
    policies_list should be a list of policies of all the agents;
    len(policies_list) = num agents
    """
    env = utils.make_single_env(args)
    master = setup_master(args, env)
    # if env is None or master is None: # if any one of them is None, generate both of them
    #     master, env = setup_master(args, return_env=True)

    if seed is None:  # ensure env eval seed is different from training seed
        seed = np.random.randint(0, 100000)
    print("Evaluation Seed: ", seed)
    env.seed(seed)
    if ob_rms is not None:
        obs_mean, obs_std = ob_rms
    else:
        obs_mean = None
        obs_std = None
    # print('eval flag 2')
    master.load_models(policies_list)
    master.set_eval_mode()
    # print('eval flag 3')

    num_eval_episodes = args.num_eval_episodes
    all_episode_rewards = np.full((num_eval_episodes, env.n), 0.0)
    per_step_rewards = np.full((num_eval_episodes, env.n), 0.0)

    # TODO: provide support for recurrent policies and mask
    recurrent_hidden_states = None
    mask = None

    # world.dists at the end of episode for simple_spread
    final_min_dists = []
    num_success = 0
    episode_length = 0
    # print('eval flag 4')
    for t in range(num_eval_episodes):
        print('t', t)
        obs = env.reset()
        # obs = normalize_obs(obs, obs_mean, obs_std)
        done = [False] * env.n
        episode_rewards = np.full(env.n, 0.0)
        episode_steps = 0

        # print('eval flag 5')
        if render:
            attn = None if not render_attn else master.team_attn
            if attn is not None and len(attn.shape) == 3:
                attn = attn.max(0)
            env.render()  ##attn=attn)
        # print('eval flag 6')
        # while not np.all(done):
        for i in range(args.num_env_steps):
            actions = []
            masks = torch.FloatTensor(
                obs[:, 0])  ##* check the values of masks, agent alive or dead
            if not args.no_cuda:
                masks = masks.cuda()
            with torch.no_grad():
                actions = master.act(obs, mask)
            episode_steps += 1
            obs, reward, done, info = env.step(actions)
            # obs = normalize_obs(obs, obs_mean, obs_std)
            episode_rewards += np.array(reward)
            if render:
                time.sleep(0.1)
                attn = None if not render_attn else master.team_attn
                if attn is not None and len(attn.shape) == 3:
                    attn = attn.max(0)
                env.render()  #attn=attn)

                if args.record_video:
                    time.sleep(0.08)
            if done:
                break
        # print('eval flag 7')
        per_step_rewards[t] = episode_rewards / episode_steps
        # print('info[n][0]', info['n'][0])
        num_success += 0  ##* info['n'][0]['is_success']
        episode_length = 0  ##* (episode_length*t + info['n'][0]['world_steps'])/(t+1)

        # for simple spread env only
        if args.env_name == 'simple_spread':
            final_min_dists.append(env.world.min_dists)
        elif args.env_name == 'simple_formation' or args.env_name == 'simple_line':
            final_min_dists.append(env.world.dists)
        # print('can you see that i am stuck')
        # if render:
        #     print("Ep {} | Success: {} \n Av per-step reward: {:.2f} | Ep Length {}".format(t,info['n'][0]['is_success'],
        #         per_step_rewards[t][0],info['n'][0]['world_steps']))
        all_episode_rewards[
            t, :] = episode_rewards  # all_episode_rewards shape: num_eval_episodes x num agents

        if args.record_video:
            # print(attn)
            input('Press enter to continue: ')

    return all_episode_rewards, per_step_rewards, final_min_dists, num_success, episode_length
예제 #4
0
def test_fortattack(args, seed, policies_list, ob_rms):
    writer = SummaryWriter(
        args.log_dir)  # some issue in importing tensorboardX

    if seed is None:  # ensure env eval seed is different from training seed
        seed = np.random.randint(0, 100000)
    print("Evaluation Seed: ", seed)
    env = utils.make_single_env(args)
    master = setup_master(args, env)
    obs = env.reset()  # shape - num_agents (total) x obs_dim
    all_obs = [obs]
    # if ob_rms is not None:
    #     obs_mean, obs_std = ob_rms
    # else:
    #     obs_mean = None
    #     obs_std = None

    master.load_models(policies_list)
    master.set_eval_mode()
    n = len(master.all_agents)
    # final_rewards = torch.zeros([args.num_processes, n], device=args.device)

    # start simulations
    start = datetime.datetime.now()
    for j in range(args.num_eval_episodes + 100):  # iterations
        end_pts = [
        ]  # maintain a list of time end points of episodes that can be used in master.wrap_horizon. Each entry is (where the episode got over) + 1
        episode_rewards = torch.zeros([args.num_processes, n],
                                      device=args.device)

        print('j (num update)', j)

        ## training
        print('training')
        print('end_pts at the begining', end_pts, 'train_fortattack.py')

        # print('obs', obs[:,0])
        master.initialize_obs(obs)
        done = False
        step = 0

        # if args.render():
        #     video_path = 'out_files/videos'
        #     self.eval_env.startRecording(video_path)
        #     if self.args.record_video:
        #             self.eval_env.recordFrame()

        while not done:  # data collection steps in each iteration
            # print('step', step, 'train_fortattack')
            masks = torch.FloatTensor(
                obs[:, 0])  ##* check the values of masks, agent alive or dead
            if not args.no_cuda:
                masks = masks.cuda()

            with torch.no_grad():
                actions_list, attn_list = master.act(step, masks)  ## IMPORTANT
            agent_actions = np.array(actions_list).reshape(-1)
            obs, reward, done, info = env.step(agent_actions)

            all_obs.append(obs)

            # obs = normalize_obs(obs, obs_mean, obs_std)
            # print('reward')
            # print(reward)

            env.render(
                attn_list
            )  # attn_list = [[teamates_attn, opp_attn] for each team]

            path = 'out_files/Frames/generalize_5_{}_{}.png'.format(j, step)
            # env.saveFrame(attn_list, path)
            # time.sleep(0.06)

            # obs, newdead = obs_newdead
            # print('obs', obs.shape, 'masks', masks.shape)
            # print('done', done)
            reward = torch.from_numpy(np.stack(reward)).float().to(args.device)

            ##* Don't know what final_reward means
            # print(masks.shape, episode_rewards.shape)
            # print(masks)
            # print(episode_rewards)
            # print(masks.dtype, episode_rewards.dtype)
            # print(((1 - masks) * episode_rewards).dtype, (1-masks).dtype,masks.dtype, episode_rewards.dtype)
            # episode_rewards *= masks
            # final_rewards += episode_rewards  # it is (1-masks)*.., but i think it should be masks*...

            # print('episode_rewards')
            # print(episode_rewards)
            # final_rewards *= masks
            master.update_rollout(
                obs, reward,
                masks)  ## adds the data point to the rolloutstorage
            episode_rewards += reward * masks
            # print('step reward', reward)
            # print('done', done)
            ## once masks is used, we can update it
            ## it's just easier to use masks
            # masks = torch.FloatTensor(1-1.0*done).to(args.device)      # mask is to not use rewards for agents

            step += 1
            ##* need to confirm this
            if done:
                end_pts.append(step)
                # time.sleep(1)
                obs = env.reset()
                masks = torch.FloatTensor(
                    obs[:,
                        0])  ##* check the values of masks, agent alive or dead
                if not args.no_cuda:
                    masks = masks.cuda()
                master.initialize_new_episode(step, obs, masks)
                time.sleep(1)
                if not args.out_file is None:
                    all_obs = np.array(all_obs)
                    print('all_obs', all_obs.shape)
                    print('path', args.out_file)
                    np.save(args.out_file, all_obs)
                    break

        # print('')
        # if end_pts[-1] != args.num_steps:
        #     end_pts.append(args.num_steps)
        # master.wrap_horizon(end_pts) ## computes the return = (reward + gamam*value), IMPORTANT
        # # master.before_update()
        # vals = master.update()   ## PPO update, IMPORTANT. Multiple iterations of PPO on the last episode
        # value_loss = vals[:, 0]
        # action_loss = vals[:, 1]
        # dist_entropy = vals[:, 2]
        # master.after_update() ## IMPORTANT

        # for agent in master.all_agents:
        #     print('after_update', agent.rollouts.obs.shape, 'train_fortattack.py')

        print('episode_rewqards')
        print(episode_rewards)
예제 #5
0
def train(args, policies_list, return_early=False):
    writer = SummaryWriter(
        args.log_dir)  # some issue in importing tensorboardX
    # env = utils.make_parallel_envs(args)
    env = utils.make_single_env(args)
    master = setup_master(args, env)
    # used during evaluation only
    eval_master, eval_env = setup_master(args, return_env=True)
    obs = env.reset()  # shape - num_agents (total) x obs_dim

    if args.continue_training:
        master.load_models(policies_list)

    n = len(master.all_agents)
    # final_rewards = torch.zeros([args.num_processes, n], device=args.device)

    # start simulations
    start = datetime.datetime.now()
    shift = int(args.ckpt) + 1 if args.continue_training else 0
    for j in range(shift, args.num_updates + shift):  # iterations
        end_pts = [
        ]  # maintain a list of time end points of episodes that can be used in master.wrap_horizon. Each entry is (where the episode got over) + 1
        episode_rewards = torch.zeros([args.num_processes, n],
                                      device=args.device)

        print('j (num update)', j)

        ## training
        print('training')

        master.initialize_obs(obs)
        step = 0
        while step < args.num_steps:  # data collection steps in each iteration
            # print('step', step, 'train_fortattack')
            masks = torch.FloatTensor(
                obs[:, 0])  ##* check the values of masks, agent alive or dead
            if not args.no_cuda:
                masks = masks.cuda()

            with torch.no_grad():
                actions_list, attn_list = master.act(step, masks)  ## IMPORTANT
            agent_actions = np.array(actions_list).reshape(-1)
            obs, reward, done, info = env.step(agent_actions)

            # print('reward')
            # print(reward)

            # env.render(attn_list)	# attn_list = [[teamates_attn, opp_attn] for each team]
            # time.sleep(0.01)

            # obs, newdead = obs_newdead
            # print('obs', obs.shape, 'masks', masks.shape)
            # print('done', done)
            reward = torch.from_numpy(np.stack(reward)).float().to(args.device)

            ##* Don't know what final_reward means
            # print(masks.shape, episode_rewards.shape)
            # print(masks)
            # print(episode_rewards)
            # print(masks.dtype, episode_rewards.dtype)
            # print(((1 - masks) * episode_rewards).dtype, (1-masks).dtype,masks.dtype, episode_rewards.dtype)
            # episode_rewards *= masks
            # final_rewards += episode_rewards  # it is (1-masks)*.., but i think it should be masks*...

            # print('episode_rewards')
            # print(episode_rewards)
            # final_rewards *= masks
            master.update_rollout(
                obs, reward,
                masks)  ## adds the data point to the rolloutstorage
            episode_rewards += reward * masks
            # print('step reward', reward)
            # print('done', done)
            ## once masks is used, we can update it
            ## it's just easier to use masks
            # masks = torch.FloatTensor(1-1.0*done).to(args.device)      # mask is to not use rewards for agents

            step += 1
            ##* need to confirm this
            if done:
                end_pts.append(step)
                # time.sleep(1)
                obs = env.reset()
                masks = torch.FloatTensor(
                    obs[:,
                        0])  ##* check the values of masks, agent alive or dead
                if not args.no_cuda:
                    masks = masks.cuda()
                master.initialize_new_episode(step, obs, masks)
                # break

        # print('')
        if end_pts[-1] != args.num_steps:
            end_pts.append(args.num_steps)
        master.wrap_horizon(
            end_pts
        )  ## computes the return = (reward + gamam*value), IMPORTANT
        # master.before_update()
        vals = master.update(
        )  ## PPO update, IMPORTANT. Multiple iterations of PPO on the last episode
        value_loss = vals[:, 0]
        action_loss = vals[:, 1]
        dist_entropy = vals[:, 2]
        master.after_update()  ## IMPORTANT

        ## Saving trained model
        if j % args.save_interval == 0 and not args.test:
            print('saving')
            savedict = {
                'models': [
                    agent.actor_critic.state_dict()
                    for agent in master.all_agents
                ]
            }
            ob_rms = (None,
                      None) if env.ob_rms is None else (env.ob_rms[0].mean,
                                                        env.ob_rms[0].var)
            savedict['ob_rms'] = ob_rms
            savedir = args.save_dir + '/ep' + str(j) + '.pt'
            print(savedir)
            torch.save(savedict, savedir)

        total_num_steps = (j + 1) * args.num_processes * args.num_steps

        # Logginhg data to tensorboardX
        if j % args.log_interval == 0:
            print('logging')
            end = datetime.datetime.now()
            seconds = (end - start).total_seconds()
            total_reward = episode_rewards.sum(dim=0).cpu().numpy()
            print('total_reward')
            print(total_reward)
            print(
                "Updates {} | Num timesteps {} | Time {} | FPS {}\nTotal reward {}\nEntropy {:.4f} Value loss {:.4f} Policy loss {:.4f}\n"
                .format(j, total_num_steps, str(end - start),
                        int(total_num_steps / seconds), total_reward,
                        dist_entropy[0], value_loss[0], action_loss[0]))
            if not args.test:
                for idx in range(n):
                    writer.add_scalar('agent' + str(idx) + '/training_reward',
                                      total_reward[idx], j)
                    print('idx', idx, 'total_reward[idx]', total_reward[idx])

                writer.add_scalar('all/value_loss', value_loss[0], j)
                writer.add_scalar('all/action_loss', action_loss[0], j)
                writer.add_scalar('all/dist_entropy', dist_entropy[0], j)

        # ## evaluation/validation
        # if args.eval_interval is not None and j%args.eval_interval==0:
        #     print('evaluating')
        #     ob_rms = (None, None) if env.ob_rms is None else (env.ob_rms[0].mean, env.ob_rms[0].var)
        #     print('===========================================================================================')
        #     _, eval_perstep_rewards, final_min_dists, num_success, eval_episode_len = evaluate(args, None, master.all_policies,
        #                                                                                        ob_rms=ob_rms, env=eval_env,
        #                                                                                        render = args.render,
        #                                                                                        master=eval_master)
        #     print('Evaluation {:d} | Mean per-step reward {:.2f}'.format(j//args.eval_interval, eval_perstep_rewards.mean()))
        #     print('Num success {:d}/{:d} | Episode Length {:.2f}'.format(num_success, args.num_eval_episodes, eval_episode_len))
        #     if final_min_dists:
        #         print('Final_dists_mean {}'.format(np.stack(final_min_dists).mean(0)))
        #         print('Final_dists_var {}'.format(np.stack(final_min_dists).var(0)))
        #     print('===========================================================================================\n')

        #     if not args.test:
        #         writer.add_scalar('all/eval_success', 100.0*num_success/args.num_eval_episodes, j)
        #         writer.add_scalar('all/episode_length', eval_episode_len, j)
        #         for idx in range(n):
        #             writer.add_scalar('agent'+str(idx)+'/eval_per_step_reward', eval_perstep_rewards.mean(0)[idx], j)
        #             if final_min_dists:
        #                 writer.add_scalar('agent'+str(idx)+'/eval_min_dist', np.stack(final_min_dists).mean(0)[idx], j)
        #     # print('flag3')
        #     curriculum_success_thres = 0.9
        #     if return_early and num_success*1./args.num_eval_episodes > curriculum_success_thres:
        #         savedict = {'models': [agent.actor_critic.state_dict() for agent in master.all_agents]}
        #         ob_rms = (None, None) if env.ob_rms is None else (env.ob_rms[0].mean, env.ob_rms[0].var)
        #         savedict['ob_rms'] = ob_rms
        #         savedir = args.save_dir+'/ep'+str(j)+'.pt'
        #         torch.save(savedict, savedir)
        #         print('===========================================================================================\n')
        #         print('{} agents: training complete. Breaking.\n'.format(args.num_agents))
        #         print('===========================================================================================\n')
        #         break

    writer.close()
    if return_early:
        return savedir
예제 #6
0
파일: eval.py 프로젝트: goodbyeearth/mpnn
def evaluate(args,
             seed,
             policies_list,
             ob_rms=None,
             render=False,
             env=None,
             master=None,
             render_attn=True):
    """
    RL evaluation: 训练时或者单独使用均可
    policies_list 是所有agent策略的list;
    len(policies_list) = 智能体数量
    """
    import numpy as np
    import torch
    from arguments import get_args
    from utils import normalize_obs
    from learner import setup_master
    import time
    if env is None or master is None:  # 其中一个为空则两个都生成
        master, env = setup_master(args, return_env=True)

    if seed is None:
        seed = np.random.randint(0, 100000)
    print("Evaluation Seed: ", seed)
    env.seed(seed)

    if ob_rms is not None:
        obs_mean, obs_std = ob_rms
    else:
        obs_mean = None
        obs_std = None
    master.load_models(policies_list)
    master.set_eval_mode()

    num_eval_episodes = args.num_eval_episodes
    all_episode_rewards = np.full((num_eval_episodes, env.n), 0.0)
    per_step_rewards = np.full((num_eval_episodes, env.n), 0.0)

    recurrent_hidden_states = None
    mask = None

    # simple_spread 回合结束时 world.dists
    final_min_dists = []
    num_success = 0
    episode_length = 0

    for t in range(num_eval_episodes):
        obs = env.reset()
        obs = normalize_obs(obs, obs_mean, obs_std)
        done = [False] * env.n
        episode_rewards = np.full(env.n, 0.0)
        episode_steps = 0
        if render:
            attn = None if not render_attn else master.team_attn
            if attn is not None and len(attn.shape) == 3:
                attn = attn.max(0)
            env.render(attn=attn)

        while not np.all(done):
            actions = []
            with torch.no_grad():
                actions = master.eval_act(obs, recurrent_hidden_states, mask)
            episode_steps += 1
            obs, reward, done, info = env.step(actions)
            obs = normalize_obs(obs, obs_mean, obs_std)
            episode_rewards += np.array(reward)
            if render:
                attn = None if not render_attn else master.team_attn
                if attn is not None and len(attn.shape) == 3:
                    attn = attn.max(0)
                env.render(attn=attn)
                if args.record_video:
                    time.sleep(0.08)

        per_step_rewards[t] = episode_rewards / episode_steps
        num_success += info['n'][0]['is_success']
        episode_length = (episode_length * t +
                          info['n'][0]['world_steps']) / (t + 1)

        # simple spread
        if args.env_name == 'simple_spread':
            final_min_dists.append(env.world.min_dists)
        elif args.env_name == 'simple_formation' or args.env_name == 'simple_line':
            final_min_dists.append(env.world.dists)

        if render:
            print(
                "Ep {} | Success: {} \n Av per-step reward: {:.2f} | Ep Length {}"
                .format(t, info['n'][0]['is_success'], per_step_rewards[t][0],
                        info['n'][0]['world_steps']))
        all_episode_rewards[
            t, :] = episode_rewards  # all_episode_rewards shape: num_eval_episodes x num agents

        if args.record_video:
            # print(attn)
            input('Press enter to continue: ')

    return all_episode_rewards, per_step_rewards, final_min_dists, num_success, episode_length
def test_fortattack(args, seed):
    writer = SummaryWriter(args.log_dir)      # some issue in importing tensorboardX
    
    if seed is None: # ensure env eval seed is different from training seed
        seed = np.random.randint(0,100000)
    print("Evaluation Seed: ",seed)
    env = utils.make_single_env(args)
    master = setup_master(args, env) 
    obs = env.reset() # shape - num_agents (total) x obs_dim
    all_obs = [obs]


    # if ob_rms is not None:
    #     obs_mean, obs_std = ob_rms
    # else:
    #     obs_mean = None
    #     obs_std = None


    
    # stats = np.zeros((num_attacker_ckpts, 4+2+2))    # gameResult, num_alive for guards/acttackers, rewards for guards/attackers

    guard_load_dir = os.path.join('./marlsave/save_new', args.guard_load_dir)
    guard_ckpt_files = [file for file in os.listdir(guard_load_dir) if os.path.isfile(os.path.join(guard_load_dir,file)) and file.endswith('.pt')]
    guard_ckpt_names = np.sort([int(file[2:-3]) for file in guard_ckpt_files])
    guard_ckpt_files = ['ep'+str(name)+'.pt' for name in guard_ckpt_names]
    
    num_attacker_ckpts = len(args.attacker_ckpts)
    num_episodes = args.num_eval_episodes

    data = np.zeros((len(guard_ckpt_files), num_attacker_ckpts*num_episodes, 11))
    # start simulations
    start = datetime.datetime.now()
    


    for k,ckpt in enumerate(guard_ckpt_files):
        checkpoint = torch.load(os.path.join(guard_load_dir, ckpt), map_location=lambda storage, loc: storage)
        policies_list = checkpoint['models']
        ob_rms = checkpoint['ob_rms']

        master.load_models(policies_list)       ## we are done setting the guards
        master.set_eval_mode()    

        n, n_guards, n_attackers = len(master.all_agents), master.env.world.numGuards, master.env.world.numAliveAttackers

        for i,attacker_ckpt in enumerate(args.attacker_ckpts):      # iterate through attacker strategies
            print('Playing against attacker strategy {}, ckpt {}'.format(i, attacker_ckpt))
            master.select_attacker(attacker_ckpt)

            # data = np.zeros((args.num_eval_episodes, 4+2+2))    # gameResult, num_alive for guards/acttackers, rewards for guards/attackers
        
            for j in range(num_episodes):                                      # 3 test runs for each strategy
                episode_rewards = torch.zeros([args.num_processes, n], device=args.device)
                
                master.initialize_obs(obs)
                step = 0

                done = False
                while not done:  # while episode is not over
                    masks = torch.FloatTensor(obs[:,0])		##* check the values of masks, agent alive or dead
                    if not args.no_cuda:
                        masks = masks.cuda()
                    
                    with torch.no_grad():
                        actions_list, attn_list = master.act(step, masks) ## IMPORTANT
                    agent_actions = np.array(actions_list).reshape(-1)
                    obs, reward, done, info = env.step(agent_actions)

                    all_obs.append(obs)

                    
                    if args.render:
                        env.render(attn_list)	# attn_list = [[teamates_attn, opp_attn] for each team]
                        time.sleep(0.06)

                    
                    reward = torch.from_numpy(np.stack(reward)).float().to(args.device)

                    master.update_rollout(obs, reward, masks)   ## adds the data point to the rolloutstorage
                    episode_rewards += reward*masks
                    
                    step += 1
                    if done:
                        # print(master.env.world.gameResult)
                        # data[j,:2] = master.env.world.gameResult[:2]
                        # data[j,2] = data[j,0] + data[j,1]
                        # data[j,3] = master.env.world.gameResult[2]
                        # data[j,4] = master.env.world.numAliveGuards
                        # data[j,5] = master.env.world.numAliveAttackers 
                        # data[j,6] = np.average(episode_rewards[0,0:n_guards])
                        # data[j,7] = np.average(episode_rewards[0,n_guards:]) 
                        data[k,i*num_episodes+j,0] = guard_ckpt_names[k]
                        data[k,i*num_episodes+j,1:] = episode_rewards.cpu().numpy() 
                        obs = env.reset()
                        masks = torch.FloatTensor(obs[:,0])     ##* check the values of masks, agent alive or dead
                        if not args.no_cuda:
                            masks = masks.cuda()
                        if args.render:
                            time.sleep(2)
                        if not args.out_file is None:
                            all_obs = np.array(all_obs)
                            print('all_obs', all_obs.shape)
                            print('path', args.out_file)
                            np.save(args.out_file, all_obs)
                            break
                
                
                print('episode_rewards')
                print(episode_rewards)
                # time.sleep(2)
            # stats[i] = data.mean(axis = 0)
        # stats = stats.round(2)
        # print('stats')
        # print(stats)
        # np.savetxt("marlsave/stats/stats_ensemble_strategies.csv", stats, delimiter=",")
    np.save(os.path.join(guard_load_dir,'reward_data_ensemble'), data)